diff options
Diffstat (limited to 'src/runtime')
376 files changed, 13785 insertions, 21947 deletions
diff --git a/src/runtime/Allocator.cpp b/src/runtime/Allocator.cpp index ef7c62d64b..eca712dbf0 100644 --- a/src/runtime/Allocator.cpp +++ b/src/runtime/Allocator.cpp @@ -22,9 +22,9 @@ * SOFTWARE. */ #include "arm_compute/runtime/Allocator.h" -#include "arm_compute/runtime/MemoryRegion.h" #include "arm_compute/core/Error.h" +#include "arm_compute/runtime/MemoryRegion.h" #include <cstddef> diff --git a/src/runtime/BlobLifetimeManager.cpp b/src/runtime/BlobLifetimeManager.cpp index 1c983aa329..8a0fc05c39 100644 --- a/src/runtime/BlobLifetimeManager.cpp +++ b/src/runtime/BlobLifetimeManager.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2020 Arm Limited. + * Copyright (c) 2017-2022 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -30,12 +30,12 @@ #include <algorithm> #include <cmath> +#include <iterator> #include <map> namespace arm_compute { -BlobLifetimeManager::BlobLifetimeManager() - : _blobs() +BlobLifetimeManager::BlobLifetimeManager() : _blobs() { } @@ -61,33 +61,32 @@ void BlobLifetimeManager::update_blobs_and_mappings() ARM_COMPUTE_ERROR_ON(_active_group == nullptr); // Sort free blobs requirements in descending order. - _free_blobs.sort([](const Blob & ba, const Blob & bb) - { - return ba.max_size > bb.max_size; - }); + _free_blobs.sort([](const Blob &ba, const Blob &bb) { return ba.max_size > bb.max_size; }); // Create group sizes vector std::vector<BlobInfo> group_sizes; - std::transform(std::begin(_free_blobs), std::end(_free_blobs), std::back_inserter(group_sizes), [](const Blob & b) - { - return BlobInfo{ b.max_size, b.max_alignment, b.bound_elements.size() }; - }); + std::transform(std::begin(_free_blobs), std::end(_free_blobs), std::back_inserter(group_sizes), + [](const Blob &b) { + return BlobInfo{b.max_size, b.max_alignment, b.bound_elements.size()}; + }); // Update blob sizes size_t max_size = std::max(_blobs.size(), group_sizes.size()); _blobs.resize(max_size); group_sizes.resize(max_size); - std::transform(std::begin(_blobs), std::end(_blobs), std::begin(group_sizes), std::begin(_blobs), [](BlobInfo lhs, BlobInfo rhs) - { - return BlobInfo{ std::max(lhs.size, rhs.size), std::max(lhs.alignment, rhs.alignment), std::max(lhs.owners, rhs.owners) }; - }); + std::transform(std::begin(_blobs), std::end(_blobs), std::begin(group_sizes), std::begin(_blobs), + [](BlobInfo lhs, BlobInfo rhs) + { + return BlobInfo{std::max(lhs.size, rhs.size), std::max(lhs.alignment, rhs.alignment), + std::max(lhs.owners, rhs.owners)}; + }); // Calculate group mappings auto &group_mappings = _active_group->mappings(); int blob_idx = 0; - for(auto &free_blob : _free_blobs) + for (auto &free_blob : _free_blobs) { - for(auto &bound_element_id : free_blob.bound_elements) + for (auto &bound_element_id : free_blob.bound_elements) { ARM_COMPUTE_ERROR_ON(_active_elements.find(bound_element_id) == std::end(_active_elements)); Element &bound_element = _active_elements[bound_element_id]; diff --git a/src/runtime/BlobMemoryPool.cpp b/src/runtime/BlobMemoryPool.cpp index 88e280537c..a2f63ef52b 100644 --- a/src/runtime/BlobMemoryPool.cpp +++ b/src/runtime/BlobMemoryPool.cpp @@ -47,7 +47,7 @@ BlobMemoryPool::~BlobMemoryPool() void BlobMemoryPool::acquire(MemoryMappings &handles) { // Set memory to handlers - for(auto &handle : handles) + for (auto &handle : handles) { ARM_COMPUTE_ERROR_ON(handle.first == nullptr); handle.first->set_region(_blobs[handle.second].get()); @@ -56,7 +56,7 @@ void BlobMemoryPool::acquire(MemoryMappings &handles) void BlobMemoryPool::release(MemoryMappings &handles) { - for(auto &handle : handles) + for (auto &handle : handles) { ARM_COMPUTE_ERROR_ON(handle.first == nullptr); handle.first->set_region(nullptr); @@ -78,7 +78,7 @@ void BlobMemoryPool::allocate_blobs(const std::vector<BlobInfo> &blob_info) { ARM_COMPUTE_ERROR_ON(!_allocator); - for(const auto &bi : blob_info) + for (const auto &bi : blob_info) { _blobs.push_back(_allocator->make_region(bi.size, bi.alignment)); } diff --git a/src/runtime/CL/CLBufferAllocator.cpp b/src/runtime/CL/CLBufferAllocator.cpp index e06ef3d37d..b4545b93bf 100644 --- a/src/runtime/CL/CLBufferAllocator.cpp +++ b/src/runtime/CL/CLBufferAllocator.cpp @@ -35,7 +35,8 @@ namespace arm_compute void *CLBufferAllocator::allocate(size_t size, size_t alignment) { ARM_COMPUTE_UNUSED(alignment); - cl_mem buf{ clCreateBuffer(CLScheduler::get().context().get(), CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE, size, nullptr, nullptr) }; + cl_mem buf{clCreateBuffer(CLScheduler::get().context().get(), CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE, size, + nullptr, nullptr)}; return static_cast<void *>(buf); } diff --git a/src/runtime/CL/CLGEMMHeuristicsHandle.cpp b/src/runtime/CL/CLGEMMHeuristicsHandle.cpp index 7168259fcd..d680dc08bb 100644 --- a/src/runtime/CL/CLGEMMHeuristicsHandle.cpp +++ b/src/runtime/CL/CLGEMMHeuristicsHandle.cpp @@ -27,8 +27,7 @@ namespace arm_compute { -CLGEMMHeuristicsHandle::CLGEMMHeuristicsHandle() - : _heuristics(std::make_unique<mlgo::MLGOHeuristics>()) +CLGEMMHeuristicsHandle::CLGEMMHeuristicsHandle() : _heuristics(std::make_unique<mlgo::MLGOHeuristics>()) { } CLGEMMHeuristicsHandle::~CLGEMMHeuristicsHandle() = default; diff --git a/src/runtime/CL/CLHelpers.cpp b/src/runtime/CL/CLHelpers.cpp index 5b4bbbcde0..eb28ecbf8d 100644 --- a/src/runtime/CL/CLHelpers.cpp +++ b/src/runtime/CL/CLHelpers.cpp @@ -50,34 +50,30 @@ void printf_callback(const char *buffer, unsigned int len, size_t complete, void * @return A pointer to the context properties which can be used to create an opencl context */ -void initialise_context_properties(const cl::Platform &platform, const cl::Device &device, std::array<cl_context_properties, 7> &prop) +void initialise_context_properties(const cl::Platform &platform, + const cl::Device &device, + std::array<cl_context_properties, 7> &prop) { ARM_COMPUTE_UNUSED(device); #if defined(ARM_COMPUTE_ASSERTS_ENABLED) // Query devices in the context for cl_arm_printf support - if(arm_compute::device_supports_extension(device, "cl_arm_printf")) + if (arm_compute::device_supports_extension(device, "cl_arm_printf")) { // Create a cl_context with a printf_callback and user specified buffer size. - std::array<cl_context_properties, 7> properties_printf = - { + std::array<cl_context_properties, 7> properties_printf = { CL_CONTEXT_PLATFORM, reinterpret_cast<cl_context_properties>(platform()), // Enable a printf callback function for this context. CL_PRINTF_CALLBACK_ARM, reinterpret_cast<cl_context_properties>(printf_callback), // Request a minimum printf buffer size of 4MB for devices in the // context that support this extension. - CL_PRINTF_BUFFERSIZE_ARM, 0x1000, - 0 - }; + CL_PRINTF_BUFFERSIZE_ARM, 0x1000, 0}; prop = properties_printf; } else #endif // defined(ARM_COMPUTE_ASSERTS_ENABLED) { - std::array<cl_context_properties, 3> properties = - { - CL_CONTEXT_PLATFORM, reinterpret_cast<cl_context_properties>(platform()), - 0 - }; + std::array<cl_context_properties, 3> properties = {CL_CONTEXT_PLATFORM, + reinterpret_cast<cl_context_properties>(platform()), 0}; std::copy(properties.begin(), properties.end(), prop.begin()); }; } @@ -91,19 +87,19 @@ cl::Platform select_preferable_platform(CLBackendType cl_backend_type) cl::Platform::get(&platforms); ARM_COMPUTE_ERROR_ON_MSG(platforms.size() == 0, "Couldn't find any OpenCL platform"); - cl::Platform selected_platform{ nullptr }; + cl::Platform selected_platform{nullptr}; // If the user has selected the Native platform, return the first available. - switch(cl_backend_type) + switch (cl_backend_type) { case CLBackendType::Native: selected_platform = platforms[0]; break; case CLBackendType::Clvk: - for(auto p : platforms) + for (auto p : platforms) { std::string res = p.getInfo<CL_PLATFORM_NAME>(); - if(res.find("clvk") != std::string::npos) + if (res.find("clvk") != std::string::npos) { selected_platform = p; break; @@ -114,7 +110,7 @@ cl::Platform select_preferable_platform(CLBackendType cl_backend_type) ARM_COMPUTE_ERROR("Unsupported backend type"); } - if(!selected_platform()) + if (!selected_platform()) { ARM_COMPUTE_ERROR("No valid platform found"); } @@ -122,8 +118,7 @@ cl::Platform select_preferable_platform(CLBackendType cl_backend_type) return selected_platform; } -std::tuple<cl::Context, cl::Device, cl_int> -create_opencl_context_and_device(CLBackendType cl_backend_type) +std::tuple<cl::Context, cl::Device, cl_int> create_opencl_context_and_device(CLBackendType cl_backend_type) { ARM_COMPUTE_ERROR_ON(!opencl_is_available()); cl::Platform p = select_preferable_platform(cl_backend_type); @@ -131,9 +126,9 @@ create_opencl_context_and_device(CLBackendType cl_backend_type) std::vector<cl::Device> platform_devices; p.getDevices(CL_DEVICE_TYPE_DEFAULT, &platform_devices); ARM_COMPUTE_ERROR_ON_MSG(platform_devices.size() == 0, "Couldn't find any OpenCL device"); - device = platform_devices[0]; - cl_int err = CL_SUCCESS; - std::array<cl_context_properties, 7> properties = { 0, 0, 0, 0, 0, 0, 0 }; + device = platform_devices[0]; + cl_int err = CL_SUCCESS; + std::array<cl_context_properties, 7> properties = {0, 0, 0, 0, 0, 0, 0}; initialise_context_properties(p, device, properties); cl::Context cl_context = cl::Context(device, properties.data(), nullptr, nullptr, &err); ARM_COMPUTE_ERROR_ON_MSG(err != CL_SUCCESS, "Failed to create OpenCL context"); @@ -143,7 +138,7 @@ create_opencl_context_and_device(CLBackendType cl_backend_type) void schedule_kernel_on_ctx(CLRuntimeContext *ctx, ICLKernel *kernel, bool flush) { ARM_COMPUTE_ERROR_ON_NULLPTR(kernel); - if(ctx) + if (ctx) { ARM_COMPUTE_ERROR_ON(ctx->gpu_scheduler() == nullptr); ctx->gpu_scheduler()->enqueue(*kernel, flush); diff --git a/src/runtime/CL/CLMemory.cpp b/src/runtime/CL/CLMemory.cpp index a1743c56e6..c6ee6fde83 100644 --- a/src/runtime/CL/CLMemory.cpp +++ b/src/runtime/CL/CLMemory.cpp @@ -24,24 +24,22 @@ #include "arm_compute/runtime/CL/CLMemory.h" #include "arm_compute/core/Error.h" + #include "support/Cast.h" namespace arm_compute { -CLMemory::CLMemory() - : _region(nullptr), _region_owned(nullptr) +CLMemory::CLMemory() : _region(nullptr), _region_owned(nullptr) { } -CLMemory::CLMemory(const std::shared_ptr<ICLMemoryRegion> &memory) - : _region(nullptr), _region_owned(memory) +CLMemory::CLMemory(const std::shared_ptr<ICLMemoryRegion> &memory) : _region(nullptr), _region_owned(memory) { _region_owned = memory; _region = _region_owned.get(); } -CLMemory::CLMemory(ICLMemoryRegion *memory) - : _region(memory), _region_owned(nullptr) +CLMemory::CLMemory(ICLMemoryRegion *memory) : _region(memory), _region_owned(nullptr) { _region = memory; } @@ -78,4 +76,4 @@ void CLMemory::set_owned_region(std::unique_ptr<IMemoryRegion> region) _region_owned = utils::cast::polymorphic_downcast_unique_ptr<ICLMemoryRegion>(std::move(region)); _region = _region_owned.get(); } -} // namespace arm_compute
\ No newline at end of file +} // namespace arm_compute diff --git a/src/runtime/CL/CLMemoryRegion.cpp b/src/runtime/CL/CLMemoryRegion.cpp index 780a563d63..c9ddf9b85c 100644 --- a/src/runtime/CL/CLMemoryRegion.cpp +++ b/src/runtime/CL/CLMemoryRegion.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018-2021 Arm Limited. + * Copyright (c) 2018-2021, 2023 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -26,14 +26,12 @@ #include "arm_compute/core/Error.h" #include "arm_compute/runtime/CL/CLScheduler.h" +#include "src/common/utils/Log.h" + namespace arm_compute { ICLMemoryRegion::ICLMemoryRegion(size_t size) - : IMemoryRegion(size), - _queue(CLScheduler::get().queue()), - _ctx(CLScheduler::get().context()), - _mapping(nullptr), - _mem() + : IMemoryRegion(size), _ctx(CLScheduler::get().context()), _mapping(nullptr), _mem() { } @@ -58,21 +56,34 @@ std::unique_ptr<IMemoryRegion> ICLMemoryRegion::extract_subregion(size_t offset, return nullptr; } -CLBufferMemoryRegion::CLBufferMemoryRegion(cl_mem_flags flags, size_t size) - : ICLMemoryRegion(size) +CLBufferMemoryRegion::CLBufferMemoryRegion(cl_mem_flags flags, size_t size) : ICLMemoryRegion(size) { - if(_size != 0) + if (_size != 0) { _mem = cl::Buffer(CLScheduler::get().context(), flags, _size); } } -CLBufferMemoryRegion::CLBufferMemoryRegion(const cl::Buffer &buffer) - : ICLMemoryRegion(buffer.getInfo<CL_MEM_SIZE>()) +CLBufferMemoryRegion::CLBufferMemoryRegion(const cl::Buffer &buffer) : ICLMemoryRegion(buffer.getInfo<CL_MEM_SIZE>()) { _mem = buffer; } +CLBufferMemoryRegion::~CLBufferMemoryRegion() +{ + // Flush the command queue to ensure all commands that may use this memory buffer are scheduled to be finished before + // this buffer is freed + // Do not call finish as it is a blocking call which affects the performance + try + { + CLScheduler::get().queue().flush(); + } + catch (const std::exception &e) + { + ARM_COMPUTE_LOG_ERROR_ACL(e.what()); + } +} + void *CLBufferMemoryRegion::ptr() { return nullptr; @@ -95,10 +106,10 @@ void CLBufferMemoryRegion::unmap(cl::CommandQueue &q) ICLSVMMemoryRegion::ICLSVMMemoryRegion(cl_mem_flags flags, size_t size, size_t alignment) : ICLMemoryRegion(size), _ptr(nullptr) { - if(size != 0) + if (size != 0) { _ptr = clSVMAlloc(CLScheduler::get().context().get(), flags, size, alignment); - if(_ptr != nullptr) + if (_ptr != nullptr) { _mem = cl::Buffer(CLScheduler::get().context(), CL_MEM_READ_WRITE | CL_MEM_USE_HOST_PTR, _size, _ptr); } @@ -107,15 +118,18 @@ ICLSVMMemoryRegion::ICLSVMMemoryRegion(cl_mem_flags flags, size_t size, size_t a ICLSVMMemoryRegion::~ICLSVMMemoryRegion() { - if(_ptr != nullptr) + if (_ptr != nullptr) { try { - clFinish(_queue.get()); + // Can only use the blocking finish instead of the non-blocking flush here, because clSVMFree requires all + // commands that may use the svm pointer to finish beforehand + // https://registry.khronos.org/OpenCL/sdk/3.0/docs/man/html/clSVMFree.html + clFinish(CLScheduler::get().queue().get()); _mem = cl::Buffer(); clSVMFree(_ctx.get(), _ptr); } - catch(...) + catch (...) { } } @@ -134,7 +148,8 @@ CLCoarseSVMMemoryRegion::CLCoarseSVMMemoryRegion(cl_mem_flags flags, size_t size void *CLCoarseSVMMemoryRegion::map(cl::CommandQueue &q, bool blocking) { ARM_COMPUTE_ERROR_ON(_ptr == nullptr); - clEnqueueSVMMap(q.get(), blocking ? CL_TRUE : CL_FALSE, CL_MAP_READ | CL_MAP_WRITE, _ptr, _size, 0, nullptr, nullptr); + clEnqueueSVMMap(q.get(), blocking ? CL_TRUE : CL_FALSE, CL_MAP_READ | CL_MAP_WRITE, _ptr, _size, 0, nullptr, + nullptr); _mapping = _ptr; return _mapping; } @@ -153,7 +168,7 @@ CLFineSVMMemoryRegion::CLFineSVMMemoryRegion(cl_mem_flags flags, size_t size, si void *CLFineSVMMemoryRegion::map(cl::CommandQueue &q, bool blocking) { - if(blocking) + if (blocking) { clFinish(q.get()); } diff --git a/src/runtime/CL/CLOperator.cpp b/src/runtime/CL/CLOperator.cpp index 075a544077..89d4520038 100644 --- a/src/runtime/CL/CLOperator.cpp +++ b/src/runtime/CL/CLOperator.cpp @@ -30,14 +30,13 @@ namespace arm_compute { namespace experimental { -ICLOperator::ICLOperator(IRuntimeContext *ctx) - : _kernel(), _ctx(ctx), _workspace() +ICLOperator::ICLOperator(IRuntimeContext *ctx) : _kernel(), _ctx(ctx), _workspace() { } void ICLOperator::run(ITensorPack &tensors) { - if(tensors.empty()) + if (tensors.empty()) { ARM_COMPUTE_ERROR("No inputs provided"); } diff --git a/src/runtime/CL/CLRuntimeContext.cpp b/src/runtime/CL/CLRuntimeContext.cpp index 5083b4b0c5..b426b8c304 100644 --- a/src/runtime/CL/CLRuntimeContext.cpp +++ b/src/runtime/CL/CLRuntimeContext.cpp @@ -22,6 +22,7 @@ * SOFTWARE. */ #include "arm_compute/runtime/CL/CLRuntimeContext.h" + #include "arm_compute/core/Validate.h" #include "arm_compute/runtime/CL/CLHelpers.h" #include "arm_compute/runtime/CL/CLScheduler.h" @@ -29,7 +30,10 @@ namespace arm_compute { CLRuntimeContext::CLRuntimeContext() - : _gpu_owned_scheduler(std::make_unique<CLScheduler>()), _gpu_scheduler(_gpu_owned_scheduler.get()), _symbols(), _backend_type() + : _gpu_owned_scheduler(std::make_unique<CLScheduler>()), + _gpu_scheduler(_gpu_owned_scheduler.get()), + _symbols(), + _backend_type() { _symbols.load_default(); auto ctx_dev_err = create_opencl_context_and_device(_backend_type); diff --git a/src/runtime/CL/CLScheduler.cpp b/src/runtime/CL/CLScheduler.cpp index cb5f04ce8b..f0a42f55fd 100644 --- a/src/runtime/CL/CLScheduler.cpp +++ b/src/runtime/CL/CLScheduler.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2016-2021 Arm Limited. + * Copyright (c) 2016-2023 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -25,6 +25,7 @@ #include "arm_compute/core/CL/CLKernelLibrary.h" #include "arm_compute/runtime/CL/CLTuner.h" + #include "src/core/CL/ICLKernel.h" namespace arm_compute @@ -81,7 +82,7 @@ cl::Event CLScheduler::enqueue_sync_event() void CLScheduler::tune_kernel_static(ICLKernel &kernel) { - if(_cl_tuner != nullptr) + if (_cl_tuner != nullptr) { _cl_tuner->tune_kernel_static(kernel); } @@ -95,7 +96,16 @@ bool CLScheduler::is_initialised() const std::once_flag CLScheduler::_initialize_symbols; CLScheduler::CLScheduler() - : _context(), _queue(), _target(GPUTarget::MIDGARD), _is_initialised(false), _cl_tuner(nullptr), _gemm_heuristics(nullptr), _backend_type(CLBackendType::Native) + : _context(), + _queue(), + _target(GPUTarget::MIDGARD), + _is_initialised(false), + _cl_tuner(nullptr), + _gemm_heuristics(nullptr), + _backend_type(CLBackendType::Native), + _job_chaining_enabled(true), + _job_chaining_size(1), + _job_chaining_count(0) { } @@ -106,9 +116,12 @@ CLScheduler &CLScheduler::get() return scheduler; } -void CLScheduler::default_init_with_context(cl::Device &device, cl::Context &ctx, ICLTuner *cl_tuner, CLGEMMHeuristicsHandle *gemm_h) +void CLScheduler::default_init_with_context(cl::Device &device, + cl::Context &ctx, + ICLTuner *cl_tuner, + CLGEMMHeuristicsHandle *gemm_h) { - if(!_is_initialised) + if (!_is_initialised) { const std::string cl_kernels_folder("./cl_kernels/"); cl::CommandQueue queue = cl::CommandQueue(ctx, device); @@ -120,7 +133,7 @@ void CLScheduler::default_init_with_context(cl::Device &device, cl::Context &ctx void CLScheduler::default_init(ICLTuner *cl_tuner, CLGEMMHeuristicsHandle *gemm_h, CLBackendType cl_backend_type) { - if(!_is_initialised) + if (!_is_initialised) { cl::Context ctx; cl::Device dev; @@ -132,8 +145,16 @@ void CLScheduler::default_init(ICLTuner *cl_tuner, CLGEMMHeuristicsHandle *gemm_ init(ctx, queue, dev, cl_tuner, gemm_h); } - // Set CL tuner - _cl_tuner = cl_tuner; + // Set CL tuner and GEMM heuristics + _cl_tuner = cl_tuner; + _gemm_heuristics = gemm_h; +} + +void CLScheduler::default_reinit(ICLTuner *cl_tuner, CLGEMMHeuristicsHandle *gemm_h, CLBackendType cl_backend_type) +{ + _is_initialised = false; + + default_init(cl_tuner, gemm_h, cl_backend_type); } void CLScheduler::set_context(cl::Context context) @@ -142,7 +163,12 @@ void CLScheduler::set_context(cl::Context context) CLKernelLibrary::get().set_context(_context); } -void CLScheduler::init(cl::Context context, cl::CommandQueue queue, const cl::Device &device, ICLTuner *cl_tuner, CLGEMMHeuristicsHandle *gemm_h, CLBackendType cl_backend_type) +void CLScheduler::init(cl::Context context, + cl::CommandQueue queue, + const cl::Device &device, + ICLTuner *cl_tuner, + CLGEMMHeuristicsHandle *gemm_h, + CLBackendType cl_backend_type) { set_context(std::move(context)); _queue = std::move(queue); @@ -155,22 +181,49 @@ void CLScheduler::init(cl::Context context, cl::CommandQueue queue, const cl::De void CLScheduler::enqueue_common(ICLKernel &kernel, ITensorPack &tensors, bool flush) { - ARM_COMPUTE_ERROR_ON_MSG(!_is_initialised, - "The CLScheduler is not initialised yet! Please call the CLScheduler::get().default_init(), \ + ARM_COMPUTE_ERROR_ON_MSG( + !_is_initialised, "The CLScheduler is not initialised yet! Please call the CLScheduler::get().default_init(), \ or CLScheduler::get()::init() and CLKernelLibrary::get()::init() function before running functions!"); const bool inject_memory = !tensors.empty(); // Tune the kernel if the CLTuner has been provided - if(_cl_tuner != nullptr) + if (_cl_tuner != nullptr) { inject_memory ? _cl_tuner->tune_kernel_dynamic(kernel, tensors) : _cl_tuner->tune_kernel_dynamic(kernel); } // Run kernel inject_memory ? kernel.run_op(tensors, kernel.window(), _queue) : kernel.run(kernel.window(), _queue); + if (_job_chaining_enabled) + { + ++_job_chaining_count; + } - if(flush) + flush_queue(flush); +} + +void CLScheduler::flush_queue(bool flush) +{ + if (_job_chaining_enabled) + { + if (_job_chaining_count >= _job_chaining_size) + { + _job_chaining_count = 0; + /* + Optimisation note: Flush the queue at the first enqueue to start the GPU + execution and then incrementally saturate the clFlush calls to minimize + the CPU activity for job-scheduling. + For eg. job-chain size goes from 1, 2, 4, 8 and 16 + */ + if (_job_chaining_size < 16) + { + _job_chaining_size <<= 1; + } + _queue.flush(); + } + } + else if (flush) { _queue.flush(); } @@ -186,4 +239,10 @@ void CLScheduler::enqueue_op(ICLKernel &kernel, ITensorPack &tensors, bool flush { enqueue_common(kernel, tensors, flush); } + +void CLScheduler::enable_job_chaining(int job_chaining_size) +{ + _job_chaining_enabled = true; + _job_chaining_size = job_chaining_size; +} } // namespace arm_compute diff --git a/src/runtime/CL/CLSubTensor.cpp b/src/runtime/CL/CLSubTensor.cpp index d0822414c3..ace820bbb7 100644 --- a/src/runtime/CL/CLSubTensor.cpp +++ b/src/runtime/CL/CLSubTensor.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2019 Arm Limited. + * Copyright (c) 2017-2019, 2022 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -29,12 +29,14 @@ using namespace arm_compute; -CLSubTensor::CLSubTensor() - : _parent(nullptr), _info() +CLSubTensor::CLSubTensor() : _parent(nullptr), _info() { } -CLSubTensor::CLSubTensor(ICLTensor *parent, const TensorShape &tensor_shape, const Coordinates &coords, bool extend_parent) +CLSubTensor::CLSubTensor(ICLTensor *parent, + const TensorShape &tensor_shape, + const Coordinates &coords, + bool extend_parent) : _parent(nullptr), _info() { ARM_COMPUTE_ERROR_ON(parent == nullptr); @@ -81,11 +83,15 @@ void CLSubTensor::unmap() uint8_t *CLSubTensor::do_map(cl::CommandQueue &q, bool blocking) { ARM_COMPUTE_ERROR_ON(cl_buffer().get() == nullptr); - return static_cast<uint8_t *>(q.enqueueMapBuffer(cl_buffer(), blocking ? CL_TRUE : CL_FALSE, CL_MAP_READ | CL_MAP_WRITE, 0, info()->total_size())); + if (_parent->buffer() == nullptr) + { + _parent->map(q, blocking); + } + return _parent->buffer(); } void CLSubTensor::do_unmap(cl::CommandQueue &q) { ARM_COMPUTE_ERROR_ON(cl_buffer().get() == nullptr); - q.enqueueUnmapMemObject(cl_buffer(), buffer()); + _parent->unmap(q); } diff --git a/src/runtime/CL/CLTensorAllocator.cpp b/src/runtime/CL/CLTensorAllocator.cpp index f85b8ae777..e6457218c7 100644 --- a/src/runtime/CL/CLTensorAllocator.cpp +++ b/src/runtime/CL/CLTensorAllocator.cpp @@ -46,17 +46,16 @@ static IAllocator *static_global_cl_allocator = nullptr; std::unique_ptr<ICLMemoryRegion> allocate_region(size_t size, cl_uint alignment) { // Try fine-grain SVM - std::unique_ptr<ICLMemoryRegion> region = std::make_unique<CLFineSVMMemoryRegion>(CL_MEM_READ_WRITE | CL_MEM_SVM_FINE_GRAIN_BUFFER, - size, - alignment); + std::unique_ptr<ICLMemoryRegion> region = + std::make_unique<CLFineSVMMemoryRegion>(CL_MEM_READ_WRITE | CL_MEM_SVM_FINE_GRAIN_BUFFER, size, alignment); // Try coarse-grain SVM in case of failure - if(region != nullptr && region->ptr() == nullptr) + if (region != nullptr && region->ptr() == nullptr) { region = std::make_unique<CLCoarseSVMMemoryRegion>(CL_MEM_READ_WRITE, size, alignment); } // Try legacy buffer memory in case of failure - if(region != nullptr && region->ptr() == nullptr) + if (region != nullptr && region->ptr() == nullptr) { region = std::make_unique<CLBufferMemoryRegion>(CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE, size); } @@ -80,7 +79,10 @@ void clear_quantization_arrays(CLFloatArray &scale, CLInt32Array &offset) * @param[in] qinfo Quantization info * @param[in] pad_size Pad size to use in case array needs to be padded for computation purposes */ -void populate_quantization_info(CLFloatArray &scale, CLInt32Array &offset, const QuantizationInfo &qinfo, size_t pad_size) +void populate_quantization_info(CLFloatArray &scale, + CLInt32Array &offset, + const QuantizationInfo &qinfo, + size_t pad_size) { clear_quantization_arrays(scale, offset); @@ -90,16 +92,18 @@ void populate_quantization_info(CLFloatArray &scale, CLInt32Array &offset, const const size_t element_size = sizeof(std::remove_reference<decltype(qscale)>::type::value_type); scale = CLFloatArray(num_elements + pad_size); scale.resize(num_elements); - CLScheduler::get().queue().enqueueWriteBuffer(scale.cl_buffer(), CL_TRUE, 0, num_elements * element_size, qinfo.scale().data()); + CLScheduler::get().queue().enqueueWriteBuffer(scale.cl_buffer(), CL_TRUE, 0, num_elements * element_size, + qinfo.scale().data()); - if(!qinfo.offset().empty()) + if (!qinfo.offset().empty()) { // Create offset array - const std::vector<int32_t> &qoffset = qinfo.offset(); - const size_t offset_element_size = sizeof(std::remove_reference<decltype(qoffset)>::type::value_type); - offset = CLInt32Array(num_elements + pad_size); + const std::vector<int32_t> &qoffset = qinfo.offset(); + const size_t offset_element_size = sizeof(std::remove_reference<decltype(qoffset)>::type::value_type); + offset = CLInt32Array(num_elements + pad_size); offset.resize(num_elements); - CLScheduler::get().queue().enqueueWriteBuffer(offset.cl_buffer(), CL_TRUE, 0, num_elements * offset_element_size, qinfo.offset().data()); + CLScheduler::get().queue().enqueueWriteBuffer(offset.cl_buffer(), CL_TRUE, 0, + num_elements * offset_element_size, qinfo.offset().data()); } } } // namespace @@ -111,7 +115,7 @@ CLTensorAllocator::CLTensorAllocator(IMemoryManageable *owner, CLRuntimeContext CLQuantization CLTensorAllocator::quantization() const { - return { &_scale, &_offset }; + return {&_scale, &_offset}; } uint8_t *CLTensorAllocator::data() @@ -127,10 +131,10 @@ const cl::Buffer &CLTensorAllocator::cl_data() const void CLTensorAllocator::allocate() { // Allocate tensor backing memory - if(_associated_memory_group == nullptr) + if (_associated_memory_group == nullptr) { // Perform memory allocation - if(static_global_cl_allocator != nullptr) + if (static_global_cl_allocator != nullptr) { _memory.set_owned_region(static_global_cl_allocator->make_region(info().total_size(), 0)); } @@ -146,7 +150,7 @@ void CLTensorAllocator::allocate() } // Allocate and fill the quantization parameter arrays - if(is_data_type_quantized_per_channel(info().data_type())) + if (is_data_type_quantized_per_channel(info().data_type())) { const size_t pad_size = 0; populate_quantization_info(_scale, _offset, info().quantization_info(), pad_size); @@ -193,7 +197,7 @@ void CLTensorAllocator::set_global_allocator(IAllocator *allocator) uint8_t *CLTensorAllocator::lock() { - if(_ctx) + if (_ctx) { return map(_ctx->gpu_scheduler()->queue(), true); } @@ -206,7 +210,7 @@ uint8_t *CLTensorAllocator::lock() void CLTensorAllocator::unlock() { ARM_COMPUTE_ERROR_ON(_memory.region() == nullptr); - if(_ctx) + if (_ctx) { unmap(_ctx->gpu_scheduler()->queue(), reinterpret_cast<uint8_t *>(_memory.region()->buffer())); } diff --git a/src/runtime/CL/CLTuner.cpp b/src/runtime/CL/CLTuner.cpp index e16d6808ed..0d62fe3afe 100644 --- a/src/runtime/CL/CLTuner.cpp +++ b/src/runtime/CL/CLTuner.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2021 Arm Limited. + * Copyright (c) 2017-2023 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -22,10 +22,12 @@ * SOFTWARE. */ #include "arm_compute/runtime/CL/CLTuner.h" -#include "arm_compute/runtime/CL/tuners/CLTuningParametersList.h" #include "arm_compute/core/Error.h" #include "arm_compute/runtime/CL/CLScheduler.h" +#include "arm_compute/runtime/CL/tuners/CLTuningParametersList.h" + +#include "src/common/utils/Log.h" #include "src/core/CL/ICLKernel.h" #include "support/StringSupport.h" @@ -36,10 +38,36 @@ namespace arm_compute { CLTuner::CLTuner(bool tune_new_kernels, CLTuningInfo tuning_info) - : real_clEnqueueNDRangeKernel(nullptr), _tuning_params_table(), _lws_table(), _kernel_event(), _tune_new_kernels(tune_new_kernels), _tuning_info(tuning_info) + : real_clEnqueueNDRangeKernel(nullptr), + _tuning_params_table(), + _lws_table(), + _kernel_event(), + _tune_new_kernels(tune_new_kernels), + _tuning_info(tuning_info) { } +struct CLTuner::IKernelData +{ + virtual ~IKernelData() = default; + virtual void do_run(ICLKernel &kernel, cl::CommandQueue &queue) = 0; +}; +struct DefaultKernelData : public CLTuner::IKernelData +{ + DefaultKernelData(ITensorPack &tensors) : _tensors{tensors} + { + } + ~DefaultKernelData() override = default; + void do_run(ICLKernel &kernel, cl::CommandQueue &queue) override + { + const bool inject_memory = !_tensors.empty(); + inject_memory ? kernel.run_op(_tensors, kernel.window(), queue) : kernel.run(kernel.window(), queue); + } + +private: + ITensorPack &_tensors; +}; + bool CLTuner::kernel_event_is_set() const { return _kernel_event() != nullptr; @@ -63,11 +91,6 @@ void CLTuner::set_tuner_mode(CLTunerMode mode) _tuning_info.tuner_mode = mode; } -CLTunerMode CLTuner::get_tuner_mode() const -{ - return _tuning_info.tuner_mode; -} - void CLTuner::tune_kernel_static(ICLKernel &kernel) { ARM_COMPUTE_UNUSED(kernel); @@ -79,29 +102,30 @@ void CLTuner::tune_kernel_dynamic(ICLKernel &kernel) tune_kernel_dynamic(kernel, pack); } -void CLTuner::tune_kernel_dynamic(ICLKernel &kernel, ITensorPack &tensors) +void CLTuner::do_tune_kernel_dynamic(ICLKernel &kernel, IKernelData *data) { // Get the configuration ID from the kernel and append GPU target name and number of available compute units - const std::string config_id = kernel.config_id() + "_" + string_from_target(kernel.get_target()) + "_MP" + support::cpp11::to_string(CLKernelLibrary::get().get_num_compute_units()); + const std::string config_id = kernel.config_id() + "_" + string_from_target(kernel.get_target()) + "_MP" + + support::cpp11::to_string(CLKernelLibrary::get().get_num_compute_units()); // Check if we need to find the Optimal LWS. If the kernel's config_id is equal to default_config_id, the kernel does not require to be tuned - if(kernel.config_id() != arm_compute::default_config_id) + if (kernel.config_id() != arm_compute::default_config_id) { auto p = _tuning_params_table.find(config_id); - if(p == _tuning_params_table.end()) + if (p == _tuning_params_table.end()) { - if(_tune_new_kernels) + if (_tune_new_kernels) { // Find the optimal LWS for the kernel - CLTuningParams opt_tuning_params = find_optimal_tuning_params(kernel, tensors); + CLTuningParams opt_tuning_params = find_optimal_tuning_params(kernel, data); // Insert the optimal LWS in the table add_tuning_params(config_id, opt_tuning_params); // Set Local-Workgroup-Size kernel.set_lws_hint(opt_tuning_params.get_lws()); - if(_tuning_info.tune_wbsm) + if (_tuning_info.tune_wbsm) { kernel.set_wbsm_hint(opt_tuning_params.get_wbsm()); } @@ -111,17 +135,18 @@ void CLTuner::tune_kernel_dynamic(ICLKernel &kernel, ITensorPack &tensors) { // Set Local-Workgroup-Size kernel.set_lws_hint(p->second.get_lws()); - if(_tuning_info.tune_wbsm) + if (_tuning_info.tune_wbsm) { kernel.set_wbsm_hint(p->second.get_wbsm()); } } } } - -void CLTuner::add_lws_to_table(const std::string &kernel_id, cl::NDRange optimal_lws) +void CLTuner::tune_kernel_dynamic(ICLKernel &kernel, ITensorPack &tensors) { - add_tuning_params(kernel_id, CLTuningParams(optimal_lws)); + DefaultKernelData data{tensors}; + + do_tune_kernel_dynamic(kernel, &data); } void CLTuner::add_tuning_params(const std::string &kernel_id, CLTuningParams optimal_tuning_params) @@ -129,13 +154,13 @@ void CLTuner::add_tuning_params(const std::string &kernel_id, CLTuningParams opt _tuning_params_table.emplace(kernel_id, optimal_tuning_params); } -CLTuningParams CLTuner::find_optimal_tuning_params(ICLKernel &kernel, ITensorPack &tensors) +CLTuningParams CLTuner::find_optimal_tuning_params(ICLKernel &kernel, IKernelData *data) { // Profiling queue cl::CommandQueue queue_profiler; // Extract real OpenCL function to intercept - if(real_clEnqueueNDRangeKernel == nullptr) + if (real_clEnqueueNDRangeKernel == nullptr) { real_clEnqueueNDRangeKernel = CLSymbols::get().clEnqueueNDRangeKernel_ptr; } @@ -146,7 +171,7 @@ CLTuningParams CLTuner::find_optimal_tuning_params(ICLKernel &kernel, ITensorPac // Check if we can use the OpenCL timer with the default queue cl_command_queue_properties props = default_queue.getInfo<CL_QUEUE_PROPERTIES>(); - if((props & CL_QUEUE_PROFILING_ENABLE) == 0) + if ((props & CL_QUEUE_PROFILING_ENABLE) == 0) { // Set the queue for profiling queue_profiler = cl::CommandQueue(CLScheduler::get().context(), props | CL_QUEUE_PROFILING_ENABLE); @@ -157,21 +182,23 @@ CLTuningParams CLTuner::find_optimal_tuning_params(ICLKernel &kernel, ITensorPac } // Start intercepting enqueues: - auto interceptor = [this](cl_command_queue command_queue, cl_kernel kernel, cl_uint work_dim, const size_t *gwo, const size_t *gws, const size_t *lws, cl_uint num_events_in_wait_list, - const cl_event * event_wait_list, cl_event * event) + auto interceptor = [this](cl_command_queue command_queue, cl_kernel kernel, cl_uint work_dim, const size_t *gwo, + const size_t *gws, const size_t *lws, cl_uint num_events_in_wait_list, + const cl_event *event_wait_list, cl_event *event) { - if(this->kernel_event_is_set()) + if (this->kernel_event_is_set()) { // If the event is already set it means the kernel enqueue is sliced: given that we only time the first slice we can save time by skipping the other enqueues. return CL_SUCCESS; } cl_event tmp; - cl_int retval = this->real_clEnqueueNDRangeKernel(command_queue, kernel, work_dim, gwo, gws, lws, num_events_in_wait_list, event_wait_list, &tmp); + cl_int retval = this->real_clEnqueueNDRangeKernel(command_queue, kernel, work_dim, gwo, gws, lws, + num_events_in_wait_list, event_wait_list, &tmp); // Set OpenCL event this->set_cl_kernel_event(tmp); - if(event != nullptr) + if (event != nullptr) { //return cl_event from the intercepted call clRetainEvent(tmp); @@ -181,11 +208,19 @@ CLTuningParams CLTuner::find_optimal_tuning_params(ICLKernel &kernel, ITensorPac }; CLSymbols::get().clEnqueueNDRangeKernel_ptr = interceptor; - cl::NDRange gws = ICLKernel::gws_from_window(kernel.window()); - // Run the kernel with default lws to be used as baseline - const bool inject_memory = !tensors.empty(); - inject_memory ? kernel.run_op(tensors, kernel.window(), queue_profiler) : kernel.run(kernel.window(), queue_profiler); + data->do_run(kernel, queue_profiler); + + /// Get the cached gws used by the kernel + /// NOTE: The window configured inside configure() is usually changed in run(). Thus we should not calculate gws + /// from this static window. Instead we get the real gws used (and cached) by run() in the previous step. + /// This is only a temporary workaround. An ideal solution involves decoupling the execution window from run() / run_op() + /// Please see COMPMID-5934 + cl::NDRange gws = kernel.get_cached_gws(); + ARM_COMPUTE_LOG_MSG_WITH_FORMAT_ACL( + arm_compute::logging::LogLevel::INFO, + "[CLTuner] Kernel with config_id '%s' uses %s as the upper-bound for lws search", kernel.config_id().c_str(), + to_string(gws).c_str()); queue_profiler.finish(); @@ -198,7 +233,7 @@ CLTuningParams CLTuner::find_optimal_tuning_params(ICLKernel &kernel, ITensorPac // Construct the list of tuning parameters values to be tested based on the tuner mode. auto tuning_list = cl_tuner::get_tuning_parameters_list(_tuning_info, gws); - for(size_t i = 0; i < tuning_list->size(); ++i) + for (size_t i = 0; i < tuning_list->size(); ++i) { CLTuningParams tuning_test = (*tuning_list)[i]; // Setting the lws @@ -208,20 +243,22 @@ CLTuningParams CLTuner::find_optimal_tuning_params(ICLKernel &kernel, ITensorPac auto z = lws_test[2]; const bool invalid_lws = (x * y * z > kernel.get_max_workgroup_size()) || (x == 1 && y == 1 && z == 1); - if(invalid_lws) + if (invalid_lws) { continue; } kernel.set_lws_hint(lws_test); - if(_tuning_info.tune_wbsm && CLKernelLibrary::get().is_wbsm_supported()) + if (_tuning_info.tune_wbsm && CLKernelLibrary::get().is_wbsm_supported()) { cl_int wbsm_test = tuning_test.get_wbsm(); kernel.set_wbsm_hint(wbsm_test); } + ARM_COMPUTE_LOG_MSG_WITH_FORMAT_ACL(arm_compute::logging::LogLevel::INFO, "[CLTuner] Trying LWS: %s, WBSM: %d", + to_string(kernel.lws_hint()).c_str(), kernel.wbsm_hint()); // Run the kernel - inject_memory ? kernel.run_op(tensors, kernel.window(), queue_profiler) : kernel.run(kernel.window(), queue_profiler); + data->do_run(kernel, queue_profiler); queue_profiler.finish(); @@ -231,11 +268,11 @@ CLTuningParams CLTuner::find_optimal_tuning_params(ICLKernel &kernel, ITensorPac _kernel_event = nullptr; // Check the execution time - if(diff < min_exec_time) + if (diff < min_exec_time) { min_exec_time = diff; opt_tuning_params.set_lws(tuning_test.get_lws()); - if(_tuning_info.tune_wbsm) + if (_tuning_info.tune_wbsm) { opt_tuning_params.set_wbsm(tuning_test.get_wbsm()); } @@ -247,25 +284,6 @@ CLTuningParams CLTuner::find_optimal_tuning_params(ICLKernel &kernel, ITensorPac return opt_tuning_params; } -void CLTuner::import_lws_table(const std::unordered_map<std::string, cl::NDRange> &lws_table) -{ - _tuning_params_table.clear(); - for(auto && params : lws_table) - { - add_tuning_params(params.first, CLTuningParams(params.second)); - } -} - -const std::unordered_map<std::string, cl::NDRange> &CLTuner::lws_table() -{ - _lws_table.clear(); - for(auto && params : _tuning_params_table) - { - _lws_table.emplace(params.first, params.second.get_lws()); - } - return _lws_table; -} - const std::unordered_map<std::string, CLTuningParams> &CLTuner::tuning_params_table() const { return _tuning_params_table; @@ -282,30 +300,30 @@ void CLTuner::load_from_file(const std::string &filename) std::ifstream fs; fs.exceptions(std::ifstream::badbit); fs.open(filename, std::ios::in); - if(!fs.is_open()) + if (!fs.is_open()) { ARM_COMPUTE_ERROR_VAR("Failed to open '%s' (%s [%d])", filename.c_str(), strerror(errno), errno); } std::string line; bool header_line = true; - while(!std::getline(fs, line).fail()) + while (!std::getline(fs, line).fail()) { - if(header_line) + if (header_line) { header_line = false; size_t pos_lws = line.find("lws"); size_t pos_wbsm = line.find("wbsm"); _tuning_info.tune_wbsm = false; - if(pos_lws != std::string::npos || pos_wbsm != std::string::npos) + if (pos_lws != std::string::npos || pos_wbsm != std::string::npos) { // The file has in the first line the parameters it has been tuned on - if(pos_wbsm != std::string::npos) + if (pos_wbsm != std::string::npos) { _tuning_info.tune_wbsm = true; } // Once the line with the tuning parameter is read we can // read the next one to start collecting the values - if(std::getline(fs, line).fail()) + if (std::getline(fs, line).fail()) { break; } @@ -314,13 +332,13 @@ void CLTuner::load_from_file(const std::string &filename) CLTuningParams tuning_params; size_t pos = line.find(";"); - if(pos == std::string::npos) + if (pos == std::string::npos) { ARM_COMPUTE_ERROR_VAR("Malformed row '%s' in %s", line.c_str(), filename.c_str()); } std::string kernel_id = line.substr(0, pos); line.erase(0, pos + 1); - if(!tuning_params.from_string(_tuning_info, line)) + if (!tuning_params.from_string(_tuning_info, line)) { ARM_COMPUTE_ERROR_VAR("Malformed row '%s' in %s", line.c_str(), filename.c_str()); } @@ -331,7 +349,7 @@ void CLTuner::load_from_file(const std::string &filename) bool CLTuner::save_to_file(const std::string &filename) const { - if(!_tune_new_kernels || _tuning_params_table.empty() || filename.empty()) + if (!_tune_new_kernels || _tuning_params_table.empty() || filename.empty()) { return false; } @@ -340,16 +358,16 @@ bool CLTuner::save_to_file(const std::string &filename) const fs.open(filename, std::ios::out); std::string header_string = ""; header_string += "lws"; - if(_tuning_info.tune_wbsm) + if (_tuning_info.tune_wbsm) { - if(!header_string.empty()) + if (!header_string.empty()) { header_string += " "; } header_string += "wbsm"; } fs << header_string << std::endl; - for(auto const &kernel_data : _tuning_params_table) + for (auto const &kernel_data : _tuning_params_table) { CLTuningParams tun_pams(kernel_data.second); fs << kernel_data.first << tun_pams.to_string(_tuning_info) << std::endl; diff --git a/src/runtime/CL/ICLSimpleFunction.cpp b/src/runtime/CL/ICLSimpleFunction.cpp index 4530537789..bc782c3a2c 100644 --- a/src/runtime/CL/ICLSimpleFunction.cpp +++ b/src/runtime/CL/ICLSimpleFunction.cpp @@ -26,15 +26,14 @@ #include "arm_compute/core/Error.h" #include "arm_compute/runtime/CL/CLHelpers.h" #include "arm_compute/runtime/CL/CLScheduler.h" + #include "src/core/CL/ICLKernel.h" #include "src/core/CL/kernels/CLFillBorderKernel.h" using namespace arm_compute; ICLSimpleFunction::ICLSimpleFunction(CLRuntimeContext *ctx) // NOLINT - : _kernel(), - _border_handler(std::make_unique<CLFillBorderKernel>()), - _ctx(ctx) + : _kernel(), _border_handler(std::make_unique<CLFillBorderKernel>()), _ctx(ctx) { } diff --git a/src/runtime/CL/Utils.cpp b/src/runtime/CL/Utils.cpp index da3d4850bf..294396c28a 100644 --- a/src/runtime/CL/Utils.cpp +++ b/src/runtime/CL/Utils.cpp @@ -35,20 +35,20 @@ namespace arm_compute void restore_program_cache_from_file(const std::string &filename) { std::ifstream cache_file(filename, std::ios::binary); - if(cache_file.is_open()) + if (cache_file.is_open()) { - if(!CLScheduler::get().is_initialised()) + if (!CLScheduler::get().is_initialised()) { arm_compute::CLScheduler::get().default_init(); } - while(!cache_file.eof()) + while (!cache_file.eof()) { size_t name_len = 0; size_t binary_len = 0; cache_file.read(reinterpret_cast<char *>(&name_len), sizeof(size_t)); cache_file.read(reinterpret_cast<char *>(&binary_len), sizeof(size_t)); - if(name_len == 0 || binary_len == 0) + if (name_len == 0 || binary_len == 0) { break; } @@ -60,7 +60,7 @@ void restore_program_cache_from_file(const std::string &filename) tmp.resize(binary_len); cache_file.read(reinterpret_cast<char *>(binary.data()), binary_len); cl::Context context = arm_compute::CLScheduler::get().context(); - cl::Program::Binaries binaries{ binary }; + cl::Program::Binaries binaries{binary}; std::vector<cl::Device> devices = context.getInfo<CL_CONTEXT_DEVICES>(); cl::Program program(context, devices, binaries); program.build(); @@ -72,12 +72,12 @@ void restore_program_cache_from_file(const std::string &filename) void save_program_cache_to_file(const std::string &filename) { - if(CLScheduler::get().is_initialised()) + if (CLScheduler::get().is_initialised()) { std::ofstream cache_file(filename, std::ios::binary); - if(cache_file.is_open()) + if (cache_file.is_open()) { - for(const auto &it : CLKernelLibrary::get().get_built_programs()) + for (const auto &it : CLKernelLibrary::get().get_built_programs()) { std::vector<std::vector<unsigned char>> binaries = it.second.getInfo<CL_PROGRAM_BINARIES>(); ARM_COMPUTE_ERROR_ON(binaries.size() != 1); diff --git a/src/runtime/CL/functions/CLActivationLayer.cpp b/src/runtime/CL/functions/CLActivationLayer.cpp index 9c71b2aa7d..c035644e4a 100644 --- a/src/runtime/CL/functions/CLActivationLayer.cpp +++ b/src/runtime/CL/functions/CLActivationLayer.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2016-2021 Arm Limited. + * Copyright (c) 2016-2021, 2023 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -26,26 +26,27 @@ #include "arm_compute/core/CL/ICLTensor.h" #include "arm_compute/core/Types.h" #include "arm_compute/core/Validate.h" +#include "arm_compute/function_info/ActivationLayerInfo.h" #include "arm_compute/runtime/CL/CLRuntimeContext.h" + #include "src/core/CL/ICLKernel.h" -#include "src/runtime/gpu/cl/operators/ClActivation.h" +#include "src/gpu/cl/operators/ClActivation.h" namespace arm_compute { struct CLActivationLayer::Impl { - const ICLTensor *src{ nullptr }; - ICLTensor *dst{ nullptr }; - CLRuntimeContext *ctx{ nullptr }; - std::unique_ptr<opencl::ClActivation> op{ nullptr }; + const ICLTensor *src{nullptr}; + ICLTensor *dst{nullptr}; + CLRuntimeContext *ctx{nullptr}; + std::unique_ptr<opencl::ClActivation> op{nullptr}; }; -CLActivationLayer::CLActivationLayer(CLRuntimeContext *ctx) - : _impl(std::make_unique<Impl>()) +CLActivationLayer::CLActivationLayer(CLRuntimeContext *ctx) : _impl(std::make_unique<Impl>()) { _impl->ctx = ctx; } -CLActivationLayer::CLActivationLayer(CLActivationLayer &&) = default; +CLActivationLayer::CLActivationLayer(CLActivationLayer &&) = default; CLActivationLayer &CLActivationLayer::operator=(CLActivationLayer &&) = default; CLActivationLayer::~CLActivationLayer() = default; @@ -54,7 +55,10 @@ void CLActivationLayer::configure(ICLTensor *input, ICLTensor *output, Activatio configure(CLKernelLibrary::get().get_compile_context(), input, output, act_info); } -void CLActivationLayer::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, ActivationLayerInfo act_info) +void CLActivationLayer::configure(const CLCompileContext &compile_context, + ICLTensor *input, + ICLTensor *output, + ActivationLayerInfo act_info) { ARM_COMPUTE_ERROR_ON_NULLPTR(input); @@ -65,7 +69,8 @@ void CLActivationLayer::configure(const CLCompileContext &compile_context, ICLTe _impl->op->configure(compile_context, _impl->src->info(), _impl->dst->info(), act_info); } -Status CLActivationLayer::validate(const ITensorInfo *input, const ITensorInfo *output, const ActivationLayerInfo &act_info) +Status +CLActivationLayer::validate(const ITensorInfo *input, const ITensorInfo *output, const ActivationLayerInfo &act_info) { return opencl::ClActivation::validate(input, output, act_info); } diff --git a/src/runtime/CL/functions/CLArgMinMaxLayer.cpp b/src/runtime/CL/functions/CLArgMinMaxLayer.cpp index 8c32563abb..f9bbd31e8a 100644 --- a/src/runtime/CL/functions/CLArgMinMaxLayer.cpp +++ b/src/runtime/CL/functions/CLArgMinMaxLayer.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018-2020 Arm Limited. + * Copyright (c) 2018-2021, 2023 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -27,8 +27,10 @@ #include "arm_compute/core/Error.h" #include "arm_compute/core/TensorInfo.h" #include "arm_compute/core/Types.h" -#include "arm_compute/core/Validate.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" +#include "arm_compute/core/Validate.h" + +#include "src/common/utils/Log.h" #include "src/core/CL/CLValidate.h" #include "src/core/CL/kernels/CLArgMinMaxLayerKernel.h" #include "src/core/helpers/AutoConfiguration.h" @@ -37,76 +39,52 @@ namespace arm_compute { CLArgMinMaxLayer::CLArgMinMaxLayer(std::shared_ptr<IMemoryManager> memory_manager) - : _memory_group(std::move(memory_manager)), _results_vector(), _not_reshaped_output(), _reduction_kernels_vector(), _reshape(), _num_of_stages(), _reduction_axis() + : _memory_group(std::move(memory_manager)), + _not_reshaped_output(), + _arg_min_max_kernel(), + _reshape(), + _reduction_axis() { } CLArgMinMaxLayer::~CLArgMinMaxLayer() = default; -Status CLArgMinMaxLayer::validate(const ITensorInfo *input, int axis, const ITensorInfo *output, const ReductionOperation &op) +Status +CLArgMinMaxLayer::validate(const ITensorInfo *input, int axis, const ITensorInfo *output, const ReductionOperation &op) { ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::S32, DataType::F16, DataType::F32); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(op != ReductionOperation::ARG_IDX_MAX && op != ReductionOperation::ARG_IDX_MIN, "Invalid reduction operation"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis >= static_cast<int>(TensorShape::num_max_dimensions), "Reduction axis greater than max number of dimensions"); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, + DataType::S32, DataType::F16, DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(op != ReductionOperation::ARG_IDX_MAX && op != ReductionOperation::ARG_IDX_MIN, + "Invalid reduction operation"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis >= static_cast<int>(TensorShape::num_max_dimensions), + "Reduction axis greater than max number of dimensions"); ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis > 3, "Unsupported reduction axis"); - const unsigned int num_of_stages = utils::calculate_number_of_stages_only_x_axis(input->dimension(0), axis); DataType output_data_type = DataType::S32; TensorInfo not_reshaped_output; const auto input_num_channles = input->num_channels(); const auto input_qinfo = input->quantization_info(); - if(output->total_size() != 0) + if (output->total_size() != 0) { output_data_type = output->data_type(); - const TensorInfo expected_output_shape = output->clone()->set_tensor_shape(arm_compute::misc::shape_calculator::compute_reduced_shape(input->tensor_shape(), axis, false)); + const TensorInfo expected_output_shape = output->clone()->set_tensor_shape( + arm_compute::misc::shape_calculator::compute_reduced_shape(input->tensor_shape(), axis, false)); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&expected_output_shape, output); } auto shape_before_reshape = input->tensor_shape(); shape_before_reshape.set(axis, 1); - auto initialize_tensorinfo = [](TensorInfo & ti, TensorShape shape, DataType data_type, int num_channels, QuantizationInfo qinfo) - { + auto initialize_tensorinfo = [](TensorInfo &ti, TensorShape shape, DataType data_type, int num_channels, + QuantizationInfo qinfo) { ti.set_data_type(data_type).set_tensor_shape(shape).set_num_channels(num_channels).set_quantization_info(qinfo); }; initialize_tensorinfo(not_reshaped_output, shape_before_reshape, output_data_type, input_num_channles, input_qinfo); - if(num_of_stages == 1) - { - ARM_COMPUTE_RETURN_ON_ERROR(CLArgMinMaxLayerKernel::validate(input, nullptr, ¬_reshaped_output, axis, op)); - } - else - { - // Create temporary tensor infos - std::vector<TensorInfo> sums_vector(num_of_stages - 1); - - // Create intermediate tensor info - TensorShape shape{ input->tensor_shape() }; - - for(unsigned int i = 0; i < num_of_stages - 1; i++) - { - shape.set(0, ceil(shape.x() / 128.f)); - sums_vector[i].set_data_type(input->data_type()); - sums_vector[i].set_tensor_shape(shape); - sums_vector[i].set_num_channels(input->num_channels()); - } - - // Validate ReductionOperation only on first kernel - ARM_COMPUTE_RETURN_ON_ERROR(CLArgMinMaxLayerKernel::validate(input, nullptr, &sums_vector[0], axis, op)); - - // Validate ReductionOperation on intermediate stages - for(unsigned int i = 1; i < num_of_stages - 1; ++i) - { - ARM_COMPUTE_RETURN_ON_ERROR(CLArgMinMaxLayerKernel::validate(input, &sums_vector[i - 1], &sums_vector[i], axis, op)); - } - - // Validate ReductionOperation on the last stage - const unsigned int last_stage = num_of_stages - 1; - ARM_COMPUTE_RETURN_ON_ERROR(CLArgMinMaxLayerKernel::validate(input, &sums_vector[last_stage - 1], ¬_reshaped_output, axis, op)); - } + ARM_COMPUTE_RETURN_ON_ERROR(CLArgMinMaxLayerKernel::validate(input, ¬_reshaped_output, axis, op)); ARM_COMPUTE_RETURN_ON_ERROR(CLReshapeLayer::validate(¬_reshaped_output, output)); return Status{}; } @@ -116,58 +94,42 @@ void CLArgMinMaxLayer::configure(const ICLTensor *input, int axis, ICLTensor *ou configure(CLKernelLibrary::get().get_compile_context(), input, axis, output, op); } -void CLArgMinMaxLayer::configure(const CLCompileContext &compile_context, const ICLTensor *input, int axis, ICLTensor *output, const ReductionOperation &op) +void CLArgMinMaxLayer::configure(const CLCompileContext &compile_context, + const ICLTensor *input, + int axis, + ICLTensor *output, + const ReductionOperation &op) { ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); - _num_of_stages = utils::calculate_number_of_stages_only_x_axis(input->info()->dimension(0), axis); - _reduction_axis = axis; - - const TensorShape output_shape = arm_compute::misc::shape_calculator::compute_reduced_shape(input->info()->tensor_shape(), axis, false); - DataType output_data_type = (output->info()->data_type() == DataType::UNKNOWN) ? DataType::S32 : output->info()->data_type(); - auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(output_shape).set_data_type(output_data_type).reset_padding().set_is_resizable(true)); + ARM_COMPUTE_LOG_PARAMS(input, axis, output, op); - // Configure reduction operation kernels - _reduction_kernels_vector.reserve(_num_of_stages); + _reduction_axis = axis; - auto add_reduction_kernel = [this, &compile_context, axis, op](const ICLTensor * input, const ICLTensor * prev_output, ICLTensor * output) - { - _reduction_kernels_vector.emplace_back(std::make_unique<CLArgMinMaxLayerKernel>()); - _reduction_kernels_vector.back()->configure(compile_context, input, prev_output, output, axis, op); - }; + const TensorShape output_shape = + arm_compute::misc::shape_calculator::compute_reduced_shape(input->info()->tensor_shape(), axis, false); + DataType output_data_type = + (output->info()->data_type() == DataType::UNKNOWN) ? DataType::S32 : output->info()->data_type(); + auto_init_if_empty(*output->info(), input->info() + ->clone() + ->set_tensor_shape(output_shape) + .set_data_type(output_data_type) + .reset_padding() + .set_is_resizable(true)); + + TensorShape not_reshaped_output_shape{input->info()->tensor_shape()}; + not_reshaped_output_shape.set(axis, 1); + auto_init_if_empty(*_not_reshaped_output.info(), input->info() + ->clone() + ->set_tensor_shape(not_reshaped_output_shape) + .set_data_type(output_data_type) + .reset_padding() + .set_is_resizable(true)); + + _arg_min_max_kernel = std::make_unique<CLArgMinMaxLayerKernel>(); + _arg_min_max_kernel->configure(compile_context, input, &_not_reshaped_output, axis, op); _memory_group.manage(&_not_reshaped_output); - // Create temporary tensors - if(_num_of_stages == 1) - { - add_reduction_kernel(input, nullptr, &_not_reshaped_output); - } - else - { - _results_vector.resize(_num_of_stages - 1); - TensorShape shape{ input->info()->tensor_shape() }; - for(unsigned int i = 0; i < _num_of_stages - 1; i++) - { - shape.set(0, ceil(shape.x() / 128.f)); - _results_vector[i].allocator()->init(input->info()->clone()->set_tensor_shape(shape).set_data_type(output_data_type)); - } - - // Apply ReductionOperation only on first kernel - _memory_group.manage(&_results_vector[0]); - add_reduction_kernel(input, nullptr, &_results_vector[0]); - - // Apply ReductionOperation on intermediate stages - for(unsigned int i = 1; i < _num_of_stages - 1; ++i) - { - _memory_group.manage(&_results_vector[i]); - add_reduction_kernel(input, &_results_vector[i - 1], &_results_vector[i]); - _results_vector[i - 1].allocator()->allocate(); - } - - // Apply ReductionOperation on the last stage - const unsigned int last_stage = _num_of_stages - 1; - add_reduction_kernel(input, &_results_vector[last_stage - 1], &_not_reshaped_output); - _results_vector[last_stage - 1].allocator()->allocate(); - } + _reshape.configure(compile_context, &_not_reshaped_output, output); _not_reshaped_output.allocator()->allocate(); } @@ -176,10 +138,7 @@ void CLArgMinMaxLayer::run() { MemoryGroupResourceScope scope_mg(_memory_group); - for(unsigned int i = 0; i < _num_of_stages; ++i) - { - CLScheduler::get().enqueue(*_reduction_kernels_vector[i], false); - } + CLScheduler::get().enqueue(*_arg_min_max_kernel, false); _reshape.run(); } } // namespace arm_compute diff --git a/src/runtime/CL/functions/CLBatchNormalizationLayer.cpp b/src/runtime/CL/functions/CLBatchNormalizationLayer.cpp index 6b76da81c6..0c371c4171 100644 --- a/src/runtime/CL/functions/CLBatchNormalizationLayer.cpp +++ b/src/runtime/CL/functions/CLBatchNormalizationLayer.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2020 Arm Limited. + * Copyright (c) 2017-2021 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -30,6 +30,7 @@ #include "arm_compute/core/Validate.h" #include "arm_compute/runtime/CL/CLScheduler.h" +#include "src/common/utils/Log.h" #include "src/core/CL/kernels/CLBatchNormalizationLayerKernel.h" namespace arm_compute @@ -41,23 +42,40 @@ CLBatchNormalizationLayer::CLBatchNormalizationLayer() CLBatchNormalizationLayer::~CLBatchNormalizationLayer() = default; -void CLBatchNormalizationLayer::configure(ICLTensor *input, ICLTensor *output, const ICLTensor *mean, const ICLTensor *var, const ICLTensor *beta, const ICLTensor *gamma, float epsilon, +void CLBatchNormalizationLayer::configure(ICLTensor *input, + ICLTensor *output, + const ICLTensor *mean, + const ICLTensor *var, + const ICLTensor *beta, + const ICLTensor *gamma, + float epsilon, ActivationLayerInfo act_info) { configure(CLKernelLibrary::get().get_compile_context(), input, output, mean, var, beta, gamma, epsilon, act_info); } -void CLBatchNormalizationLayer::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, const ICLTensor *mean, const ICLTensor *var, const ICLTensor *beta, - const ICLTensor *gamma, float epsilon, - ActivationLayerInfo act_info) +void CLBatchNormalizationLayer::configure(const CLCompileContext &compile_context, + ICLTensor *input, + ICLTensor *output, + const ICLTensor *mean, + const ICLTensor *var, + const ICLTensor *beta, + const ICLTensor *gamma, + float epsilon, + ActivationLayerInfo act_info) { + ARM_COMPUTE_LOG_PARAMS(input, output, mean, var, beta, gamma, epsilon, act_info); _norm_kernel->configure(compile_context, input, output, mean, var, beta, gamma, epsilon, act_info); } -Status CLBatchNormalizationLayer::validate(const ITensorInfo *input, const ITensorInfo *output, - const ITensorInfo *mean, const ITensorInfo *var, - const ITensorInfo *beta, const ITensorInfo *gamma, - float epsilon, ActivationLayerInfo act_info) +Status CLBatchNormalizationLayer::validate(const ITensorInfo *input, + const ITensorInfo *output, + const ITensorInfo *mean, + const ITensorInfo *var, + const ITensorInfo *beta, + const ITensorInfo *gamma, + float epsilon, + ActivationLayerInfo act_info) { return CLBatchNormalizationLayerKernel::validate(input, output, mean, var, beta, gamma, epsilon, act_info); } @@ -66,4 +84,4 @@ void CLBatchNormalizationLayer::run() { CLScheduler::get().enqueue(*_norm_kernel, true); } -} // namespace arm_compute
\ No newline at end of file +} // namespace arm_compute diff --git a/src/runtime/CL/functions/CLBatchToSpaceLayer.cpp b/src/runtime/CL/functions/CLBatchToSpaceLayer.cpp index c2fdb74777..a3798daf61 100644 --- a/src/runtime/CL/functions/CLBatchToSpaceLayer.cpp +++ b/src/runtime/CL/functions/CLBatchToSpaceLayer.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018-2020 Arm Limited. + * Copyright (c) 2018-2021, 2023 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -30,12 +30,12 @@ #include "arm_compute/core/Validate.h" #include "arm_compute/runtime/CL/CLScheduler.h" +#include "src/common/utils/Log.h" #include "src/core/CL/kernels/CLBatchToSpaceLayerKernel.h" namespace arm_compute { -CLBatchToSpaceLayer::CLBatchToSpaceLayer() - : _batch_to_space_kernel(std::make_unique<CLBatchToSpaceLayerKernel>()) +CLBatchToSpaceLayer::CLBatchToSpaceLayer() : _batch_to_space_kernel(std::make_unique<CLBatchToSpaceLayerKernel>()) { } @@ -43,32 +43,49 @@ CLBatchToSpaceLayer::~CLBatchToSpaceLayer() = default; void CLBatchToSpaceLayer::configure(const ICLTensor *input, const ICLTensor *block_shape, ICLTensor *output) { - configure(CLKernelLibrary::get().get_compile_context(), input, block_shape, output); + ARM_COMPUTE_LOG_PARAMS(input, block_shape, output); + _batch_to_space_kernel->configure(CLKernelLibrary::get().get_compile_context(), input, block_shape, output); } -void CLBatchToSpaceLayer::configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *block_shape, ICLTensor *output) +void CLBatchToSpaceLayer::configure(const CLCompileContext &compile_context, + const ICLTensor *input, + const ICLTensor *block_shape, + ICLTensor *output) { + ARM_COMPUTE_LOG_PARAMS(input, block_shape, output); _batch_to_space_kernel->configure(compile_context, input, block_shape, output); } -void CLBatchToSpaceLayer::configure(const ICLTensor *input, int32_t block_shape_x, int32_t block_shape_y, ICLTensor *output) +void CLBatchToSpaceLayer::configure( + const ICLTensor *input, int32_t block_shape_x, int32_t block_shape_y, ICLTensor *output, const CropInfo &crop_info) { - configure(CLKernelLibrary::get().get_compile_context(), input, block_shape_x, block_shape_y, output); + configure(CLKernelLibrary::get().get_compile_context(), input, block_shape_x, block_shape_y, output, crop_info); } -void CLBatchToSpaceLayer::configure(const CLCompileContext &compile_context, const ICLTensor *input, int32_t block_shape_x, int32_t block_shape_y, ICLTensor *output) +void CLBatchToSpaceLayer::configure(const CLCompileContext &compile_context, + const ICLTensor *input, + int32_t block_shape_x, + int32_t block_shape_y, + ICLTensor *output, + const CropInfo &crop_info) { - _batch_to_space_kernel->configure(compile_context, input, block_shape_x, block_shape_y, output); + ARM_COMPUTE_LOG_PARAMS(input, block_shape_x, block_shape_y, output); + _batch_to_space_kernel->configure(compile_context, input, block_shape_x, block_shape_y, output, crop_info); } -Status CLBatchToSpaceLayer::validate(const ITensorInfo *input, const ITensorInfo *block_shape, const ITensorInfo *output) +Status +CLBatchToSpaceLayer::validate(const ITensorInfo *input, const ITensorInfo *block_shape, const ITensorInfo *output) { return CLBatchToSpaceLayerKernel::validate(input, block_shape, output); } -Status CLBatchToSpaceLayer::validate(const ITensorInfo *input, int32_t block_shape_x, int32_t block_shape_y, const ITensorInfo *output) +Status CLBatchToSpaceLayer::validate(const ITensorInfo *input, + int32_t block_shape_x, + int32_t block_shape_y, + const ITensorInfo *output, + const CropInfo &crop_info) { - return CLBatchToSpaceLayerKernel::validate(input, block_shape_x, block_shape_y, output); + return CLBatchToSpaceLayerKernel::validate(input, block_shape_x, block_shape_y, output, crop_info); } void CLBatchToSpaceLayer::run() diff --git a/src/runtime/CL/functions/CLBitwiseAnd.cpp b/src/runtime/CL/functions/CLBitwiseAnd.cpp index 70e27c0cca..7bfd0e3677 100644 --- a/src/runtime/CL/functions/CLBitwiseAnd.cpp +++ b/src/runtime/CL/functions/CLBitwiseAnd.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2016-2020 Arm Limited. + * Copyright (c) 2016-2021 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -23,6 +23,7 @@ */ #include "arm_compute/runtime/CL/functions/CLBitwiseAnd.h" +#include "src/common/utils/Log.h" #include "src/core/CL/kernels/CLBitwiseKernel.h" #include <utility> @@ -34,10 +35,14 @@ void CLBitwiseAnd::configure(const ICLTensor *input1, const ICLTensor *input2, I configure(CLKernelLibrary::get().get_compile_context(), input1, input2, output); } -void CLBitwiseAnd::configure(const CLCompileContext &compile_context, const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output) +void CLBitwiseAnd::configure(const CLCompileContext &compile_context, + const ICLTensor *input1, + const ICLTensor *input2, + ICLTensor *output) { + ARM_COMPUTE_LOG_PARAMS(input1, input2, output); auto k = std::make_unique<CLBitwiseKernel>(); k->configure(compile_context, input1, input2, output, BitwiseOperation::AND); _kernel = std::move(k); } -} // namespace arm_compute
\ No newline at end of file +} // namespace arm_compute diff --git a/src/runtime/CL/functions/CLBitwiseNot.cpp b/src/runtime/CL/functions/CLBitwiseNot.cpp index 7970a1698b..9763915c02 100644 --- a/src/runtime/CL/functions/CLBitwiseNot.cpp +++ b/src/runtime/CL/functions/CLBitwiseNot.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2016-2020 Arm Limited. + * Copyright (c) 2016-2021 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -23,6 +23,7 @@ */ #include "arm_compute/runtime/CL/functions/CLBitwiseNot.h" +#include "src/common/utils/Log.h" #include "src/core/CL/kernels/CLBitwiseKernel.h" #include <utility> @@ -36,8 +37,9 @@ void CLBitwiseNot::configure(const ICLTensor *input, ICLTensor *output) void CLBitwiseNot::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output) { + ARM_COMPUTE_LOG_PARAMS(input, output); auto k = std::make_unique<CLBitwiseKernel>(); k->configure(compile_context, input, nullptr, output, BitwiseOperation::NOT); _kernel = std::move(k); } -} // namespace arm_compute
\ No newline at end of file +} // namespace arm_compute diff --git a/src/runtime/CL/functions/CLBitwiseOr.cpp b/src/runtime/CL/functions/CLBitwiseOr.cpp index fbda9ad289..dd3171b982 100644 --- a/src/runtime/CL/functions/CLBitwiseOr.cpp +++ b/src/runtime/CL/functions/CLBitwiseOr.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2016-2020 Arm Limited. + * Copyright (c) 2016-2021 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -23,6 +23,7 @@ */ #include "arm_compute/runtime/CL/functions/CLBitwiseOr.h" +#include "src/common/utils/Log.h" #include "src/core/CL/kernels/CLBitwiseKernel.h" #include <utility> @@ -34,10 +35,14 @@ void CLBitwiseOr::configure(const ICLTensor *input1, const ICLTensor *input2, IC configure(CLKernelLibrary::get().get_compile_context(), input1, input2, output); } -void CLBitwiseOr::configure(const CLCompileContext &compile_context, const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output) +void CLBitwiseOr::configure(const CLCompileContext &compile_context, + const ICLTensor *input1, + const ICLTensor *input2, + ICLTensor *output) { + ARM_COMPUTE_LOG_PARAMS(input1, input2, output); auto k = std::make_unique<CLBitwiseKernel>(); k->configure(compile_context, input1, input2, output, BitwiseOperation::OR); _kernel = std::move(k); } -} // namespace arm_compute
\ No newline at end of file +} // namespace arm_compute diff --git a/src/runtime/CL/functions/CLBitwiseXor.cpp b/src/runtime/CL/functions/CLBitwiseXor.cpp index 4f4b74c04c..5bee4b37ec 100644 --- a/src/runtime/CL/functions/CLBitwiseXor.cpp +++ b/src/runtime/CL/functions/CLBitwiseXor.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2016-2020 Arm Limited. + * Copyright (c) 2016-2021 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -23,6 +23,7 @@ */ #include "arm_compute/runtime/CL/functions/CLBitwiseXor.h" +#include "src/common/utils/Log.h" #include "src/core/CL/kernels/CLBitwiseKernel.h" #include <utility> @@ -34,10 +35,14 @@ void CLBitwiseXor::configure(const ICLTensor *input1, const ICLTensor *input2, I configure(CLKernelLibrary::get().get_compile_context(), input1, input2, output); } -void CLBitwiseXor::configure(const CLCompileContext &compile_context, const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output) +void CLBitwiseXor::configure(const CLCompileContext &compile_context, + const ICLTensor *input1, + const ICLTensor *input2, + ICLTensor *output) { + ARM_COMPUTE_LOG_PARAMS(input1, input2, output); auto k = std::make_unique<CLBitwiseKernel>(); k->configure(compile_context, input1, input2, output, BitwiseOperation::XOR); _kernel = std::move(k); } -} // namespace arm_compute
\ No newline at end of file +} // namespace arm_compute diff --git a/src/runtime/CL/functions/CLBoundingBoxTransform.cpp b/src/runtime/CL/functions/CLBoundingBoxTransform.cpp index 0dade0a369..76e626fd75 100644 --- a/src/runtime/CL/functions/CLBoundingBoxTransform.cpp +++ b/src/runtime/CL/functions/CLBoundingBoxTransform.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018-2020 Arm Limited. + * Copyright (c) 2018-2021 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -23,24 +23,37 @@ */ #include "arm_compute/runtime/CL/functions/CLBoundingBoxTransform.h" +#include "src/common/utils/Log.h" #include "src/core/CL/kernels/CLBoundingBoxTransformKernel.h" namespace arm_compute { -void CLBoundingBoxTransform::configure(const ICLTensor *boxes, ICLTensor *pred_boxes, const ICLTensor *deltas, const BoundingBoxTransformInfo &info) +void CLBoundingBoxTransform::configure(const ICLTensor *boxes, + ICLTensor *pred_boxes, + const ICLTensor *deltas, + const BoundingBoxTransformInfo &info) { configure(CLKernelLibrary::get().get_compile_context(), boxes, pred_boxes, deltas, info); } -void CLBoundingBoxTransform::configure(const CLCompileContext &compile_context, const ICLTensor *boxes, ICLTensor *pred_boxes, const ICLTensor *deltas, const BoundingBoxTransformInfo &info) +void CLBoundingBoxTransform::configure(const CLCompileContext &compile_context, + const ICLTensor *boxes, + ICLTensor *pred_boxes, + const ICLTensor *deltas, + const BoundingBoxTransformInfo &info) { + ARM_COMPUTE_LOG_PARAMS(boxes, pred_boxes, deltas, info); + // Configure Bounding Box kernel auto k = std::make_unique<CLBoundingBoxTransformKernel>(); k->configure(compile_context, boxes, pred_boxes, deltas, info); _kernel = std::move(k); } -Status CLBoundingBoxTransform::validate(const ITensorInfo *boxes, const ITensorInfo *pred_boxes, const ITensorInfo *deltas, const BoundingBoxTransformInfo &info) +Status CLBoundingBoxTransform::validate(const ITensorInfo *boxes, + const ITensorInfo *pred_boxes, + const ITensorInfo *deltas, + const BoundingBoxTransformInfo &info) { return CLBoundingBoxTransformKernel::validate(boxes, pred_boxes, deltas, info); } diff --git a/src/runtime/CL/functions/CLCast.cpp b/src/runtime/CL/functions/CLCast.cpp index 53256ebed4..42ec8f7ee0 100644 --- a/src/runtime/CL/functions/CLCast.cpp +++ b/src/runtime/CL/functions/CLCast.cpp @@ -26,8 +26,10 @@ #include "arm_compute/core/CL/CLKernelLibrary.h" #include "arm_compute/core/CL/ICLTensor.h" #include "arm_compute/core/Validate.h" + +#include "src/common/utils/Log.h" #include "src/core/CL/ICLKernel.h" -#include "src/runtime/gpu/cl/operators/ClCast.h" +#include "src/gpu/cl/operators/ClCast.h" #include <utility> @@ -35,16 +37,15 @@ namespace arm_compute { struct CLCast::Impl { - const ICLTensor *src{ nullptr }; - ICLTensor *dst{ nullptr }; - std::unique_ptr<opencl::ClCast> op{ nullptr }; + const ICLTensor *src{nullptr}; + ICLTensor *dst{nullptr}; + std::unique_ptr<opencl::ClCast> op{nullptr}; }; -CLCast::CLCast() - : _impl(std::make_unique<Impl>()) +CLCast::CLCast() : _impl(std::make_unique<Impl>()) { } -CLCast::CLCast(CLCast &&) = default; +CLCast::CLCast(CLCast &&) = default; CLCast &CLCast::operator=(CLCast &&) = default; CLCast::~CLCast() = default; @@ -53,9 +54,13 @@ void CLCast::configure(const ICLTensor *input, ICLTensor *output, ConvertPolicy configure(CLKernelLibrary::get().get_compile_context(), input, output, policy); } -void CLCast::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, ConvertPolicy policy) +void CLCast::configure(const CLCompileContext &compile_context, + const ICLTensor *input, + ICLTensor *output, + ConvertPolicy policy) { ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); + ARM_COMPUTE_LOG_PARAMS(input, output, policy); _impl->src = input; _impl->dst = output; @@ -71,7 +76,7 @@ Status CLCast::validate(const ITensorInfo *input, const ITensorInfo *output, Con void CLCast::run() { - ITensorPack pack = { { ACL_SRC, _impl->src }, { ACL_DST, _impl->dst } }; + ITensorPack pack = {{ACL_SRC, _impl->src}, {ACL_DST, _impl->dst}}; _impl->op->run(pack); } } // namespace arm_compute diff --git a/src/runtime/CL/functions/CLChannelShuffleLayer.cpp b/src/runtime/CL/functions/CLChannelShuffleLayer.cpp index c6af5a05d5..1ee4789816 100644 --- a/src/runtime/CL/functions/CLChannelShuffleLayer.cpp +++ b/src/runtime/CL/functions/CLChannelShuffleLayer.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018-2020 Arm Limited. + * Copyright (c) 2018-2021 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -24,6 +24,8 @@ #include "arm_compute/runtime/CL/functions/CLChannelShuffleLayer.h" #include "arm_compute/core/Types.h" + +#include "src/common/utils/Log.h" #include "src/core/CL/kernels/CLChannelShuffleLayerKernel.h" namespace arm_compute @@ -33,8 +35,12 @@ void CLChannelShuffleLayer::configure(const ICLTensor *input, ICLTensor *output, configure(CLKernelLibrary::get().get_compile_context(), input, output, num_groups); } -void CLChannelShuffleLayer::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, unsigned int num_groups) +void CLChannelShuffleLayer::configure(const CLCompileContext &compile_context, + const ICLTensor *input, + ICLTensor *output, + unsigned int num_groups) { + ARM_COMPUTE_LOG_PARAMS(input, output, num_groups); auto k = std::make_unique<CLChannelShuffleLayerKernel>(); k->configure(compile_context, input, output, num_groups); _kernel = std::move(k); diff --git a/src/runtime/CL/functions/CLComparison.cpp b/src/runtime/CL/functions/CLComparison.cpp index 4122928578..2f54371e88 100644 --- a/src/runtime/CL/functions/CLComparison.cpp +++ b/src/runtime/CL/functions/CLComparison.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018-2020 Arm Limited. + * Copyright (c) 2018-2021 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -25,6 +25,8 @@ #include "arm_compute/core/CL/ICLTensor.h" #include "arm_compute/core/Types.h" + +#include "src/common/utils/Log.h" #include "src/core/CL/kernels/CLComparisonKernel.h" #include "src/core/CL/kernels/CLFillBorderKernel.h" @@ -35,24 +37,33 @@ void CLComparison::configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *ou configure(CLKernelLibrary::get().get_compile_context(), input1, input2, output, operation); } -void CLComparison::configure(const CLCompileContext &compile_context, ICLTensor *input1, ICLTensor *input2, ICLTensor *output, ComparisonOperation operation) +void CLComparison::configure(const CLCompileContext &compile_context, + ICLTensor *input1, + ICLTensor *input2, + ICLTensor *output, + ComparisonOperation operation) { + ARM_COMPUTE_LOG_PARAMS(input2, input2, output, operation); auto k = std::make_unique<CLComparisonKernel>(); k->configure(compile_context, input1, input2, output, operation); _kernel = std::move(k); - if(output->info()->dimension(0) > 1) + if (output->info()->dimension(0) > 1) { ICLTensor *broadcasted_info = (input1->info()->dimension(0) == 1) ? input1 : input2; - if(broadcasted_info->info()->dimension(0) == 1) + if (broadcasted_info->info()->dimension(0) == 1) { - _border_handler->configure(compile_context, broadcasted_info, _kernel->border_size(), BorderMode::REPLICATE); + _border_handler->configure(compile_context, broadcasted_info, _kernel->border_size(), + BorderMode::REPLICATE); } } } -Status CLComparison::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, ComparisonOperation operation) +Status CLComparison::validate(const ITensorInfo *input1, + const ITensorInfo *input2, + const ITensorInfo *output, + ComparisonOperation operation) { return CLComparisonKernel::validate(input1, input2, output, operation); } @@ -64,25 +75,30 @@ void CLComparisonStatic<COP>::configure(ICLTensor *input1, ICLTensor *input2, IC } template <ComparisonOperation COP> -void CLComparisonStatic<COP>::configure(const CLCompileContext &compile_context, ICLTensor *input1, ICLTensor *input2, ICLTensor *output) +void CLComparisonStatic<COP>::configure(const CLCompileContext &compile_context, + ICLTensor *input1, + ICLTensor *input2, + ICLTensor *output) { auto k = std::make_unique<CLComparisonKernel>(); k->configure(compile_context, input1, input2, output, COP); _kernel = std::move(k); - if(output->info()->dimension(0) > 1) + if (output->info()->dimension(0) > 1) { ICLTensor *broadcasted_info = (input1->info()->dimension(0) == 1) ? input1 : input2; - if(broadcasted_info->info()->dimension(0) == 1) + if (broadcasted_info->info()->dimension(0) == 1) { - _border_handler->configure(compile_context, broadcasted_info, _kernel->border_size(), BorderMode::REPLICATE); + _border_handler->configure(compile_context, broadcasted_info, _kernel->border_size(), + BorderMode::REPLICATE); } } } template <ComparisonOperation COP> -Status CLComparisonStatic<COP>::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output) +Status +CLComparisonStatic<COP>::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output) { return CLComparisonKernel::validate(input1, input2, output, COP); } diff --git a/src/runtime/CL/functions/CLConcatenateLayer.cpp b/src/runtime/CL/functions/CLConcatenateLayer.cpp index ea96e45bf8..9df1c34593 100644 --- a/src/runtime/CL/functions/CLConcatenateLayer.cpp +++ b/src/runtime/CL/functions/CLConcatenateLayer.cpp @@ -24,22 +24,23 @@ #include "arm_compute/runtime/CL/functions/CLConcatenateLayer.h" #include "arm_compute/core/CL/ICLTensor.h" + +#include "src/common/utils/Log.h" #include "src/core/CL/ICLKernel.h" -#include "src/runtime/gpu/cl/operators/ClConcatenate.h" +#include "src/gpu/cl/operators/ClConcatenate.h" namespace arm_compute { struct CLConcatenateLayer::Impl { std::vector<const ICLTensor *> srcs{}; - ICLTensor *dst{ nullptr }; - unsigned int num_inputs{ 0 }; - unsigned int axis{ 0 }; - std::unique_ptr<opencl::ClConcatenate> op{ nullptr }; + ICLTensor *dst{nullptr}; + unsigned int num_inputs{0}; + unsigned int axis{0}; + std::unique_ptr<opencl::ClConcatenate> op{nullptr}; }; -CLConcatenateLayer::CLConcatenateLayer() - : _impl(std::make_unique<Impl>()) +CLConcatenateLayer::CLConcatenateLayer() : _impl(std::make_unique<Impl>()) { } @@ -54,9 +55,13 @@ void CLConcatenateLayer::configure(std::vector<const ICLTensor *> &inputs_vector configure(CLKernelLibrary::get().get_compile_context(), inputs_vector, output, axis); } -void CLConcatenateLayer::configure(const CLCompileContext &compile_context, std::vector<const ICLTensor *> &inputs_vector, ICLTensor *output, size_t axis) +void CLConcatenateLayer::configure(const CLCompileContext &compile_context, + std::vector<const ICLTensor *> &inputs_vector, + ICLTensor *output, + size_t axis) { ARM_COMPUTE_ERROR_ON(output == nullptr); + ARM_COMPUTE_LOG_PARAMS(inputs_vector, output, axis); _impl->srcs = inputs_vector; _impl->dst = output; @@ -65,7 +70,7 @@ void CLConcatenateLayer::configure(const CLCompileContext &compile_context, std: _impl->op = std::make_unique<opencl::ClConcatenate>(); std::vector<ITensorInfo *> inputs_vector_info; - for(unsigned int i = 0; i < inputs_vector.size(); ++i) + for (unsigned int i = 0; i < inputs_vector.size(); ++i) { ARM_COMPUTE_ERROR_ON_NULLPTR(inputs_vector.at(i)); inputs_vector_info.emplace_back(inputs_vector.at(i)->info()); @@ -73,7 +78,9 @@ void CLConcatenateLayer::configure(const CLCompileContext &compile_context, std: _impl->op->configure(compile_context, inputs_vector_info, _impl->dst->info(), axis); } -Status CLConcatenateLayer::validate(const std::vector<const ITensorInfo *> &inputs_vector, const ITensorInfo *output, size_t axis) +Status CLConcatenateLayer::validate(const std::vector<const ITensorInfo *> &inputs_vector, + const ITensorInfo *output, + size_t axis) { return opencl::ClConcatenate::validate(inputs_vector, output, axis); } @@ -81,7 +88,7 @@ Status CLConcatenateLayer::validate(const std::vector<const ITensorInfo *> &inpu void CLConcatenateLayer::run() { ITensorPack pack; - for(unsigned i = 0; i < _impl->num_inputs; ++i) + for (unsigned i = 0; i < _impl->num_inputs; ++i) { pack.add_tensor(TensorType::ACL_SRC_VEC + i, _impl->srcs.at(i)); } diff --git a/src/runtime/CL/functions/CLConv3D.cpp b/src/runtime/CL/functions/CLConv3D.cpp new file mode 100644 index 0000000000..9d1b368f72 --- /dev/null +++ b/src/runtime/CL/functions/CLConv3D.cpp @@ -0,0 +1,97 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/runtime/CL/functions/CLConv3D.h" + +#include "arm_compute/core/CL/ICLTensor.h" + +#include "src/gpu/cl/operators/ClDirectConv3d.h" + +namespace arm_compute +{ +using namespace arm_compute::experimental; + +struct CLConv3D::Impl +{ + const ICLTensor *src{nullptr}; + const ICLTensor *weights{nullptr}; + const ICLTensor *biases{nullptr}; + ICLTensor *dst{nullptr}; + std::unique_ptr<opencl::ClDirectConv3d> op{nullptr}; +}; + +CLConv3D::CLConv3D() : _impl(std::make_unique<Impl>()) +{ +} + +CLConv3D::~CLConv3D() = default; + +void CLConv3D::configure(const ICLTensor *src, + const ICLTensor *weights, + const ICLTensor *biases, + ICLTensor *dst, + const Conv3dInfo &conv3d_info) +{ + configure(CLKernelLibrary::get().get_compile_context(), src, weights, biases, dst, conv3d_info); +} + +void CLConv3D::configure(const CLCompileContext &compile_context, + const ICLTensor *src, + const ICLTensor *weights, + const ICLTensor *biases, + ICLTensor *dst, + const Conv3dInfo &conv3d_info) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(src, weights, dst); + ARM_COMPUTE_ERROR_THROW_ON(CLConv3D::validate( + src->info(), weights->info(), ((biases != nullptr) ? biases->info() : nullptr), dst->info(), conv3d_info)); + + _impl->src = src; + _impl->weights = weights; + _impl->biases = biases; + _impl->dst = dst; + + _impl->op = std::make_unique<opencl::ClDirectConv3d>(); + _impl->op->configure(compile_context, _impl->src->info(), _impl->weights->info(), + _impl->biases ? _impl->biases->info() : nullptr, _impl->dst->info(), conv3d_info); +} + +Status CLConv3D::validate(const ITensorInfo *src, + const ITensorInfo *weights, + const ITensorInfo *biases, + const ITensorInfo *dst, + const Conv3dInfo &conv3d_info) +{ + return opencl::ClDirectConv3d::validate(src, weights, biases, dst, conv3d_info); +} + +void CLConv3D::run() +{ + ITensorPack pack; + pack.add_tensor(TensorType::ACL_SRC_0, _impl->src); + pack.add_tensor(TensorType::ACL_SRC_1, _impl->weights); + pack.add_tensor(TensorType::ACL_SRC_2, _impl->biases); + pack.add_tensor(TensorType::ACL_DST, _impl->dst); + _impl->op->run(pack); +} +} // namespace arm_compute diff --git a/src/runtime/CL/functions/CLConvertFullyConnectedWeights.cpp b/src/runtime/CL/functions/CLConvertFullyConnectedWeights.cpp index 8189eee402..2298f2a669 100644 --- a/src/runtime/CL/functions/CLConvertFullyConnectedWeights.cpp +++ b/src/runtime/CL/functions/CLConvertFullyConnectedWeights.cpp @@ -27,41 +27,50 @@ #include "arm_compute/core/CL/ICLTensor.h" #include "arm_compute/core/Types.h" #include "arm_compute/core/Validate.h" + +#include "src/common/utils/Log.h" #include "src/core/CL/ICLKernel.h" -#include "src/runtime/gpu/cl/operators/ClConvertFullyConnectedWeights.h" +#include "src/gpu/cl/operators/ClConvertFullyConnectedWeights.h" namespace arm_compute { struct CLConvertFullyConnectedWeights::Impl { - const ICLTensor *src{ nullptr }; - ICLTensor *dst{ nullptr }; - std::unique_ptr<opencl::ClConvertFullyConnectedWeights> op{ nullptr }; + const ICLTensor *src{nullptr}; + ICLTensor *dst{nullptr}; + std::unique_ptr<opencl::ClConvertFullyConnectedWeights> op{nullptr}; }; -CLConvertFullyConnectedWeights::CLConvertFullyConnectedWeights() - : _impl(std::make_unique<Impl>()) +CLConvertFullyConnectedWeights::CLConvertFullyConnectedWeights() : _impl(std::make_unique<Impl>()) { } CLConvertFullyConnectedWeights::~CLConvertFullyConnectedWeights() = default; -void CLConvertFullyConnectedWeights::configure(const ICLTensor *input, ICLTensor *output, const TensorShape &original_input_shape, - DataLayout data_layout) +void CLConvertFullyConnectedWeights::configure(const ICLTensor *input, + ICLTensor *output, + const TensorShape &original_input_shape, + DataLayout data_layout) { configure(CLKernelLibrary::get().get_compile_context(), input, output, original_input_shape, data_layout); } -void CLConvertFullyConnectedWeights::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const TensorShape &original_input_shape, - DataLayout data_layout) +void CLConvertFullyConnectedWeights::configure(const CLCompileContext &compile_context, + const ICLTensor *input, + ICLTensor *output, + const TensorShape &original_input_shape, + DataLayout data_layout) { ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); + ARM_COMPUTE_LOG_PARAMS(input, output, original_input_shape, data_layout); _impl->src = input; _impl->dst = output; _impl->op = std::make_unique<opencl::ClConvertFullyConnectedWeights>(); _impl->op->configure(compile_context, _impl->src->info(), _impl->dst->info(), original_input_shape, data_layout); } -Status CLConvertFullyConnectedWeights::validate(const ITensorInfo *input, const ITensorInfo *output, const TensorShape &original_input_shape, - DataLayout data_layout) +Status CLConvertFullyConnectedWeights::validate(const ITensorInfo *input, + const ITensorInfo *output, + const TensorShape &original_input_shape, + DataLayout data_layout) { return opencl::ClConvertFullyConnectedWeights::validate(input, output, original_input_shape, data_layout); } @@ -74,4 +83,4 @@ void CLConvertFullyConnectedWeights::run() _impl->op->run(pack); } -} // namespace arm_compute
\ No newline at end of file +} // namespace arm_compute diff --git a/src/runtime/CL/functions/CLConvolutionLayer.cpp b/src/runtime/CL/functions/CLConvolutionLayer.cpp index 96d7cc72c2..7767b45a01 100644 --- a/src/runtime/CL/functions/CLConvolutionLayer.cpp +++ b/src/runtime/CL/functions/CLConvolutionLayer.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2021 Arm Limited. + * Copyright (c) 2017-2023 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -23,115 +23,149 @@ */ #include "arm_compute/runtime/CL/functions/CLConvolutionLayer.h" -#include "arm_compute/core/PixelValue.h" -#include "arm_compute/core/Utils.h" -#include "arm_compute/core/Validate.h" +#include "arm_compute/core/CL/CLKernelLibrary.h" +#include "arm_compute/core/CL/ICLTensor.h" +#include "arm_compute/core/KernelDescriptors.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" -#include "arm_compute/core/utils/quantization/AsymmHelpers.h" -#include "arm_compute/runtime/CL/CLScheduler.h" +#include "arm_compute/runtime/CL/functions/CLFFTConvolutionLayer.h" -#include <cmath> -#include <memory> -#include <tuple> +#include "src/common/utils/Log.h" +#include "src/core/CL/ICLKernel.h" +#include "src/core/helpers/MemoryHelpers.h" +#include "src/gpu/cl/operators/ClConv2d.h" +#include "support/Cast.h" namespace arm_compute { using namespace arm_compute::misc::shape_calculator; - -CLConvolutionLayer::CLConvolutionLayer(std::shared_ptr<IMemoryManager> memory_manager) - : _memory_manager(std::move(memory_manager)), _function() +using namespace arm_compute::experimental; +struct CLConvolutionLayer::Impl +{ + MemoryGroup memory_group{}; + std::shared_ptr<IMemoryManager> memory_manager{}; + std::unique_ptr<opencl::IClOperator> op{nullptr}; + ITensorPack run_pack{}; + ITensorPack prep_pack{}; + WorkspaceData<CLTensor> workspace{}; + experimental::MemoryRequirements aux_mem_req{}; + std::unique_ptr<IFunction> func{nullptr}; +}; + +CLConvolutionLayer::CLConvolutionLayer(std::shared_ptr<IMemoryManager> memory_manager) : _impl(std::make_unique<Impl>()) { + _impl->memory_manager = std::move(memory_manager); } CLConvolutionLayer::~CLConvolutionLayer() = default; -void CLConvolutionLayer::configure(ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info, const WeightsInfo &weights_info, - const Size2D &dilation, const ActivationLayerInfo &act_info, bool enable_fast_math, unsigned int num_groups) +void CLConvolutionLayer::configure(ICLTensor *input, + const ICLTensor *weights, + const ICLTensor *biases, + ICLTensor *output, + const PadStrideInfo &conv_info, + const WeightsInfo &weights_info, + const Size2D &dilation, + const ActivationLayerInfo &act_info, + bool enable_fast_math, + unsigned int num_groups) { - configure(CLKernelLibrary::get().get_compile_context(), input, weights, biases, output, conv_info, weights_info, dilation, act_info, enable_fast_math, num_groups); + configure(CLKernelLibrary::get().get_compile_context(), input, weights, biases, output, conv_info, weights_info, + dilation, act_info, enable_fast_math, num_groups); } -void CLConvolutionLayer::configure(const CLCompileContext &compile_context, ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info, - const WeightsInfo &weights_info, - const Size2D &dilation, const ActivationLayerInfo &act_info, bool enable_fast_math, unsigned int num_groups) +void CLConvolutionLayer::configure(const CLCompileContext &compile_context, + ICLTensor *input, + const ICLTensor *weights, + const ICLTensor *biases, + ICLTensor *output, + const PadStrideInfo &conv_info, + const WeightsInfo &weights_info, + const Size2D &dilation, + const ActivationLayerInfo &act_info, + bool enable_fast_math, + unsigned int num_groups) { ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output); - ARM_COMPUTE_ERROR_THROW_ON(CLConvolutionLayer::validate(input->info(), weights->info(), ((biases != nullptr) ? biases->info() : nullptr), output->info(), conv_info, weights_info, dilation, act_info, - enable_fast_math, num_groups)); + ARM_COMPUTE_ERROR_THROW_ON(CLConvolutionLayer::validate( + input->info(), weights->info(), ((biases != nullptr) ? biases->info() : nullptr), output->info(), conv_info, + weights_info, dilation, act_info, enable_fast_math, num_groups)); + ARM_COMPUTE_LOG_PARAMS(input, weights, biases, output, conv_info, weights_info, dilation, act_info, + enable_fast_math, num_groups); + + const Conv2dInfo conv2d_info = Conv2dInfo(conv_info, dilation, act_info, enable_fast_math, num_groups); - switch(CLConvolutionLayer::get_convolution_method(input->info(), weights->info(), output->info(), conv_info, - weights_info, act_info, CLScheduler::get().target(), dilation, enable_fast_math)) + switch (opencl::ClConv2d::get_convolution_method(input->info(), weights->info(), output->info(), conv2d_info, + weights_info, CLScheduler::get().target())) { case ConvolutionMethod::WINOGRAD: - { - ARM_COMPUTE_ERROR_ON(num_groups != 1); - auto f = std::make_unique<CLWinogradConvolutionLayer>(_memory_manager); - f->configure(compile_context, input, weights, biases, output, conv_info, act_info, enable_fast_math); - _function = std::move(f); - break; - } case ConvolutionMethod::DIRECT: - { - ARM_COMPUTE_ERROR_ON(num_groups != 1); - auto f = std::make_unique<CLDirectConvolutionLayer>(); - f->configure(compile_context, input, weights, biases, output, conv_info, act_info); - _function = std::move(f); - break; - } + case ConvolutionMethod::INDIRECT: case ConvolutionMethod::GEMM: { - auto f = std::make_unique<CLGEMMConvolutionLayer>(_memory_manager); - f->configure(compile_context, input, weights, biases, output, conv_info, weights_info, dilation, act_info, num_groups); - _function = std::move(f); + auto f = std::make_unique<opencl::ClConv2d>(); + f->configure(compile_context, input->info(), weights->info(), + ((biases != nullptr) ? biases->info() : nullptr), output->info(), conv2d_info, weights_info); + _impl->op = std::move(f); break; } case ConvolutionMethod::FFT: { - auto f = std::make_unique<CLFFTConvolutionLayer>(_memory_manager); + auto f = std::make_unique<CLFFTConvolutionLayer>(_impl->memory_manager); f->configure(compile_context, input, weights, biases, output, conv_info, act_info, enable_fast_math); - _function = std::move(f); + _impl->func = std::move(f); break; } default: ARM_COMPUTE_ERROR("Not supported."); break; } + + if (_impl->op) + { + _impl->memory_group = MemoryGroup(std::move(_impl->memory_manager)); + _impl->aux_mem_req = _impl->op->workspace(); + _impl->run_pack = {{ACL_SRC_0, input}, {ACL_SRC_1, weights}, {ACL_SRC_2, biases}, {ACL_DST, output}}; + _impl->prep_pack = {{ACL_SRC_1, weights}, {ACL_SRC_2, biases}}; + _impl->workspace = + manage_workspace<CLTensor>(_impl->aux_mem_req, _impl->memory_group, _impl->run_pack, _impl->prep_pack); + } } -Status CLConvolutionLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info, - const WeightsInfo &weights_info, const Size2D &dilation, const ActivationLayerInfo &act_info, bool enable_fast_math, unsigned int num_groups) +Status CLConvolutionLayer::validate(const ITensorInfo *input, + const ITensorInfo *weights, + const ITensorInfo *biases, + const ITensorInfo *output, + const PadStrideInfo &conv_info, + const WeightsInfo &weights_info, + const Size2D &dilation, + const ActivationLayerInfo &act_info, + bool enable_fast_math, + unsigned int num_groups) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output); - ARM_COMPUTE_RETURN_ERROR_ON_MSG((num_groups != 1) && (input->data_layout() != DataLayout::NCHW), "Grouping (num_groups != 1) with NHWC data layout is not supported"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(!weights->are_values_constant(), "Dynamic weights are not supported"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG((num_groups != 1) && (input->data_layout() != DataLayout::NCHW), + "Grouping (num_groups != 1) with NHWC data layout is not supported"); - const GPUTarget gpu_target = CLScheduler::get().target(); + const GPUTarget gpu_target = CLScheduler::get().target(); + const Conv2dInfo conv2d_info = Conv2dInfo(conv_info, dilation, act_info, enable_fast_math, num_groups); - switch(CLConvolutionLayer::get_convolution_method(input, weights, output, conv_info, weights_info, act_info, gpu_target, dilation, enable_fast_math)) + switch (opencl::ClConv2d::get_convolution_method(input, weights, output, conv2d_info, weights_info, gpu_target)) { case ConvolutionMethod::WINOGRAD: - { - //Validate Winograd - ARM_COMPUTE_RETURN_ERROR_ON_MSG(num_groups != 1, "Grouping (num_groups != 1) with CLWinogradConvolutionLayer is not supported"); - ARM_COMPUTE_RETURN_ON_ERROR(CLWinogradConvolutionLayer::validate(input, weights, biases, output, conv_info, act_info, enable_fast_math)); - break; - } case ConvolutionMethod::DIRECT: - { - // Validate direct convolution layer - ARM_COMPUTE_RETURN_ERROR_ON_MSG(num_groups != 1, "Grouping (num_groups != 1) with CLDirectConvolutionLayer is not supported"); - ARM_COMPUTE_RETURN_ON_ERROR(CLDirectConvolutionLayer::validate(input, weights, biases, output, conv_info, act_info)); - break; - } + case ConvolutionMethod::INDIRECT: case ConvolutionMethod::GEMM: { - // Validate gemm-based convolution layer - ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMConvolutionLayer::validate(input, weights, biases, output, conv_info, weights_info, dilation, act_info, num_groups)); + ARM_COMPUTE_RETURN_ON_ERROR( + opencl::ClConv2d::validate(input, weights, biases, output, conv2d_info, weights_info)); break; } case ConvolutionMethod::FFT: { // Validate FFT-based convolution layer - ARM_COMPUTE_RETURN_ON_ERROR(CLFFTConvolutionLayer::validate(input, weights, nullptr, output, conv_info, act_info, enable_fast_math)); + ARM_COMPUTE_RETURN_ON_ERROR(CLFFTConvolutionLayer::validate(input, weights, nullptr, output, conv_info, + act_info, enable_fast_math)); break; } default: @@ -142,120 +176,48 @@ Status CLConvolutionLayer::validate(const ITensorInfo *input, const ITensorInfo return Status{}; } -ConvolutionMethod CLConvolutionLayer::get_convolution_method(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *output, const PadStrideInfo &conv_info, - const WeightsInfo &weights_info, const ActivationLayerInfo &act_info, const GPUTarget gpu_target, const Size2D &dilation, bool enable_fast_math) +ConvolutionMethod CLConvolutionLayer::get_convolution_method(const ITensorInfo *input, + const ITensorInfo *weights, + const ITensorInfo *output, + const PadStrideInfo &conv_info, + const WeightsInfo &weights_info, + const ActivationLayerInfo &act_info, + const GPUTarget gpu_target, + const Size2D &dilation, + bool enable_fast_math) { - ARM_COMPUTE_ERROR_ON_NULLPTR(input); - ARM_COMPUTE_ERROR_ON_NULLPTR(output); - ARM_COMPUTE_ERROR_ON_NULLPTR(weights); - ARM_COMPUTE_UNUSED(weights_info); - ARM_COMPUTE_UNUSED(gpu_target); - - const size_t idx_w = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::WIDTH); - const size_t idx_h = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::HEIGHT); - const size_t idx_c = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::CHANNEL); - - /* Input spatial dims, kernel size, IFM/OFM, conv info*/ - using ConvolutionConfiguration = std::tuple<Size2D, Size2D, Size2D, PadStrideInfo, DataLayout>; - using ConfigurationMethod = std::pair<ConvolutionConfiguration, ConvolutionMethod>; - - const std::vector<ConfigurationMethod> known_configs = - { - // Alexnet - ConfigurationMethod(ConvolutionConfiguration(Size2D(27U, 27U), Size2D(5U, 5U), Size2D(48U, 128U), PadStrideInfo(1U, 1U, 2U, 2U), DataLayout::NCHW), ConvolutionMethod::DIRECT), - // VGG16 / VGG19 - ConfigurationMethod(ConvolutionConfiguration(Size2D(224U, 224U), Size2D(3U, 3U), Size2D(3U, 64U), PadStrideInfo(1U, 1U, 1U, 1U), DataLayout::NCHW), ConvolutionMethod::DIRECT), - // Mobilenet 224 - ConfigurationMethod(ConvolutionConfiguration(Size2D(224U, 224U), Size2D(3U, 3U), Size2D(3U, 32U), PadStrideInfo(2U, 2U, 0U, 1U, 0U, 1U, DimensionRoundingType::FLOOR), DataLayout::NCHW), ConvolutionMethod::GEMM), - // Mobilenet 160 - ConfigurationMethod(ConvolutionConfiguration(Size2D(160U, 160U), Size2D(3U, 3U), Size2D(3U, 24U), PadStrideInfo(2U, 2U, 0U, 1U, 0U, 1U, DimensionRoundingType::FLOOR), DataLayout::NCHW), ConvolutionMethod::GEMM), - // Mobilenet 224 - ConfigurationMethod(ConvolutionConfiguration(Size2D(224U, 224U), Size2D(3U, 3U), Size2D(3U, 32U), PadStrideInfo(2U, 2U, 0U, 1U, 0U, 1U, DimensionRoundingType::FLOOR), DataLayout::NHWC), ConvolutionMethod::GEMM), - // Mobilenet 160 - ConfigurationMethod(ConvolutionConfiguration(Size2D(160U, 160U), Size2D(3U, 3U), Size2D(3U, 24U), PadStrideInfo(2U, 2U, 0U, 1U, 0U, 1U, DimensionRoundingType::FLOOR), DataLayout::NHWC), ConvolutionMethod::GEMM), - }; - - const auto find_config = [&](ConfigurationMethod c) - { - const ConvolutionConfiguration config = c.first; - const PadStrideInfo info = std::get<3>(config); - const DataLayout data_layout = std::get<4>(config); + const Conv2dInfo conv2d_info = Conv2dInfo(conv_info, dilation, act_info, enable_fast_math, 1); + return opencl::ClConv2d::get_convolution_method(input, weights, output, conv2d_info, weights_info, gpu_target); +} - return std::get<0>(config) == Size2D(input->dimension(idx_w), input->dimension(idx_h)) && std::get<1>(config) == Size2D(weights->dimension(idx_w), weights->dimension(idx_h)) - && std::get<2>(config) == Size2D(weights->dimension(idx_c), weights->dimension(3)) && info.pad_top() == conv_info.pad_top() && info.pad_right() == conv_info.pad_right() - && info.pad_bottom() == conv_info.pad_bottom() && info.pad_left() == conv_info.pad_left() && info.stride() == conv_info.stride() && (data_layout == input->data_layout()); - }; +void CLConvolutionLayer::run() +{ + prepare(); - std::vector<ConfigurationMethod>::const_iterator found; - if((found = std::find_if(known_configs.begin(), known_configs.end(), find_config)) != known_configs.end()) - { - return (*found).second; - } + MemoryGroupResourceScope scope_mg(_impl->memory_group); - if(dilation != Size2D(1U, 1U)) + if (_impl->func) { - return ConvolutionMethod::GEMM; + _impl->func->run(); } else { - if(input->data_layout() == DataLayout::NCHW) - { - // SRGAN - if((input->dimension(idx_h) > 720U) && (output->dimension(idx_h) > 720U) && (weights->dimension(idx_h) == 9) && (conv_info.pad_top() < 3) - && (CLDirectConvolutionLayer::validate(input, weights, nullptr, output, conv_info, act_info))) - { - return ConvolutionMethod::DIRECT; - } - if((weights->dimension(idx_h) > 5) && (input->dimension(idx_c) > output->dimension(idx_c)) && (CLFFTConvolutionLayer::validate(input, weights, nullptr, output, conv_info, act_info, enable_fast_math))) - { - return ConvolutionMethod::FFT; - } - if(input->dimension(idx_c) < 16) - { - return ConvolutionMethod::GEMM; - } - return bool(CLWinogradConvolutionLayer::validate(input, weights, nullptr, output, conv_info, act_info, enable_fast_math)) ? ConvolutionMethod::WINOGRAD : ConvolutionMethod::GEMM; - } - else - { - // SRGAN - if((input->dimension(idx_h) > 720U) && (output->dimension(idx_h) > 720U) && (weights->dimension(idx_h) == 9) && (conv_info.pad_top() < 3) - && (CLDirectConvolutionLayer::validate(input, weights, nullptr, output, conv_info, act_info))) - { - return ConvolutionMethod::DIRECT; - } - if(gpu_target == GPUTarget::G71) - { - if((weights->dimension(idx_h) > 7) && (input->dimension(idx_c) >= output->dimension(idx_c)) - && (CLFFTConvolutionLayer::validate(input, weights, nullptr, output, conv_info, act_info, enable_fast_math))) - { - return ConvolutionMethod::FFT; - } - } - else if(is_data_type_float(input->data_type())) - { - if((weights->dimension(idx_h) >= 5) && (input->dimension(idx_c) >= output->dimension(idx_c)) && (CLDirectConvolutionLayer::validate(input, weights, nullptr, output, conv_info, act_info))) - { - return ConvolutionMethod::DIRECT; - } - } - if(input->dimension(idx_c) < 16) - { - return ConvolutionMethod::GEMM; - } - return bool(CLWinogradConvolutionLayer::validate(input, weights, nullptr, output, conv_info, act_info, enable_fast_math)) ? ConvolutionMethod::WINOGRAD : ConvolutionMethod::GEMM; - } + _impl->op->run(_impl->run_pack); } } -void CLConvolutionLayer::run() -{ - prepare(); - _function->run(); -} - void CLConvolutionLayer::prepare() { - _function->prepare(); + if (_impl->func) + { + _impl->func->prepare(); + } + else + { + _impl->op->prepare(_impl->prep_pack); + + // Release temporary tensors that are only used in prepare stage + release_temporaries(_impl->aux_mem_req, _impl->workspace); + } } } // namespace arm_compute diff --git a/src/runtime/CL/functions/CLCopy.cpp b/src/runtime/CL/functions/CLCopy.cpp index 98916bf38a..a4f2b0634f 100644 --- a/src/runtime/CL/functions/CLCopy.cpp +++ b/src/runtime/CL/functions/CLCopy.cpp @@ -27,8 +27,10 @@ #include "arm_compute/core/CL/ICLTensor.h" #include "arm_compute/core/Types.h" #include "arm_compute/core/Validate.h" + +#include "src/common/utils/Log.h" #include "src/core/CL/ICLKernel.h" -#include "src/runtime/gpu/cl/operators/ClCopy.h" +#include "src/gpu/cl/operators/ClCopy.h" #include <utility> @@ -36,16 +38,15 @@ namespace arm_compute { struct CLCopy::Impl { - const ICLTensor *src{ nullptr }; - ICLTensor *dst{ nullptr }; - std::unique_ptr<opencl::ClCopy> op{ nullptr }; + const ICLTensor *src{nullptr}; + ICLTensor *dst{nullptr}; + std::unique_ptr<opencl::ClCopy> op{nullptr}; }; -CLCopy::CLCopy() - : _impl(std::make_unique<Impl>()) +CLCopy::CLCopy() : _impl(std::make_unique<Impl>()) { } -CLCopy::CLCopy(CLCopy &&) = default; +CLCopy::CLCopy(CLCopy &&) = default; CLCopy &CLCopy::operator=(CLCopy &&) = default; CLCopy::~CLCopy() = default; @@ -57,6 +58,7 @@ void CLCopy::configure(ICLTensor *input, ICLTensor *output, Window *dst_window) void CLCopy::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, Window *dst_window) { ARM_COMPUTE_ERROR_ON_NULLPTR(input); + ARM_COMPUTE_LOG_PARAMS(input, output, dst_window); _impl->src = input; _impl->dst = output; diff --git a/src/runtime/CL/functions/CLCrop.cpp b/src/runtime/CL/functions/CLCrop.cpp index 20cab4df5f..fc29c43827 100644 --- a/src/runtime/CL/functions/CLCrop.cpp +++ b/src/runtime/CL/functions/CLCrop.cpp @@ -27,8 +27,10 @@ #include "arm_compute/core/CL/ICLTensor.h" #include "arm_compute/core/Types.h" #include "arm_compute/core/Validate.h" + +#include "src/common/utils/Log.h" #include "src/core/CL/ICLKernel.h" -#include "src/runtime/gpu/cl/operators/ClCrop.h" +#include "src/gpu/cl/operators/ClCrop.h" #include <utility> @@ -36,38 +38,57 @@ namespace arm_compute { struct CLCrop::Impl { - const ICLTensor *src{ nullptr }; - ICLTensor *dst{ nullptr }; - std::unique_ptr<opencl::ClCrop> op{ nullptr }; + const ICLTensor *src{nullptr}; + ICLTensor *dst{nullptr}; + std::unique_ptr<opencl::ClCrop> op{nullptr}; }; -CLCrop::CLCrop() - : _impl(std::make_unique<Impl>()) +CLCrop::CLCrop() : _impl(std::make_unique<Impl>()) { } -CLCrop::CLCrop(CLCrop &&) = default; +CLCrop::CLCrop(CLCrop &&) = default; CLCrop &CLCrop::operator=(CLCrop &&) = default; CLCrop::~CLCrop() = default; -void CLCrop::configure(const ICLTensor *src, ICLTensor *dst, Coordinates2D start, Coordinates2D end, uint32_t batch_index, float extrapolation_value, - Window *dst_window) +void CLCrop::configure(const ICLTensor *src, + ICLTensor *dst, + Coordinates2D start, + Coordinates2D end, + uint32_t batch_index, + float extrapolation_value, + Window *dst_window) { - configure(CLKernelLibrary::get().get_compile_context(), src, dst, start, end, batch_index, extrapolation_value, dst_window); + configure(CLKernelLibrary::get().get_compile_context(), src, dst, start, end, batch_index, extrapolation_value, + dst_window); } -void CLCrop::configure(const CLCompileContext &compile_context, const ICLTensor *src, ICLTensor *dst, Coordinates2D start, Coordinates2D end, uint32_t batch_index, float extrapolation_value, - Window *dst_window) +void CLCrop::configure(const CLCompileContext &compile_context, + const ICLTensor *src, + ICLTensor *dst, + Coordinates2D start, + Coordinates2D end, + uint32_t batch_index, + float extrapolation_value, + Window *dst_window) { ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst); + ARM_COMPUTE_LOG_PARAMS(src, dst, start, end, batch_index, extrapolation_value, dst_window); _impl->src = src; _impl->dst = dst; _impl->op = std::make_unique<opencl::ClCrop>(); - _impl->op->configure(compile_context, _impl->src->info(), _impl->dst->info(), start, end, batch_index, extrapolation_value, dst_window); + _impl->op->configure(compile_context, _impl->src->info(), _impl->dst->info(), start, end, batch_index, + extrapolation_value, dst_window); } -Status CLCrop::validate(const ITensorInfo *input, const ITensorInfo *output, Coordinates2D start, Coordinates2D end, uint32_t batch_index, float extrapolation_value, Window *dst_window) +Status CLCrop::validate(const ITensorInfo *input, + const ITensorInfo *output, + Coordinates2D start, + Coordinates2D end, + uint32_t batch_index, + float extrapolation_value, + Window *dst_window) { return opencl::ClCrop::validate(input, output, start, end, batch_index, extrapolation_value, dst_window); } diff --git a/src/runtime/CL/functions/CLCropResize.cpp b/src/runtime/CL/functions/CLCropResize.cpp index 77c44d539b..821412b149 100644 --- a/src/runtime/CL/functions/CLCropResize.cpp +++ b/src/runtime/CL/functions/CLCropResize.cpp @@ -25,6 +25,8 @@ #include "arm_compute/core/CL/CLHelpers.h" #include "arm_compute/runtime/CL/CLScheduler.h" + +#include "src/common/utils/Log.h" #include "src/core/CL/kernels/CLFillBorderKernel.h" #include "src/core/helpers/AutoConfiguration.h" #include "src/core/helpers/WindowHelpers.h" @@ -35,7 +37,14 @@ namespace arm_compute { namespace { -inline void configure_crop(const ICLTensor *input, ICLTensor *crop_boxes, ICLTensor *box_ind, ICLTensor *output, uint32_t crop_box_ind, Coordinates &start, Coordinates &end, uint32_t &batch_index) +inline void configure_crop(const ICLTensor *input, + ICLTensor *crop_boxes, + ICLTensor *box_ind, + ICLTensor *output, + uint32_t crop_box_ind, + Coordinates &start, + Coordinates &end, + uint32_t &batch_index) { batch_index = *(reinterpret_cast<int32_t *>(box_ind->ptr_to_element(Coordinates(crop_box_ind)))); @@ -48,30 +57,48 @@ inline void configure_crop(const ICLTensor *input, ICLTensor *crop_boxes, ICLTen // The normalized coordinates are scaled to retrieve the floating point image coordinates which are rounded to integers. start = Coordinates(std::floor(x0 * (input->info()->tensor_shape()[1] - 1) + 0.5f), std::floor(y0 * (input->info()->tensor_shape()[2] - 1) + 0.5f)); - end = Coordinates(std::floor(x1 * (input->info()->tensor_shape()[1] - 1) + 0.5f), - std::floor(y1 * (input->info()->tensor_shape()[2] - 1) + 0.5f)); - const TensorShape out_shape(input->info()->tensor_shape()[0], static_cast<uint32_t>(abs(end[0] - start[0])) + 1, static_cast<uint32_t>(abs(end[1] - start[1])) + 1); + end = Coordinates(std::floor(x1 * (input->info()->tensor_shape()[1] - 1) + 0.5f), + std::floor(y1 * (input->info()->tensor_shape()[2] - 1) + 0.5f)); + const TensorShape out_shape(input->info()->tensor_shape()[0], static_cast<uint32_t>(abs(end[0] - start[0])) + 1, + static_cast<uint32_t>(abs(end[1] - start[1])) + 1); output->info()->set_tensor_shape(out_shape); } } // namespace CLCropResize::CLCropResize() - : _input(nullptr), _boxes(nullptr), _box_ind(nullptr), _output(nullptr), _num_boxes(0), _method(), _extrapolation_value(0), _scale(), _copy(), _crop_results(), _scaled_results(), _internal_functions() + : _input(nullptr), + _boxes(nullptr), + _box_ind(nullptr), + _output(nullptr), + _num_boxes(0), + _method(), + _extrapolation_value(0), + _scale(), + _copy(), + _crop_results(), + _scaled_results(), + _internal_functions() { } CLCropResize::~CLCropResize() = default; -Status CLCropResize::validate(const ITensorInfo *input, ITensorInfo *boxes, ITensorInfo *box_ind, const ITensorInfo *output, - Coordinates2D crop_size, InterpolationPolicy method, float extrapolation_value) +Status CLCropResize::validate(const ITensorInfo *input, + ITensorInfo *boxes, + ITensorInfo *box_ind, + const ITensorInfo *output, + Coordinates2D crop_size, + InterpolationPolicy method, + float extrapolation_value) { ARM_COMPUTE_RETURN_ERROR_ON(crop_size.x <= 0 || crop_size.y <= 0); ARM_COMPUTE_RETURN_ERROR_ON(method == InterpolationPolicy::AREA); ARM_COMPUTE_RETURN_ERROR_ON(boxes->tensor_shape()[0] != 4); ARM_COMPUTE_RETURN_ERROR_ON(boxes->tensor_shape()[1] != box_ind->tensor_shape()[0]); TensorInfo temp_info; - ARM_COMPUTE_RETURN_ON_ERROR(CLCrop::validate(input->clone().get(), &temp_info, { 0, 0 }, { 1, 1 }, input->dimension(3) - 1, extrapolation_value)); - if(output->total_size() > 0) + ARM_COMPUTE_RETURN_ON_ERROR(CLCrop::validate(input->clone().get(), &temp_info, {0, 0}, {1, 1}, + input->dimension(3) - 1, extrapolation_value)); + if (output->total_size() > 0) { ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_NOT_IN(output, DataType::F32); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, output); @@ -81,19 +108,34 @@ Status CLCropResize::validate(const ITensorInfo *input, ITensorInfo *boxes, ITen return Status{}; } -void CLCropResize::configure(const ICLTensor *input, ICLTensor *boxes, ICLTensor *box_ind, ICLTensor *output, Coordinates2D crop_size, - InterpolationPolicy method, float extrapolation_value) +void CLCropResize::configure(const ICLTensor *input, + ICLTensor *boxes, + ICLTensor *box_ind, + ICLTensor *output, + Coordinates2D crop_size, + InterpolationPolicy method, + float extrapolation_value) { - configure(CLKernelLibrary::get().get_compile_context(), input, boxes, box_ind, output, crop_size, method, extrapolation_value); + configure(CLKernelLibrary::get().get_compile_context(), input, boxes, box_ind, output, crop_size, method, + extrapolation_value); } -void CLCropResize::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *boxes, ICLTensor *box_ind, ICLTensor *output, Coordinates2D crop_size, - InterpolationPolicy method, float extrapolation_value) +void CLCropResize::configure(const CLCompileContext &compile_context, + const ICLTensor *input, + ICLTensor *boxes, + ICLTensor *box_ind, + ICLTensor *output, + Coordinates2D crop_size, + InterpolationPolicy method, + float extrapolation_value) { ARM_COMPUTE_ERROR_ON_NULLPTR(input, output, boxes, box_ind); - ARM_COMPUTE_ERROR_THROW_ON(CLCropResize::validate(input->info(), boxes->info(), box_ind->info(), output->info(), crop_size, method, extrapolation_value)); + ARM_COMPUTE_ERROR_THROW_ON(CLCropResize::validate(input->info(), boxes->info(), box_ind->info(), output->info(), + crop_size, method, extrapolation_value)); + ARM_COMPUTE_LOG_PARAMS(input, boxes, box_ind, output, crop_size, method, extrapolation_value); - TensorShape output_shape = TensorShape(input->info()->tensor_shape()[0], crop_size.x, crop_size.y, boxes->info()->tensor_shape()[1]); + TensorShape output_shape = + TensorShape(input->info()->tensor_shape()[0], crop_size.x, crop_size.y, boxes->info()->tensor_shape()[1]); auto_init_if_empty(*output->info(), output_shape, 1, DataType::F32); _num_boxes = boxes->info()->tensor_shape()[1]; @@ -119,7 +161,7 @@ void CLCropResize::configure(const CLCompileContext &compile_context, const ICLT // kernels used for cropping and scaling. _boxes->map(CLScheduler::get().queue()); _box_ind->map(CLScheduler::get().queue()); - for(unsigned int num_box = 0; num_box < _num_boxes; ++num_box) + for (unsigned int num_box = 0; num_box < _num_boxes; ++num_box) { auto crop_tensor = std::make_unique<CLTensor>(); TensorInfo crop_result_info(1, DataType::F32); @@ -140,7 +182,9 @@ void CLCropResize::configure(const CLCompileContext &compile_context, const ICLT configure_crop(_input, _boxes, _box_ind, _crop_results[num_box].get(), num_box, start, end, batch_index); auto scale_kernel = std::make_unique<CLScale>(); - scale_kernel->configure(compile_context, _crop_results[num_box].get(), _scaled_results[num_box].get(), ScaleKernelInfo{ _method, BorderMode::CONSTANT, PixelValue(_extrapolation_value), SamplingPolicy::TOP_LEFT }); + scale_kernel->configure( + compile_context, _crop_results[num_box].get(), _scaled_results[num_box].get(), + ScaleKernelInfo{_method, BorderMode::CONSTANT, PixelValue(_extrapolation_value), SamplingPolicy::TOP_LEFT}); _scale.emplace_back(std::move(scale_kernel)); Window win = calculate_max_window(*_output->info()); @@ -156,28 +200,50 @@ void CLCropResize::configure(const CLCompileContext &compile_context, const ICLT bool is_width_flipped = end[0] < start[0]; bool is_height_flipped = end[1] < start[1]; /** The number of rows out of bounds at the start and end of _crop_results[num_box].get(). */ - std::array<int32_t, 2> rows_out_of_bounds{ 0 }; + std::array<int32_t, 2> rows_out_of_bounds{0}; /** The number of columns out of bounds at the start and end of _crop_results[num_box].get(). */ - std::array<int32_t, 2> cols_out_of_bounds{ 0 }; - if(is_height_flipped) + std::array<int32_t, 2> cols_out_of_bounds{0}; + if (is_height_flipped) { - rows_out_of_bounds[0] = start[1] >= static_cast<int32_t>(_input->info()->dimension(2)) ? std::min(start[1] - _input->info()->dimension(2) + 1, _crop_results[num_box].get()->info()->dimension(2)) : 0; - rows_out_of_bounds[1] = end[1] < 0 ? std::min(-end[1], static_cast<int32_t>(_crop_results[num_box].get()->info()->dimension(2))) : 0; + rows_out_of_bounds[0] = start[1] >= static_cast<int32_t>(_input->info()->dimension(2)) + ? std::min(start[1] - _input->info()->dimension(2) + 1, + _crop_results[num_box].get()->info()->dimension(2)) + : 0; + rows_out_of_bounds[1] = + end[1] < 0 ? std::min(-end[1], static_cast<int32_t>(_crop_results[num_box].get()->info()->dimension(2))) + : 0; } else { - rows_out_of_bounds[0] = start[1] < 0 ? std::min(-start[1], static_cast<int32_t>(_crop_results[num_box].get()->info()->dimension(2))) : 0; - rows_out_of_bounds[1] = end[1] >= static_cast<int32_t>(_input->info()->dimension(2)) ? std::min(end[1] - _input->info()->dimension(2) + 1, _crop_results[num_box].get()->info()->dimension(2)) : 0; + rows_out_of_bounds[0] = + start[1] < 0 + ? std::min(-start[1], static_cast<int32_t>(_crop_results[num_box].get()->info()->dimension(2))) + : 0; + rows_out_of_bounds[1] = end[1] >= static_cast<int32_t>(_input->info()->dimension(2)) + ? std::min(end[1] - _input->info()->dimension(2) + 1, + _crop_results[num_box].get()->info()->dimension(2)) + : 0; } - if(is_width_flipped) + if (is_width_flipped) { - cols_out_of_bounds[0] = start[0] >= static_cast<int32_t>(_input->info()->dimension(1)) ? std::min(start[0] - _input->info()->dimension(1) + 1, _crop_results[num_box].get()->info()->dimension(1)) : 0; - cols_out_of_bounds[1] = end[0] < 0 ? std::min(-end[0], static_cast<int32_t>(_crop_results[num_box].get()->info()->dimension(1))) : 0; + cols_out_of_bounds[0] = start[0] >= static_cast<int32_t>(_input->info()->dimension(1)) + ? std::min(start[0] - _input->info()->dimension(1) + 1, + _crop_results[num_box].get()->info()->dimension(1)) + : 0; + cols_out_of_bounds[1] = + end[0] < 0 ? std::min(-end[0], static_cast<int32_t>(_crop_results[num_box].get()->info()->dimension(1))) + : 0; } else { - cols_out_of_bounds[0] = start[0] < 0 ? std::min(-start[0], static_cast<int32_t>(_crop_results[num_box].get()->info()->dimension(1))) : 0; - cols_out_of_bounds[1] = end[0] >= static_cast<int32_t>(_input->info()->dimension(1)) ? std::min(end[0] - _input->info()->dimension(1) + 1, _crop_results[num_box].get()->info()->dimension(1)) : 0; + cols_out_of_bounds[0] = + start[0] < 0 + ? std::min(-start[0], static_cast<int32_t>(_crop_results[num_box].get()->info()->dimension(1))) + : 0; + cols_out_of_bounds[1] = end[0] >= static_cast<int32_t>(_input->info()->dimension(1)) + ? std::min(end[0] - _input->info()->dimension(1) + 1, + _crop_results[num_box].get()->info()->dimension(1)) + : 0; } Window full_window = calculate_max_window(*_crop_results[num_box].get()->info()); @@ -200,67 +266,84 @@ void CLCropResize::configure(const CLCompileContext &compile_context, const ICLT // Fill all _crop_results[num_box].get() rows that have no elements that are within the input bounds // with the extrapolation value using memset. // First for the rows before the in bounds rows. - if(rows_out_of_bounds[0] > 0) + if (rows_out_of_bounds[0] > 0) { Window slice_fill_rows_before(full_window); slice_fill_rows_before.set(2, Window::Dimension(0, rows_out_of_bounds[0], 1)); auto kernel = std::make_unique<CLFill>(); - kernel->configure(compile_context, _crop_results[num_box].get(), extrapolation_value, &slice_fill_rows_before); + kernel->configure(compile_context, _crop_results[num_box].get(), extrapolation_value, + &slice_fill_rows_before); //_internal_functions.emplace_back(std::move(kernel)); _internal_functions.push_back(std::move(kernel)); } Window slice_in(full_window); - slice_in.set(2, Window::Dimension(rows_out_of_bounds[0], _crop_results[num_box].get()->info()->dimension(2) - rows_out_of_bounds[1], 1)); - slice_in.set(1, Window::Dimension(cols_out_of_bounds[0], _crop_results[num_box].get()->info()->dimension(1) - cols_out_of_bounds[1], 1)); - - int rows_in_bounds = static_cast<int32_t>(_crop_results[num_box].get()->info()->dimension(2)) - rows_out_of_bounds[0] - rows_out_of_bounds[1]; - if(rows_in_bounds > 0) + slice_in.set(2, + Window::Dimension(rows_out_of_bounds[0], + _crop_results[num_box].get()->info()->dimension(2) - rows_out_of_bounds[1], 1)); + slice_in.set(1, + Window::Dimension(cols_out_of_bounds[0], + _crop_results[num_box].get()->info()->dimension(1) - cols_out_of_bounds[1], 1)); + + int rows_in_bounds = static_cast<int32_t>(_crop_results[num_box].get()->info()->dimension(2)) - + rows_out_of_bounds[0] - rows_out_of_bounds[1]; + if (rows_in_bounds > 0) { // Fill all elements that share a row with an in bounds element with the extrapolation value. - if(cols_out_of_bounds[0] > 0) + if (cols_out_of_bounds[0] > 0) { Window slice_fill_cols_before(slice_in); slice_fill_cols_before.set(1, Window::Dimension(0, cols_out_of_bounds[0], 1)); auto kernel = std::make_unique<CLFill>(); - kernel->configure(compile_context, _crop_results[num_box].get(), extrapolation_value, &slice_fill_cols_before); + kernel->configure(compile_context, _crop_results[num_box].get(), extrapolation_value, + &slice_fill_cols_before); //_internal_functions.emplace_back(std::move(kernel)); _internal_functions.push_back(std::move(kernel)); } - if(cols_out_of_bounds[1] > 0) + if (cols_out_of_bounds[1] > 0) { Window slice_fill_cols_after(slice_in); - slice_fill_cols_after.set(1, Window::Dimension(_crop_results[num_box].get()->info()->dimension(1) - cols_out_of_bounds[1], _crop_results[num_box].get()->info()->dimension(1), 1)); + slice_fill_cols_after.set( + 1, Window::Dimension(_crop_results[num_box].get()->info()->dimension(1) - cols_out_of_bounds[1], + _crop_results[num_box].get()->info()->dimension(1), 1)); auto kernel = std::make_unique<CLFill>(); - kernel->configure(compile_context, _crop_results[num_box].get(), extrapolation_value, &slice_fill_cols_after); + kernel->configure(compile_context, _crop_results[num_box].get(), extrapolation_value, + &slice_fill_cols_after); //_internal_functions.emplace_back(std::move(kernel)); _internal_functions.push_back(std::move(kernel)); } // Copy all elements within the input bounds from the input tensor. - int cols_in_bounds = static_cast<int32_t>(_crop_results[num_box].get()->info()->dimension(1)) - cols_out_of_bounds[0] - cols_out_of_bounds[1]; - if(cols_in_bounds > 0) + int cols_in_bounds = static_cast<int32_t>(_crop_results[num_box].get()->info()->dimension(1)) - + cols_out_of_bounds[0] - cols_out_of_bounds[1]; + if (cols_in_bounds > 0) { - Coordinates2D start_in{ is_width_flipped ? start[0] - cols_out_of_bounds[0] : start[0] + cols_out_of_bounds[0], - is_height_flipped ? start[1] - rows_out_of_bounds[0] : start[1] + rows_out_of_bounds[0] }; - Coordinates2D end_in{ is_width_flipped ? start_in.x - cols_in_bounds + 1 : start_in.x + cols_in_bounds - 1, - is_height_flipped ? start_in.y - rows_in_bounds + 1 : start_in.y + rows_in_bounds - 1 }; + Coordinates2D start_in{ + is_width_flipped ? start[0] - cols_out_of_bounds[0] : start[0] + cols_out_of_bounds[0], + is_height_flipped ? start[1] - rows_out_of_bounds[0] : start[1] + rows_out_of_bounds[0]}; + Coordinates2D end_in{ + is_width_flipped ? start_in.x - cols_in_bounds + 1 : start_in.x + cols_in_bounds - 1, + is_height_flipped ? start_in.y - rows_in_bounds + 1 : start_in.y + rows_in_bounds - 1}; auto kernel = std::make_unique<CLCrop>(); - kernel->configure(compile_context, _input, _crop_results[num_box].get(), start_in, end_in, batch_index, extrapolation_value, &slice_in); + kernel->configure(compile_context, _input, _crop_results[num_box].get(), start_in, end_in, batch_index, + extrapolation_value, &slice_in); //_internal_functions.emplace_back(std::move(kernel)); _internal_functions.push_back(std::move(kernel)); } } // Fill all rows after the in bounds elements with the extrapolation value. - if(rows_out_of_bounds[1] > 0) + if (rows_out_of_bounds[1] > 0) { Window slice_fill_rows_after(full_window); - slice_fill_rows_after.set(2, Window::Dimension(_crop_results[num_box].get()->info()->dimension(2) - rows_out_of_bounds[1], _crop_results[num_box].get()->info()->dimension(2), 1)); + slice_fill_rows_after.set( + 2, Window::Dimension(_crop_results[num_box].get()->info()->dimension(2) - rows_out_of_bounds[1], + _crop_results[num_box].get()->info()->dimension(2), 1)); auto kernel = std::make_unique<CLFill>(); - kernel->configure(compile_context, _crop_results[num_box].get(), extrapolation_value, &slice_fill_rows_after); + kernel->configure(compile_context, _crop_results[num_box].get(), extrapolation_value, + &slice_fill_rows_after); //_internal_functions.emplace_back(std::move(kernel)); _internal_functions.push_back(std::move(kernel)); } @@ -274,21 +357,21 @@ void CLCropResize::run() { ARM_COMPUTE_ERROR_ON_MSG(_output == nullptr, "Unconfigured function"); - for(unsigned int i = 0; i < _internal_functions.size(); ++i) + for (unsigned int i = 0; i < _internal_functions.size(); ++i) { _internal_functions[i]->run(); } CLScheduler::get().sync(); - for(auto &kernel : _scale) + for (auto &kernel : _scale) { kernel->run(); } CLScheduler::get().sync(); - for(auto &kernel : _copy) + for (auto &kernel : _copy) { kernel->run(); } CLScheduler::get().sync(); } -} // namespace arm_compute
\ No newline at end of file +} // namespace arm_compute diff --git a/src/runtime/CL/functions/CLDeconvolutionLayer.cpp b/src/runtime/CL/functions/CLDeconvolutionLayer.cpp index 918848745e..4e0d1501ba 100644 --- a/src/runtime/CL/functions/CLDeconvolutionLayer.cpp +++ b/src/runtime/CL/functions/CLDeconvolutionLayer.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2021 Arm Limited. + * Copyright (c) 2017-2023 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -23,12 +23,18 @@ */ #include "arm_compute/runtime/CL/functions/CLDeconvolutionLayer.h" +#include "arm_compute/core/Types.h" #include "arm_compute/core/Utils.h" -#include "arm_compute/core/Validate.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" #include "arm_compute/core/utils/quantization/AsymmHelpers.h" +#include "arm_compute/core/Validate.h" #include "arm_compute/runtime/CL/CLScheduler.h" +#include "src/common/utils/Log.h" +#include "src/core/CL/ICLKernel.h" +#include "src/gpu/cl/IClOperator.h" +#include "src/gpu/cl/operators/ClTransposedConvolution.h" + #include <cmath> #include <memory> #include <tuple> @@ -36,26 +42,62 @@ using namespace arm_compute; using namespace arm_compute::misc::shape_calculator; +struct CLDeconvolutionLayer::Impl +{ + const ICLTensor *src{nullptr}; + const ICLTensor *weights{nullptr}; + const ICLTensor *biases{nullptr}; + ICLTensor *dst{nullptr}; + std::unique_ptr<opencl::IClOperator> op{nullptr}; +}; + +CLDeconvolutionLayer::~CLDeconvolutionLayer() = default; + CLDeconvolutionLayer::CLDeconvolutionLayer(std::shared_ptr<IMemoryManager> memory_manager) - : _memory_manager(std::move(memory_manager)), _function() + : _memory_manager(std::move(memory_manager)), _function(), _impl(std::make_unique<Impl>()) { } -void CLDeconvolutionLayer::configure(ICLTensor *input, ICLTensor *weights, const ICLTensor *bias, ICLTensor *output, const PadStrideInfo &deconv_info, - const WeightsInfo &weights_info) +void CLDeconvolutionLayer::configure(ICLTensor *input, + ICLTensor *weights, + const ICLTensor *bias, + ICLTensor *output, + const PadStrideInfo &deconv_info, + const WeightsInfo &weights_info) { configure(CLKernelLibrary::get().get_compile_context(), input, weights, bias, output, deconv_info, weights_info); } -void CLDeconvolutionLayer::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *weights, const ICLTensor *bias, ICLTensor *output, const PadStrideInfo &deconv_info, - const WeightsInfo &weights_info) +void CLDeconvolutionLayer::configure(const CLCompileContext &compile_context, + ICLTensor *input, + ICLTensor *weights, + const ICLTensor *bias, + ICLTensor *output, + const PadStrideInfo &deconv_info, + const WeightsInfo &weights_info) { ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output); + ARM_COMPUTE_LOG_PARAMS(input, weights, bias, output, deconv_info, weights_info); - switch(CLDeconvolutionLayer::get_deconvolution_method(input->info(), weights->info(), nullptr, output->info(), deconv_info, weights_info)) + switch (CLDeconvolutionLayer::get_deconvolution_method(input->info(), weights->info(), nullptr, output->info(), + deconv_info, weights_info)) { case DeconvolutionMethod::DIRECT: { + auto op = std::make_unique<opencl::ClTransposedConvolution>(); + op->configure(compile_context, input->info(), weights->info(), bias != nullptr ? bias->info() : nullptr, + output->info(), deconv_info); + + _impl->src = input; + _impl->weights = weights; + _impl->biases = bias; + _impl->dst = output; + + _impl->op = std::move(op); + break; + } + case DeconvolutionMethod::UPSCALE_CONV2D: + { auto f = std::make_unique<CLDirectDeconvolutionLayer>(); f->configure(compile_context, input, weights, bias, output, deconv_info, weights_info); _function = std::move(f); @@ -74,16 +116,28 @@ void CLDeconvolutionLayer::configure(const CLCompileContext &compile_context, IC } } -Status CLDeconvolutionLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *bias, ITensorInfo *output, const PadStrideInfo &deconv_info, - const WeightsInfo &weights_info) +Status CLDeconvolutionLayer::validate(const ITensorInfo *input, + const ITensorInfo *weights, + const ITensorInfo *bias, + ITensorInfo *output, + const PadStrideInfo &deconv_info, + const WeightsInfo &weights_info) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output); - switch(CLDeconvolutionLayer::get_deconvolution_method(input, weights, bias, output, deconv_info, weights_info)) + switch (CLDeconvolutionLayer::get_deconvolution_method(input, weights, bias, output, deconv_info, weights_info)) { case DeconvolutionMethod::DIRECT: { + // Validate transposed convolution operator + ARM_COMPUTE_RETURN_ON_ERROR( + opencl::ClTransposedConvolution::validate(input, weights, bias, output, deconv_info)); + break; + } + case DeconvolutionMethod::UPSCALE_CONV2D: + { // Validate direct convolution layer - ARM_COMPUTE_RETURN_ON_ERROR(CLDirectDeconvolutionLayer::validate(input, weights, bias, output, deconv_info, weights_info)); + ARM_COMPUTE_RETURN_ON_ERROR( + CLDirectDeconvolutionLayer::validate(input, weights, bias, output, deconv_info, weights_info)); break; } case DeconvolutionMethod::GEMM: @@ -100,24 +154,40 @@ Status CLDeconvolutionLayer::validate(const ITensorInfo *input, const ITensorInf return Status{}; } -DeconvolutionMethod CLDeconvolutionLayer::get_deconvolution_method(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *bias, ITensorInfo *output, const PadStrideInfo &deconv_info, - const WeightsInfo &weights_info) +DeconvolutionMethod CLDeconvolutionLayer::get_deconvolution_method(const ITensorInfo *input, + const ITensorInfo *weights, + const ITensorInfo *bias, + ITensorInfo *output, + const PadStrideInfo &deconv_info, + const WeightsInfo &weights_info) { ARM_COMPUTE_UNUSED(output, bias, weights_info); - if(is_data_type_quantized_per_channel(weights->data_type())) + if (is_data_type_quantized_per_channel(weights->data_type())) { - return DeconvolutionMethod::DIRECT; + return DeconvolutionMethod::UPSCALE_CONV2D; } const DataLayout data_layout = input->data_layout(); const size_t idx_w = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH); const size_t idx_h = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT); + const size_t idx_n = get_data_layout_dimension_index(data_layout, DataLayoutDimension::BATCHES); + const size_t ofm = weights->tensor_shape()[idx_n]; - if(weights->dimension(idx_w) != deconv_info.stride().first || weights->dimension(idx_h) != deconv_info.stride().second) + if (weights->dimension(idx_w) != deconv_info.stride().first || + weights->dimension(idx_h) != deconv_info.stride().second) { - return DeconvolutionMethod::DIRECT; + // We observe better performance for FP32 types only when ofm <= 16, and for FP16 only when ofm <= 32. + if (input->data_layout() == DataLayout::NHWC && !((input->data_type() == DataType::F32) && (ofm > 16)) && + !((input->data_type() == DataType::F16) && (ofm > 32))) + { + return DeconvolutionMethod::DIRECT; + } + else + { + return DeconvolutionMethod::UPSCALE_CONV2D; + } } return DeconvolutionMethod::GEMM; @@ -126,10 +196,29 @@ DeconvolutionMethod CLDeconvolutionLayer::get_deconvolution_method(const ITensor void CLDeconvolutionLayer::run() { prepare(); - _function->run(); + + if (_impl->op != nullptr) + { + // Optimized Operator will be used + ITensorPack pack; + + pack.add_tensor(TensorType::ACL_SRC_0, _impl->src); + pack.add_tensor(TensorType::ACL_SRC_1, _impl->weights); + pack.add_tensor(TensorType::ACL_SRC_2, _impl->biases); + pack.add_tensor(TensorType::ACL_DST, _impl->dst); + + _impl->op->run(pack); + } + else + { + _function->run(); + } } void CLDeconvolutionLayer::prepare() { - _function->prepare(); + if (_impl->op == nullptr) + { + _function->prepare(); + } } diff --git a/src/runtime/CL/functions/CLDeconvolutionLayerUpsample.cpp b/src/runtime/CL/functions/CLDeconvolutionLayerUpsample.cpp index c371558f30..b92bf903a6 100644 --- a/src/runtime/CL/functions/CLDeconvolutionLayerUpsample.cpp +++ b/src/runtime/CL/functions/CLDeconvolutionLayerUpsample.cpp @@ -27,20 +27,21 @@ #include "arm_compute/core/Utils.h" #include "arm_compute/runtime/CL/CLScheduler.h" #include "arm_compute/runtime/CL/CLTensor.h" + +#include "src/common/utils/Log.h" #include "src/core/CL/kernels/CLDeconvolutionLayerUpsampleKernel.h" namespace arm_compute { CLDeconvolutionLayerUpsample::CLDeconvolutionLayerUpsample() // NOLINT - : _upsample(std::make_unique<CLDeconvolutionLayerUpsampleKernel>()), - _fill(), - _output(nullptr) + : _upsample(std::make_unique<CLDeconvolutionLayerUpsampleKernel>()), _fill(), _output(nullptr) { } CLDeconvolutionLayerUpsample::~CLDeconvolutionLayerUpsample() = default; -Status CLDeconvolutionLayerUpsample::validate(const ITensorInfo *input, const ITensorInfo *output, const PadStrideInfo &info) +Status +CLDeconvolutionLayerUpsample::validate(const ITensorInfo *input, const ITensorInfo *output, const PadStrideInfo &info) { return CLDeconvolutionLayerUpsampleKernel::validate(input, output, info); } @@ -50,12 +51,17 @@ void CLDeconvolutionLayerUpsample::configure(ICLTensor *input, ICLTensor *output configure(CLKernelLibrary::get().get_compile_context(), input, output, info); } -void CLDeconvolutionLayerUpsample::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, const PadStrideInfo &info) +void CLDeconvolutionLayerUpsample::configure(const CLCompileContext &compile_context, + ICLTensor *input, + ICLTensor *output, + const PadStrideInfo &info) { ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); + ARM_COMPUTE_LOG_PARAMS(input, output, info); _output = output; - _fill.configure(compile_context, _output, PixelValue(0, _output->info()->data_type(), _output->info()->quantization_info())); + _fill.configure(compile_context, _output, + PixelValue(0, _output->info()->data_type(), _output->info()->quantization_info())); _upsample->configure(compile_context, input, _output, info); } diff --git a/src/runtime/CL/functions/CLDepthConvertLayer.cpp b/src/runtime/CL/functions/CLDepthConvertLayer.cpp index 6aa370b23c..6d2fea974e 100644 --- a/src/runtime/CL/functions/CLDepthConvertLayer.cpp +++ b/src/runtime/CL/functions/CLDepthConvertLayer.cpp @@ -26,8 +26,10 @@ #include "arm_compute/core/CL/CLKernelLibrary.h" #include "arm_compute/core/CL/ICLTensor.h" #include "arm_compute/core/Validate.h" + +#include "src/common/utils/Log.h" #include "src/core/CL/ICLKernel.h" -#include "src/runtime/gpu/cl/operators/ClCast.h" +#include "src/gpu/cl/operators/ClCast.h" #include <utility> @@ -35,16 +37,15 @@ namespace arm_compute { struct CLDepthConvertLayer::Impl { - const ICLTensor *src{ nullptr }; - ICLTensor *dst{ nullptr }; - std::unique_ptr<opencl::ClCast> op{ nullptr }; + const ICLTensor *src{nullptr}; + ICLTensor *dst{nullptr}; + std::unique_ptr<opencl::ClCast> op{nullptr}; }; -CLDepthConvertLayer::CLDepthConvertLayer() - : _impl(std::make_unique<Impl>()) +CLDepthConvertLayer::CLDepthConvertLayer() : _impl(std::make_unique<Impl>()) { } -CLDepthConvertLayer::CLDepthConvertLayer(CLDepthConvertLayer &&) = default; +CLDepthConvertLayer::CLDepthConvertLayer(CLDepthConvertLayer &&) = default; CLDepthConvertLayer &CLDepthConvertLayer::operator=(CLDepthConvertLayer &&) = default; CLDepthConvertLayer::~CLDepthConvertLayer() = default; @@ -53,9 +54,14 @@ void CLDepthConvertLayer::configure(const ICLTensor *input, ICLTensor *output, C configure(CLKernelLibrary::get().get_compile_context(), input, output, policy, shift); } -void CLDepthConvertLayer::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, ConvertPolicy policy, uint32_t shift) +void CLDepthConvertLayer::configure(const CLCompileContext &compile_context, + const ICLTensor *input, + ICLTensor *output, + ConvertPolicy policy, + uint32_t shift) { ARM_COMPUTE_UNUSED(shift); + ARM_COMPUTE_LOG_PARAMS(input, output, policy, shift); _impl->src = input; _impl->dst = output; @@ -67,7 +73,8 @@ void CLDepthConvertLayer::configure(const CLCompileContext &compile_context, con _impl->op->configure(compile_context, _impl->src->info(), _impl->dst->info(), policy); } -Status CLDepthConvertLayer::validate(const ITensorInfo *input, const ITensorInfo *output, ConvertPolicy policy, uint32_t shift) +Status +CLDepthConvertLayer::validate(const ITensorInfo *input, const ITensorInfo *output, ConvertPolicy policy, uint32_t shift) { ARM_COMPUTE_RETURN_ERROR_ON(shift != 0); return opencl::ClCast::validate(input, output, policy); @@ -75,7 +82,7 @@ Status CLDepthConvertLayer::validate(const ITensorInfo *input, const ITensorInfo void CLDepthConvertLayer::run() { - ITensorPack pack = { { ACL_SRC, _impl->src }, { ACL_DST, _impl->dst } }; + ITensorPack pack = {{ACL_SRC, _impl->src}, {ACL_DST, _impl->dst}}; _impl->op->run(pack); } } // namespace arm_compute diff --git a/src/runtime/CL/functions/CLDepthToSpaceLayer.cpp b/src/runtime/CL/functions/CLDepthToSpaceLayer.cpp index bd2303c410..9477c7f81d 100644 --- a/src/runtime/CL/functions/CLDepthToSpaceLayer.cpp +++ b/src/runtime/CL/functions/CLDepthToSpaceLayer.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2020 Arm Limited. + * Copyright (c) 2019-2021 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -23,6 +23,7 @@ */ #include "arm_compute/runtime/CL/functions/CLDepthToSpaceLayer.h" +#include "src/common/utils/Log.h" #include "src/core/CL/kernels/CLDepthToSpaceLayerKernel.h" #include <utility> @@ -34,8 +35,12 @@ void CLDepthToSpaceLayer::configure(const ICLTensor *input, ICLTensor *output, i configure(CLKernelLibrary::get().get_compile_context(), input, output, block_shape); } -void CLDepthToSpaceLayer::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, int32_t block_shape) +void CLDepthToSpaceLayer::configure(const CLCompileContext &compile_context, + const ICLTensor *input, + ICLTensor *output, + int32_t block_shape) { + ARM_COMPUTE_LOG_PARAMS(input, output, block_shape); auto k = std::make_unique<CLDepthToSpaceLayerKernel>(); k->configure(compile_context, input, output, block_shape); _kernel = std::move(k); diff --git a/src/runtime/CL/functions/CLDepthwiseConvolutionLayer.cpp b/src/runtime/CL/functions/CLDepthwiseConvolutionLayer.cpp index 6467caffef..873601bb11 100644 --- a/src/runtime/CL/functions/CLDepthwiseConvolutionLayer.cpp +++ b/src/runtime/CL/functions/CLDepthwiseConvolutionLayer.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2021 Arm Limited. + * Copyright (c) 2017-2023 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -25,86 +25,23 @@ #include "arm_compute/core/CL/ICLTensor.h" #include "arm_compute/core/Helpers.h" -#include "arm_compute/core/PixelValue.h" +#include "arm_compute/core/Utils.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" #include "arm_compute/core/utils/quantization/AsymmHelpers.h" #include "arm_compute/runtime/CL/CLScheduler.h" -#include "src/core/CL/kernels/CLDepthwiseConvolutionLayer3x3NCHWKernel.h" -#include "src/core/CL/kernels/CLDepthwiseConvolutionLayer3x3NHWCKernel.h" + +#include "src/common/utils/Log.h" #include "src/core/CL/kernels/CLDepthwiseConvolutionLayerNativeKernel.h" -#include "src/core/CL/kernels/CLFillBorderKernel.h" +#include "src/runtime/heuristics/dwc_native/ClDWCNativeKernelConfig.h" +#include "src/runtime/heuristics/dwc_native/IClDWCNativeKernelConfig.h" namespace arm_compute { using namespace arm_compute::misc; using namespace arm_compute::misc::shape_calculator; +using namespace arm_compute::cl_dwc; -namespace -{ -Status validate_arguments_3x3(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info, - unsigned int depth_multiplier, ActivationLayerInfo act_info, const Size2D &dilation) -{ - // This function should be removed and incorporated inside CLDepthwiseConvolutionLayerInternal3x3 once CLDepthwiseConvolutionLayer3x3 is properly removed - ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F16, DataType::F32); - ARM_COMPUTE_RETURN_ERROR_ON(input->data_layout() == DataLayout::UNKNOWN); - - const bool is_quantized = is_data_type_quantized_asymmetric(input->data_type()); - const bool is_nhwc = input->data_layout() == DataLayout::NHWC; - const bool needs_permute = is_nhwc && (depth_multiplier > 1); - - ARM_COMPUTE_RETURN_ERROR_ON(is_quantized && is_nhwc && !needs_permute); - - TensorInfo output_multipliers_shifts_info(TensorInfo(TensorShape(1U), 1, DataType::S32)); - if(is_quantized) - { - if(is_data_type_quantized_per_channel(weights->data_type())) - { - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(weights, 1, DataType::QSYMM8_PER_CHANNEL); - - const size_t idx_c = get_data_layout_dimension_index(weights->data_layout(), DataLayoutDimension::CHANNEL); - output_multipliers_shifts_info.set_tensor_shape(TensorShape(weights->dimension(idx_c))); - } - else - { - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights); - } - } - - if(needs_permute) - { - TensorShape permuted_input_shape = input->tensor_shape(); - TensorShape permuted_weights_shape = weights->tensor_shape(); - const ConvolutionInfo info{ conv_info, depth_multiplier, ActivationLayerInfo(), dilation }; - TensorShape permuted_output_shape = shape_calculator::compute_depthwise_convolution_shape(*input, *weights, info); - - permute(permuted_input_shape, PermutationVector(1U, 2U, 0U)); - permute(permuted_weights_shape, PermutationVector(1U, 2U, 0U)); - permute(permuted_output_shape, PermutationVector(1U, 2U, 0U)); - - const TensorInfo permuted_input = input->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(permuted_input_shape).set_data_layout(DataLayout::NCHW); - const TensorInfo permuted_weights = weights->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(permuted_weights_shape).set_data_layout(DataLayout::NCHW); - const TensorInfo permuted_output = output->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(permuted_output_shape).set_data_layout(DataLayout::NCHW); - - ARM_COMPUTE_RETURN_ON_ERROR(CLDepthwiseConvolutionLayer3x3NCHWKernel::validate(&permuted_input, &permuted_weights, biases, &permuted_output, - conv_info, depth_multiplier, act_info, - dilation, &output_multipliers_shifts_info, &output_multipliers_shifts_info)); - } - else if(is_nhwc) - { - ARM_COMPUTE_RETURN_ON_ERROR(CLDepthwiseConvolutionLayer3x3NHWCKernel::validate(input, weights, biases, output, conv_info, depth_multiplier, act_info, - dilation)); - } - else - { - ARM_COMPUTE_RETURN_ON_ERROR(CLDepthwiseConvolutionLayer3x3NCHWKernel::validate(input, weights, biases, output, conv_info, depth_multiplier, act_info, - dilation, &output_multipliers_shifts_info, &output_multipliers_shifts_info)); - } - return Status{}; -} -} // namespace - -CLDepthwiseConvolutionLayer::CLDepthwiseConvolutionLayerGeneric::CLDepthwiseConvolutionLayerGeneric(std::shared_ptr<IMemoryManager> memory_manager) +CLDepthwiseConvolutionLayer::CLDepthwiseConvolutionLayer(std::shared_ptr<IMemoryManager> memory_manager) : _memory_group(std::move(memory_manager)), _dwc_native_kernel(std::make_unique<CLDepthwiseConvolutionLayerNativeKernel>()), _permute_input_to_nhwc(), @@ -126,25 +63,34 @@ CLDepthwiseConvolutionLayer::CLDepthwiseConvolutionLayerGeneric::CLDepthwiseConv CLDepthwiseConvolutionLayer::~CLDepthwiseConvolutionLayer() = default; -void CLDepthwiseConvolutionLayer::CLDepthwiseConvolutionLayerGeneric::configure(ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info, - unsigned int depth_multiplier, const ActivationLayerInfo &act_info, const Size2D &dilation) +void CLDepthwiseConvolutionLayer::configure(ICLTensor *input, + const ICLTensor *weights, + const ICLTensor *biases, + ICLTensor *output, + const PadStrideInfo &conv_info, + unsigned int depth_multiplier, + ActivationLayerInfo act_info, + const Size2D &dilation) { - configure(CLKernelLibrary::get().get_compile_context(), input, weights, biases, output, conv_info, depth_multiplier, act_info, dilation); + configure(CLKernelLibrary::get().get_compile_context(), input, weights, biases, output, conv_info, depth_multiplier, + act_info, dilation); } -void CLDepthwiseConvolutionLayer::CLDepthwiseConvolutionLayerGeneric::configure(const CLCompileContext &compile_context, ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, - ICLTensor *output, const PadStrideInfo &conv_info, - unsigned int depth_multiplier, const ActivationLayerInfo &act_info, const Size2D &dilation) +void CLDepthwiseConvolutionLayer::configure(const CLCompileContext &compile_context, + ICLTensor *input, + const ICLTensor *weights, + const ICLTensor *biases, + ICLTensor *output, + const PadStrideInfo &conv_info, + unsigned int depth_multiplier, + ActivationLayerInfo act_info, + const Size2D &dilation) { - ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output); - ARM_COMPUTE_ERROR_THROW_ON(CLDepthwiseConvolutionLayer::validate(input->info(), - weights->info(), - biases != nullptr ? biases->info() : nullptr, - output->info(), - conv_info, - depth_multiplier, - act_info, - dilation)); + ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights); + ARM_COMPUTE_ERROR_THROW_ON(CLDepthwiseConvolutionLayer::validate( + input->info(), weights->info(), biases != nullptr ? biases->info() : nullptr, + output != nullptr ? output->info() : input->info(), conv_info, depth_multiplier, act_info, dilation)); + ARM_COMPUTE_LOG_PARAMS(input, weights, biases, output, conv_info, depth_multiplier, act_info, dilation); _is_quantized = is_data_type_quantized(input->info()->data_type()); _is_prepared = false; @@ -153,10 +99,12 @@ void CLDepthwiseConvolutionLayer::CLDepthwiseConvolutionLayerGeneric::configure( _output = output; _needs_permute = input->info()->data_layout() == DataLayout::NCHW; + const GPUTarget gpu_target = CLScheduler::get().target(); + ICLTensor *input_to_use = input; const ICLTensor *weights_to_use = weights; ICLTensor *output_to_use = output; - if(_needs_permute) + if (_needs_permute) { _memory_group.manage(&_permuted_input); _memory_group.manage(&_permuted_output); @@ -179,10 +127,12 @@ void CLDepthwiseConvolutionLayer::CLDepthwiseConvolutionLayerGeneric::configure( CLTensor *output_multipliers_to_use = nullptr; CLTensor *output_shifts_to_use = nullptr; - if(_is_quantized) + if (_is_quantized) { - const size_t idx_c = get_data_layout_dimension_index(weights->info()->data_layout(), DataLayoutDimension::CHANNEL); - const size_t num_filters = (is_data_type_quantized_per_channel(weights->info()->data_type())) ? weights->info()->dimension(idx_c) : 1; + const size_t idx_c = + get_data_layout_dimension_index(weights->info()->data_layout(), DataLayoutDimension::CHANNEL); + const size_t num_filters = + (is_data_type_quantized_per_channel(weights->info()->data_type())) ? weights->info()->dimension(idx_c) : 1; _output_multipliers.allocator()->init(TensorInfo(TensorShape(num_filters), 1, DataType::S32)); _output_shifts.allocator()->init(TensorInfo(TensorShape(num_filters), 1, DataType::S32)); @@ -191,15 +141,19 @@ void CLDepthwiseConvolutionLayer::CLDepthwiseConvolutionLayerGeneric::configure( output_shifts_to_use = &_output_shifts; } - DWCWeightsKernelInfo dwc_weights_info; - dwc_weights_info.n0 = (depth_multiplier == 1) ? 8 : 1; - DWCKernelInfo dwc_info; - dwc_info.activation_info = act_info; + // Get the depthwise convolution compute parameters + auto t = ClDWCNativeKernelConfigurationFactory::create(gpu_target); + const DWCComputeKernelInfo dwc_native_compute_info = + t->configure(input_to_use->info(), weights_to_use->info(), conv_info, dilation, depth_multiplier); + + const ConvolutionInfo conv_kernel_info{conv_info, depth_multiplier, act_info, dilation}; + + _dwc_native_kernel->set_target(gpu_target); _dwc_native_kernel->configure(compile_context, input_to_use, weights_to_use, biases, output_to_use, - dwc_weights_info, dwc_info, conv_info, depth_multiplier, dilation, - output_multipliers_to_use, output_shifts_to_use); + dwc_native_compute_info, conv_kernel_info, output_multipliers_to_use, + output_shifts_to_use); - if(_needs_permute) + if (_needs_permute) { _permuted_input.allocator()->allocate(); @@ -209,37 +163,51 @@ void CLDepthwiseConvolutionLayer::CLDepthwiseConvolutionLayerGeneric::configure( _permuted_output.allocator()->allocate(); } - if(_is_quantized) + if (_is_quantized) { _output_multipliers.allocator()->allocate(); _output_shifts.allocator()->allocate(); } } -Status CLDepthwiseConvolutionLayer::CLDepthwiseConvolutionLayerGeneric::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, - const PadStrideInfo &conv_info, - unsigned int depth_multiplier, const ActivationLayerInfo &act_info, const Size2D &dilation) +Status CLDepthwiseConvolutionLayer::validate(const ITensorInfo *input, + const ITensorInfo *weights, + const ITensorInfo *biases, + const ITensorInfo *output, + const PadStrideInfo &conv_info, + unsigned int depth_multiplier, + ActivationLayerInfo act_info, + const Size2D &dilation) { + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(!weights->are_values_constant(), "Dynamic weights are not supported"); + + const bool in_place = input == output || output == nullptr; + if (in_place) + { + output = input; + } ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, output); const size_t idx_w = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::WIDTH); const size_t idx_h = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::HEIGHT); - ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(idx_w) + (weights->dimension(idx_w) - 1) * (dilation.x() - 1) > input->dimension(idx_w) + conv_info.pad_left() + conv_info.pad_right()); - ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(idx_h) + (weights->dimension(idx_h) - 1) * (dilation.y() - 1) > input->dimension(idx_h) + conv_info.pad_top() + conv_info.pad_bottom()); + ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(idx_w) + (weights->dimension(idx_w) - 1) * (dilation.x() - 1) > + input->dimension(idx_w) + conv_info.pad_left() + conv_info.pad_right()); + ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(idx_h) + (weights->dimension(idx_h) - 1) * (dilation.y() - 1) > + input->dimension(idx_h) + conv_info.pad_top() + conv_info.pad_bottom()); + + const GPUTarget gpu_target = CLScheduler::get().target(); - DWCWeightsKernelInfo dwc_weights_info; - dwc_weights_info.n0 = (depth_multiplier == 1) ? 8 : 1; - DWCKernelInfo dwc_info; - dwc_info.activation_info = act_info; + const ConvolutionInfo conv_kernel_info{conv_info, depth_multiplier, act_info, dilation}; const bool needs_permute = input->data_layout() == DataLayout::NCHW; const bool is_quantized = is_data_type_quantized(input->data_type()); TensorInfo output_multipliers_shifts_info(TensorInfo(TensorShape(1U), 1, DataType::S32)); - if(is_quantized) + if (is_quantized) { - if(is_data_type_quantized_per_channel(weights->data_type())) + if (is_data_type_quantized_per_channel(weights->data_type())) { ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(weights, 1, DataType::QSYMM8_PER_CHANNEL); @@ -252,73 +220,95 @@ Status CLDepthwiseConvolutionLayer::CLDepthwiseConvolutionLayerGeneric::validate } } - if(needs_permute) + if (needs_permute) { + ARM_COMPUTE_RETURN_ERROR_ON_MSG(in_place, "In-place is supported only with NHWC data layout"); TensorShape permuted_input_shape = input->tensor_shape(); TensorShape permuted_weights_shape = weights->tensor_shape(); - const ConvolutionInfo info{ conv_info, depth_multiplier, ActivationLayerInfo(), dilation }; - TensorShape permuted_output_shape = shape_calculator::compute_depthwise_convolution_shape(*input, *weights, info); + const ConvolutionInfo info{conv_info, depth_multiplier, ActivationLayerInfo(), dilation}; + TensorShape permuted_output_shape = + shape_calculator::compute_depthwise_convolution_shape(*input, *weights, info); permute(permuted_input_shape, PermutationVector(2U, 0U, 1U)); permute(permuted_weights_shape, PermutationVector(2U, 0U, 1U)); permute(permuted_output_shape, PermutationVector(2U, 0U, 1U)); - const TensorInfo permuted_input = input->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(permuted_input_shape).set_data_layout(DataLayout::NHWC); - const TensorInfo permuted_weights = weights->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(permuted_weights_shape).set_data_layout(DataLayout::NHWC); - const TensorInfo permuted_output = output->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(permuted_output_shape).set_data_layout(DataLayout::NHWC); + const TensorInfo permuted_input = input->clone() + ->set_is_resizable(true) + .reset_padding() + .set_tensor_shape(permuted_input_shape) + .set_data_layout(DataLayout::NHWC); + const TensorInfo permuted_weights = weights->clone() + ->set_is_resizable(true) + .reset_padding() + .set_tensor_shape(permuted_weights_shape) + .set_data_layout(DataLayout::NHWC); + const TensorInfo permuted_output = output->clone() + ->set_is_resizable(true) + .reset_padding() + .set_tensor_shape(permuted_output_shape) + .set_data_layout(DataLayout::NHWC); ARM_COMPUTE_RETURN_ON_ERROR(CLPermute::validate(input, &permuted_input, PermutationVector(2U, 0U, 1U))); ARM_COMPUTE_RETURN_ON_ERROR(CLPermute::validate(weights, &permuted_weights, PermutationVector(2U, 0U, 1U))); - ARM_COMPUTE_RETURN_ON_ERROR(CLDepthwiseConvolutionLayerNativeKernel::validate(&permuted_input, &permuted_weights, biases, &permuted_output, dwc_weights_info, - dwc_info, conv_info, depth_multiplier, dilation, - &output_multipliers_shifts_info, &output_multipliers_shifts_info)); + + // Get the depthwise convolution compute parameters + auto t = ClDWCNativeKernelConfigurationFactory::create(gpu_target); + const DWCComputeKernelInfo dwc_native_compute_info = + t->configure(&permuted_input, &permuted_weights, conv_info, dilation, depth_multiplier); + + ARM_COMPUTE_RETURN_ON_ERROR(CLDepthwiseConvolutionLayerNativeKernel::validate( + &permuted_input, &permuted_weights, biases, &permuted_output, dwc_native_compute_info, conv_kernel_info, + &output_multipliers_shifts_info, &output_multipliers_shifts_info)); ARM_COMPUTE_RETURN_ON_ERROR(CLPermute::validate(&permuted_output, output, PermutationVector(1U, 2U, 0U))); } else { - ARM_COMPUTE_RETURN_ON_ERROR(CLDepthwiseConvolutionLayerNativeKernel::validate(input, weights, biases, output, dwc_weights_info, dwc_info, conv_info, depth_multiplier, - dilation, &output_multipliers_shifts_info, &output_multipliers_shifts_info)); + // Get the depthwise convolution compute parameters + auto t = ClDWCNativeKernelConfigurationFactory::create(gpu_target); + const DWCComputeKernelInfo dwc_native_compute_info = + t->configure(input, weights, conv_info, dilation, depth_multiplier); + ARM_COMPUTE_RETURN_ON_ERROR(CLDepthwiseConvolutionLayerNativeKernel::validate( + input, weights, biases, output, dwc_native_compute_info, conv_kernel_info, &output_multipliers_shifts_info, + &output_multipliers_shifts_info)); } return Status{}; } -void CLDepthwiseConvolutionLayer::CLDepthwiseConvolutionLayerGeneric::run() +void CLDepthwiseConvolutionLayer::run() { prepare(); MemoryGroupResourceScope scope_mg(_memory_group); - if(_needs_permute) + if (_needs_permute) { _permute_input_to_nhwc.run(); } CLScheduler::get().enqueue(*_dwc_native_kernel); - if(_needs_permute) + if (_needs_permute) { _permute_output_to_nchw.run(); } } -void CLDepthwiseConvolutionLayer::CLDepthwiseConvolutionLayerGeneric::prepare() +void CLDepthwiseConvolutionLayer::prepare() { - if(!_is_prepared) + if (!_is_prepared) { - if(_is_quantized) + if (_is_quantized) { _output_multipliers.map(); _output_shifts.map(); - const unsigned int idx_ofms = _needs_permute ? 2 : 0; - quantization::compute_quantized_multipliers_and_shifts(_input->info(), - _original_weights->info(), - _output->info(), - idx_ofms, - reinterpret_cast<int32_t *>(_output_multipliers.ptr_to_element(Coordinates(0))), - reinterpret_cast<int32_t *>(_output_shifts.ptr_to_element(Coordinates(0)))); + quantization::compute_quantized_multipliers_and_shifts( + _input->info(), _original_weights->info(), _output != nullptr ? _output->info() : _input->info(), + reinterpret_cast<int32_t *>(_output_multipliers.ptr_to_element(Coordinates(0))), + reinterpret_cast<int32_t *>(_output_shifts.ptr_to_element(Coordinates(0)))); _output_multipliers.unmap(); _output_shifts.unmap(); } - if(_needs_permute) + if (_needs_permute) { ARM_COMPUTE_ERROR_ON(!_original_weights->is_used()); @@ -329,305 +319,4 @@ void CLDepthwiseConvolutionLayer::CLDepthwiseConvolutionLayerGeneric::prepare() _is_prepared = true; } } - -CLDepthwiseConvolutionLayer::CLDepthwiseConvolutionLayerInternal3x3::CLDepthwiseConvolutionLayerInternal3x3(std::shared_ptr<IMemoryManager> memory_manager) - : _memory_group(std::move(memory_manager)), - _kernel_nchw(nullptr), - _kernel_nhwc(nullptr), - _border_handler(std::make_unique<CLFillBorderKernel>()), - _permute_input_to_nchw(), - _permute_weights_to_nchw(), - _permute_output_to_nhwc(), - _permuted_input(), - _permuted_weights(), - _permuted_output(), - _output_multipliers(), - _output_shifts(), - _original_weights(nullptr), - _input(nullptr), - _output(nullptr), - _needs_permute(false), - _is_prepared(false), - _is_quantized(false), - _is_nhwc(false) -{ -} - -void CLDepthwiseConvolutionLayer::CLDepthwiseConvolutionLayerInternal3x3::configure(ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, - const PadStrideInfo &conv_info, unsigned int depth_multiplier, ActivationLayerInfo act_info, const Size2D &dilation) -{ - configure(CLKernelLibrary::get().get_compile_context(), input, weights, biases, output, conv_info, depth_multiplier, act_info, dilation); -} - -void CLDepthwiseConvolutionLayer::CLDepthwiseConvolutionLayerInternal3x3::configure(const CLCompileContext &compile_context, ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, - ICLTensor *output, - const PadStrideInfo &conv_info, unsigned int depth_multiplier, ActivationLayerInfo act_info, const Size2D &dilation) -{ - // Perform validation step - ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output); - ARM_COMPUTE_ERROR_THROW_ON(CLDepthwiseConvolutionLayerInternal3x3::validate(input->info(), - weights->info(), - biases != nullptr ? biases->info() : nullptr, - output->info(), - conv_info, - depth_multiplier, - act_info, - dilation)); - - _is_nhwc = input->info()->data_layout() == DataLayout::NHWC; - _is_quantized = is_data_type_quantized_asymmetric(input->info()->data_type()); - _needs_permute = _is_nhwc && (depth_multiplier > 1); - - _is_prepared = false; - _original_weights = weights; - _input = input; - _output = output; - - ICLTensor *input_to_use = input; - const ICLTensor *weights_to_use = weights; - ICLTensor *output_to_use = output; - - const bool is_quantized_per_channel = is_data_type_quantized_per_channel(weights->info()->data_type()); - - if(_needs_permute) - { - _memory_group.manage(&_permuted_input); - _memory_group.manage(&_permuted_output); - - // Configure the function to transform the input tensor from NHWC -> NCHW - _permute_input_to_nchw.configure(compile_context, input, &_permuted_input, PermutationVector(1U, 2U, 0U)); - _permuted_input.info()->set_data_layout(DataLayout::NCHW); - - // Configure the function to transform the weights tensor from HWI -> IHW - _permute_weights_to_nchw.configure(compile_context, weights, &_permuted_weights, PermutationVector(1U, 2U, 0U)); - _permuted_weights.info()->set_data_layout(DataLayout::NCHW); - _permuted_output.info()->set_quantization_info(output->info()->quantization_info()); - - input_to_use = &_permuted_input; - weights_to_use = &_permuted_weights; - output_to_use = &_permuted_output; - - _kernel_nchw = std::make_unique<CLDepthwiseConvolutionLayer3x3NCHWKernel>(); - } - else if(_is_nhwc) - { - _kernel_nhwc = std::make_unique<CLDepthwiseConvolutionLayer3x3NHWCKernel>(); - } - else - { - _kernel_nchw = std::make_unique<CLDepthwiseConvolutionLayer3x3NCHWKernel>(); - } - - CLTensor *output_multipliers_to_use = nullptr; - CLTensor *output_shifts_to_use = nullptr; - if(_is_quantized) - { - const size_t idx_c = get_data_layout_dimension_index(weights->info()->data_layout(), DataLayoutDimension::CHANNEL); - const size_t num_filters = (is_quantized_per_channel) ? weights->info()->dimension(idx_c) : 1; - - _output_multipliers.allocator()->init(TensorInfo(TensorShape(num_filters), 1, DataType::S32)); - _output_shifts.allocator()->init(TensorInfo(TensorShape(num_filters), 1, DataType::S32)); - - output_multipliers_to_use = &_output_multipliers; - output_shifts_to_use = &_output_shifts; - } - - // Configure kernel - if(_is_nhwc && !_needs_permute) - { - _kernel_nhwc->configure(compile_context, input_to_use, weights_to_use, biases, output_to_use, conv_info, depth_multiplier, - act_info, dilation); - } - else - { - _kernel_nchw->configure(compile_context, input_to_use, weights_to_use, biases, output_to_use, conv_info, depth_multiplier, - act_info, dilation, output_multipliers_to_use, output_shifts_to_use); - } - - if(_is_quantized) - { - _output_multipliers.allocator()->allocate(); - _output_shifts.allocator()->allocate(); - } - - // Permute output if needed - if(_needs_permute) - { - // Configure the function to transform the convoluted output to ACL's native ordering format NCHW - _permuted_output.info()->set_data_layout(DataLayout::NCHW); - _permute_output_to_nhwc.configure(compile_context, &_permuted_output, output, PermutationVector(2U, 0U, 1U)); - - // Allocate tensors - _permuted_input.allocator()->allocate(); - _permuted_output.allocator()->allocate(); - } - // Configure border handler - PixelValue &&zero_value(0.f); - if(is_data_type_quantized_asymmetric(input->info()->data_type())) - { - zero_value = PixelValue(static_cast<uint8_t>(input->info()->quantization_info().uniform().offset)); - } - if(!_is_nhwc || _needs_permute) - { - _border_handler->configure(compile_context, input_to_use, _kernel_nchw->border_size(), BorderMode::CONSTANT, zero_value); - } -} - -Status CLDepthwiseConvolutionLayer::CLDepthwiseConvolutionLayerInternal3x3::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, - const PadStrideInfo &conv_info, unsigned int depth_multiplier, ActivationLayerInfo act_info, const Size2D &dilation) -{ - return validate_arguments_3x3(input, weights, biases, output, conv_info, depth_multiplier, act_info, dilation); -} - -void CLDepthwiseConvolutionLayer::CLDepthwiseConvolutionLayerInternal3x3::run() -{ - prepare(); - - MemoryGroupResourceScope scope_mg(_memory_group); - - if(_needs_permute) - { - _permute_input_to_nchw.run(); - } - CLScheduler::get().enqueue(*_border_handler); - if(_is_nhwc && !_needs_permute) - { - CLScheduler::get().enqueue(*_kernel_nhwc); - } - else - { - CLScheduler::get().enqueue(*_kernel_nchw); - } - - if(_needs_permute) - { - _permute_output_to_nhwc.run(); - } -} - -void CLDepthwiseConvolutionLayer::CLDepthwiseConvolutionLayerInternal3x3::prepare() -{ - if(!_is_prepared) - { - if(_is_quantized) - { - _output_multipliers.map(); - _output_shifts.map(); - const unsigned int idx_ofms = _is_nhwc ? 0 : 2; - quantization::compute_quantized_multipliers_and_shifts(_input->info(), - _original_weights->info(), - _output->info(), - idx_ofms, - reinterpret_cast<int32_t *>(_output_multipliers.ptr_to_element(Coordinates(0))), - reinterpret_cast<int32_t *>(_output_shifts.ptr_to_element(Coordinates(0)))); - _output_multipliers.unmap(); - _output_shifts.unmap(); - } - - if(_needs_permute) - { - ARM_COMPUTE_ERROR_ON(!_original_weights->is_used()); - - _permuted_weights.allocator()->allocate(); - _permute_weights_to_nchw.run(); - _original_weights->mark_as_unused(); - } - - _is_prepared = true; - } -} - -CLDepthwiseConvolutionLayer::CLDepthwiseConvolutionLayer(std::shared_ptr<IMemoryManager> memory_manager) - : _memory_manager(std::move(memory_manager)), _depth_conv_func(DepthwiseConvolutionFunction::GENERIC), _func_3x3(), _func_generic() -{ -} - -void CLDepthwiseConvolutionLayer::configure(ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info, unsigned int depth_multiplier, - ActivationLayerInfo act_info, const Size2D &dilation) -{ - configure(CLKernelLibrary::get().get_compile_context(), input, weights, biases, output, conv_info, depth_multiplier, act_info, dilation); -} - -void CLDepthwiseConvolutionLayer::configure(const CLCompileContext &compile_context, ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, - const PadStrideInfo &conv_info, - unsigned int depth_multiplier, - ActivationLayerInfo act_info, const Size2D &dilation) -{ - _depth_conv_func = get_depthwiseconvolution_function(input->info(), weights->info(), (biases != nullptr) ? biases->info() : nullptr, output->info(), conv_info, depth_multiplier, act_info, - dilation); - switch(_depth_conv_func) - { - case DepthwiseConvolutionFunction::OPTIMIZED: - _func_3x3.set_memory_group(_memory_manager); - _func_3x3.configure(compile_context, input, weights, biases, output, conv_info, depth_multiplier, act_info, dilation); - break; - case DepthwiseConvolutionFunction::GENERIC: - { - _func_generic.set_memory_group(_memory_manager); - _func_generic.configure(compile_context, input, weights, biases, output, conv_info, depth_multiplier, act_info, dilation); - } - break; - default: - ARM_COMPUTE_ERROR("Unsupported DepthwiseConvolutionFunction"); - } -} - -Status CLDepthwiseConvolutionLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info, - unsigned int depth_multiplier, ActivationLayerInfo act_info, const Size2D &dilation) -{ - DepthwiseConvolutionFunction depth_conv_func = get_depthwiseconvolution_function(input, weights, biases, output, conv_info, depth_multiplier, act_info, dilation); - switch(depth_conv_func) - { - case DepthwiseConvolutionFunction::OPTIMIZED: - return CLDepthwiseConvolutionLayerInternal3x3::validate(input, weights, biases, output, conv_info, depth_multiplier, act_info, dilation); - case DepthwiseConvolutionFunction::GENERIC: - return CLDepthwiseConvolutionLayerGeneric::validate(input, weights, biases, output, conv_info, depth_multiplier, act_info, dilation); - default: - ARM_COMPUTE_ERROR("Unsupported DepthwiseConvolutionFunction"); - } -} - -DepthwiseConvolutionFunction CLDepthwiseConvolutionLayer::get_depthwiseconvolution_function(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, - const PadStrideInfo &conv_info, - unsigned int depth_multiplier, ActivationLayerInfo act_info, const Size2D &dilation) -{ - if(bool(CLDepthwiseConvolutionLayerInternal3x3::validate(input, weights, biases, output, conv_info, depth_multiplier, act_info, dilation))) - { - return DepthwiseConvolutionFunction::OPTIMIZED; - } - else - { - return DepthwiseConvolutionFunction::GENERIC; - } -} - -void CLDepthwiseConvolutionLayer::run() -{ - switch(_depth_conv_func) - { - case DepthwiseConvolutionFunction::OPTIMIZED: - _func_3x3.run(); - break; - case DepthwiseConvolutionFunction::GENERIC: - _func_generic.run(); - break; - default: - ARM_COMPUTE_ERROR("DepthwiseConvolutionFunction not properly configured"); - } -} - -void CLDepthwiseConvolutionLayer::prepare() -{ - switch(_depth_conv_func) - { - case DepthwiseConvolutionFunction::OPTIMIZED: - _func_3x3.prepare(); - break; - case DepthwiseConvolutionFunction::GENERIC: - _func_generic.prepare(); - break; - default: - ARM_COMPUTE_ERROR("DepthwiseConvolutionFunction not properly configured"); - } -} } // namespace arm_compute diff --git a/src/runtime/CL/functions/CLDequantizationLayer.cpp b/src/runtime/CL/functions/CLDequantizationLayer.cpp index 3b104017e7..20162a03db 100644 --- a/src/runtime/CL/functions/CLDequantizationLayer.cpp +++ b/src/runtime/CL/functions/CLDequantizationLayer.cpp @@ -26,20 +26,21 @@ #include "arm_compute/core/CL/CLKernelLibrary.h" #include "arm_compute/core/CL/ICLTensor.h" #include "arm_compute/core/KernelDescriptors.h" + +#include "src/common/utils/Log.h" #include "src/core/CL/ICLKernel.h" -#include "src/runtime/gpu/cl/operators/ClDequantize.h" +#include "src/gpu/cl/operators/ClDequantize.h" namespace arm_compute { struct CLDequantizationLayer::Impl { - const ICLTensor *src{ nullptr }; - ICLTensor *dst{ nullptr }; - std::unique_ptr<opencl::ClDequantize> op{ nullptr }; + const ICLTensor *src{nullptr}; + ICLTensor *dst{nullptr}; + std::unique_ptr<opencl::ClDequantize> op{nullptr}; }; -CLDequantizationLayer::CLDequantizationLayer() - : _impl(std::make_unique<Impl>()) +CLDequantizationLayer::CLDequantizationLayer() : _impl(std::make_unique<Impl>()) { } CLDequantizationLayer::~CLDequantizationLayer() = default; @@ -49,8 +50,11 @@ void CLDequantizationLayer::configure(const ICLTensor *input, ICLTensor *output) configure(CLKernelLibrary::get().get_compile_context(), input, output); } -void CLDequantizationLayer::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output) +void CLDequantizationLayer::configure(const CLCompileContext &compile_context, + const ICLTensor *input, + ICLTensor *output) { + ARM_COMPUTE_LOG_PARAMS(input, output); _impl->src = input; _impl->dst = output; diff --git a/src/runtime/CL/functions/CLDirectConvolutionLayer.cpp b/src/runtime/CL/functions/CLDirectConvolutionLayer.cpp index 907e69d8d7..d6dae0d732 100644 --- a/src/runtime/CL/functions/CLDirectConvolutionLayer.cpp +++ b/src/runtime/CL/functions/CLDirectConvolutionLayer.cpp @@ -28,37 +28,49 @@ #include "arm_compute/core/Utils.h" #include "arm_compute/core/Validate.h" #include "arm_compute/runtime/CL/CLScheduler.h" -#include "src/runtime/gpu/cl/operators/ClActivation.h" -#include "src/runtime/gpu/cl/operators/ClDirectConv2d.h" + +#include "src/common/utils/Log.h" +#include "src/gpu/cl/operators/ClActivation.h" +#include "src/gpu/cl/operators/ClDirectConv2d.h" namespace arm_compute { struct CLDirectConvolutionLayer::Impl { - const ICLTensor *src{ nullptr }; - const ICLTensor *weights{ nullptr }; - const ICLTensor *biases{ nullptr }; - ICLTensor *dst{ nullptr }; - std::unique_ptr<opencl::ClDirectConv2d> op{ nullptr }; + const ICLTensor *src{nullptr}; + const ICLTensor *weights{nullptr}; + const ICLTensor *biases{nullptr}; + ICLTensor *dst{nullptr}; + std::unique_ptr<opencl::ClDirectConv2d> op{nullptr}; }; -CLDirectConvolutionLayer::CLDirectConvolutionLayer() - : _impl(std::make_unique<Impl>()) +CLDirectConvolutionLayer::CLDirectConvolutionLayer() : _impl(std::make_unique<Impl>()) { } -CLDirectConvolutionLayer::CLDirectConvolutionLayer(CLDirectConvolutionLayer &&) = default; +CLDirectConvolutionLayer::CLDirectConvolutionLayer(CLDirectConvolutionLayer &&) = default; CLDirectConvolutionLayer &CLDirectConvolutionLayer::operator=(CLDirectConvolutionLayer &&) = default; CLDirectConvolutionLayer::~CLDirectConvolutionLayer() = default; -void CLDirectConvolutionLayer::configure(ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info) +void CLDirectConvolutionLayer::configure(ICLTensor *input, + const ICLTensor *weights, + const ICLTensor *biases, + ICLTensor *output, + const PadStrideInfo &conv_info, + const ActivationLayerInfo &act_info) { configure(CLKernelLibrary::get().get_compile_context(), input, weights, biases, output, conv_info, act_info); } -void CLDirectConvolutionLayer::configure(const CLCompileContext &compile_context, ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, - const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info) +void CLDirectConvolutionLayer::configure(const CLCompileContext &compile_context, + ICLTensor *input, + const ICLTensor *weights, + const ICLTensor *biases, + ICLTensor *output, + const PadStrideInfo &conv_info, + const ActivationLayerInfo &act_info) { ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output); + ARM_COMPUTE_LOG_PARAMS(input, weights, biases, output, conv_info, act_info); _impl->src = input; _impl->weights = weights; @@ -66,10 +78,15 @@ void CLDirectConvolutionLayer::configure(const CLCompileContext &compile_context _impl->dst = output; _impl->op = std::make_unique<opencl::ClDirectConv2d>(); - _impl->op->configure(compile_context, input->info(), weights->info(), (biases != nullptr) ? biases->info() : nullptr, output->info(), conv_info, act_info); + _impl->op->configure(compile_context, input->info(), weights->info(), + (biases != nullptr) ? biases->info() : nullptr, output->info(), conv_info, act_info); } -Status CLDirectConvolutionLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info, +Status CLDirectConvolutionLayer::validate(const ITensorInfo *input, + const ITensorInfo *weights, + const ITensorInfo *biases, + const ITensorInfo *output, + const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info) { return opencl::ClDirectConv2d::validate(input, weights, biases, output, conv_info, act_info); @@ -84,4 +101,4 @@ void CLDirectConvolutionLayer::run() pack.add_tensor(TensorType::ACL_DST, _impl->dst); _impl->op->run(pack); } -}
\ No newline at end of file +} // namespace arm_compute diff --git a/src/runtime/CL/functions/CLDirectDeconvolutionLayer.cpp b/src/runtime/CL/functions/CLDirectDeconvolutionLayer.cpp index 8d1a91e420..7cd268ab0b 100644 --- a/src/runtime/CL/functions/CLDirectDeconvolutionLayer.cpp +++ b/src/runtime/CL/functions/CLDirectDeconvolutionLayer.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2021 Arm Limited. + * Copyright (c) 2019-2021, 2023 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -26,12 +26,13 @@ #include "arm_compute/core/CL/CLKernelLibrary.h" #include "arm_compute/core/Helpers.h" #include "arm_compute/core/Utils.h" -#include "arm_compute/core/Validate.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" +#include "arm_compute/core/Validate.h" #include "arm_compute/runtime/CL/CLScheduler.h" + +#include "src/common/utils/Log.h" #include "src/core/CL/kernels/CLDeconvolutionLayerUpsampleKernel.h" #include "src/core/CL/kernels/CLFillBorderKernel.h" -#include "src/core/CL/kernels/CLWeightsReshapeKernel.h" #include "src/core/helpers/AutoConfiguration.h" #include <memory> @@ -54,11 +55,16 @@ CLDirectDeconvolutionLayer::CLDirectDeconvolutionLayer(std::shared_ptr<IMemoryMa { } -Status CLDirectDeconvolutionLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *bias, ITensorInfo *output, const PadStrideInfo &info, - const WeightsInfo &weights_info) +Status CLDirectDeconvolutionLayer::validate(const ITensorInfo *input, + const ITensorInfo *weights, + const ITensorInfo *bias, + ITensorInfo *output, + const PadStrideInfo &info, + const WeightsInfo &weights_info) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8_SIGNED, DataType::QASYMM8, DataType::F16, DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8_SIGNED, DataType::QASYMM8, + DataType::F16, DataType::F32); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, weights); const DataLayout data_layout = input->data_layout(); @@ -66,23 +72,25 @@ Status CLDirectDeconvolutionLayer::validate(const ITensorInfo *input, const ITen const size_t idx_h = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT); const size_t idx_c = get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL); - ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(idx_w) != weights->dimension(idx_h)); ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(idx_w) < 1); + ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(idx_h) < 1); - auto out_dims = deconvolution_output_dimensions(input->dimension(idx_w), input->dimension(idx_h), weights->dimension(idx_w), weights->dimension(idx_h), info); + auto out_dims = deconvolution_output_dimensions(input->dimension(idx_w), input->dimension(idx_h), + weights->dimension(idx_w), weights->dimension(idx_h), info); const TensorShape output_shape = compute_deconvolution_output_shape(out_dims, *input, *weights); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); - if(input->data_type() != weights->data_type()) + if (input->data_type() != weights->data_type()) { - ARM_COMPUTE_RETURN_ERROR_ON(weights->data_type() != DataType::QSYMM8_PER_CHANNEL || !is_data_type_quantized_asymmetric(input->data_type())); + ARM_COMPUTE_RETURN_ERROR_ON(weights->data_type() != DataType::QSYMM8_PER_CHANNEL || + !is_data_type_quantized_asymmetric(input->data_type())); } - if(bias != nullptr) + if (bias != nullptr) { - if(is_data_type_quantized_asymmetric(input->data_type())) + if (is_data_type_quantized_asymmetric(input->data_type())) { ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(bias, 1, DataType::S32); } @@ -101,26 +109,42 @@ Status CLDirectDeconvolutionLayer::validate(const ITensorInfo *input, const ITen unsigned int deconv_pad_y = 0; const unsigned int stride_x = info.stride().first; const unsigned int stride_y = info.stride().second; - const TensorShape scale_out_shape = compute_deconvolution_upsampled_shape(*input, *weights, stride_x, stride_y, out_dims, deconv_pad_x, deconv_pad_y); - TensorInfo scale_out_info(input->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(scale_out_shape).set_data_layout(data_layout)); + const TensorShape scale_out_shape = compute_deconvolution_upsampled_shape(*input, *weights, stride_x, stride_y, + out_dims, deconv_pad_x, deconv_pad_y); + TensorInfo scale_out_info(input->clone() + ->set_is_resizable(true) + .reset_padding() + .set_tensor_shape(scale_out_shape) + .set_data_layout(data_layout)); const PadStrideInfo conv_info(1, 1, 0, 0, 0, 0, DimensionRoundingType::CEIL); ARM_COMPUTE_RETURN_ON_ERROR(CLDeconvolutionLayerUpsample::validate(input, &scale_out_info, info)); - ARM_COMPUTE_RETURN_ON_ERROR(CLConvolutionLayer::validate(&scale_out_info, weights, bias, output, conv_info, weights_info)); + ARM_COMPUTE_RETURN_ON_ERROR( + CLConvolutionLayer::validate(&scale_out_info, weights, bias, output, conv_info, weights_info)); return Status{}; } -void CLDirectDeconvolutionLayer::configure(ICLTensor *input, ICLTensor *weights, const ICLTensor *bias, ICLTensor *output, const PadStrideInfo &info, - const WeightsInfo &weights_info) +void CLDirectDeconvolutionLayer::configure(ICLTensor *input, + ICLTensor *weights, + const ICLTensor *bias, + ICLTensor *output, + const PadStrideInfo &info, + const WeightsInfo &weights_info) { configure(CLKernelLibrary::get().get_compile_context(), input, weights, bias, output, info, weights_info); } -void CLDirectDeconvolutionLayer::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *weights, const ICLTensor *bias, ICLTensor *output, const PadStrideInfo &info, - const WeightsInfo &weights_info) +void CLDirectDeconvolutionLayer::configure(const CLCompileContext &compile_context, + ICLTensor *input, + ICLTensor *weights, + const ICLTensor *bias, + ICLTensor *output, + const PadStrideInfo &info, + const WeightsInfo &weights_info) { ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output); + ARM_COMPUTE_LOG_PARAMS(input, weights, bias, output, info, weights_info); const unsigned int pad_left = info.pad_left(); const unsigned int pad_right = info.pad_right(); @@ -137,17 +161,21 @@ void CLDirectDeconvolutionLayer::configure(const CLCompileContext &compile_conte _original_weights = weights; _flip_axis.allocator()->init(TensorInfo(TensorShape(2U), 1, DataType::U32)); _weights_flipped.allocator()->init(weights->info()->clone()->set_data_layout(data_layout)); - _flip_weights.configure(compile_context, weights, &_weights_flipped, &_flip_axis); + _flip_weights.configure(compile_context, weights, &_weights_flipped, &_flip_axis, /* use_inverted_axis */ false); - auto out_dims = deconvolution_output_dimensions(input->info()->dimension(idx_w), input->info()->dimension(idx_h), weights->info()->dimension(idx_w), weights->info()->dimension(idx_h), info); + auto out_dims = + deconvolution_output_dimensions(input->info()->dimension(idx_w), input->info()->dimension(idx_h), + weights->info()->dimension(idx_w), weights->info()->dimension(idx_h), info); const TensorShape output_shape = compute_deconvolution_output_shape(out_dims, *input->info(), *weights->info()); // Output auto initialization if not yet initialized - auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(output_shape).set_data_layout(data_layout)); + auto_init_if_empty(*output->info(), + input->info()->clone()->set_tensor_shape(output_shape).set_data_layout(data_layout)); // Perform validation step - ARM_COMPUTE_ERROR_THROW_ON(CLDirectDeconvolutionLayer::validate(input->info(), weights->info(), bias == nullptr ? nullptr : bias->info(), output->info(), info)); + ARM_COMPUTE_ERROR_THROW_ON(CLDirectDeconvolutionLayer::validate( + input->info(), weights->info(), bias == nullptr ? nullptr : bias->info(), output->info(), info)); _is_prepared = weights_info.retain_internal_weights(); @@ -156,7 +184,8 @@ void CLDirectDeconvolutionLayer::configure(const CLCompileContext &compile_conte // Find the upsampled dimensions and the padding needed for the convolution with stride 1 in order to match output shape unsigned int deconv_pad_x = 0; unsigned int deconv_pad_y = 0; - const TensorShape scale_out_shape = compute_deconvolution_upsampled_shape(*input->info(), *weights->info(), stride_x, stride_y, out_dims, deconv_pad_x, deconv_pad_y); + const TensorShape scale_out_shape = compute_deconvolution_upsampled_shape( + *input->info(), *weights->info(), stride_x, stride_y, out_dims, deconv_pad_x, deconv_pad_y); unsigned int deconv_pad_left = pad_right > pad_left ? pad_right - pad_left : 0; unsigned int deconv_pad_right = pad_left > pad_right ? pad_left - pad_right : 0; @@ -177,7 +206,8 @@ void CLDirectDeconvolutionLayer::configure(const CLCompileContext &compile_conte _scaled_output.allocator()->init(scale_out_info); // configure scale function - const PadStrideInfo upsample_info(stride_x, stride_y, deconv_pad_left, deconv_pad_right, deconv_pad_top, deconv_pad_bottom, DimensionRoundingType::FLOOR); + const PadStrideInfo upsample_info(stride_x, stride_y, deconv_pad_left, deconv_pad_right, deconv_pad_top, + deconv_pad_bottom, DimensionRoundingType::FLOOR); _scale_f.configure(compile_context, input, &_scaled_output, upsample_info); // Setup the function to convolve the upscaled output @@ -189,7 +219,7 @@ void CLDirectDeconvolutionLayer::configure(const CLCompileContext &compile_conte _flip_axis.allocator()->allocate(); _flip_axis.map(true); auto axis_data = reinterpret_cast<uint32_t *>(_flip_axis.buffer()); - if(weights->info()->data_layout() == DataLayout::NHWC) + if (weights->info()->data_layout() == DataLayout::NHWC) { axis_data[0] = 1; axis_data[1] = 2; @@ -214,7 +244,7 @@ void CLDirectDeconvolutionLayer::run() void CLDirectDeconvolutionLayer::prepare() { - if(!_is_prepared) + if (!_is_prepared) { ARM_COMPUTE_ERROR_ON(!_original_weights->is_used()); @@ -227,7 +257,7 @@ void CLDirectDeconvolutionLayer::prepare() _conv_f.prepare(); // Free flipped weights - if(!_weights_flipped.is_used()) + if (!_weights_flipped.is_used()) { _weights_flipped.allocator()->free(); } diff --git a/src/runtime/CL/functions/CLElementwiseOperations.cpp b/src/runtime/CL/functions/CLElementwiseOperations.cpp index 60c699cbb8..d9529f0b7f 100644 --- a/src/runtime/CL/functions/CLElementwiseOperations.cpp +++ b/src/runtime/CL/functions/CLElementwiseOperations.cpp @@ -26,36 +26,40 @@ #include "arm_compute/core/CL/CLKernelLibrary.h" #include "arm_compute/core/CL/ICLTensor.h" #include "arm_compute/core/Types.h" -#include "src/core/CL/ICLKernel.h" -#include "src/runtime/gpu/cl/operators/ClAdd.h" -#include "src/runtime/gpu/cl/operators/ClElementwiseOperations.h" -#include "src/runtime/gpu/cl/operators/ClSub.h" +#include "src/core/CL/ICLKernel.h" +#include "src/gpu/cl/operators/ClAdd.h" +#include "src/gpu/cl/operators/ClElementwiseOperations.h" +#include "src/gpu/cl/operators/ClSub.h" namespace arm_compute { struct CLArithmeticAddition::Impl { - const ICLTensor *src_0{ nullptr }; - const ICLTensor *src_1{ nullptr }; - ICLTensor *dst{ nullptr }; - std::unique_ptr<opencl::ClAdd> op{ nullptr }; + const ICLTensor *src_0{nullptr}; + const ICLTensor *src_1{nullptr}; + ICLTensor *dst{nullptr}; + std::unique_ptr<opencl::ClAdd> op{nullptr}; }; -CLArithmeticAddition::CLArithmeticAddition() - : _impl(std::make_unique<Impl>()) +CLArithmeticAddition::CLArithmeticAddition() : _impl(std::make_unique<Impl>()) { } -CLArithmeticAddition::CLArithmeticAddition(CLArithmeticAddition &&) = default; +CLArithmeticAddition::CLArithmeticAddition(CLArithmeticAddition &&) = default; CLArithmeticAddition &CLArithmeticAddition::operator=(CLArithmeticAddition &&) = default; CLArithmeticAddition::~CLArithmeticAddition() = default; -void CLArithmeticAddition::configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *output, ConvertPolicy policy, const ActivationLayerInfo &act_info) +void CLArithmeticAddition::configure( + ICLTensor *input1, ICLTensor *input2, ICLTensor *output, ConvertPolicy policy, const ActivationLayerInfo &act_info) { configure(CLKernelLibrary::get().get_compile_context(), input1, input2, output, policy, act_info); } -void CLArithmeticAddition::configure(const CLCompileContext &compile_context, const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, ConvertPolicy policy, +void CLArithmeticAddition::configure(const CLCompileContext &compile_context, + const ICLTensor *input1, + const ICLTensor *input2, + ICLTensor *output, + ConvertPolicy policy, const ActivationLayerInfo &act_info) { _impl->src_0 = input1; @@ -65,7 +69,11 @@ void CLArithmeticAddition::configure(const CLCompileContext &compile_context, co _impl->op->configure(compile_context, input1->info(), input2->info(), output->info(), policy, act_info); } -Status CLArithmeticAddition::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, ConvertPolicy policy, const ActivationLayerInfo &act_info) +Status CLArithmeticAddition::validate(const ITensorInfo *input1, + const ITensorInfo *input2, + const ITensorInfo *output, + ConvertPolicy policy, + const ActivationLayerInfo &act_info) { return opencl::ClAdd::validate(input1, input2, output, policy, act_info); } @@ -82,26 +90,33 @@ void CLArithmeticAddition::run() struct CLArithmeticSubtraction::Impl { - const ICLTensor *src_0{ nullptr }; - const ICLTensor *src_1{ nullptr }; - ICLTensor *dst{ nullptr }; - std::unique_ptr<opencl::ClSub> op{ nullptr }; + const ICLTensor *src_0{nullptr}; + const ICLTensor *src_1{nullptr}; + ICLTensor *dst{nullptr}; + std::unique_ptr<opencl::ClSub> op{nullptr}; }; -CLArithmeticSubtraction::CLArithmeticSubtraction() - : _impl(std::make_unique<Impl>()) +CLArithmeticSubtraction::CLArithmeticSubtraction() : _impl(std::make_unique<Impl>()) { } -CLArithmeticSubtraction::CLArithmeticSubtraction(CLArithmeticSubtraction &&) = default; +CLArithmeticSubtraction::CLArithmeticSubtraction(CLArithmeticSubtraction &&) = default; CLArithmeticSubtraction &CLArithmeticSubtraction::operator=(CLArithmeticSubtraction &&) = default; CLArithmeticSubtraction::~CLArithmeticSubtraction() = default; -void CLArithmeticSubtraction::configure(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, ConvertPolicy policy, const ActivationLayerInfo &act_info) +void CLArithmeticSubtraction::configure(const ICLTensor *input1, + const ICLTensor *input2, + ICLTensor *output, + ConvertPolicy policy, + const ActivationLayerInfo &act_info) { configure(CLKernelLibrary::get().get_compile_context(), input1, input2, output, policy, act_info); } -void CLArithmeticSubtraction::configure(const CLCompileContext &compile_context, const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, ConvertPolicy policy, +void CLArithmeticSubtraction::configure(const CLCompileContext &compile_context, + const ICLTensor *input1, + const ICLTensor *input2, + ICLTensor *output, + ConvertPolicy policy, const ActivationLayerInfo &act_info) { _impl->src_0 = input1; @@ -111,7 +126,11 @@ void CLArithmeticSubtraction::configure(const CLCompileContext &compile_context, _impl->op->configure(compile_context, input1->info(), input2->info(), output->info(), policy, act_info); } -Status CLArithmeticSubtraction::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, ConvertPolicy policy, const ActivationLayerInfo &act_info) +Status CLArithmeticSubtraction::validate(const ITensorInfo *input1, + const ITensorInfo *input2, + const ITensorInfo *output, + ConvertPolicy policy, + const ActivationLayerInfo &act_info) { return opencl::ClSub::validate(input1, input2, output, policy, act_info); } @@ -128,26 +147,32 @@ void CLArithmeticSubtraction::run() struct CLArithmeticDivision::Impl { - const ICLTensor *src_0{ nullptr }; - const ICLTensor *src_1{ nullptr }; - ICLTensor *dst{ nullptr }; - std::unique_ptr<opencl::ClElementwiseDivision> op{ nullptr }; + const ICLTensor *src_0{nullptr}; + const ICLTensor *src_1{nullptr}; + ICLTensor *dst{nullptr}; + std::unique_ptr<opencl::ClElementwiseDivision> op{nullptr}; }; -CLArithmeticDivision::CLArithmeticDivision() - : _impl(std::make_unique<Impl>()) +CLArithmeticDivision::CLArithmeticDivision() : _impl(std::make_unique<Impl>()) { } -CLArithmeticDivision::CLArithmeticDivision(CLArithmeticDivision &&) = default; +CLArithmeticDivision::CLArithmeticDivision(CLArithmeticDivision &&) = default; CLArithmeticDivision &CLArithmeticDivision::operator=(CLArithmeticDivision &&) = default; CLArithmeticDivision::~CLArithmeticDivision() = default; -void CLArithmeticDivision::configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *output, const ActivationLayerInfo &act_info) +void CLArithmeticDivision::configure(ICLTensor *input1, + ICLTensor *input2, + ICLTensor *output, + const ActivationLayerInfo &act_info) { configure(CLKernelLibrary::get().get_compile_context(), input1, input2, output, act_info); } -void CLArithmeticDivision::configure(const CLCompileContext &compile_context, const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, const ActivationLayerInfo &act_info) +void CLArithmeticDivision::configure(const CLCompileContext &compile_context, + const ICLTensor *input1, + const ICLTensor *input2, + ICLTensor *output, + const ActivationLayerInfo &act_info) { _impl->src_0 = input1; _impl->src_1 = input2; @@ -156,7 +181,10 @@ void CLArithmeticDivision::configure(const CLCompileContext &compile_context, co _impl->op->configure(compile_context, input1->info(), input2->info(), output->info(), act_info); } -Status CLArithmeticDivision::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const ActivationLayerInfo &act_info) +Status CLArithmeticDivision::validate(const ITensorInfo *input1, + const ITensorInfo *input2, + const ITensorInfo *output, + const ActivationLayerInfo &act_info) { return opencl::ClElementwiseDivision::validate(input1, input2, output, act_info); } @@ -173,26 +201,32 @@ void CLArithmeticDivision::run() struct CLElementwiseMax::Impl { - const ICLTensor *src_0{ nullptr }; - const ICLTensor *src_1{ nullptr }; - ICLTensor *dst{ nullptr }; - std::unique_ptr<opencl::ClElementwiseMax> op{ nullptr }; + const ICLTensor *src_0{nullptr}; + const ICLTensor *src_1{nullptr}; + ICLTensor *dst{nullptr}; + std::unique_ptr<opencl::ClElementwiseMax> op{nullptr}; }; -CLElementwiseMax::CLElementwiseMax() - : _impl(std::make_unique<Impl>()) +CLElementwiseMax::CLElementwiseMax() : _impl(std::make_unique<Impl>()) { } -CLElementwiseMax::CLElementwiseMax(CLElementwiseMax &&) = default; +CLElementwiseMax::CLElementwiseMax(CLElementwiseMax &&) = default; CLElementwiseMax &CLElementwiseMax::operator=(CLElementwiseMax &&) = default; CLElementwiseMax::~CLElementwiseMax() = default; -void CLElementwiseMax::configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *output, const ActivationLayerInfo &act_info) +void CLElementwiseMax::configure(ICLTensor *input1, + ICLTensor *input2, + ICLTensor *output, + const ActivationLayerInfo &act_info) { configure(CLKernelLibrary::get().get_compile_context(), input1, input2, output, act_info); } -void CLElementwiseMax::configure(const CLCompileContext &compile_context, ICLTensor *input1, ICLTensor *input2, ICLTensor *output, const ActivationLayerInfo &act_info) +void CLElementwiseMax::configure(const CLCompileContext &compile_context, + ICLTensor *input1, + ICLTensor *input2, + ICLTensor *output, + const ActivationLayerInfo &act_info) { _impl->src_0 = input1; _impl->src_1 = input2; @@ -201,7 +235,10 @@ void CLElementwiseMax::configure(const CLCompileContext &compile_context, ICLTen _impl->op->configure(compile_context, input1->info(), input2->info(), output->info(), act_info); } -Status CLElementwiseMax::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const ActivationLayerInfo &act_info) +Status CLElementwiseMax::validate(const ITensorInfo *input1, + const ITensorInfo *input2, + const ITensorInfo *output, + const ActivationLayerInfo &act_info) { return opencl::ClElementwiseMax::validate(input1, input2, output, act_info); } @@ -218,26 +255,32 @@ void CLElementwiseMax::run() struct CLElementwiseMin::Impl { - const ICLTensor *src_0{ nullptr }; - const ICLTensor *src_1{ nullptr }; - ICLTensor *dst{ nullptr }; - std::unique_ptr<opencl::ClElementwiseMin> op{ nullptr }; + const ICLTensor *src_0{nullptr}; + const ICLTensor *src_1{nullptr}; + ICLTensor *dst{nullptr}; + std::unique_ptr<opencl::ClElementwiseMin> op{nullptr}; }; -CLElementwiseMin::CLElementwiseMin() - : _impl(std::make_unique<Impl>()) +CLElementwiseMin::CLElementwiseMin() : _impl(std::make_unique<Impl>()) { } -CLElementwiseMin::CLElementwiseMin(CLElementwiseMin &&) = default; +CLElementwiseMin::CLElementwiseMin(CLElementwiseMin &&) = default; CLElementwiseMin &CLElementwiseMin::operator=(CLElementwiseMin &&) = default; CLElementwiseMin::~CLElementwiseMin() = default; -void CLElementwiseMin::configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *output, const ActivationLayerInfo &act_info) +void CLElementwiseMin::configure(ICLTensor *input1, + ICLTensor *input2, + ICLTensor *output, + const ActivationLayerInfo &act_info) { configure(CLKernelLibrary::get().get_compile_context(), input1, input2, output, act_info); } -void CLElementwiseMin::configure(const CLCompileContext &compile_context, ICLTensor *input1, ICLTensor *input2, ICLTensor *output, const ActivationLayerInfo &act_info) +void CLElementwiseMin::configure(const CLCompileContext &compile_context, + ICLTensor *input1, + ICLTensor *input2, + ICLTensor *output, + const ActivationLayerInfo &act_info) { _impl->src_0 = input1; _impl->src_1 = input2; @@ -246,7 +289,10 @@ void CLElementwiseMin::configure(const CLCompileContext &compile_context, ICLTen _impl->op->configure(compile_context, input1->info(), input2->info(), output->info(), act_info); } -Status CLElementwiseMin::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const ActivationLayerInfo &act_info) +Status CLElementwiseMin::validate(const ITensorInfo *input1, + const ITensorInfo *input2, + const ITensorInfo *output, + const ActivationLayerInfo &act_info) { return opencl::ClElementwiseMin::validate(input1, input2, output, act_info); } @@ -263,26 +309,32 @@ void CLElementwiseMin::run() struct CLElementwiseSquaredDiff::Impl { - const ICLTensor *src_0{ nullptr }; - const ICLTensor *src_1{ nullptr }; - ICLTensor *dst{ nullptr }; - std::unique_ptr<opencl::ClElementwiseSquaredDiff> op{ nullptr }; + const ICLTensor *src_0{nullptr}; + const ICLTensor *src_1{nullptr}; + ICLTensor *dst{nullptr}; + std::unique_ptr<opencl::ClElementwiseSquaredDiff> op{nullptr}; }; -CLElementwiseSquaredDiff::CLElementwiseSquaredDiff() - : _impl(std::make_unique<Impl>()) +CLElementwiseSquaredDiff::CLElementwiseSquaredDiff() : _impl(std::make_unique<Impl>()) { } -CLElementwiseSquaredDiff::CLElementwiseSquaredDiff(CLElementwiseSquaredDiff &&) = default; +CLElementwiseSquaredDiff::CLElementwiseSquaredDiff(CLElementwiseSquaredDiff &&) = default; CLElementwiseSquaredDiff &CLElementwiseSquaredDiff::operator=(CLElementwiseSquaredDiff &&) = default; CLElementwiseSquaredDiff::~CLElementwiseSquaredDiff() = default; -void CLElementwiseSquaredDiff::configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *output, const ActivationLayerInfo &act_info) +void CLElementwiseSquaredDiff::configure(ICLTensor *input1, + ICLTensor *input2, + ICLTensor *output, + const ActivationLayerInfo &act_info) { configure(CLKernelLibrary::get().get_compile_context(), input1, input2, output, act_info); } -void CLElementwiseSquaredDiff::configure(const CLCompileContext &compile_context, ICLTensor *input1, ICLTensor *input2, ICLTensor *output, const ActivationLayerInfo &act_info) +void CLElementwiseSquaredDiff::configure(const CLCompileContext &compile_context, + ICLTensor *input1, + ICLTensor *input2, + ICLTensor *output, + const ActivationLayerInfo &act_info) { _impl->src_0 = input1; _impl->src_1 = input2; @@ -291,7 +343,10 @@ void CLElementwiseSquaredDiff::configure(const CLCompileContext &compile_context _impl->op->configure(compile_context, input1->info(), input2->info(), output->info(), act_info); } -Status CLElementwiseSquaredDiff::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const ActivationLayerInfo &act_info) +Status CLElementwiseSquaredDiff::validate(const ITensorInfo *input1, + const ITensorInfo *input2, + const ITensorInfo *output, + const ActivationLayerInfo &act_info) { return opencl::ClElementwiseSquaredDiff::validate(input1, input2, output, act_info); } @@ -308,26 +363,32 @@ void CLElementwiseSquaredDiff::run() struct CLElementwisePower::Impl { - const ICLTensor *src_0{ nullptr }; - const ICLTensor *src_1{ nullptr }; - ICLTensor *dst{ nullptr }; - std::unique_ptr<opencl::ClElementwisePower> op{ nullptr }; + const ICLTensor *src_0{nullptr}; + const ICLTensor *src_1{nullptr}; + ICLTensor *dst{nullptr}; + std::unique_ptr<opencl::ClElementwisePower> op{nullptr}; }; -CLElementwisePower::CLElementwisePower() - : _impl(std::make_unique<Impl>()) +CLElementwisePower::CLElementwisePower() : _impl(std::make_unique<Impl>()) { } -CLElementwisePower::CLElementwisePower(CLElementwisePower &&) = default; +CLElementwisePower::CLElementwisePower(CLElementwisePower &&) = default; CLElementwisePower &CLElementwisePower::operator=(CLElementwisePower &&) = default; CLElementwisePower::~CLElementwisePower() = default; -void CLElementwisePower::configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *output, const ActivationLayerInfo &act_info) +void CLElementwisePower::configure(ICLTensor *input1, + ICLTensor *input2, + ICLTensor *output, + const ActivationLayerInfo &act_info) { configure(CLKernelLibrary::get().get_compile_context(), input1, input2, output, act_info); } -void CLElementwisePower::configure(const CLCompileContext &compile_context, ICLTensor *input1, ICLTensor *input2, ICLTensor *output, const ActivationLayerInfo &act_info) +void CLElementwisePower::configure(const CLCompileContext &compile_context, + ICLTensor *input1, + ICLTensor *input2, + ICLTensor *output, + const ActivationLayerInfo &act_info) { _impl->src_0 = input1; _impl->src_1 = input2; @@ -336,7 +397,10 @@ void CLElementwisePower::configure(const CLCompileContext &compile_context, ICLT _impl->op->configure(compile_context, input1->info(), input2->info(), output->info(), act_info); } -Status CLElementwisePower::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const ActivationLayerInfo &act_info) +Status CLElementwisePower::validate(const ITensorInfo *input1, + const ITensorInfo *input2, + const ITensorInfo *output, + const ActivationLayerInfo &act_info) { return opencl::ClElementwisePower::validate(input1, input2, output, act_info); } diff --git a/src/runtime/CL/functions/CLElementwiseUnaryLayer.cpp b/src/runtime/CL/functions/CLElementwiseUnaryLayer.cpp index a45dd6f9a6..3043c26feb 100644 --- a/src/runtime/CL/functions/CLElementwiseUnaryLayer.cpp +++ b/src/runtime/CL/functions/CLElementwiseUnaryLayer.cpp @@ -25,24 +25,24 @@ #include "arm_compute/core/CL/CLKernelLibrary.h" #include "arm_compute/core/CL/ICLTensor.h" + #include "src/core/CL/ICLKernel.h" -#include "src/runtime/gpu/cl/operators/ClElementwiseUnary.h" +#include "src/gpu/cl/operators/ClElementwiseUnary.h" namespace arm_compute { struct CLRsqrtLayer::Impl { - const ICLTensor *src{ nullptr }; - ICLTensor *dst{ nullptr }; - std::unique_ptr<opencl::ClRsqrt> op{ nullptr }; + const ICLTensor *src{nullptr}; + ICLTensor *dst{nullptr}; + std::unique_ptr<opencl::ClRsqrt> op{nullptr}; }; -CLRsqrtLayer::CLRsqrtLayer() - : _impl(std::make_unique<Impl>()) +CLRsqrtLayer::CLRsqrtLayer() : _impl(std::make_unique<Impl>()) { } -CLRsqrtLayer::CLRsqrtLayer(CLRsqrtLayer &&) = default; +CLRsqrtLayer::CLRsqrtLayer(CLRsqrtLayer &&) = default; CLRsqrtLayer &CLRsqrtLayer::operator=(CLRsqrtLayer &&) = default; CLRsqrtLayer::~CLRsqrtLayer() = default; @@ -74,17 +74,16 @@ void CLRsqrtLayer::run() struct CLExpLayer::Impl { - const ICLTensor *src{ nullptr }; - ICLTensor *dst{ nullptr }; - std::unique_ptr<opencl::ClExp> op{ nullptr }; + const ICLTensor *src{nullptr}; + ICLTensor *dst{nullptr}; + std::unique_ptr<opencl::ClExp> op{nullptr}; }; -CLExpLayer::CLExpLayer() - : _impl(std::make_unique<Impl>()) +CLExpLayer::CLExpLayer() : _impl(std::make_unique<Impl>()) { } -CLExpLayer::CLExpLayer(CLExpLayer &&) = default; +CLExpLayer::CLExpLayer(CLExpLayer &&) = default; CLExpLayer &CLExpLayer::operator=(CLExpLayer &&) = default; CLExpLayer::~CLExpLayer() = default; @@ -116,17 +115,16 @@ void CLExpLayer::run() struct CLNegLayer::Impl { - const ICLTensor *src{ nullptr }; - ICLTensor *dst{ nullptr }; - std::unique_ptr<opencl::ClNeg> op{ nullptr }; + const ICLTensor *src{nullptr}; + ICLTensor *dst{nullptr}; + std::unique_ptr<opencl::ClNeg> op{nullptr}; }; -CLNegLayer::CLNegLayer() - : _impl(std::make_unique<Impl>()) +CLNegLayer::CLNegLayer() : _impl(std::make_unique<Impl>()) { } -CLNegLayer::CLNegLayer(CLNegLayer &&) = default; +CLNegLayer::CLNegLayer(CLNegLayer &&) = default; CLNegLayer &CLNegLayer::operator=(CLNegLayer &&) = default; CLNegLayer::~CLNegLayer() = default; @@ -157,17 +155,16 @@ void CLNegLayer::run() struct CLSinLayer::Impl { - const ICLTensor *src{ nullptr }; - ICLTensor *dst{ nullptr }; - std::unique_ptr<opencl::ClSin> op{ nullptr }; + const ICLTensor *src{nullptr}; + ICLTensor *dst{nullptr}; + std::unique_ptr<opencl::ClSin> op{nullptr}; }; -CLSinLayer::CLSinLayer() - : _impl(std::make_unique<Impl>()) +CLSinLayer::CLSinLayer() : _impl(std::make_unique<Impl>()) { } -CLSinLayer::CLSinLayer(CLSinLayer &&) = default; +CLSinLayer::CLSinLayer(CLSinLayer &&) = default; CLSinLayer &CLSinLayer::operator=(CLSinLayer &&) = default; CLSinLayer::~CLSinLayer() = default; @@ -198,17 +195,16 @@ void CLSinLayer::run() struct CLAbsLayer::Impl { - const ICLTensor *src{ nullptr }; - ICLTensor *dst{ nullptr }; - std::unique_ptr<opencl::ClAbs> op{ nullptr }; + const ICLTensor *src{nullptr}; + ICLTensor *dst{nullptr}; + std::unique_ptr<opencl::ClAbs> op{nullptr}; }; -CLAbsLayer::CLAbsLayer() - : _impl(std::make_unique<Impl>()) +CLAbsLayer::CLAbsLayer() : _impl(std::make_unique<Impl>()) { } -CLAbsLayer::CLAbsLayer(CLAbsLayer &&) = default; +CLAbsLayer::CLAbsLayer(CLAbsLayer &&) = default; CLAbsLayer &CLAbsLayer::operator=(CLAbsLayer &&) = default; CLAbsLayer::~CLAbsLayer() = default; @@ -239,17 +235,16 @@ void CLAbsLayer::run() struct CLLogLayer::Impl { - const ICLTensor *src{ nullptr }; - ICLTensor *dst{ nullptr }; - std::unique_ptr<opencl::ClLog> op{ nullptr }; + const ICLTensor *src{nullptr}; + ICLTensor *dst{nullptr}; + std::unique_ptr<opencl::ClLog> op{nullptr}; }; -CLLogLayer::CLLogLayer() - : _impl(std::make_unique<Impl>()) +CLLogLayer::CLLogLayer() : _impl(std::make_unique<Impl>()) { } -CLLogLayer::CLLogLayer(CLLogLayer &&) = default; +CLLogLayer::CLLogLayer(CLLogLayer &&) = default; CLLogLayer &CLLogLayer::operator=(CLLogLayer &&) = default; CLLogLayer::~CLLogLayer() = default; @@ -280,17 +275,16 @@ void CLLogLayer::run() struct CLRoundLayer::Impl { - const ICLTensor *src{ nullptr }; - ICLTensor *dst{ nullptr }; - std::unique_ptr<opencl::ClRound> op{ nullptr }; + const ICLTensor *src{nullptr}; + ICLTensor *dst{nullptr}; + std::unique_ptr<opencl::ClRound> op{nullptr}; }; -CLRoundLayer::CLRoundLayer() - : _impl(std::make_unique<Impl>()) +CLRoundLayer::CLRoundLayer() : _impl(std::make_unique<Impl>()) { } -CLRoundLayer::CLRoundLayer(CLRoundLayer &&) = default; +CLRoundLayer::CLRoundLayer(CLRoundLayer &&) = default; CLRoundLayer &CLRoundLayer::operator=(CLRoundLayer &&) = default; CLRoundLayer::~CLRoundLayer() = default; diff --git a/src/runtime/CL/functions/CLFFT1D.cpp b/src/runtime/CL/functions/CLFFT1D.cpp index cf136dc75e..48e9ae824a 100644 --- a/src/runtime/CL/functions/CLFFT1D.cpp +++ b/src/runtime/CL/functions/CLFFT1D.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2020 Arm Limited. + * Copyright (c) 2019-2021 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -26,6 +26,8 @@ #include "arm_compute/core/CL/ICLTensor.h" #include "arm_compute/core/Validate.h" #include "arm_compute/runtime/CL/CLScheduler.h" + +#include "src/common/utils/Log.h" #include "src/core/CL/kernels/CLFFTDigitReverseKernel.h" #include "src/core/CL/kernels/CLFFTRadixStageKernel.h" #include "src/core/CL/kernels/CLFFTScaleKernel.h" @@ -52,10 +54,14 @@ void CLFFT1D::configure(const ICLTensor *input, ICLTensor *output, const FFT1DIn configure(CLKernelLibrary::get().get_compile_context(), input, output, config); } -void CLFFT1D::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const FFT1DInfo &config) +void CLFFT1D::configure(const CLCompileContext &compile_context, + const ICLTensor *input, + ICLTensor *output, + const FFT1DInfo &config) { ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); ARM_COMPUTE_ERROR_THROW_ON(CLFFT1D::validate(input->info(), output->info(), config)); + ARM_COMPUTE_LOG_PARAMS(input, output, config); // Decompose size to radix factors const auto supported_radix = CLFFTRadixStageKernel::supported_radix(); @@ -74,13 +80,14 @@ void CLFFT1D::configure(const CLCompileContext &compile_context, const ICLTensor TensorInfo digit_reverse_indices_info(TensorShape(input->info()->tensor_shape()[config.axis]), 1, DataType::U32); _digit_reverse_indices.allocator()->init(digit_reverse_indices_info); _memory_group.manage(&_digit_reversed_input); - _digit_reverse_kernel->configure(compile_context, input, &_digit_reversed_input, &_digit_reverse_indices, digit_reverse_config); + _digit_reverse_kernel->configure(compile_context, input, &_digit_reversed_input, &_digit_reverse_indices, + digit_reverse_config); // Create and configure FFT kernels unsigned int Nx = 1; _num_ffts = decomposed_vector.size(); _fft_kernels.reserve(_num_ffts); - for(unsigned int i = 0; i < _num_ffts; ++i) + for (unsigned int i = 0; i < _num_ffts; ++i) { const unsigned int radix_for_stage = decomposed_vector.at(i); @@ -90,18 +97,20 @@ void CLFFT1D::configure(const CLCompileContext &compile_context, const ICLTensor fft_kernel_info.Nx = Nx; fft_kernel_info.is_first_stage = (i == 0); _fft_kernels.emplace_back(std::make_unique<CLFFTRadixStageKernel>()); - _fft_kernels.back()->configure(compile_context, &_digit_reversed_input, ((i == (_num_ffts - 1)) && !is_c2r) ? output : nullptr, fft_kernel_info); + _fft_kernels.back()->configure(compile_context, &_digit_reversed_input, + ((i == (_num_ffts - 1)) && !is_c2r) ? output : nullptr, fft_kernel_info); Nx *= radix_for_stage; } // Configure scale kernel - if(_run_scale) + if (_run_scale) { FFTScaleKernelInfo scale_config; scale_config.scale = static_cast<float>(N); scale_config.conjugate = config.direction == FFTDirection::Inverse; - is_c2r ? _scale_kernel->configure(compile_context, &_digit_reversed_input, output, scale_config) : _scale_kernel->configure(output, nullptr, scale_config); + is_c2r ? _scale_kernel->configure(compile_context, &_digit_reversed_input, output, scale_config) + : _scale_kernel->configure(output, nullptr, scale_config); } // Allocate tensors @@ -120,7 +129,7 @@ Status CLFFT1D::validate(const ITensorInfo *input, const ITensorInfo *output, co ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output); ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_NOT_IN(input, DataType::F16, DataType::F32); ARM_COMPUTE_RETURN_ERROR_ON(input->num_channels() != 1 && input->num_channels() != 2); - ARM_COMPUTE_RETURN_ERROR_ON(std::set<unsigned int>({ 0, 1 }).count(config.axis) == 0); + ARM_COMPUTE_RETURN_ERROR_ON(std::set<unsigned int>({0, 1}).count(config.axis) == 0); // Check if FFT is decomposable const auto supported_radix = CLFFTRadixStageKernel::supported_radix(); @@ -129,7 +138,7 @@ Status CLFFT1D::validate(const ITensorInfo *input, const ITensorInfo *output, co ARM_COMPUTE_RETURN_ERROR_ON(decomposed_vector.empty()); // Checks performed when output is configured - if((output != nullptr) && (output->total_size() != 0)) + if ((output != nullptr) && (output->total_size() != 0)) { ARM_COMPUTE_RETURN_ERROR_ON(output->num_channels() == 1 && input->num_channels() == 1); ARM_COMPUTE_RETURN_ERROR_ON(output->num_channels() != 1 && output->num_channels() != 2); @@ -148,13 +157,13 @@ void CLFFT1D::run() CLScheduler::get().enqueue(*_digit_reverse_kernel, false); // Run radix kernels - for(unsigned int i = 0; i < _num_ffts; ++i) + for (unsigned int i = 0; i < _num_ffts; ++i) { CLScheduler::get().enqueue(*_fft_kernels[i], i == (_num_ffts - 1) && !_run_scale); } // Run output scaling - if(_run_scale) + if (_run_scale) { CLScheduler::get().enqueue(*_scale_kernel, true); } diff --git a/src/runtime/CL/functions/CLFFT2D.cpp b/src/runtime/CL/functions/CLFFT2D.cpp index e0497ca6dc..3857046719 100644 --- a/src/runtime/CL/functions/CLFFT2D.cpp +++ b/src/runtime/CL/functions/CLFFT2D.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2020 Arm Limited. + * Copyright (c) 2019-2021 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -26,6 +26,8 @@ #include "arm_compute/core/CL/ICLTensor.h" #include "arm_compute/core/Validate.h" #include "arm_compute/runtime/CL/CLScheduler.h" + +#include "src/common/utils/Log.h" #include "src/core/CL/kernels/CLFFTDigitReverseKernel.h" #include "src/core/CL/kernels/CLFFTRadixStageKernel.h" #include "src/core/CL/kernels/CLFFTScaleKernel.h" @@ -33,7 +35,10 @@ namespace arm_compute { CLFFT2D::CLFFT2D(std::shared_ptr<IMemoryManager> memory_manager) - : _memory_group(memory_manager), _first_pass_func(memory_manager), _second_pass_func(memory_manager), _first_pass_tensor() + : _memory_group(memory_manager), + _first_pass_func(memory_manager), + _second_pass_func(memory_manager), + _first_pass_tensor() { } @@ -44,10 +49,14 @@ void CLFFT2D::configure(const ICLTensor *input, ICLTensor *output, const FFT2DIn configure(CLKernelLibrary::get().get_compile_context(), input, output, config); } -void CLFFT2D::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const FFT2DInfo &config) +void CLFFT2D::configure(const CLCompileContext &compile_context, + const ICLTensor *input, + ICLTensor *output, + const FFT2DInfo &config) { ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); ARM_COMPUTE_ERROR_THROW_ON(CLFFT2D::validate(input->info(), output->info(), config)); + ARM_COMPUTE_LOG_PARAMS(input, output, config); // Setup first pass FFT1DInfo first_pass_config; @@ -85,7 +94,7 @@ Status CLFFT2D::validate(const ITensorInfo *input, const ITensorInfo *output, co ARM_COMPUTE_RETURN_ON_ERROR(CLFFT1D::validate(&first_pass_tensor, output, second_pass_config)); // Checks performed when output is configured - if((output != nullptr) && (output->total_size() != 0)) + if ((output != nullptr) && (output->total_size() != 0)) { ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); diff --git a/src/runtime/CL/functions/CLFFTConvolutionLayer.cpp b/src/runtime/CL/functions/CLFFTConvolutionLayer.cpp index 41b02d03f2..2a73517549 100644 --- a/src/runtime/CL/functions/CLFFTConvolutionLayer.cpp +++ b/src/runtime/CL/functions/CLFFTConvolutionLayer.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2021 Arm Limited. + * Copyright (c) 2019-2021, 2023 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -25,10 +25,12 @@ #include "arm_compute/core/CL/ICLTensor.h" #include "arm_compute/core/Utils.h" -#include "arm_compute/core/Validate.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" +#include "arm_compute/core/Validate.h" #include "arm_compute/runtime/CL/CLScheduler.h" #include "arm_compute/runtime/CPP/CPPScheduler.h" + +#include "src/common/utils/Log.h" #include "src/core/CL/kernels/CLFFTDigitReverseKernel.h" #include "src/core/CL/kernels/CLFFTRadixStageKernel.h" #include "src/core/CL/kernels/CLFFTScaleKernel.h" @@ -48,11 +50,11 @@ int pad_decomposable(int N) int pad = 0; bool is_decomposed = false; - while(!is_decomposed) + while (!is_decomposed) { const auto decomposed_vector = arm_compute::helpers::fft::decompose_stages(N++, supported_radix); is_decomposed = !decomposed_vector.empty(); - if(!is_decomposed) + if (!is_decomposed) { ++pad; } @@ -102,17 +104,32 @@ CLFFTConvolutionLayer::CLFFTConvolutionLayer(std::shared_ptr<IMemoryManager> mem { } -void CLFFTConvolutionLayer::configure(ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info, - const ActivationLayerInfo &act_info, bool enable_fast_math) +void CLFFTConvolutionLayer::configure(ICLTensor *input, + const ICLTensor *weights, + const ICLTensor *biases, + ICLTensor *output, + const PadStrideInfo &conv_info, + const ActivationLayerInfo &act_info, + bool enable_fast_math) { - configure(CLKernelLibrary::get().get_compile_context(), input, weights, biases, output, conv_info, act_info, enable_fast_math); + configure(CLKernelLibrary::get().get_compile_context(), input, weights, biases, output, conv_info, act_info, + enable_fast_math); } -void CLFFTConvolutionLayer::configure(const CLCompileContext &compile_context, ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info, - const ActivationLayerInfo &act_info, bool enable_fast_math) +void CLFFTConvolutionLayer::configure(const CLCompileContext &compile_context, + ICLTensor *input, + const ICLTensor *weights, + const ICLTensor *biases, + ICLTensor *output, + const PadStrideInfo &conv_info, + const ActivationLayerInfo &act_info, + bool enable_fast_math) { ARM_COMPUTE_UNUSED(enable_fast_math); - ARM_COMPUTE_ERROR_THROW_ON(CLFFTConvolutionLayer::validate(input->info(), weights->info(), biases != nullptr ? biases->info() : nullptr, output->info(), conv_info, act_info, enable_fast_math)); + ARM_COMPUTE_ERROR_THROW_ON(CLFFTConvolutionLayer::validate(input->info(), weights->info(), + biases != nullptr ? biases->info() : nullptr, + output->info(), conv_info, act_info, enable_fast_math)); + ARM_COMPUTE_LOG_PARAMS(input, weights, biases, output, conv_info, act_info, enable_fast_math); _original_weights = weights; _original_bias = biases; @@ -121,21 +138,24 @@ void CLFFTConvolutionLayer::configure(const CLCompileContext &compile_context, I _has_bias = biases != nullptr; // Get indices for the width and height - const size_t idx_width = get_data_layout_dimension_index(input->info()->data_layout(), DataLayoutDimension::WIDTH); - const size_t idx_height = get_data_layout_dimension_index(input->info()->data_layout(), DataLayoutDimension::HEIGHT); + const size_t idx_width = get_data_layout_dimension_index(input->info()->data_layout(), DataLayoutDimension::WIDTH); + const size_t idx_height = + get_data_layout_dimension_index(input->info()->data_layout(), DataLayoutDimension::HEIGHT); // Input shape, kernel size and output tile - const Size2D input_dims = Size2D(input->info()->tensor_shape()[idx_width], input->info()->tensor_shape()[idx_height]); - const Size2D kernel_size = Size2D(weights->info()->tensor_shape()[idx_width], weights->info()->tensor_shape()[idx_height]); - const Size2D pad_valid = Size2D(pad_decomposable(input_dims.x() + kernel_size.x() - 1), - pad_decomposable(input_dims.y() + kernel_size.y() - 1)); + const Size2D input_dims = + Size2D(input->info()->tensor_shape()[idx_width], input->info()->tensor_shape()[idx_height]); + const Size2D kernel_size = + Size2D(weights->info()->tensor_shape()[idx_width], weights->info()->tensor_shape()[idx_height]); + const Size2D pad_valid = Size2D(pad_decomposable(input_dims.x() + kernel_size.x() - 1), + pad_decomposable(input_dims.y() + kernel_size.y() - 1)); // Tensors to use ICLTensor *input_to_use = input; const ICLTensor *weights_to_use = weights; ICLTensor *output_to_use = _has_bias ? &_bias_output : output; // Permute bias - if(biases != nullptr) + if (biases != nullptr) { _permute_bias_func.configure(compile_context, biases, &_permuted_bias, PermutationVector(1U, 2U, 0U)); _permuted_bias.info()->set_data_layout(DataLayout::NCHW); @@ -143,7 +163,7 @@ void CLFFTConvolutionLayer::configure(const CLCompileContext &compile_context, I // Permute input if needed _needs_permute = input->info()->data_layout() == DataLayout::NHWC; - if(_needs_permute) + if (_needs_permute) { _memory_group.manage(&_permuted_input); // Configure the function to transform the input tensor from NHWC -> NCHW @@ -161,10 +181,11 @@ void CLFFTConvolutionLayer::configure(const CLCompileContext &compile_context, I // Flip weights _flipped_weights.allocator()->init(weights_to_use->info()->clone()->set_is_resizable(true).reset_padding()); _flip_axis.allocator()->init(TensorInfo(TensorShape(2U), 1, DataType::U32)); - _flip_weights_func.configure(compile_context, weights_to_use, &_flipped_weights, &_flip_axis); + _flip_weights_func.configure(compile_context, weights_to_use, &_flipped_weights, &_flip_axis, + /* use_inverted_axis */ false); // Pad weights - const PaddingList padding_w = { { 0, input_dims.x() + pad_valid.x() - 1 }, { 0, input_dims.y() + pad_valid.y() - 1 } }; + const PaddingList padding_w = {{0, input_dims.x() + pad_valid.x() - 1}, {0, input_dims.y() + pad_valid.y() - 1}}; _pad_weights_func.configure(compile_context, &_flipped_weights, &_padded_weights, padding_w); // Transform weights @@ -172,10 +193,10 @@ void CLFFTConvolutionLayer::configure(const CLCompileContext &compile_context, I _transform_weights_func->configure(compile_context, &_padded_weights, &_transformed_weights, FFT2DInfo()); // Pad input - const PaddingList padding_in = { { 0, kernel_size.x() + pad_valid.x() - 1 }, { 0, kernel_size.y() + pad_valid.y() - 1 } }; + const PaddingList padding_in = {{0, kernel_size.x() + pad_valid.x() - 1}, {0, kernel_size.y() + pad_valid.y() - 1}}; _memory_group.manage(&_padded_input); _pad_input_func.configure(compile_context, input_to_use, &_padded_input, padding_in); - if(_needs_permute) + if (_needs_permute) { _permuted_input.allocator()->allocate(); } @@ -199,7 +220,8 @@ void CLFFTConvolutionLayer::configure(const CLCompileContext &compile_context, I _memory_group.manage(&_itransformed_output); FFT2DInfo itranform_info; itranform_info.direction = FFTDirection::Inverse; - _itransformed_output.allocator()->init(_output_reduced.info()->clone()->set_is_resizable(true).set_num_channels(1).reset_padding()); + _itransformed_output.allocator()->init( + _output_reduced.info()->clone()->set_is_resizable(true).set_num_channels(1).reset_padding()); _itransform_output_func.configure(compile_context, &_output_reduced, &_itransformed_output, itranform_info); _output_reduced.allocator()->allocate(); @@ -211,25 +233,28 @@ void CLFFTConvolutionLayer::configure(const CLCompileContext &compile_context, I // Extract correct region const int start_left = kernel_size.x() - conv_info.pad_left() - 1; const int start_top = kernel_size.y() - conv_info.pad_top() - 1; - const int end_right = _reshaped_output.info()->tensor_shape().x() - (kernel_size.x() - conv_info.pad_right() - 1) - pad_valid.x(); - const int end_botton = _reshaped_output.info()->tensor_shape().y() - (kernel_size.y() - conv_info.pad_bottom() - 1) - pad_valid.y(); - if(_has_bias) + const int end_right = + _reshaped_output.info()->tensor_shape().x() - (kernel_size.x() - conv_info.pad_right() - 1) - pad_valid.x(); + const int end_botton = + _reshaped_output.info()->tensor_shape().y() - (kernel_size.y() - conv_info.pad_bottom() - 1) - pad_valid.y(); + if (_has_bias) { _memory_group.manage(&_bias_output); } - else if(_needs_permute) + else if (_needs_permute) { output_to_use = &_permuted_output; _memory_group.manage(&_permuted_output); } - _extract_output_func.configure(compile_context, &_reshaped_output, output_to_use, Coordinates(start_left, start_top), Coordinates(end_right, end_botton)); + _extract_output_func.configure(compile_context, &_reshaped_output, output_to_use, + Coordinates(start_left, start_top), Coordinates(end_right, end_botton)); _itransformed_output.allocator()->allocate(); // Add bias - if(biases != nullptr) + if (biases != nullptr) { output_to_use = output; - if(_needs_permute) + if (_needs_permute) { output_to_use = &_permuted_output; _memory_group.manage(&_permuted_output); @@ -240,7 +265,7 @@ void CLFFTConvolutionLayer::configure(const CLCompileContext &compile_context, I } // Permute output - if(_needs_permute) + if (_needs_permute) { // Configure the function to transform the convoluted output to ACL's native ordering format NCHW _permuted_output.info()->set_data_layout(DataLayout::NCHW); @@ -252,7 +277,7 @@ void CLFFTConvolutionLayer::configure(const CLCompileContext &compile_context, I // Configure Activation Layer _is_activationlayer_enabled = act_info.enabled(); - if(_is_activationlayer_enabled) + if (_is_activationlayer_enabled) { _activation_layer_func.configure(compile_context, output, nullptr, act_info); } @@ -266,8 +291,13 @@ void CLFFTConvolutionLayer::configure(const CLCompileContext &compile_context, I _flip_axis.unmap(); } -Status CLFFTConvolutionLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info, - const ActivationLayerInfo &act_info, bool enable_fast_math) +Status CLFFTConvolutionLayer::validate(const ITensorInfo *input, + const ITensorInfo *weights, + const ITensorInfo *biases, + const ITensorInfo *output, + const PadStrideInfo &conv_info, + const ActivationLayerInfo &act_info, + bool enable_fast_math) { ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32); ARM_COMPUTE_RETURN_ERROR_ON((input->data_type() == DataType::F16) && !enable_fast_math); @@ -284,24 +314,27 @@ Status CLFFTConvolutionLayer::validate(const ITensorInfo *input, const ITensorIn const auto strides = conv_info.stride(); ARM_COMPUTE_RETURN_ERROR_ON(strides.first != strides.second && strides.first != 1); ARM_COMPUTE_RETURN_ERROR_ON(kernel_size.x() != kernel_size.y()); - ARM_COMPUTE_RETURN_ERROR_ON(conv_info.pad_left() != (kernel_size.x() / 2) || conv_info.pad_right() != (kernel_size.x() / 2)); - ARM_COMPUTE_RETURN_ERROR_ON(conv_info.pad_top() != (kernel_size.y() / 2) || conv_info.pad_bottom() != (kernel_size.y() / 2)); + ARM_COMPUTE_RETURN_ERROR_ON(conv_info.pad_left() != (kernel_size.x() / 2) || + conv_info.pad_right() != (kernel_size.x() / 2)); + ARM_COMPUTE_RETURN_ERROR_ON(conv_info.pad_top() != (kernel_size.y() / 2) || + conv_info.pad_bottom() != (kernel_size.y() / 2)); // Validate biases - if(biases != nullptr) + if (biases != nullptr) { ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, biases); ARM_COMPUTE_RETURN_ERROR_ON(weights->tensor_shape()[3] != biases->tensor_shape().x()); } // Checks performed when output is configured - if((output != nullptr) && (output->total_size() != 0)) + if ((output != nullptr) && (output->total_size() != 0)) { ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); - ARM_COMPUTE_RETURN_ERROR_ON((input->tensor_shape()[idx_height] != output->tensor_shape()[idx_height]) || (input->tensor_shape()[idx_width] != output->tensor_shape()[idx_width])); + ARM_COMPUTE_RETURN_ERROR_ON((input->tensor_shape()[idx_height] != output->tensor_shape()[idx_height]) || + (input->tensor_shape()[idx_width] != output->tensor_shape()[idx_width])); // Validate Activation Layer - if(act_info.enabled()) + if (act_info.enabled()) { ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayer::validate(output, nullptr, act_info)); } @@ -317,7 +350,7 @@ void CLFFTConvolutionLayer::run() MemoryGroupResourceScope scope_mg(_memory_group); // Transform input - if(_needs_permute) + if (_needs_permute) { _permute_input_func.run(); } @@ -333,17 +366,17 @@ void CLFFTConvolutionLayer::run() _reshaped_output.allocator()->import_memory(_itransformed_output.cl_buffer()); _extract_output_func.run(); // Add bias - if(_has_bias) + if (_has_bias) { _bias_add_func.run(); } - if(_needs_permute) + if (_needs_permute) { _permute_output_func.run(); } // Run activation layer - if(_is_activationlayer_enabled) + if (_is_activationlayer_enabled) { _activation_layer_func.run(); } @@ -351,10 +384,10 @@ void CLFFTConvolutionLayer::run() void CLFFTConvolutionLayer::prepare() { - if(!_is_prepared) + if (!_is_prepared) { // Permute bias to NCHW - if(_original_bias != nullptr) + if (_original_bias != nullptr) { _permuted_bias.allocator()->allocate(); _permute_bias_func.run(); @@ -363,7 +396,7 @@ void CLFFTConvolutionLayer::prepare() const ICLTensor *cur_weights = _original_weights; // Permute weights - if(_needs_permute) + if (_needs_permute) { ARM_COMPUTE_ERROR_ON(!cur_weights->is_used()); diff --git a/src/runtime/CL/functions/CLFill.cpp b/src/runtime/CL/functions/CLFill.cpp index b22d79fea4..9bd96a975e 100644 --- a/src/runtime/CL/functions/CLFill.cpp +++ b/src/runtime/CL/functions/CLFill.cpp @@ -27,8 +27,9 @@ #include "arm_compute/core/CL/ICLTensor.h" #include "arm_compute/core/Types.h" #include "arm_compute/core/Validate.h" + #include "src/core/CL/ICLKernel.h" -#include "src/runtime/gpu/cl/operators/ClFill.h" +#include "src/gpu/cl/operators/ClFill.h" #include <utility> @@ -36,16 +37,15 @@ namespace arm_compute { struct CLFill::Impl { - const ICLTensor *src{ nullptr }; - ICLTensor *dst{ nullptr }; - std::unique_ptr<opencl::ClFill> op{ nullptr }; + const ICLTensor *src{nullptr}; + ICLTensor *dst{nullptr}; + std::unique_ptr<opencl::ClFill> op{nullptr}; }; -CLFill::CLFill() - : _impl(std::make_unique<Impl>()) +CLFill::CLFill() : _impl(std::make_unique<Impl>()) { } -CLFill::CLFill(CLFill &&) = default; +CLFill::CLFill(CLFill &&) = default; CLFill &CLFill::operator=(CLFill &&) = default; CLFill::~CLFill() = default; @@ -54,7 +54,10 @@ void CLFill::configure(ICLTensor *tensor, const PixelValue &constant_value, Wind configure(CLKernelLibrary::get().get_compile_context(), tensor, constant_value, dst_window); } -void CLFill::configure(const CLCompileContext &compile_context, ICLTensor *tensor, const PixelValue &constant_value, Window *dst_window) +void CLFill::configure(const CLCompileContext &compile_context, + ICLTensor *tensor, + const PixelValue &constant_value, + Window *dst_window) { ARM_COMPUTE_ERROR_ON_NULLPTR(tensor); diff --git a/src/runtime/CL/functions/CLFillBorder.cpp b/src/runtime/CL/functions/CLFillBorder.cpp deleted file mode 100644 index 2e5a29ece1..0000000000 --- a/src/runtime/CL/functions/CLFillBorder.cpp +++ /dev/null @@ -1,42 +0,0 @@ -/* - * Copyright (c) 2016-2020 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "arm_compute/runtime/CL/functions/CLFillBorder.h" - -#include "src/core/CL/kernels/CLFillBorderKernel.h" - -#include <utility> - -using namespace arm_compute; - -void CLFillBorder::configure(ICLTensor *tensor, unsigned int border_width, BorderMode border_mode, const PixelValue &constant_border_value) -{ - configure(CLKernelLibrary::get().get_compile_context(), tensor, border_width, border_mode, constant_border_value); -} - -void CLFillBorder::configure(const CLCompileContext &compile_context, ICLTensor *tensor, unsigned int border_width, BorderMode border_mode, const PixelValue &constant_border_value) -{ - auto k = std::make_unique<CLFillBorderKernel>(); - k->configure(compile_context, tensor, BorderSize(border_width), border_mode, constant_border_value); - _kernel = std::move(k); -} diff --git a/src/runtime/CL/functions/CLFlattenLayer.cpp b/src/runtime/CL/functions/CLFlattenLayer.cpp index 9563055276..ba1b5372d3 100644 --- a/src/runtime/CL/functions/CLFlattenLayer.cpp +++ b/src/runtime/CL/functions/CLFlattenLayer.cpp @@ -26,26 +26,26 @@ #include "arm_compute/core/CL/CLKernelLibrary.h" #include "arm_compute/core/CL/ICLTensor.h" #include "arm_compute/core/TensorInfo.h" -#include "arm_compute/core/Validate.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" +#include "arm_compute/core/Validate.h" + #include "src/core/CL/ICLKernel.h" #include "src/core/helpers/AutoConfiguration.h" -#include "src/runtime/gpu/cl/operators/ClFlatten.h" +#include "src/gpu/cl/operators/ClFlatten.h" namespace arm_compute { struct CLFlattenLayer::Impl { - const ICLTensor *src{ nullptr }; - ICLTensor *dst{ nullptr }; - std::unique_ptr<opencl::ClFlatten> op{ nullptr }; + const ICLTensor *src{nullptr}; + ICLTensor *dst{nullptr}; + std::unique_ptr<opencl::ClFlatten> op{nullptr}; }; -CLFlattenLayer::CLFlattenLayer() - : _impl(std::make_unique<Impl>()) +CLFlattenLayer::CLFlattenLayer() : _impl(std::make_unique<Impl>()) { } -CLFlattenLayer::CLFlattenLayer(CLFlattenLayer &&) = default; +CLFlattenLayer::CLFlattenLayer(CLFlattenLayer &&) = default; CLFlattenLayer &CLFlattenLayer::operator=(CLFlattenLayer &&) = default; CLFlattenLayer::~CLFlattenLayer() = default; @@ -59,7 +59,8 @@ void CLFlattenLayer::configure(const CLCompileContext &compile_context, const IC ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); _impl->src = input; _impl->dst = output; - auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(misc::shape_calculator::compute_flatten_shape(input->info()))); + auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape( + misc::shape_calculator::compute_flatten_shape(input->info()))); _impl->op = std::make_unique<opencl::ClFlatten>(); _impl->op->configure(compile_context, _impl->src->info(), _impl->dst->info()); @@ -68,9 +69,10 @@ void CLFlattenLayer::configure(const CLCompileContext &compile_context, const IC Status CLFlattenLayer::validate(const ITensorInfo *input, const ITensorInfo *output) { // Checks performed when output is configured - if(output->total_size() != 0) + if (output->total_size() != 0) { - const TensorInfo tensor_info_output = input->clone()->set_tensor_shape(misc::shape_calculator::compute_flatten_shape(input)); + const TensorInfo tensor_info_output = + input->clone()->set_tensor_shape(misc::shape_calculator::compute_flatten_shape(input)); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output, &tensor_info_output); } return opencl::ClFlatten::validate(input, output); @@ -83,4 +85,4 @@ void CLFlattenLayer::run() pack.add_tensor(TensorType::ACL_DST, _impl->dst); _impl->op->run(pack); } -} // namespace arm_compute
\ No newline at end of file +} // namespace arm_compute diff --git a/src/runtime/CL/functions/CLFloor.cpp b/src/runtime/CL/functions/CLFloor.cpp index 4c5e482b10..4322219dd9 100644 --- a/src/runtime/CL/functions/CLFloor.cpp +++ b/src/runtime/CL/functions/CLFloor.cpp @@ -27,23 +27,23 @@ #include "arm_compute/core/CL/ICLTensor.h" #include "arm_compute/core/Types.h" #include "arm_compute/core/Validate.h" + #include "src/core/CL/ICLKernel.h" -#include "src/runtime/gpu/cl/operators/ClFloor.h" +#include "src/gpu/cl/operators/ClFloor.h" namespace arm_compute { struct CLFloor::Impl { - const ICLTensor *src{ nullptr }; - ICLTensor *dst{ nullptr }; - std::unique_ptr<opencl::ClFloor> op{ nullptr }; + const ICLTensor *src{nullptr}; + ICLTensor *dst{nullptr}; + std::unique_ptr<opencl::ClFloor> op{nullptr}; }; -CLFloor::CLFloor() - : _impl(std::make_unique<Impl>()) +CLFloor::CLFloor() : _impl(std::make_unique<Impl>()) { } -CLFloor::CLFloor(CLFloor &&) = default; +CLFloor::CLFloor(CLFloor &&) = default; CLFloor &CLFloor::operator=(CLFloor &&) = default; CLFloor::~CLFloor() = default; diff --git a/src/runtime/CL/functions/CLFullyConnectedLayer.cpp b/src/runtime/CL/functions/CLFullyConnectedLayer.cpp index 31c8908270..b30f9e701f 100644 --- a/src/runtime/CL/functions/CLFullyConnectedLayer.cpp +++ b/src/runtime/CL/functions/CLFullyConnectedLayer.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2021 Arm Limited. + * Copyright (c) 2017-2021, 2023 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -23,472 +23,137 @@ */ #include "arm_compute/runtime/CL/functions/CLFullyConnectedLayer.h" -#include "arm_compute/core/Size2D.h" -#include "arm_compute/core/Validate.h" -#include "arm_compute/core/utils/misc/ShapeCalculator.h" -#include "arm_compute/core/utils/quantization/AsymmHelpers.h" +#include "arm_compute/core/CL/CLKernelLibrary.h" #include "arm_compute/runtime/CL/CLScheduler.h" -#include "src/core/CL/kernels/CLFillBorderKernel.h" -#include "src/core/CL/kernels/CLGEMMLowpMatrixMultiplyNativeKernel.h" -#include "src/core/CL/kernels/CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel.h" -#include "src/core/CL/kernels/CLGEMMLowpOffsetContributionKernel.h" -#include "src/core/CL/kernels/CLGEMMLowpOffsetContributionOutputStageKernel.h" -#include "src/core/CL/kernels/CLGEMMLowpReductionKernel.h" -#include "src/core/gpu/cl/kernels/ClTransposeKernel.h" -#include "support/Cast.h" -#include <algorithm> +#include "src/core/helpers/MemoryHelpers.h" +#include "src/gpu/cl/operators/ClFullyConnected.h" namespace arm_compute { -using namespace arm_compute::misc::shape_calculator; -using namespace arm_compute::utils::cast; +using namespace arm_compute::experimental; -namespace +struct CLFullyConnectedLayer::Impl { -Status construct_gemmlowp_output_stage(const ITensorInfo &input, const ITensorInfo &weights, const ITensorInfo &output, - GEMMLowpOutputStageInfo &gemmlowp_output_stage, ActivationLayerInfo activation_info) -{ - gemmlowp_output_stage.type = GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT; - gemmlowp_output_stage.gemmlowp_offset = 0; - gemmlowp_output_stage.gemmlowp_multiplier = 0; - gemmlowp_output_stage.gemmlowp_shift = 0; - - const auto data_type = input.data_type(); - - // Configure output stage for quantized case - if(is_data_type_quantized_asymmetric(data_type)) - { - const QuantizationInfo oq_info = output.quantization_info(); - const UniformQuantizationInfo iq_unif = input.quantization_info().uniform(); - const UniformQuantizationInfo wq_unif = weights.quantization_info().uniform(); - const UniformQuantizationInfo oq_unif = oq_info.uniform(); - - const auto output_quant_info = (output.total_size() == 0) ? iq_unif : oq_unif; - - const float multiplier = (iq_unif.scale * wq_unif.scale) / output_quant_info.scale; - int output_multiplier = 0; - int output_shift = 0; - ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier(multiplier, &output_multiplier, &output_shift)); - - PixelValue type_min{}; - PixelValue type_max{}; - std::tie(type_min, type_max) = get_min_max(data_type); - - if(activation_info.enabled()) - { - std::tie(type_min, type_max) = get_quantized_activation_min_max(activation_info, data_type, output_quant_info); - } - - // Set the GEMMLowp output stage info - gemmlowp_output_stage.gemmlowp_offset = output_quant_info.offset; - gemmlowp_output_stage.gemmlowp_multiplier = output_multiplier; - gemmlowp_output_stage.gemmlowp_shift = output_shift; - gemmlowp_output_stage.gemmlowp_multipliers.push_back(output_multiplier); - gemmlowp_output_stage.gemmlowp_shifts.push_back(output_shift); - type_min.get(gemmlowp_output_stage.gemmlowp_min_bound); - type_max.get(gemmlowp_output_stage.gemmlowp_max_bound); - } - - return Status{}; -} - -Status validate_mm(const ITensorInfo &input, const ITensorInfo &weights, const ITensorInfo *bias, const ITensorInfo &output, const FullyConnectedLayerInfo &fc_info) -{ - GEMMLowpOutputStageInfo gemmlowp_output_stage; - ARM_COMPUTE_RETURN_ON_ERROR(construct_gemmlowp_output_stage(input, weights, output, gemmlowp_output_stage, fc_info.activation_info)); + MemoryGroup memory_group{}; + IWeightsManager *weights_manager{nullptr}; - const GEMMInfo &gemm_info = GEMMInfo(false, // is_a_reshaped - false, // is_b_reshaped - true, // reshape_b_only_on_first_run - 0, // depth_output_gemm3d - false, // reinterpret_input_as_3d - fc_info.retain_internal_weights, // retain_internal_weights - gemmlowp_output_stage, // gemmlowp_output_stage - fc_info.fp_mixed_precision, // fp_mixed_precision - true, // broadcast_bias - ActivationLayerInfo()); // activation_info + std::unique_ptr<opencl::ClFullyConnected> op{nullptr}; - if(is_data_type_quantized_asymmetric(input.data_type())) - { - const UniformQuantizationInfo iq_info = input.quantization_info().uniform(); - const UniformQuantizationInfo wq_info = weights.quantization_info().uniform(); - - // Since we need negative offsets for computing convolution, we need to change QuantizationInfo() - // Extract and negate input and weights offset - const QuantizationInfo input_quantization_info(iq_info.scale, -iq_info.offset); - const QuantizationInfo weights_quantization_info(wq_info.scale, -wq_info.offset); + const ITensor *original_weights{nullptr}; - // Validate gemmlowp function - ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixMultiplyCore::validate(&input.clone()->set_quantization_info(input_quantization_info), - &weights.clone()->set_quantization_info(weights_quantization_info), - bias, - &output, - gemm_info)); - } - else - { - ARM_COMPUTE_RETURN_ON_ERROR(CLGEMM::validate(&input, &weights, bias, &output, 1.f, 1.f, gemm_info)); - } + ITensorPack run_pack{}; + WorkspaceData<CLTensor> workspace{}; + experimental::MemoryRequirements aux_mem_req{}; - return Status{}; -} -} // namespace + bool is_prepared{false}; + bool dynamic_weights{false}; +}; -CLFullyConnectedLayer::CLFullyConnectedLayer(std::shared_ptr<IMemoryManager> memory_manager, IWeightsManager *weights_manager) - : _memory_group(memory_manager), _weights_manager(weights_manager), _convert_weights(), _convert_weights_managed(), _reshape_weights_managed_function(), _flatten_layer(), _reshape_weights_function(), - _mm_gemm(memory_manager, weights_manager), _mm_gemmlowp(memory_manager), _flatten_output(), _converted_weights_output(), _reshape_weights_output(), _are_weights_converted(true), - _are_weights_reshaped(true), _is_fc_after_conv(true), _is_quantized(false), _is_prepared(false), _original_weights(nullptr) +CLFullyConnectedLayer::CLFullyConnectedLayer(std::shared_ptr<IMemoryManager> memory_manager, + IWeightsManager *weights_manager) + : _impl(std::make_unique<Impl>()) { + _impl->memory_group = MemoryGroup(std::move(memory_manager)); + _impl->weights_manager = weights_manager; } -void CLFullyConnectedLayer::configure_mm(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *weights, const ICLTensor *bias, ICLTensor *output, - const FullyConnectedLayerInfo &fc_info) -{ - GEMMLowpOutputStageInfo gemmlowp_output_stage; - construct_gemmlowp_output_stage(*input->info(), *weights->info(), *output->info(), gemmlowp_output_stage, fc_info.activation_info); - - const GEMMInfo &gemm_info = GEMMInfo(false, // is_a_reshaped - false, // is_b_reshaped - true, // reshape_b_only_on_first_run - 0, // depth_output_gemm3d - false, // reinterpret_input_as_3d - fc_info.retain_internal_weights, // retain_internal_weights - gemmlowp_output_stage, // gemmlowp_output_stage - fc_info.fp_mixed_precision, // fp_mixed_precision - true, // broadcast_bias - fc_info.activation_info, // activation_info - fc_info.constant_weights); // constant_weights - - if(_is_quantized) - { - // Since we need negative offsets for computing convolution, we need to change QuantizationInfo() - // Extract and negate input and weights offset - const QuantizationInfo input_quantization_info = input->info()->quantization_info(); - const QuantizationInfo weights_quantization_info = weights->info()->quantization_info(); - - input->info()->set_quantization_info(QuantizationInfo(input_quantization_info.uniform().scale, -input_quantization_info.uniform().offset)); - weights->info()->set_quantization_info(QuantizationInfo(weights_quantization_info.uniform().scale, -weights_quantization_info.uniform().offset)); +CLFullyConnectedLayer::~CLFullyConnectedLayer() = default; - // Configure gemmlowp function - _mm_gemmlowp.configure(compile_context, input, weights, bias, output, gemm_info); - - // Revert back QuantizatioInfo as input and weights could be used in other fully connected layers - input->info()->set_quantization_info(input_quantization_info); - weights->info()->set_quantization_info(weights_quantization_info); - } - else - { - // Configure matrix multiply kernel - _mm_gemm.configure(compile_context, input, weights, bias, output, 1.f, 1.f, gemm_info); - } -} - -void CLFullyConnectedLayer::configure_conv_fc(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *weights, const ICLTensor *bias, ICLTensor *output, - const FullyConnectedLayerInfo &fc_info) -{ - ARM_COMPUTE_ERROR_ON((weights->info()->dimension(1) != (input->info()->dimension(0) * input->info()->dimension(1) * input->info()->dimension(2)))); - - // If the fully connected layer is called after a convolution layer, the input tensor must be linearized - - // Initialize output tensor for flatten - TensorShape shape_flatten = compute_flatten_shape(input->info()); - _flatten_output.allocator()->init(input->info()->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(shape_flatten).set_data_layout(DataLayout::NCHW)); - - // Configure flatten kernel - _memory_group.manage(&_flatten_output); - _flatten_layer.configure(compile_context, input, &_flatten_output); - - // Configure matrix multiply kernel - configure_mm(compile_context, &_flatten_output, weights, bias, output, fc_info); - - // Allocate the output tensor for flatten once all the configure methods have been called - _flatten_output.allocator()->allocate(); -} - -void CLFullyConnectedLayer::configure_fc_fc(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *weights, const ICLTensor *bias, ICLTensor *output, - const FullyConnectedLayerInfo &fc_info) -{ - ARM_COMPUTE_ERROR_ON(input->info()->dimension(0) != weights->info()->dimension(1)); - - // Configure matrix multiply kernel - configure_mm(compile_context, input, weights, bias, output, fc_info); -} - -void CLFullyConnectedLayer::configure(const ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, +void CLFullyConnectedLayer::configure(const ICLTensor *input, + const ICLTensor *weights, + const ICLTensor *biases, + ICLTensor *output, FullyConnectedLayerInfo fc_info) { configure(CLKernelLibrary::get().get_compile_context(), input, weights, biases, output, fc_info); } -void CLFullyConnectedLayer::configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, +void CLFullyConnectedLayer::configure(const CLCompileContext &compile_context, + const ICLTensor *input, + const ICLTensor *weights, + const ICLTensor *biases, + ICLTensor *output, FullyConnectedLayerInfo fc_info) { + // Perform validate step ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output); + ARM_COMPUTE_ERROR_THROW_ON(CLFullyConnectedLayer::validate( + input->info(), weights->info(), biases != nullptr ? biases->info() : nullptr, output->info(), fc_info)); - // Perform validate step - ARM_COMPUTE_ERROR_THROW_ON(CLFullyConnectedLayer::validate(input->info(), - weights->info(), - biases != nullptr ? biases->info() : nullptr, - output->info(), - fc_info)); + _impl->op = std::make_unique<opencl::ClFullyConnected>(); + _impl->original_weights = weights; + _impl->is_prepared = fc_info.retain_internal_weights; - _are_weights_converted = true; - _are_weights_reshaped = fc_info.transpose_weights ? fc_info.are_weights_reshaped : true; - _is_fc_after_conv = true; - _is_quantized = is_data_type_quantized_asymmetric(input->info()->data_type()); - _is_prepared = fc_info.retain_internal_weights; - _original_weights = weights; + _impl->op->configure(compile_context, input->info(), weights->info(), + (biases != nullptr) ? biases->info() : nullptr, output->info(), fc_info); - if(_weights_manager) + if (_impl->weights_manager != nullptr) { - _weights_manager->manage(weights); + _impl->weights_manager->manage(_impl->original_weights); } - const ICLTensor *weights_to_use = weights; - - // With the Fully Connected layer we can have 4 different cases: - // 1) Convolution layer -> Fully Connected layer without batches - // 2) Fully Connected layer -> Fully Connected layer without batches - // 3) Convolution layer -> Fully Connected layer with batches - // 4) Fully Connected layer -> Fully Connected layer with batches - - // Check if we have a fully connected layer with batches - const bool is_batched_fc_layer = output->info()->dimension(1) > 1; - if(is_batched_fc_layer) + if (!_impl->is_prepared) { - _is_fc_after_conv = (TensorShape::num_max_dimensions >= 4) && (std::equal(input->info()->tensor_shape().cbegin() + 3, - input->info()->tensor_shape().cend(), - output->info()->tensor_shape().cbegin() + 1)); + _impl->aux_mem_req = _impl->op->workspace(); + _impl->run_pack = {{ACL_SRC_0, input}, {ACL_SRC_1, weights}, {ACL_SRC_2, biases}, {ACL_DST, output}}; + _impl->workspace = + manage_workspace<CLTensor>(_impl->aux_mem_req, _impl->memory_group, _impl->run_pack, _impl->run_pack); } else { - _is_fc_after_conv = input->info()->num_dimensions() > 1; - } - - // Reshape weights if needed - if(!_are_weights_reshaped) - { - if(_weights_manager && _weights_manager->are_weights_managed(weights)) - { - _reshape_weights_managed_function.configure(compile_context, weights); - weights_to_use = utils::cast::polymorphic_downcast<ICLTensor *>(_weights_manager->acquire(weights, &_reshape_weights_managed_function)); - } - else - { - // Reshape the weights - _reshape_weights_function.configure(compile_context, weights, &_reshape_weights_output); - weights_to_use = &_reshape_weights_output; - } - } - - // Convert weights if needed - if(_is_fc_after_conv && (input->info()->data_layout() != fc_info.weights_trained_layout)) - { - if(_weights_manager && _weights_manager->are_weights_managed(weights_to_use)) - { - _convert_weights_managed.configure(compile_context, weights_to_use, - input->info()->tensor_shape(), - fc_info.weights_trained_layout); - weights_to_use = utils::cast::polymorphic_downcast<ICLTensor *>(_weights_manager->acquire(weights, &_convert_weights_managed)); - } - else - { - // Convert weights - _convert_weights.configure(compile_context, weights_to_use, - &_converted_weights_output, - input->info()->tensor_shape(), - fc_info.weights_trained_layout); - - weights_to_use = &_converted_weights_output; - } - _are_weights_converted = false; + _impl->run_pack.add_tensor(ACL_SRC_0, input); + _impl->run_pack.add_tensor(ACL_DST, output); } - if(_is_fc_after_conv) - { - // Fully Connected layer after a Convolution Layer without batches - configure_conv_fc(compile_context, input, weights_to_use, biases, output, fc_info); - } - else - { - // Fully Connected layer after a Fully Connected Layer without batches - configure_fc_fc(compile_context, input, weights_to_use, biases, output, fc_info); - } + _impl->dynamic_weights = !weights->info()->are_values_constant() && fc_info.transpose_weights && + !fc_info.are_weights_reshaped && !fc_info.retain_internal_weights; } -Status CLFullyConnectedLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, +Status CLFullyConnectedLayer::validate(const ITensorInfo *input, + const ITensorInfo *weights, + const ITensorInfo *biases, + const ITensorInfo *output, FullyConnectedLayerInfo fc_info) { - ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F16, DataType::F32); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights, output); - ARM_COMPUTE_RETURN_ERROR_ON(weights->num_dimensions() > 2); - ARM_COMPUTE_RETURN_ERROR_ON(fc_info.activation_info.enabled() && is_data_type_quantized(input->data_type()) && fc_info.activation_info.activation() != ActivationLayerInfo::ActivationFunction::RELU - && fc_info.activation_info.activation() != ActivationLayerInfo::ActivationFunction::BOUNDED_RELU && fc_info.activation_info.activation() != ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU); - ARM_COMPUTE_RETURN_ERROR_ON(!fc_info.constant_weights && (!fc_info.are_weights_reshaped || fc_info.transpose_weights)); - - bool weights_reshaped = fc_info.transpose_weights ? fc_info.are_weights_reshaped : true; - bool is_fc_after_conv = true; - - const ITensorInfo &flatten_input = TensorInfo(input->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(compute_flatten_shape(input)).set_data_layout(DataLayout::NCHW)); - const ITensorInfo &reshaped_weights = TensorInfo(weights->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(compute_transposed_shape(*weights))); - const ITensorInfo &converted_weights = weights_reshaped ? TensorInfo(weights->clone()->set_is_resizable(true).reset_padding()) : TensorInfo(*reshaped_weights.clone()); - - // With the Fully Connected layer we can have 4 different cases: - // 1) Convolution layer -> Fully Connected layer without batches - // 2) Fully Connected layer -> Fully Connected layer without batches - // 3) Convolution layer -> Fully Connected layer with batches - // 4) Fully Connected layer -> Fully Connected layer with batches - - const ITensorInfo *input_to_use = input; - const ITensorInfo *weights_to_use = weights; - - // Check if we have a fully connected layer with batches - const bool is_batched_fc_layer = output->dimension(1) > 1; - if(is_batched_fc_layer) - { - is_fc_after_conv = (TensorShape::num_max_dimensions >= 4) && (std::equal(input->tensor_shape().cbegin() + 3, - input->tensor_shape().cend(), - output->tensor_shape().cbegin() + 1)); - } - else - { - is_fc_after_conv = input->num_dimensions() > 1; - } - - if(!weights_reshaped) - { - // Validate reshape weights kernel - ARM_COMPUTE_RETURN_ON_ERROR(CLTranspose::validate(weights, &reshaped_weights)); - weights_to_use = &reshaped_weights; - } - - if(is_fc_after_conv && (input->data_layout() != fc_info.weights_trained_layout)) - { - // Validate convert weights kernel - ARM_COMPUTE_RETURN_ON_ERROR(CLConvertFullyConnectedWeights::validate(weights_to_use, - &converted_weights, - input->tensor_shape(), - fc_info.weights_trained_layout)); - weights_to_use = &converted_weights; - } - - if(is_fc_after_conv) - { - // Fully Connected layer after a Convolution Layer without batches - ARM_COMPUTE_RETURN_ERROR_ON((weights_to_use->dimension(1) != (input->dimension(0) * input->dimension(1) * input->dimension(2)))); - - // Validate flatten kernel - ARM_COMPUTE_RETURN_ON_ERROR(CLFlattenLayer::validate(input, &flatten_input)); - input_to_use = &flatten_input; - } - else - { - // Fully Connected layer after a Fully Connected Layer without batches - ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(0) != weights_to_use->dimension(1)); - } - - // Validate matrix multiply kernel - ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(*input_to_use, *weights_to_use, biases, *output, fc_info)); - - return Status{}; + return opencl::ClFullyConnected::validate(input, weights, biases, output, fc_info); } void CLFullyConnectedLayer::run() { - prepare(); - - MemoryGroupResourceScope scope_mg(_memory_group); - - // Linearize input if it comes from a convolutional layer - if(_is_fc_after_conv) + if (!_impl->dynamic_weights) { - _flatten_layer.run(); + prepare(); } - // Run matrix multiply - if(_is_quantized) - { - _mm_gemmlowp.run(); - } - else - { - _mm_gemm.run(); - } + MemoryGroupResourceScope scope_mg(_impl->memory_group); + _impl->op->run(_impl->run_pack); } void CLFullyConnectedLayer::prepare() { - if(!_is_prepared) + if (!_impl->is_prepared) { - if(!_weights_manager) - { - ARM_COMPUTE_ERROR_ON(!_original_weights->is_used()); - } - - auto release_unused = [](CLTensor * w) - { - if(!w->is_used()) - { - CLScheduler::get().queue().finish(); - w->allocator()->free(); - } - }; + _impl->op->prepare(_impl->run_pack); - // Pointer to current weights - const ICLTensor *cur_weights = _original_weights; + // Release temporary tensors that are only used in prepare stage + release_temporaries<CLTensor>(_impl->aux_mem_req, _impl->workspace); + _impl->is_prepared = true; - // Reshape of the weights if needed (happens only once) - if(!_are_weights_reshaped) + // Handle weights managed infrastructure + if (_impl->weights_manager != nullptr && _impl->weights_manager->are_weights_managed(_impl->original_weights)) { - if(_weights_manager && _weights_manager->are_weights_managed(_original_weights)) + // Ensure that b gets marked as unused (memory released) only after the last function which uses b also finishes its prepare + // This is for cases where multiple functions share the same b (weights) + // Therefore when a function marks original b as unused, we pre-mark it in weights manager, and mark it back to used so that it doesn't get released before its last reference + const ITensor *original_b = _impl->original_weights; + if (!original_b->is_used()) { - cur_weights = utils::cast::polymorphic_downcast<ICLTensor *>(_weights_manager->run(cur_weights, &_reshape_weights_managed_function)); + _impl->weights_manager->pre_mark_as_unused(original_b); } - else - { - // Run reshape weights kernel and mark weights as unused - _reshape_weights_output.allocator()->allocate(); - _reshape_weights_function.run(); - - cur_weights->mark_as_unused(); - cur_weights = &_reshape_weights_output; - } - _are_weights_reshaped = true; + _impl->original_weights->mark_as_used(); + _impl->weights_manager->release(_impl->original_weights); } - - // Convert weights if needed (happens only once) - if(!_are_weights_converted) - { - if(_weights_manager && _weights_manager->are_weights_managed(cur_weights)) - { - _weights_manager->run(cur_weights, &_convert_weights_managed); - } - else - { - _converted_weights_output.allocator()->allocate(); - _convert_weights.run(); - cur_weights->mark_as_unused(); - } - - _are_weights_converted = true; - } - - // Release reshaped weights if unused - release_unused(&_reshape_weights_output); - - // Prepare GEMM prepare and release unused weights - if(!_is_quantized) - { - _mm_gemm.prepare(); - } - - // Release converted weights if unused - release_unused(&_reshape_weights_output); - release_unused(&_converted_weights_output); - - _is_prepared = true; } } } // namespace arm_compute diff --git a/src/runtime/CL/functions/CLFuseBatchNormalization.cpp b/src/runtime/CL/functions/CLFuseBatchNormalization.cpp index 2945508012..e4fbf78e13 100644 --- a/src/runtime/CL/functions/CLFuseBatchNormalization.cpp +++ b/src/runtime/CL/functions/CLFuseBatchNormalization.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018-2020 Arm Limited. + * Copyright (c) 2018-2021 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -28,6 +28,8 @@ #include "arm_compute/core/TensorInfo.h" #include "arm_compute/core/Types.h" #include "arm_compute/runtime/CL/CLScheduler.h" + +#include "src/common/utils/Log.h" #include "src/core/CL/kernels/CLFuseBatchNormalizationKernel.h" namespace arm_compute @@ -39,28 +41,52 @@ CLFuseBatchNormalization::CLFuseBatchNormalization() CLFuseBatchNormalization::~CLFuseBatchNormalization() = default; -void CLFuseBatchNormalization::configure(const ICLTensor *input_weights, const ICLTensor *bn_mean, const ICLTensor *bn_var, - ICLTensor *fused_weights, ICLTensor *fused_bias, - const ICLTensor *input_bias, const ICLTensor *bn_beta, const ICLTensor *bn_gamma, - float epsilon, FuseBatchNormalizationType fbn_type) +void CLFuseBatchNormalization::configure(const ICLTensor *input_weights, + const ICLTensor *bn_mean, + const ICLTensor *bn_var, + ICLTensor *fused_weights, + ICLTensor *fused_bias, + const ICLTensor *input_bias, + const ICLTensor *bn_beta, + const ICLTensor *bn_gamma, + float epsilon, + FuseBatchNormalizationType fbn_type) { - configure(CLKernelLibrary::get().get_compile_context(), input_weights, bn_mean, bn_var, fused_weights, fused_bias, input_bias, bn_beta, bn_gamma, epsilon, fbn_type); + configure(CLKernelLibrary::get().get_compile_context(), input_weights, bn_mean, bn_var, fused_weights, fused_bias, + input_bias, bn_beta, bn_gamma, epsilon, fbn_type); } -void CLFuseBatchNormalization::configure(const CLCompileContext &compile_context, const ICLTensor *input_weights, const ICLTensor *bn_mean, const ICLTensor *bn_var, - ICLTensor *fused_weights, ICLTensor *fused_bias, - const ICLTensor *input_bias, const ICLTensor *bn_beta, const ICLTensor *bn_gamma, - float epsilon, FuseBatchNormalizationType fbn_type) +void CLFuseBatchNormalization::configure(const CLCompileContext &compile_context, + const ICLTensor *input_weights, + const ICLTensor *bn_mean, + const ICLTensor *bn_var, + ICLTensor *fused_weights, + ICLTensor *fused_bias, + const ICLTensor *input_bias, + const ICLTensor *bn_beta, + const ICLTensor *bn_gamma, + float epsilon, + FuseBatchNormalizationType fbn_type) { - _fuse_bn_kernel->configure(compile_context, input_weights, bn_mean, bn_var, fused_weights, fused_bias, input_bias, bn_beta, bn_gamma, epsilon, fbn_type); + ARM_COMPUTE_LOG_PARAMS(input_weights, bn_mean, bn_var, fused_weights, fused_bias, input_bias, bn_beta, bn_gamma, + epsilon, fbn_type); + _fuse_bn_kernel->configure(compile_context, input_weights, bn_mean, bn_var, fused_weights, fused_bias, input_bias, + bn_beta, bn_gamma, epsilon, fbn_type); } -Status CLFuseBatchNormalization::validate(const ITensorInfo *input_weights, const ITensorInfo *bn_mean, const ITensorInfo *bn_var, - const ITensorInfo *fused_weights, const ITensorInfo *fused_bias, - const ITensorInfo *input_bias, const ITensorInfo *bn_beta, const ITensorInfo *bn_gamma, - float epsilon, FuseBatchNormalizationType fbn_type) +Status CLFuseBatchNormalization::validate(const ITensorInfo *input_weights, + const ITensorInfo *bn_mean, + const ITensorInfo *bn_var, + const ITensorInfo *fused_weights, + const ITensorInfo *fused_bias, + const ITensorInfo *input_bias, + const ITensorInfo *bn_beta, + const ITensorInfo *bn_gamma, + float epsilon, + FuseBatchNormalizationType fbn_type) { - return CLFuseBatchNormalizationKernel::validate(input_weights, bn_mean, bn_var, fused_weights, fused_bias, input_bias, bn_beta, bn_gamma, epsilon, fbn_type); + return CLFuseBatchNormalizationKernel::validate(input_weights, bn_mean, bn_var, fused_weights, fused_bias, + input_bias, bn_beta, bn_gamma, epsilon, fbn_type); } void CLFuseBatchNormalization::run() diff --git a/src/runtime/CL/functions/CLGEMM.cpp b/src/runtime/CL/functions/CLGEMM.cpp index 35126ec0d7..871a1d6e27 100644 --- a/src/runtime/CL/functions/CLGEMM.cpp +++ b/src/runtime/CL/functions/CLGEMM.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2021 Arm Limited. + * Copyright (c) 2017-2022 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -30,9 +30,9 @@ #include "arm_compute/core/TensorInfo.h" #include "arm_compute/core/Types.h" #include "arm_compute/core/Utils.h" -#include "arm_compute/runtime/CL/functions/CLGEMM.h" + #include "src/core/helpers/MemoryHelpers.h" -#include "src/runtime/gpu/cl/operators/ClGemm.h" +#include "src/gpu/cl/operators/ClGemm.h" namespace arm_compute { @@ -41,19 +41,15 @@ using OperatorType = opencl::ClGemm; struct CLGEMM::Impl { - const ICLTensor *a{ nullptr }; - const ICLTensor *b{ nullptr }; - const ICLTensor *c{ nullptr }; - ICLTensor *dst{ nullptr }; - std::unique_ptr<OperatorType> op{ nullptr }; + const ICLTensor *b{nullptr}; + std::unique_ptr<OperatorType> op{nullptr}; MemoryGroup memory_group{}; - IWeightsManager *weights_manager{ nullptr }; - CLTensor weights_transformed{}; + IWeightsManager *weights_manager{nullptr}; ITensorPack run_pack{}; ITensorPack prep_pack{}; MemoryRequirements aux_mem_req{}; WorkspaceData<CLTensor> workspace_tensors{}; - bool is_prepared{ false }; + bool is_prepared{false}; }; CLGEMM::CLGEMM(std::shared_ptr<IMemoryManager> memory_manager, IWeightsManager *weights_manager) @@ -65,41 +61,59 @@ CLGEMM::CLGEMM(std::shared_ptr<IMemoryManager> memory_manager, IWeightsManager * CLGEMM::~CLGEMM() = default; -void CLGEMM::configure(const ICLTensor *a, const ICLTensor *b, const ICLTensor *c, ICLTensor *output, float alpha, float beta, const GEMMInfo &gemm_info) +void CLGEMM::configure(const ICLTensor *a, + const ICLTensor *b, + const ICLTensor *c, + ICLTensor *output, + float alpha, + float beta, + const GEMMInfo &gemm_info) { configure(CLKernelLibrary::get().get_compile_context(), a, b, c, output, alpha, beta, gemm_info); } -void CLGEMM::configure(const CLCompileContext &compile_context, const ICLTensor *a, const ICLTensor *b, const ICLTensor *c, ICLTensor *output, float alpha, float beta, const GEMMInfo &gemm_info) +void CLGEMM::configure(const CLCompileContext &compile_context, + const ICLTensor *a, + const ICLTensor *b, + const ICLTensor *c, + ICLTensor *output, + float alpha, + float beta, + const GEMMInfo &gemm_info) { ARM_COMPUTE_ERROR_ON_NULLPTR(a, b, output); - _impl->a = a; _impl->b = b; - _impl->c = c; - _impl->dst = output; _impl->op = std::make_unique<OperatorType>(); _impl->is_prepared = gemm_info.retain_internal_weights(); - _impl->op->configure(compile_context, a->info(), b->info(), c != nullptr ? c->info() : nullptr, output->info(), alpha, beta, gemm_info); + _impl->op->configure(compile_context, a->info(), b->info(), c != nullptr ? c->info() : nullptr, output->info(), + alpha, beta, gemm_info); _impl->aux_mem_req = _impl->op->workspace(); // Manage/allocate auxilairy tensors - if(_impl->is_prepared) + if (_impl->is_prepared) { - _impl->run_pack.add_const_tensor(ACL_SRC_0, _impl->a); - _impl->run_pack.add_tensor(ACL_DST, _impl->dst); + _impl->run_pack.add_const_tensor(ACL_SRC_0, a); + _impl->run_pack.add_tensor(ACL_DST, output); } else { - _impl->run_pack = { { ACL_SRC_0, _impl->a }, { ACL_SRC_2, _impl->c }, { ACL_DST, _impl->dst } }; - _impl->prep_pack = { { ACL_SRC_1, _impl->b } }; + _impl->run_pack = {{ACL_SRC_0, a}, {ACL_SRC_2, c}, {ACL_DST, output}}; + _impl->prep_pack = {{ACL_SRC_1, _impl->b}}; - _impl->workspace_tensors = manage_workspace<CLTensor>(_impl->op->workspace(), _impl->memory_group, _impl->run_pack, _impl->prep_pack); + _impl->workspace_tensors = + manage_workspace<CLTensor>(_impl->op->workspace(), _impl->memory_group, _impl->run_pack, _impl->prep_pack); } } -Status CLGEMM::validate(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *output, float alpha, float beta, const GEMMInfo &gemm_info) +Status CLGEMM::validate(const ITensorInfo *a, + const ITensorInfo *b, + const ITensorInfo *c, + const ITensorInfo *output, + float alpha, + float beta, + const GEMMInfo &gemm_info) { return OperatorType::validate(a, b, c, output, alpha, beta, gemm_info); } @@ -110,21 +124,20 @@ void CLGEMM::run() MemoryGroupResourceScope scope_mg(_impl->memory_group); - ARM_COMPUTE_ERROR_ON_NULLPTR(_impl->a, _impl->b, _impl->dst); _impl->op->run(_impl->run_pack); } void CLGEMM::prepare() { - if(!_impl->is_prepared) + if (!_impl->is_prepared) { _impl->op->prepare(_impl->prep_pack); - auto has_reshape = std::find_if(_impl->aux_mem_req.begin(), - _impl->aux_mem_req.end(), - [](const MemoryInfo & m) -> bool { return m.lifetime == MemoryLifetime::Persistent; }); + auto has_reshape = + std::find_if(_impl->aux_mem_req.begin(), _impl->aux_mem_req.end(), + [](const MemoryInfo &m) -> bool { return m.lifetime == MemoryLifetime::Persistent; }); - if(has_reshape != std::end(_impl->aux_mem_req)) + if (has_reshape != std::end(_impl->aux_mem_req)) { _impl->b->mark_as_unused(); } diff --git a/src/runtime/CL/functions/CLGEMMConvolutionLayer.cpp b/src/runtime/CL/functions/CLGEMMConvolutionLayer.cpp index 3184d5dfe0..aef7cddd7a 100644 --- a/src/runtime/CL/functions/CLGEMMConvolutionLayer.cpp +++ b/src/runtime/CL/functions/CLGEMMConvolutionLayer.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2021 Arm Limited. + * Copyright (c) 2017-2021, 2023 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -23,22 +23,17 @@ */ #include "arm_compute/runtime/CL/functions/CLGEMMConvolutionLayer.h" +#include "arm_compute/core/CL/CLKernelLibrary.h" #include "arm_compute/core/PixelValue.h" #include "arm_compute/core/Size2D.h" #include "arm_compute/core/Utils.h" -#include "arm_compute/core/Validate.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" #include "arm_compute/core/utils/quantization/AsymmHelpers.h" +#include "arm_compute/core/Validate.h" #include "arm_compute/runtime/CL/CLScheduler.h" -#include "src/core/CL/kernels/CLCol2ImKernel.h" -#include "src/core/CL/kernels/CLGEMMLowpMatrixMultiplyNativeKernel.h" -#include "src/core/CL/kernels/CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel.h" -#include "src/core/CL/kernels/CLGEMMLowpOffsetContributionKernel.h" -#include "src/core/CL/kernels/CLGEMMLowpOffsetContributionOutputStageKernel.h" -#include "src/core/CL/kernels/CLGEMMLowpReductionKernel.h" -#include "src/core/CL/kernels/CLIm2ColKernel.h" -#include "src/core/CL/kernels/CLWeightsReshapeKernel.h" -#include "src/core/helpers/AutoConfiguration.h" + +#include "src/core/helpers/MemoryHelpers.h" +#include "src/gpu/cl/operators/ClGemmConv2d.h" #include "support/Cast.h" #include <cmath> @@ -49,634 +44,117 @@ namespace arm_compute { using namespace arm_compute::misc::shape_calculator; using namespace arm_compute::utils::cast; +using namespace arm_compute::experimental; -CLConvolutionLayerReshapeWeights::CLConvolutionLayerReshapeWeights() - : _weights_reshape_kernel(std::make_unique<CLWeightsReshapeKernel>()) -{ -} - -CLConvolutionLayerReshapeWeights::~CLConvolutionLayerReshapeWeights() = default; - -void CLConvolutionLayerReshapeWeights::configure(const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, unsigned int num_groups) -{ - configure(CLKernelLibrary::get().get_compile_context(), weights, biases, output, num_groups); -} - -void CLConvolutionLayerReshapeWeights::configure(const CLCompileContext &compile_context, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, unsigned int num_groups) -{ - // Perform validation step - ARM_COMPUTE_ERROR_ON_NULLPTR(weights, output); - ARM_COMPUTE_ERROR_THROW_ON(CLConvolutionLayerReshapeWeights::validate(weights->info(), - (biases != nullptr) ? biases->info() : nullptr, - output->info(), - num_groups)); - - const bool append_biases = (biases != nullptr) && !is_data_type_quantized_asymmetric(weights->info()->data_type()); - const ICLTensor *biases_to_use = (append_biases) ? biases : nullptr; - - _weights_reshape_kernel->configure(compile_context, weights, biases_to_use, output, num_groups); - - output->info()->set_quantization_info(weights->info()->quantization_info()); -} - -Status CLConvolutionLayerReshapeWeights::validate(const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, unsigned int num_groups) +struct CLGEMMConvolutionLayer::Impl { - ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(weights); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(weights, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM8_PER_CHANNEL, DataType::F16, DataType::F32); - ARM_COMPUTE_RETURN_ERROR_ON(weights->num_dimensions() > 4); - - if(biases != nullptr) - { - const int idx_kernels = get_data_layout_dimension_index(weights->data_layout(), DataLayoutDimension::BATCHES); - ARM_COMPUTE_RETURN_ERROR_ON(is_data_type_quantized(weights->data_type())); - - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(weights, biases); - ARM_COMPUTE_RETURN_ERROR_ON(biases->dimension(0) != weights->dimension(idx_kernels)); - ARM_COMPUTE_RETURN_ERROR_ON(biases->num_dimensions() > 1); - } - - if((output != nullptr) && (output->total_size() != 0)) - { - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(weights, output); - CLWeightsReshapeKernel::validate(weights, biases, output, num_groups); - } - - return Status{}; -} - -void CLConvolutionLayerReshapeWeights::run() -{ - CLScheduler::get().enqueue(*_weights_reshape_kernel); -} - -CLGEMMConvolutionLayer::CLGEMMConvolutionLayer(std::shared_ptr<IMemoryManager> memory_manager, IWeightsManager *weights_manager) - : _memory_group(memory_manager), _weights_manager(weights_manager), _reshape_weights(), _reshape_weights_managed(), _im2col_kernel(std::make_unique<CLIm2ColKernel>()), _mm_gemm(memory_manager, - weights_manager), _mm_gemmlowp(memory_manager), _col2im_kernel(std::make_unique<CLCol2ImKernel>()), _activationlayer_function(), _original_weights(nullptr), _im2col_output(), _weights_reshaped(), - _gemm_output(), _skip_im2col(false), _skip_col2im(false), _is_quantized(false), _fuse_activation(true), _is_prepared(false) + const ITensor *weights{nullptr}; + std::unique_ptr<opencl::ClGemmConv2d> op{nullptr}; + ITensorPack run_pack{}; + ITensorPack prep_pack{}; + MemoryGroup memory_group{}; + IWeightsManager *weights_manager{nullptr}; + MemoryRequirements aux_mem_req{}; + WorkspaceData<CLTensor> workspace_tensors{}; + bool is_prepared{false}; +}; + +CLGEMMConvolutionLayer::CLGEMMConvolutionLayer(std::shared_ptr<IMemoryManager> memory_manager, + IWeightsManager *weights_manager) + : _impl(std::make_unique<Impl>()) { + _impl->memory_group = MemoryGroup(memory_manager); + _impl->weights_manager = weights_manager; } CLGEMMConvolutionLayer::~CLGEMMConvolutionLayer() = default; -void CLGEMMConvolutionLayer::configure_mm(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, - const GEMMLowpOutputStageInfo &gemmlowp_output_stage, - int gemm_3d_depth, const ActivationLayerInfo &act_info) +void CLGEMMConvolutionLayer::configure(const ICLTensor *input, + const ICLTensor *weights, + const ICLTensor *biases, + ICLTensor *output, + const PadStrideInfo &conv_info, + const WeightsInfo &weights_info, + const Size2D &dilation, + const ActivationLayerInfo &act_info, + unsigned int num_groups) { - ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights); - ARM_COMPUTE_ERROR_THROW_ON(validate_mm(input->info(), weights->info(), biases != nullptr ? biases->info() : nullptr, output->info(), gemmlowp_output_stage, gemm_3d_depth, _skip_im2col, act_info)); - - const GEMMInfo &gemm_info = GEMMInfo(false, // is_a_reshaped - false, // is_b_reshaped - true, // reshape_b_only_on_first_run - gemm_3d_depth, // depth_output_gemm3d - _skip_im2col, // reinterpret_input_as_3d - false, // retain_internal_weights - gemmlowp_output_stage, // gemmlowp_output_stage - false, // fp_mixed_precision - true, // broadcast_bias - act_info); // activation_info - - if(_is_quantized) - { - // Since we need negative offsets for computing convolution, we need to change QuantizationInfo() - // Extract and negate input and weights offset - const QuantizationInfo input_quantization_info = input->info()->quantization_info(); - const QuantizationInfo weights_quantization_info = weights->info()->quantization_info(); - - input->info()->set_quantization_info(QuantizationInfo(input_quantization_info.uniform().scale, -input_quantization_info.uniform().offset)); - weights->info()->set_quantization_info(QuantizationInfo(weights_quantization_info.uniform().scale, -weights_quantization_info.uniform().offset)); - - _mm_gemmlowp.configure(compile_context, input, weights, biases, output, gemm_info); - - // Revert back QuantizatioInfo as input and weights could be used in other convolution layers - input->info()->set_quantization_info(input_quantization_info); - weights->info()->set_quantization_info(weights_quantization_info); - } - else - { - // Configure matrix multiply function - _mm_gemm.configure(compile_context, input, weights, biases, output, 1.0f, 1.0f, gemm_info); - } + configure(CLKernelLibrary::get().get_compile_context(), input, weights, biases, output, conv_info, weights_info, + dilation, act_info, num_groups); } -Status CLGEMMConvolutionLayer::validate_mm(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, - const GEMMLowpOutputStageInfo &gemmlowp_output_stage, int gemm_3d_depth, bool skip_im2col, const ActivationLayerInfo &act_info) -{ - const bool is_quantized = is_data_type_quantized_asymmetric(input->data_type()); - - const GEMMInfo &gemm_info = GEMMInfo(false, // is_a_reshaped - false, // is_b_reshaped - true, // reshape_b_only_on_first_run - gemm_3d_depth, // depth_output_gemm3d - skip_im2col, // reinterpret_input_as_3d - false, // retain_internal_weights - gemmlowp_output_stage, // gemmlowp_output_stage - false, // fp_mixed_precision - true, // broadcast_bias - act_info); // activation_info - - if(is_quantized) - { - // Since we need negative offsets for computing convolution, we need to change QuantizationInfo() - // Extract and negate input and weights offset - const QuantizationInfo input_quantization_info = input->quantization_info(); - const QuantizationInfo weights_quantization_info = weights->quantization_info(); - - std::unique_ptr<ITensorInfo> input_qa = input->clone(); - std::unique_ptr<ITensorInfo> weights_qa = weights->clone(); - input_qa->set_quantization_info(QuantizationInfo(input_quantization_info.uniform().scale, -input_quantization_info.uniform().offset)); - weights_qa->set_quantization_info(QuantizationInfo(weights_quantization_info.uniform().scale, -weights_quantization_info.uniform().offset)); - - // Perform validation step on GEMMLowp - return CLGEMMLowpMatrixMultiplyCore::validate(input_qa.get(), weights_qa.get(), biases, output, gemm_info); - } - else - { - // Perform validation step on Matrix multiply function - return CLGEMM::validate(input, weights, biases, output, 1.0f, 1.0f, gemm_info); - } -} - -void CLGEMMConvolutionLayer::configure(const ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info, const WeightsInfo &weights_info, - const Size2D &dilation, const ActivationLayerInfo &act_info, unsigned int num_groups) -{ - configure(CLKernelLibrary::get().get_compile_context(), input, weights, biases, output, conv_info, weights_info, dilation, act_info, num_groups); -} - -void CLGEMMConvolutionLayer::configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, - const PadStrideInfo &conv_info, - const WeightsInfo &weights_info, const Size2D &dilation, const ActivationLayerInfo &act_info, unsigned int num_groups) +void CLGEMMConvolutionLayer::configure(const CLCompileContext &compile_context, + const ICLTensor *input, + const ICLTensor *weights, + const ICLTensor *biases, + ICLTensor *output, + const PadStrideInfo &conv_info, + const WeightsInfo &weights_info, + const Size2D &dilation, + const ActivationLayerInfo &act_info, + unsigned int num_groups) { ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output); - - ARM_COMPUTE_ERROR_THROW_ON(CLGEMMConvolutionLayer::validate(input->info(), - weights->info(), - biases != nullptr ? biases->info() : nullptr, - output->info(), - conv_info, - weights_info, - dilation, - act_info, - num_groups)); - - const DataType data_type = input->info()->data_type(); - const DataLayout data_layout = input->info()->data_layout(); - const int idx_width = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH); - const int idx_height = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT); - const int idx_kernels = get_data_layout_dimension_index(data_layout, DataLayoutDimension::BATCHES); - - const unsigned int kernel_width = weights->info()->dimension(idx_width); - const unsigned int kernel_height = weights->info()->dimension(idx_height); - const unsigned int num_kernels = weights->info()->dimension(idx_kernels); - - const UniformQuantizationInfo iq_info = input->info()->quantization_info().uniform(); - const UniformQuantizationInfo oq_info = output->info()->quantization_info().uniform(); - - _is_prepared = weights_info.retain_internal_weights(); - _original_weights = weights; - _is_quantized = is_data_type_quantized_asymmetric(input->info()->data_type()); - _skip_im2col = (data_layout == DataLayout::NHWC && kernel_width == 1 && kernel_height == 1 && conv_info.stride().first == 1 && conv_info.stride().second == 1); - _skip_col2im = data_layout == DataLayout::NHWC; - - // Only for quantize there are few cases where we cannot fuse the activation function in GEMM - _fuse_activation = true; - - // Set the GPU target for im2col and col2im - _im2col_kernel->set_target(CLScheduler::get().target()); - _col2im_kernel->set_target(CLScheduler::get().target()); - - const ICLTensor *gemm_input_to_use = input; - ICLTensor *gemm_output_to_use = output; - - // Get parameters from conv_info - unsigned int stride_x = 0; - unsigned int stride_y = 0; - std::tie(stride_x, stride_y) = conv_info.stride(); - - // Get convolved dimensions - unsigned int conv_w = 0; - unsigned int conv_h = 0; - std::tie(conv_w, conv_h) = scaled_dimensions(input->info()->dimension(idx_width), - input->info()->dimension(idx_height), - kernel_width, - kernel_height, - conv_info, - dilation); - - unsigned int mat_weights_cols = num_kernels / num_groups; - - const ICLTensor *biases_to_use = biases; - bool append_bias = false; - - ICLTensor *weights_to_use = &_weights_reshaped; - if(num_groups != 1 && biases != nullptr) - { - // num_groups != 1 can only be for NCHW - // Since it is missing an utility function to reshape the biases, we append the biases into the weights tensor - biases_to_use = nullptr; - append_bias = true; - - if(_weights_manager && _weights_manager->are_weights_managed(weights)) - { - _reshape_weights_managed.configure(compile_context, weights, biases, num_groups); - weights_to_use = utils::cast::polymorphic_downcast<ICLTensor *>(_weights_manager->acquire(weights, &_reshape_weights_managed)); - } - else - { - _reshape_weights.configure(compile_context, weights, biases, &_weights_reshaped, num_groups); - } - } - else - { - if(_weights_manager && _weights_manager->are_weights_managed(weights)) - { - _reshape_weights_managed.configure(compile_context, weights, nullptr, num_groups); - weights_to_use = utils::cast::polymorphic_downcast<ICLTensor *>(_weights_manager->acquire(weights, &_reshape_weights_managed)); - } - else - { - _reshape_weights.configure(compile_context, weights, nullptr, &_weights_reshaped, num_groups); - } - } - - // Create tensor to store im2col reshaped inputs - if(!_skip_im2col) - { - _memory_group.manage(&_im2col_output); - - // Configure and tune im2col. im2col output shape is auto-initialized - _im2col_kernel->configure(compile_context, input, &_im2col_output, Size2D(kernel_width, kernel_height), conv_info, append_bias, dilation, num_groups); - - // Set quantization info - _im2col_output.info()->set_quantization_info(input->info()->quantization_info()); - CLScheduler::get().tune_kernel_static(*_im2col_kernel); - - // Update GEMM input - gemm_input_to_use = &_im2col_output; - } - - // Create GEMM output tensor - if(!_skip_col2im) - { - TensorShape shape_gemm; - - // If we cannot skip col2im it means we run im2col as well - shape_gemm = _im2col_output.info()->tensor_shape(); - shape_gemm.set(0, mat_weights_cols); - shape_gemm.set(1, conv_w * conv_h); - - TensorInfo info_gemm(shape_gemm, 1, data_type); - info_gemm.set_quantization_info(output->info()->quantization_info()).set_data_layout(input->info()->data_layout()); - _gemm_output.allocator()->init(info_gemm); - _memory_group.manage(&_gemm_output); - - // Update GEMM output - gemm_output_to_use = &_gemm_output; - } - - GEMMLowpOutputStageInfo gemmlowp_output_stage; - gemmlowp_output_stage.type = GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT; - gemmlowp_output_stage.gemmlowp_offset = 0; - - // Configure output stage for quantized case - if(_is_quantized) - { - const auto output_quant_info = (output->info()->total_size() == 0) ? iq_info : oq_info; - const bool is_quantized_per_channel = is_data_type_quantized_per_channel(weights->info()->data_type()); - const unsigned int num_filters = (is_quantized_per_channel) ? num_kernels : 1; - - gemmlowp_output_stage.is_quantized_per_channel = is_quantized_per_channel; - - gemmlowp_output_stage.gemmlowp_multipliers.resize(num_filters); - gemmlowp_output_stage.gemmlowp_shifts.resize(num_filters); - quantization::compute_quantized_multipliers_and_shifts(input->info(), - weights->info(), - output->info(), - idx_kernels, - gemmlowp_output_stage.gemmlowp_multipliers.data(), - gemmlowp_output_stage.gemmlowp_shifts.data()); - gemmlowp_output_stage.gemmlowp_multiplier = gemmlowp_output_stage.gemmlowp_multipliers[0]; - gemmlowp_output_stage.gemmlowp_shift = gemmlowp_output_stage.gemmlowp_shifts[0]; - - PixelValue min_val{}; - PixelValue max_val{}; - std::tie(min_val, max_val) = get_min_max(output->info()->data_type()); - - auto min_activation = min_val.get<int32_t>(); - auto max_activation = max_val.get<int32_t>(); - - const std::set<ActivationLayerInfo::ActivationFunction> supported_acts = { ActivationLayerInfo::ActivationFunction::RELU, - ActivationLayerInfo::ActivationFunction::BOUNDED_RELU, - ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU - }; - - if(act_info.enabled()) - { - if(supported_acts.count(act_info.activation()) != 0) - { - std::tie(min_activation, max_activation) = get_quantized_activation_min_max(act_info, data_type, output_quant_info); - } - else - { - _fuse_activation = false; - } - } - - // Set the GEMMLowp output stage info - gemmlowp_output_stage.gemmlowp_offset = output_quant_info.offset; - gemmlowp_output_stage.gemmlowp_min_bound = min_activation; - gemmlowp_output_stage.gemmlowp_max_bound = max_activation; - } - - // Configure and tune GEMM - // In case of NHWC, we need to run GEMM3D (gemm_3d_depth != 0) in order to avoid reshaping the output matrix - const unsigned int gemm_3d_depth = (data_layout == DataLayout::NHWC) ? conv_h : 0; - - configure_mm(compile_context, gemm_input_to_use, weights_to_use, biases_to_use, gemm_output_to_use, gemmlowp_output_stage, gemm_3d_depth, act_info); - - if(!_skip_im2col) - { - _im2col_output.allocator()->allocate(); - } - - if(!_skip_col2im) - { - // Configure and tune Col2Im - _col2im_kernel->configure(compile_context, gemm_output_to_use, output, Size2D(conv_w, conv_h), num_groups); - CLScheduler::get().tune_kernel_static(*_col2im_kernel.get()); - } - - if(!_skip_col2im) - { - _gemm_output.allocator()->allocate(); - } - - ARM_COMPUTE_ERROR_ON_MSG((output->info()->dimension(idx_width) != conv_w) || (output->info()->dimension(idx_height) != conv_h), - "Output shape does not match the expected one"); - - if(!_fuse_activation) - { - _activationlayer_function.configure(compile_context, output, nullptr, act_info); - } - - ARM_COMPUTE_UNUSED(weights_info); + _impl->weights = weights; + _impl->op = std::make_unique<opencl::ClGemmConv2d>(); + const Conv2dInfo conv2d_info = Conv2dInfo(conv_info, dilation, act_info, false, num_groups); + _impl->op->configure(compile_context, input->info(), weights->info(), + (biases != nullptr ? biases->info() : nullptr), output->info(), conv2d_info, weights_info); + + _impl->run_pack = {{TensorType::ACL_SRC_0, input}, + {TensorType::ACL_SRC_1, weights}, + {TensorType::ACL_SRC_2, biases}, + {TensorType::ACL_DST, output}}; + _impl->prep_pack = { + {TensorType::ACL_SRC_1, weights}, + {TensorType::ACL_SRC_2, biases}, + }; + _impl->aux_mem_req = _impl->op->workspace(); + _impl->workspace_tensors = + manage_workspace<CLTensor>(_impl->aux_mem_req, _impl->memory_group, _impl->run_pack, _impl->prep_pack); } -Status CLGEMMConvolutionLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info, - const WeightsInfo &weights_info, const Size2D &dilation, const ActivationLayerInfo &act_info, unsigned int num_groups) +Status CLGEMMConvolutionLayer::validate(const ITensorInfo *input, + const ITensorInfo *weights, + const ITensorInfo *biases, + const ITensorInfo *output, + const PadStrideInfo &conv_info, + const WeightsInfo &weights_info, + const Size2D &dilation, + const ActivationLayerInfo &act_info, + unsigned int num_groups) { - ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(weights_info.are_reshaped(), "Weights already reshaped are not supported!"); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F16, DataType::F32); - const bool is_quantized_per_channel = is_data_type_quantized_per_channel(weights->data_type()); - - if(!is_quantized_per_channel) - { - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights); - } - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, weights); - ARM_COMPUTE_RETURN_ERROR_ON_MSG((num_groups != 1) && (input->data_layout() != DataLayout::NCHW), "Grouping (num_groups != 1) with NHWC data layout is not supported"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG((num_groups != 1) && (input->data_type() == DataType::QASYMM8), "Grouping (num_groups != 1) is not supported with QASYMM8"); - ARM_COMPUTE_RETURN_ERROR_ON(((input->dimension(2) / weights->dimension(2)) != num_groups) && (input->data_layout() == DataLayout::NCHW)); - - const DataLayout data_layout = input->data_layout(); - const DataType data_type = input->data_type(); - const int idx_width = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH); - const int idx_height = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT); - const int idx_channel = get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL); - const int idx_kernels = get_data_layout_dimension_index(data_layout, DataLayoutDimension::BATCHES); - - const unsigned int kernel_width = weights->dimension(idx_width); - const unsigned int kernel_height = weights->dimension(idx_height); - const unsigned int num_kernels = weights->dimension(idx_kernels); - - TensorInfo im2col_reshaped_info{}; - TensorInfo info_gemm{}; - TensorInfo weights_reshaped_info{}; - const ITensorInfo *gemm_input_to_use = input; - const ITensorInfo *gemm_output_to_use = output; - const ITensorInfo *weights_to_use = weights; - const bool is_quantized = is_data_type_quantized_asymmetric(data_type); - const bool skip_im2col = (data_layout == DataLayout::NHWC && kernel_width == 1 && kernel_height == 1 && conv_info.stride().first == 1 && conv_info.stride().second == 1); - const bool skip_col2im = data_layout == DataLayout::NHWC; - bool fuse_activation = true; - - ARM_COMPUTE_RETURN_ERROR_ON((weights->dimension(idx_channel) * num_groups) != input->dimension(idx_channel)); - ARM_COMPUTE_RETURN_ERROR_ON(weights->num_dimensions() > 4); - - // Validate biases - if(biases != nullptr) - { - if(is_quantized) - { - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(biases, 1, DataType::S32); - } - else - { - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, biases); - } - ARM_COMPUTE_RETURN_ERROR_ON(biases->dimension(0) != weights->dimension(idx_kernels)); - ARM_COMPUTE_RETURN_ERROR_ON(biases->num_dimensions() > 1); - } - - if(act_info.enabled()) - { - ARM_COMPUTE_ERROR_ON(act_info.b() > act_info.a()); - } - - // Get convolved dimensions - unsigned int conv_w = 0; - unsigned int conv_h = 0; - - std::tie(conv_w, conv_h) = scaled_dimensions(input->dimension(idx_width), - input->dimension(idx_height), - kernel_width, - kernel_height, - conv_info, - dilation); - - unsigned int mat_weights_cols = num_kernels / num_groups; - - const ITensorInfo *biases_to_use = biases; - bool append_bias = false; - - if(num_groups != 1 && biases != nullptr) - { - // num_groups != 1 can only be for NCHW - // Since it is missing an utility function to reshape the biases, we append the biases into the weights tensor - biases_to_use = nullptr; - append_bias = true; - - ARM_COMPUTE_RETURN_ON_ERROR(CLConvolutionLayerReshapeWeights::validate(weights, biases, nullptr, num_groups)); - weights_reshaped_info = TensorInfo(compute_weights_reshaped_shape(*weights, true, num_groups), 1, data_type); - } - else - { - ARM_COMPUTE_RETURN_ON_ERROR(CLConvolutionLayerReshapeWeights::validate(weights, nullptr, nullptr, num_groups)); - weights_reshaped_info = TensorInfo(compute_weights_reshaped_shape(*weights, false, num_groups), 1, data_type); - } - - weights_to_use = &weights_reshaped_info; - - if(!skip_im2col) - { - const Size2D kernel_dims(kernel_width, kernel_height); - - // Output tensor auto initialization if not yet initialized - TensorShape expected_output_shape = compute_im2col_conv_shape(input, kernel_dims, conv_info, append_bias, dilation, num_groups == 1, num_groups); - - auto_init_if_empty(im2col_reshaped_info, input->clone()->set_tensor_shape(expected_output_shape)); - - ARM_COMPUTE_RETURN_ON_ERROR(CLIm2ColKernel::validate(input, &im2col_reshaped_info, kernel_dims, conv_info, append_bias, dilation, num_groups)); - gemm_input_to_use = &im2col_reshaped_info; - } - - // Create GEMM output tensor - if(!skip_col2im) - { - TensorShape shape_gemm; - - shape_gemm = gemm_input_to_use->tensor_shape(); - shape_gemm.set(0, mat_weights_cols); - shape_gemm.set(1, conv_w * conv_h); - - info_gemm = TensorInfo(shape_gemm, 1, data_type); - info_gemm.set_quantization_info(output->quantization_info()).set_data_layout(input->data_layout()); - gemm_output_to_use = &info_gemm; - } - - GEMMLowpOutputStageInfo gemmlowp_output_stage; - gemmlowp_output_stage.type = GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT; - gemmlowp_output_stage.gemmlowp_offset = 0; - gemmlowp_output_stage.is_quantized_per_channel = is_quantized_per_channel; - - if(is_quantized) - { - const UniformQuantizationInfo iq_info = input->quantization_info().uniform(); - const UniformQuantizationInfo oq_info = output->quantization_info().uniform(); - const auto output_quant_info = (output->total_size() == 0) ? iq_info : oq_info; - const unsigned int num_filters = (is_quantized_per_channel) ? num_kernels : 1; - - gemmlowp_output_stage.gemmlowp_multipliers.resize(num_filters); - gemmlowp_output_stage.gemmlowp_shifts.resize(num_filters); - quantization::compute_quantized_multipliers_and_shifts(input, - weights, - output, - idx_kernels, - gemmlowp_output_stage.gemmlowp_multipliers.data(), - gemmlowp_output_stage.gemmlowp_shifts.data()); - gemmlowp_output_stage.gemmlowp_multiplier = gemmlowp_output_stage.gemmlowp_multipliers[0]; - gemmlowp_output_stage.gemmlowp_shift = gemmlowp_output_stage.gemmlowp_shifts[0]; - - int min_activation = 0; - int max_activation = 0; - - const std::set<ActivationLayerInfo::ActivationFunction> supported_acts = { ActivationLayerInfo::ActivationFunction::RELU, - ActivationLayerInfo::ActivationFunction::BOUNDED_RELU, - ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU - }; - - if(act_info.enabled()) - { - if(supported_acts.count(act_info.activation()) != 0) - { - std::tie(min_activation, max_activation) = get_quantized_activation_min_max(act_info, data_type, output_quant_info); - } - else - { - fuse_activation = false; - } - } - - // Set the GEMMLowp output stage info - gemmlowp_output_stage.gemmlowp_offset = output_quant_info.offset; - gemmlowp_output_stage.gemmlowp_min_bound = min_activation; - gemmlowp_output_stage.gemmlowp_max_bound = max_activation; - } - - // In case of NHWC, we need to run GEMM3D (gemm_3d_depth != 0) in order to avoid reshaping the output matrix - const unsigned int gemm_3d_depth = (data_layout == DataLayout::NHWC) ? conv_h : 0; - - ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemm_input_to_use, weights_to_use, biases_to_use, gemm_output_to_use, gemmlowp_output_stage, gemm_3d_depth, skip_im2col, act_info)); - - // Validate Col2Im - if(!skip_col2im) - { - ARM_COMPUTE_RETURN_ON_ERROR(CLCol2ImKernel::validate(gemm_output_to_use, output, Size2D(conv_w, conv_h), num_groups)); - } - - //Validate Activation Layer - if(!fuse_activation) - { - ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayer::validate(output, nullptr, act_info)); - } - - return Status{}; + const Conv2dInfo conv2d_info = Conv2dInfo(conv_info, dilation, act_info, false, num_groups); + return opencl::ClGemmConv2d::validate(input, weights, biases, output, conv2d_info, weights_info); } void CLGEMMConvolutionLayer::run() { prepare(); - - MemoryGroupResourceScope scope_mg(_memory_group); - - // Run im2col - if(!_skip_im2col) - { - CLScheduler::get().enqueue(*_im2col_kernel); - } - - // Runs CLGEMM or CLGEMMLowpMatrixMultiplyCore functions - if(_is_quantized) - { - // Run gemmlowp - _mm_gemmlowp.run(); - } - else - { - // Run gemm - _mm_gemm.run(); - } - - // Reshape output matrix - if(!_skip_col2im) - { - CLScheduler::get().enqueue(*_col2im_kernel.get(), false); - } - - //Run Activation Layer if we cannot fuse in GEMM - if(!_fuse_activation) - { - _activationlayer_function.run(); - } + MemoryGroupResourceScope scope_mg(_impl->memory_group); + _impl->op->run(_impl->run_pack); } void CLGEMMConvolutionLayer::prepare() { - if(!_is_prepared) + if (!_impl->is_prepared) { - ARM_COMPUTE_ERROR_ON(!_original_weights->is_used()); - if(_weights_manager && _weights_manager->are_weights_managed(_original_weights)) + _impl->op->prepare(_impl->prep_pack); + auto has_reshape = + std::find_if(_impl->aux_mem_req.begin(), _impl->aux_mem_req.end(), + [](const MemoryInfo &m) -> bool { return m.lifetime == MemoryLifetime::Persistent; }); + + if (has_reshape != std::end(_impl->aux_mem_req)) { - _weights_manager->run(_original_weights, &_reshape_weights_managed); + _impl->weights->mark_as_unused(); } else { - // Run weights reshaping and mark original weights tensor as unused - _weights_reshaped.allocator()->allocate(); - _reshape_weights.run(); - _original_weights->mark_as_unused(); - } - - // Prepare GEMM - _is_quantized ? _mm_gemmlowp.prepare() : _mm_gemm.prepare(); - if(!_weights_reshaped.is_used()) - { - _weights_reshaped.allocator()->free(); + // Pack the B matrix to be used as the underlying GEMM performs no reshapes + _impl->run_pack.add_const_tensor(ACL_SRC_1, _impl->weights); } - - CLScheduler::get().queue().finish(); - _is_prepared = true; + release_temporaries(_impl->aux_mem_req, _impl->workspace_tensors); + _impl->is_prepared = true; } } } // namespace arm_compute diff --git a/src/runtime/CL/functions/CLGEMMDeconvolutionLayer.cpp b/src/runtime/CL/functions/CLGEMMDeconvolutionLayer.cpp index d5d1b5f41e..7d40cf1829 100644 --- a/src/runtime/CL/functions/CLGEMMDeconvolutionLayer.cpp +++ b/src/runtime/CL/functions/CLGEMMDeconvolutionLayer.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2021 Arm Limited. + * Copyright (c) 2019-2021, 2023 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -24,19 +24,15 @@ #include "arm_compute/runtime/CL/functions/CLGEMMDeconvolutionLayer.h" #include "arm_compute/core/Helpers.h" -#include "arm_compute/core/Validate.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" #include "arm_compute/core/utils/quantization/AsymmHelpers.h" +#include "arm_compute/core/Validate.h" +#include "arm_compute/function_info/ActivationLayerInfo.h" #include "arm_compute/runtime/CL/CLScheduler.h" + +#include "src/common/utils/Log.h" #include "src/core/CL/kernels/CLDeconvolutionReshapeOutputKernel.h" #include "src/core/CL/kernels/CLFillBorderKernel.h" -#include "src/core/CL/kernels/CLGEMMLowpMatrixMultiplyNativeKernel.h" -#include "src/core/CL/kernels/CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel.h" -#include "src/core/CL/kernels/CLGEMMLowpOffsetContributionKernel.h" -#include "src/core/CL/kernels/CLGEMMLowpOffsetContributionOutputStageKernel.h" -#include "src/core/CL/kernels/CLGEMMLowpReductionKernel.h" -#include "src/core/CL/kernels/CLIm2ColKernel.h" -#include "src/core/CL/kernels/CLWeightsReshapeKernel.h" #include <tuple> @@ -44,12 +40,13 @@ namespace arm_compute { namespace { -std::pair<Coordinates, Coordinates> compute_start_end_slice_coordinates(const ITensorInfo &output_info, const PadStrideInfo &deconv_info, bool is_nchw) +std::pair<Coordinates, Coordinates> +compute_start_end_slice_coordinates(const ITensorInfo &output_info, const PadStrideInfo &deconv_info, bool is_nchw) { Coordinates start; Coordinates end; - if(is_nchw) + if (is_nchw) { start.set(0, deconv_info.pad_left()); start.set(1, deconv_info.pad_top()); @@ -67,13 +64,16 @@ std::pair<Coordinates, Coordinates> compute_start_end_slice_coordinates(const IT end.set(2, output_info.dimension(2) - deconv_info.pad_bottom()); } - return { start, end }; + return {start, end}; } -Status construct_gemmlowp_output_stage(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *output, GEMMLowpOutputStageInfo &output_stage_info) +Status construct_gemmlowp_output_stage(const ITensorInfo *input, + const ITensorInfo *weights, + const ITensorInfo *output, + GEMMLowpOutputStageInfo &output_stage_info) { const auto data_type = input->data_type(); - if(is_data_type_quantized_asymmetric(data_type)) + if (is_data_type_quantized_asymmetric(data_type)) { const UniformQuantizationInfo iq_info = input->quantization_info().uniform(); const UniformQuantizationInfo wq_info = weights->quantization_info().uniform(); @@ -82,7 +82,8 @@ Status construct_gemmlowp_output_stage(const ITensorInfo *input, const ITensorIn float multiplier = iq_info.scale * wq_info.scale / oq_info.scale; int output_multiplier(0); int output_shift(0); - ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier(multiplier, &output_multiplier, &output_shift)); + ARM_COMPUTE_RETURN_ON_ERROR( + quantization::calculate_quantized_multiplier(multiplier, &output_multiplier, &output_shift)); output_stage_info.type = GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT; output_stage_info.gemmlowp_multiplier = output_multiplier; @@ -126,15 +127,21 @@ CLGEMMDeconvolutionLayer::CLGEMMDeconvolutionLayer(std::shared_ptr<IMemoryManage CLGEMMDeconvolutionLayer::~CLGEMMDeconvolutionLayer() = default; -Status CLGEMMDeconvolutionLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *bias, const ITensorInfo *output, const PadStrideInfo &deconv_info) +Status CLGEMMDeconvolutionLayer::validate(const ITensorInfo *input, + const ITensorInfo *weights, + const ITensorInfo *bias, + const ITensorInfo *output, + const PadStrideInfo &deconv_info) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32, DataType::F16, DataType::QASYMM8, DataType::QASYMM8_SIGNED); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32, DataType::F16, DataType::QASYMM8, + DataType::QASYMM8_SIGNED); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, weights); DataLayout data_layout = input->data_layout(); - const bool padded_input = deconv_info.pad_bottom() > 0 || deconv_info.pad_left() > 0 || deconv_info.pad_right() > 0 || deconv_info.pad_top() > 0; + const bool padded_input = deconv_info.pad_bottom() > 0 || deconv_info.pad_left() > 0 || + deconv_info.pad_right() > 0 || deconv_info.pad_top() > 0; const bool is_nchw = input->data_layout() == DataLayout::NCHW; const bool is_quantized = is_data_type_quantized_asymmetric(input->data_type()); @@ -148,21 +155,31 @@ Status CLGEMMDeconvolutionLayer::validate(const ITensorInfo *input, const ITenso TensorShape nhwc_weights_shape = weights->tensor_shape(); TensorShape nhwc_input_shape = input->tensor_shape(); - if(is_nchw) + if (is_nchw) { permute(nhwc_weights_shape, PermutationVector(2, 0, 1)); permute(nhwc_input_shape, PermutationVector(2, 0, 1)); - TensorInfo nhwc_input_info = input->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(nhwc_input_shape).set_data_layout(DataLayout::NCHW); + TensorInfo nhwc_input_info = input->clone() + ->set_is_resizable(true) + .reset_padding() + .set_tensor_shape(nhwc_input_shape) + .set_data_layout(DataLayout::NCHW); - TensorInfo nhwc_weights_info = weights->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(nhwc_weights_shape).set_data_layout(DataLayout::NCHW); + TensorInfo nhwc_weights_info = weights->clone() + ->set_is_resizable(true) + .reset_padding() + .set_tensor_shape(nhwc_weights_shape) + .set_data_layout(DataLayout::NCHW); CLPermute::validate(weights, &nhwc_weights_info, PermutationVector(2, 0, 1)); CLPermute::validate(input, &nhwc_input_info, PermutationVector(2, 0, 1)); } - const TensorShape reshaped_shape = TensorShape(nhwc_weights_shape[0], nhwc_weights_shape[1] * nhwc_weights_shape[2] * nhwc_weights_shape[3]); - const TensorInfo reshaped_info = weights->clone()->set_tensor_shape(reshaped_shape).set_data_layout(DataLayout::NCHW).set_is_resizable(true); + const TensorShape reshaped_shape = + TensorShape(nhwc_weights_shape[0], nhwc_weights_shape[1] * nhwc_weights_shape[2] * nhwc_weights_shape[3]); + const TensorInfo reshaped_info = + weights->clone()->set_tensor_shape(reshaped_shape).set_data_layout(DataLayout::NCHW).set_is_resizable(true); ARM_COMPUTE_RETURN_ON_ERROR(CLReshapeLayer::validate(weights, &reshaped_info)); TensorShape transposed_shape(reshaped_shape[1], reshaped_shape[0]); @@ -170,76 +187,95 @@ Status CLGEMMDeconvolutionLayer::validate(const ITensorInfo *input, const ITenso ARM_COMPUTE_RETURN_ON_ERROR(CLTranspose::validate(&reshaped_info, &reshaped_t_info)); TensorShape gemm_output_shape(weights->dimension(idx_w) * weights->dimension(idx_h) * weights->dimension(idx_b), - input->dimension(idx_w), - input->dimension(idx_h), - input->dimension(idx_b)); + input->dimension(idx_w), input->dimension(idx_h), input->dimension(idx_b)); TensorInfo gemm_output_info = reshaped_t_info.clone()->set_tensor_shape(gemm_output_shape).set_is_resizable(true); GEMMInfo gemm_info(false, false, true, input->dimension(idx_h), true); GEMMLowpOutputStageInfo output_stage_info; - if(is_quantized) + if (is_quantized) { - ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixMultiplyCore::validate(&input->clone()->set_tensor_shape(nhwc_input_shape), &reshaped_t_info, nullptr, &gemm_output_info.set_data_type(DataType::S32), - gemm_info)); + ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixMultiplyCore::validate( + &input->clone()->set_tensor_shape(nhwc_input_shape), &reshaped_t_info, nullptr, + &gemm_output_info.set_data_type(DataType::S32), gemm_info)); ARM_COMPUTE_RETURN_ON_ERROR(construct_gemmlowp_output_stage(input, weights, output, output_stage_info)); } else { - ARM_COMPUTE_RETURN_ON_ERROR(CLGEMM::validate(&input->clone()->set_tensor_shape(nhwc_input_shape).set_is_resizable(true), &reshaped_t_info, nullptr, &gemm_output_info, 1.0f, 0.0f, gemm_info)); + ARM_COMPUTE_RETURN_ON_ERROR( + CLGEMM::validate(&input->clone()->set_tensor_shape(nhwc_input_shape).set_is_resizable(true), + &reshaped_t_info, nullptr, &gemm_output_info, 1.0f, 0.0f, gemm_info)); } const PadStrideInfo stride_info(deconv_info.stride().first, deconv_info.stride().second); - auto out_dims = deconvolution_output_dimensions(input->dimension(idx_w), input->dimension(idx_h), weights->dimension(idx_w), weights->dimension(idx_h), stride_info); - const TensorShape deconv_shape = misc::shape_calculator::compute_deconvolution_output_shape(out_dims, *input, *weights); - TensorInfo col2im_output_info = gemm_output_info.clone()->set_tensor_shape(deconv_shape).set_is_resizable(true); + auto out_dims = deconvolution_output_dimensions(input->dimension(idx_w), input->dimension(idx_h), + weights->dimension(idx_w), weights->dimension(idx_h), stride_info); + const TensorShape deconv_shape = + misc::shape_calculator::compute_deconvolution_output_shape(out_dims, *input, *weights); + TensorInfo col2im_output_info = gemm_output_info.clone()->set_tensor_shape(deconv_shape).set_is_resizable(true); - if(padded_input && is_quantized) + if (padded_input && is_quantized) { const auto start_end = compute_start_end_slice_coordinates(col2im_output_info, deconv_info, is_nchw); - ARM_COMPUTE_RETURN_ON_ERROR(CLDeconvolutionReshapeOutputKernel::validate(&gemm_output_info, bias, &col2im_output_info, input, weights, deconv_info)); - ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpOutputStage::validate(&col2im_output_info, nullptr, &col2im_output_info.clone()->set_is_resizable(true).set_data_type(input->data_type()), output_stage_info)); - ARM_COMPUTE_RETURN_ON_ERROR(CLSlice::validate(&col2im_output_info.clone()->set_is_resizable(true).set_data_type(input->data_type()), output, start_end.first, start_end.second)); + ARM_COMPUTE_RETURN_ON_ERROR(CLDeconvolutionReshapeOutputKernel::validate( + &gemm_output_info, bias, &col2im_output_info, input, weights, deconv_info)); + ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpOutputStage::validate( + &col2im_output_info, nullptr, + &col2im_output_info.clone()->set_is_resizable(true).set_data_type(input->data_type()), output_stage_info)); + ARM_COMPUTE_RETURN_ON_ERROR( + CLSlice::validate(&col2im_output_info.clone()->set_is_resizable(true).set_data_type(input->data_type()), + output, start_end.first, start_end.second)); } - else if(padded_input) + else if (padded_input) { const auto start_end = compute_start_end_slice_coordinates(col2im_output_info, deconv_info, is_nchw); - ARM_COMPUTE_RETURN_ON_ERROR(CLDeconvolutionReshapeOutputKernel::validate(&gemm_output_info, bias, &col2im_output_info, input, weights, deconv_info)); + ARM_COMPUTE_RETURN_ON_ERROR(CLDeconvolutionReshapeOutputKernel::validate( + &gemm_output_info, bias, &col2im_output_info, input, weights, deconv_info)); ARM_COMPUTE_RETURN_ON_ERROR(CLSlice::validate(&col2im_output_info, output, start_end.first, start_end.second)); } - else if(is_quantized) + else if (is_quantized) { - ARM_COMPUTE_RETURN_ON_ERROR(CLDeconvolutionReshapeOutputKernel::validate(&gemm_output_info, bias, &col2im_output_info, input, weights, deconv_info)); - ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpOutputStage::validate(&col2im_output_info, nullptr, output, output_stage_info)); + ARM_COMPUTE_RETURN_ON_ERROR(CLDeconvolutionReshapeOutputKernel::validate( + &gemm_output_info, bias, &col2im_output_info, input, weights, deconv_info)); + ARM_COMPUTE_RETURN_ON_ERROR( + CLGEMMLowpOutputStage::validate(&col2im_output_info, nullptr, output, output_stage_info)); } else { - ARM_COMPUTE_RETURN_ON_ERROR(CLDeconvolutionReshapeOutputKernel::validate(&gemm_output_info, bias, output, input, weights, deconv_info)); + ARM_COMPUTE_RETURN_ON_ERROR( + CLDeconvolutionReshapeOutputKernel::validate(&gemm_output_info, bias, output, input, weights, deconv_info)); } return Status{}; } -void CLGEMMDeconvolutionLayer::configure(const ICLTensor *input, const ICLTensor *weights, const ICLTensor *bias, ICLTensor *output, const PadStrideInfo &deconv_info) +void CLGEMMDeconvolutionLayer::configure(const ICLTensor *input, + const ICLTensor *weights, + const ICLTensor *bias, + ICLTensor *output, + const PadStrideInfo &deconv_info) { configure(CLKernelLibrary::get().get_compile_context(), input, weights, bias, output, deconv_info); } -void CLGEMMDeconvolutionLayer::configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *weights, const ICLTensor *bias, ICLTensor *output, - const PadStrideInfo &deconv_info) +void CLGEMMDeconvolutionLayer::configure(const CLCompileContext &compile_context, + const ICLTensor *input, + const ICLTensor *weights, + const ICLTensor *bias, + ICLTensor *output, + const PadStrideInfo &deconv_info) { ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output); - ARM_COMPUTE_ERROR_THROW_ON(CLGEMMDeconvolutionLayer::validate(input->info(), - weights->info(), - bias != nullptr ? bias->info() : nullptr, - output->info(), - deconv_info)); + ARM_COMPUTE_ERROR_THROW_ON(CLGEMMDeconvolutionLayer::validate( + input->info(), weights->info(), bias != nullptr ? bias->info() : nullptr, output->info(), deconv_info)); + ARM_COMPUTE_LOG_PARAMS(input, weights, bias, output, deconv_info); _original_weights = weights; - _padded_input = deconv_info.pad_bottom() > 0 || deconv_info.pad_left() > 0 || deconv_info.pad_right() > 0 || deconv_info.pad_top() > 0; - _is_nchw = input->info()->data_layout() == DataLayout::NCHW; - _is_quantized = is_data_type_quantized_asymmetric(input->info()->data_type()); + _padded_input = deconv_info.pad_bottom() > 0 || deconv_info.pad_left() > 0 || deconv_info.pad_right() > 0 || + deconv_info.pad_top() > 0; + _is_nchw = input->info()->data_layout() == DataLayout::NCHW; + _is_quantized = is_data_type_quantized_asymmetric(input->info()->data_type()); const ICLTensor *input_to_use = input; const ICLTensor *weights_to_use = weights; @@ -248,7 +284,7 @@ void CLGEMMDeconvolutionLayer::configure(const CLCompileContext &compile_context // do an outer product in NCHW and then an accumulation through a reduction. This would have two // drawbacks: first, the outer product is less efficient than a full GEMM. Second, the reduction // might be slower than GEMM. - if(_is_nchw) + if (_is_nchw) { _memory_group.manage(&_permuted_input); _permute_input_to_nhwc.configure(compile_context, input, &_permuted_input, PermutationVector(2U, 0U, 1U)); @@ -260,10 +296,11 @@ void CLGEMMDeconvolutionLayer::configure(const CLCompileContext &compile_context } // Reshape the input weights. The weights will be reshaped only once during the call to prepare() - _reshaped_weights.allocator()->init(TensorInfo(TensorShape(weights_to_use->info()->dimension(0), - weights_to_use->info()->dimension(1) * weights_to_use->info()->dimension(2) * weights_to_use->info()->dimension(3)), - 1, - input->info()->data_type(), weights->info()->quantization_info())); + _reshaped_weights.allocator()->init( + TensorInfo(TensorShape(weights_to_use->info()->dimension(0), weights_to_use->info()->dimension(1) * + weights_to_use->info()->dimension(2) * + weights_to_use->info()->dimension(3)), + 1, input->info()->data_type(), weights->info()->quantization_info())); _reshape_weights.configure(compile_context, weights_to_use, &_reshaped_weights); _transpose_weights.configure(compile_context, &_reshaped_weights, &_reshaped_weights_t); @@ -272,15 +309,17 @@ void CLGEMMDeconvolutionLayer::configure(const CLCompileContext &compile_context GEMMInfo gemm_info(false, false, true, input->info()->dimension(idx_h), true); // Configure output stage for asymmetric quantized types - if(_is_quantized) + if (_is_quantized) { // gemmlowp adds the offsets (instead of subtracting them). Thus, we need to negate the original // and restore them back to make it work properly. QuantizationInfo iq_info = input->info()->quantization_info(); QuantizationInfo wq_info = weights->info()->quantization_info(); - input_to_use->info()->set_quantization_info(QuantizationInfo(iq_info.uniform().scale, -iq_info.uniform().offset)); - _reshaped_weights_t.info()->set_quantization_info(QuantizationInfo(wq_info.uniform().scale, -wq_info.uniform().offset)); + input_to_use->info()->set_quantization_info( + QuantizationInfo(iq_info.uniform().scale, -iq_info.uniform().offset)); + _reshaped_weights_t.info()->set_quantization_info( + QuantizationInfo(wq_info.uniform().scale, -wq_info.uniform().offset)); _mm_gemmlowp.configure(compile_context, input_to_use, &_reshaped_weights_t, nullptr, &_gemm_output, gemm_info); @@ -289,10 +328,11 @@ void CLGEMMDeconvolutionLayer::configure(const CLCompileContext &compile_context } else { - _mm_gemm.configure(compile_context, input_to_use, &_reshaped_weights_t, nullptr, &_gemm_output, 1.f, 0.0f, gemm_info); + _mm_gemm.configure(compile_context, input_to_use, &_reshaped_weights_t, nullptr, &_gemm_output, 1.f, 0.0f, + gemm_info); } - if(_is_nchw) + if (_is_nchw) { _permuted_input.allocator()->allocate(); } @@ -301,7 +341,7 @@ void CLGEMMDeconvolutionLayer::configure(const CLCompileContext &compile_context ICLTensor *slice_output = nullptr; ICLTensor *output_stage_output = nullptr; - if(_padded_input && _is_quantized) + if (_padded_input && _is_quantized) { _memory_group.manage(&_slice_gemm_input); _memory_group.manage(&_gemmlowp_final); @@ -309,13 +349,13 @@ void CLGEMMDeconvolutionLayer::configure(const CLCompileContext &compile_context output_stage_output = &_slice_gemm_input; slice_output = output; } - else if(_padded_input) + else if (_padded_input) { _memory_group.manage(&_slice_gemm_input); deconv_reshape_output = &_slice_gemm_input; slice_output = output; } - else if(_is_quantized) + else if (_is_quantized) { _memory_group.manage(&_gemmlowp_final); deconv_reshape_output = &_gemmlowp_final; @@ -327,21 +367,24 @@ void CLGEMMDeconvolutionLayer::configure(const CLCompileContext &compile_context } // Configure a Col2Im call to reshape the output of GEMM - _deconv_reshape->configure(compile_context, &_gemm_output, bias, deconv_reshape_output, input->info(), weights->info(), deconv_info); + _deconv_reshape->configure(compile_context, &_gemm_output, bias, deconv_reshape_output, input->info(), + weights->info(), deconv_info); _gemm_output.allocator()->allocate(); - if(_is_quantized) + if (_is_quantized) { GEMMLowpOutputStageInfo output_stage_info; construct_gemmlowp_output_stage(input->info(), weights->info(), output->info(), output_stage_info); - _gemmlowp_output_stage.configure(compile_context, &_gemmlowp_final, nullptr, output_stage_output, output_stage_info); + _gemmlowp_output_stage.configure(compile_context, &_gemmlowp_final, nullptr, output_stage_output, + output_stage_info); _gemmlowp_final.allocator()->allocate(); } // If the input was padded, the output needs to be sliced. - if(_padded_input) + if (_padded_input) { - const auto start_end = compute_start_end_slice_coordinates(*deconv_reshape_output->info(), deconv_info, _is_nchw); + const auto start_end = + compute_start_end_slice_coordinates(*deconv_reshape_output->info(), deconv_info, _is_nchw); _slice_gemm.configure(compile_context, &_slice_gemm_input, slice_output, start_end.first, start_end.second); _slice_gemm_input.allocator()->allocate(); } @@ -353,12 +396,12 @@ void CLGEMMDeconvolutionLayer::run() MemoryGroupResourceScope scope_mg(_memory_group); - if(_is_nchw) + if (_is_nchw) { _permute_input_to_nhwc.run(); } - if(_is_quantized) + if (_is_quantized) { _mm_gemmlowp.run(); } @@ -369,12 +412,12 @@ void CLGEMMDeconvolutionLayer::run() CLScheduler::get().enqueue(*_deconv_reshape, false); - if(_is_quantized) + if (_is_quantized) { _gemmlowp_output_stage.run(); } - if(_padded_input) + if (_padded_input) { _slice_gemm.run(); } @@ -382,11 +425,11 @@ void CLGEMMDeconvolutionLayer::run() void CLGEMMDeconvolutionLayer::prepare() { - if(!_is_prepared) + if (!_is_prepared) { ARM_COMPUTE_ERROR_ON(!_original_weights->is_used()); - if(_is_nchw) + if (_is_nchw) { _permuted_weights.allocator()->allocate(); _permute_weights_to_nhwc.run(); @@ -395,7 +438,7 @@ void CLGEMMDeconvolutionLayer::prepare() _reshaped_weights.allocator()->allocate(); _reshape_weights.run(); - if(_is_nchw) + if (_is_nchw) { _permuted_weights.allocator()->free(); } @@ -404,7 +447,7 @@ void CLGEMMDeconvolutionLayer::prepare() _transpose_weights.run(); // Prepare gemm - if(!_is_quantized) + if (!_is_quantized) { _mm_gemm.prepare(); } @@ -414,7 +457,7 @@ void CLGEMMDeconvolutionLayer::prepare() } // Free resources - if(!_reshaped_weights_t.is_used()) + if (!_reshaped_weights_t.is_used()) { _reshaped_weights_t.allocator()->free(); } diff --git a/src/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.cpp b/src/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.cpp index 3be09581bd..8bad198658 100644 --- a/src/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.cpp +++ b/src/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.cpp @@ -23,6 +23,7 @@ */ #include "arm_compute/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.h" +#include "arm_compute/core/CL/CLKernelLibrary.h" #include "arm_compute/core/CL/ICLTensor.h" #include "arm_compute/core/Error.h" #include "arm_compute/core/Helpers.h" @@ -30,683 +31,103 @@ #include "arm_compute/core/Log.h" #include "arm_compute/core/TensorInfo.h" #include "arm_compute/core/Types.h" -#include "arm_compute/core/Validate.h" -#include "arm_compute/core/utils/misc/ShapeCalculator.h" #include "arm_compute/core/utils/quantization/AsymmHelpers.h" +#include "arm_compute/core/Validate.h" #include "arm_compute/runtime/CL/CLScheduler.h" -#include "src/core/CL/kernels/CLGEMMLowpMatrixMultiplyNativeKernel.h" -#include "src/core/CL/kernels/CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel.h" -#include "src/core/CL/kernels/CLGEMMLowpOffsetContributionKernel.h" -#include "src/core/CL/kernels/CLGEMMLowpOffsetContributionOutputStageKernel.h" -#include "src/core/CL/kernels/CLGEMMLowpReductionKernel.h" -#include "src/core/gpu/cl/kernels/ClCastKernel.h" -#include "src/core/gpu/cl/kernels/ClGemmReshapeRhsMatrixKernel.h" -#include "src/core/helpers/AutoConfiguration.h" -#include "src/runtime/CL/gemm_auto_heuristics/CLGEMMAutoHeuristics.h" -#include "utils/TypePrinter.h" - -namespace arm_compute -{ -using namespace arm_compute::misc::shape_calculator; -using namespace arm_compute::cl_gemm; - -namespace -{ -inline bool validate_gemm_kernel(CLGEMMKernelType kernel_type) -{ - switch(kernel_type) - { - case CLGEMMKernelType::NATIVE: - case CLGEMMKernelType::RESHAPED_ONLY_RHS: - { - return true; - } - default: - { - return false; - } - } -} -//Automatically select between mlgo (prioritized) and default heuristics for gemm kernel type -inline CLGEMMKernelType auto_select_gemm_kernel(auto_heuristics::CommonQuery query, bool reshape_b_only_on_first_run) -{ - auto gemm_kernel = auto_heuristics::select_mlgo_gemm_kernel(query, reshape_b_only_on_first_run); - if(bool(gemm_kernel)) - { - if(validate_gemm_kernel(gemm_kernel.gemm_type)) - { - ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("Use gemm kernel from mlgo heuristics: %s.", to_string(gemm_kernel.gemm_type).c_str()); - return gemm_kernel.gemm_type; - } - } - gemm_kernel = auto_heuristics::select_default_gemm_kernel(query, reshape_b_only_on_first_run); - ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("Use gemm kernel from default heuristics: %s.", to_string(gemm_kernel.gemm_type).c_str()); - return gemm_kernel.gemm_type; -} -// Validate lhs_info and rhs_info for native kernel -inline bool validate_lhs_rhs_info_native(const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info, const ITensorInfo *a, const ITensorInfo *b, const GEMMReshapeInfo &reshape_info) -{ - // Validate GEMMLHSMatrixInfo and GEMMRHSMatrixInfo for reshaped only rhs kernel - TensorInfo mm_result_s32_info{}; - // Output tensor auto initialization if not yet initialized - auto_init_if_empty(mm_result_s32_info, a->clone()->set_tensor_shape(compute_mm_shape(*a, *b, false, reshape_info)).set_data_type(DataType::S32)); - // Validate mm kernel - // NOTE: Ignore all other parameters (eg. output stage etc.) and only validate lhs and rhs info - // NOTE: This assumes: - // 1. lhs and rhs info's validity does not depend on these other parameters and vice versa(in CLGEMMLowpMatrixMultiplyNativeKernel.cpp validate_arguments). - // 2. lhs and rhs info does not cause window and padding issues through side effects (in CLGEMMLowpMatrixMultiplyNativeKernel.cpp validate_and_configure_window). - if(!bool(CLGEMMLowpMatrixMultiplyNativeKernel::validate(a, b, &mm_result_s32_info, lhs_info, rhs_info, reshape_info))) - { - return false; - } - return true; -} +#include "arm_compute/runtime/IMemoryManager.h" -// Automatically select between mlgo (prioritized) and default heuristics for native kernel configs -std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> auto_select_gemm_config_native(auto_heuristics::CommonQuery query, const ITensorInfo *a, const ITensorInfo *b, const GEMMReshapeInfo &reshape_info) -{ - auto config = auto_heuristics::select_mlgo_gemm_config_native(query); - if(config) - { - if(validate_lhs_rhs_info_native(config.lhs_info, config.rhs_info, a, b, reshape_info)) - { - ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("Use native config from mlgo heuristics: LHS info: %s ; RHS info: %s ", to_string(config.lhs_info).c_str(), to_string(config.rhs_info).c_str()); - return { config.lhs_info, config.rhs_info }; - } - } - config = auto_heuristics::select_default_gemm_config_native(query); - ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("Use native config from default heuristics: LHS info: %s ; RHS info: %s ", to_string(config.lhs_info).c_str(), to_string(config.rhs_info).c_str()); - return { config.lhs_info, config.rhs_info }; -} +#include "src/core/helpers/MemoryHelpers.h" +#include "src/gpu/cl/operators/ClGemmLowpMatrixMultiplyCore.h" -// Validate lhs_info and rhs_info for reshaped only rhs kernel -inline bool validate_lhs_rhs_info_reshaped_only_rhs(const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info, const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *output, - unsigned int m, unsigned int n, unsigned int k, bool reinterpret_input_as_3d, int depth_output_gemm3d) +namespace arm_compute { - // Validate GEMMLHSMatrixInfo and GEMMRHSMatrixInfo for reshaped only rhs kernel - TensorInfo tmp_b_info{}; - // Validate reshape RHS kernel - auto_init_if_empty(tmp_b_info, b->clone()->set_tensor_shape(compute_rhs_reshaped_shape(*b, rhs_info))); - if(!bool(opencl::kernels::ClGemmReshapeRhsMatrixKernel::validate(b, &tmp_b_info, rhs_info))) - { - return false; - } - // Validate mm kernel - // NOTE: Ignore all other parameters (eg. depth_output_gemm3d, output stage etc.) and only validate lhs and rhs info - // NOTE: This assumes: - // 1. lhs and rhs info's validity does not depend on these other parameters and vice versa(in CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel.cpp validate_arguments). - // 2. lhs and rhs info does not cause window and padding issues through side effects (in CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel.cpp validate_and_configure_window). - GEMMKernelInfo gemm_kernel_info; - gemm_kernel_info.m = m; - gemm_kernel_info.n = n; - gemm_kernel_info.k = k; - gemm_kernel_info.reinterpret_input_as_3d = reinterpret_input_as_3d; - gemm_kernel_info.depth_output_gemm3d = depth_output_gemm3d; - gemm_kernel_info.lhs_info = lhs_info; - gemm_kernel_info.rhs_info = rhs_info; - // Since we ignore the output stage, output data type has to be S32 to pass the validation - TensorInfo output_info_copy(*output); - output_info_copy.set_data_type(DataType::S32); - if(!bool(CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel::validate(a, &tmp_b_info, &output_info_copy, gemm_kernel_info))) - { - return false; - } - return true; -} +using namespace arm_compute::experimental; +using OperatorType = opencl::ClGemmLowpMatrixMultiplyCore; -// Automatically select between mlgo (prioritized) and default heuristics for reshaped only rhs kernel configs -std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> auto_select_gemm_config_reshaped_only_rhs(auto_heuristics::CommonQuery query, bool reinterpret_input_as_3d, int depth_output_gemm3d, - const ITensorInfo *a, - const ITensorInfo *b, const ITensorInfo *output) +struct CLGEMMLowpMatrixMultiplyCore::Impl { - auto config = auto_heuristics::select_mlgo_gemm_config_reshaped_only_rhs(query); - if(config) - { - if(validate_lhs_rhs_info_reshaped_only_rhs(config.lhs_info, config.rhs_info, a, b, output, query.m, query.n, query.k, reinterpret_input_as_3d, depth_output_gemm3d)) - { - ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("Use reshaped_only_rhs config from mlgo heuristics: LHS info: %s ; RHS info: %s ", to_string(config.lhs_info).c_str(), to_string(config.rhs_info).c_str()); - return { config.lhs_info, config.rhs_info }; - } - } - config = auto_heuristics::select_default_gemm_config_reshaped_only_rhs(query); - ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("Use reshaped_only_rhs config from default heuristics: LHS info: %s ; RHS info: %s ", to_string(config.lhs_info).c_str(), to_string(config.rhs_info).c_str()); - return { config.lhs_info, config.rhs_info }; -} - -inline bool is_gemm_reshaped(CLGEMMKernelType kernel_type) -{ - switch(kernel_type) - { - case CLGEMMKernelType::NATIVE: - return false; - case CLGEMMKernelType::RESHAPED_ONLY_RHS: - return true; - default: - ARM_COMPUTE_ERROR("Not supported gemmlowp kernel!"); - } -} -} // namespace + const ICLTensor *b{nullptr}; + std::unique_ptr<OperatorType> op{nullptr}; + MemoryGroup memory_group{}; + ITensorPack run_pack{}; + MemoryRequirements aux_mem_req{}; + WorkspaceData<CLTensor> workspace_tensors{}; + bool is_prepared{false}; +}; CLGEMMLowpMatrixMultiplyCore::CLGEMMLowpMatrixMultiplyCore(std::shared_ptr<IMemoryManager> memory_manager) - : _memory_group(std::move(memory_manager)), - _weights_to_qasymm8(std::make_unique<opencl::kernels::ClCastKernel>()), - _mm_native_kernel(std::make_unique<CLGEMMLowpMatrixMultiplyNativeKernel>()), - _mm_reshaped_only_rhs_kernel(std::make_unique<CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel>()), - _mtx_b_reshape_kernel(std::make_unique<opencl::kernels::ClGemmReshapeRhsMatrixKernel>()), - _mtx_a_reduction_kernel(std::make_unique<CLGEMMLowpMatrixAReductionKernel>()), - _mtx_b_reduction_kernel(std::make_unique<CLGEMMLowpMatrixBReductionKernel>()), - _offset_contribution_kernel(std::make_unique<CLGEMMLowpOffsetContributionKernel>()), - _offset_contribution_output_stage_kernel(std::make_unique<CLGEMMLowpOffsetContributionOutputStageKernel>()), - _qasymm8_weights(), - _vector_sum_col(), - _vector_sum_row(), - _tmp_b(), - _mm_result_s32(), - _gemm_output_stage_multipliers(), - _gemm_output_stage_shifts(), - _matrix_a(nullptr), - _original_b(nullptr), - _output(nullptr), - _a_offset(0), - _b_offset(0), - _is_gemm_reshaped(true), - _reshape_b_only_on_first_run(false), - _is_prepared(false), - _run_output_stage(false), - _convert_to_qasymm8(false), - _run_offset_contribution(false) + : _impl(std::make_unique<Impl>()) { + _impl->memory_group = MemoryGroup(memory_manager); } CLGEMMLowpMatrixMultiplyCore::~CLGEMMLowpMatrixMultiplyCore() = default; -void CLGEMMLowpMatrixMultiplyCore::configure(const ICLTensor *a, const ICLTensor *b, const ICLTensor *c, ICLTensor *output, const GEMMInfo &gemm_info) +void CLGEMMLowpMatrixMultiplyCore::configure( + const ICLTensor *a, const ICLTensor *b, const ICLTensor *c, ICLTensor *output, const GEMMInfo &gemm_info) { configure(CLKernelLibrary::get().get_compile_context(), a, b, c, output, gemm_info); } -void CLGEMMLowpMatrixMultiplyCore::configure(const CLCompileContext &compile_context, const ICLTensor *a, const ICLTensor *b, const ICLTensor *c, ICLTensor *output, const GEMMInfo &gemm_info) +void CLGEMMLowpMatrixMultiplyCore::configure(const CLCompileContext &compile_context, + const ICLTensor *a, + const ICLTensor *b, + const ICLTensor *c, + ICLTensor *output, + const GEMMInfo &gemm_info) { ARM_COMPUTE_ERROR_ON_NULLPTR(a, b, output); - ARM_COMPUTE_ERROR_THROW_ON(CLGEMMLowpMatrixMultiplyCore::validate(a->info(), b->info(), c != nullptr ? c->info() : nullptr, output->info(), gemm_info)); - - _is_prepared = false; - _original_b = b; - _reshape_b_only_on_first_run = gemm_info.reshape_b_only_on_first_run(); - _a_offset = a->info()->quantization_info().uniform().offset; - _matrix_a = a; - _output = output; - - _convert_to_qasymm8 = is_data_type_quantized_per_channel(b->info()->data_type()) && is_data_type_quantized_symmetric(b->info()->data_type()) - && a->info()->data_type() == DataType::QASYMM8; - _b_offset = _convert_to_qasymm8 ? -128 : b->info()->quantization_info().uniform().offset; - - // Get the GPU target - const GPUTarget gpu_target = CLScheduler::get().target(); - - // Set the target for the kernels - _mm_native_kernel->set_target(gpu_target); - _mm_reshaped_only_rhs_kernel->set_target(gpu_target); - - GEMMRHSMatrixInfo rhs_info; - GEMMLHSMatrixInfo lhs_info; - - // Arguments used by GEMMReshapeInfo - // If we pass the matrix A and matrix B reshaped to CLGEMMMatrixMultiplyKernel, we need to pass m, n, k, mult_transpose1xW_width and mult_interleave4x4_height to CLGEMMReshapeInfo - // in order to know how the matrices have been reshaped - bool reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d(); - const unsigned int m = reinterpret_input_as_3d ? (a->info()->dimension(1) * a->info()->dimension(2)) : a->info()->dimension(1); - const unsigned int n = b->info()->dimension(0); - const unsigned int k = a->info()->dimension(0); - const unsigned int batch_size = reinterpret_input_as_3d ? a->info()->dimension(3) : a->info()->dimension(2); - const int depth_output_gemm3d = gemm_info.depth_output_gemm3d(); - - const auto reshape_info = GEMMReshapeInfo(m, n, k, 1, 1, depth_output_gemm3d, reinterpret_input_as_3d); - - // Check if we need to reshape the matrix A and matrix B - _is_gemm_reshaped = is_gemm_reshaped(auto_select_gemm_kernel(auto_heuristics::CommonQuery{ gpu_target, a->info()->data_type(), m, n, k, batch_size }, _reshape_b_only_on_first_run)); - if(_convert_to_qasymm8) - { - // Set data type for converted weights - TensorInfo weights_info(*b->info()); - weights_info.set_data_type(DataType::QASYMM8); - _qasymm8_weights.allocator()->init(weights_info); - _weights_to_qasymm8->configure(compile_context, b->info(), _qasymm8_weights.info(), ConvertPolicy::WRAP); - } - - const ICLTensor *matrix_b = _convert_to_qasymm8 ? &_qasymm8_weights : b; - if(_is_gemm_reshaped) - { - matrix_b = &_tmp_b; - - if(!_reshape_b_only_on_first_run) - { - _memory_group.manage(&_tmp_b); - } + _impl->b = b; + _impl->op = std::make_unique<OperatorType>(); + _impl->is_prepared = gemm_info.retain_internal_weights(); - // Pick up the GEMM configuration - // It doesn't matter whether Datatype is DataType::QASYMM8 or DataType::QASYMM8_SIGNED, since it only affect the shape configuration - std::tie(lhs_info, rhs_info) = auto_select_gemm_config_reshaped_only_rhs(auto_heuristics::CommonQuery{ gpu_target, DataType::QASYMM8, m, n, k, batch_size }, reinterpret_input_as_3d, - depth_output_gemm3d, - a->info(), _convert_to_qasymm8 ? _qasymm8_weights.info() : b->info(), output->info()); + _impl->op->configure(compile_context, a->info(), b->info(), c != nullptr ? c->info() : nullptr, output->info(), + gemm_info); + _impl->aux_mem_req = _impl->op->workspace(); - // Configure reshape RHS kernel - _mtx_b_reshape_kernel->configure(compile_context, _convert_to_qasymm8 ? _qasymm8_weights.info() : b->info(), _tmp_b.info(), rhs_info); - } - - // Using default reduction info - const GEMMLowpReductionKernelInfo reduction_info {}; - - // Initialize matrix B reduction kernel only if _a_offset is not equal to 0 - if(_a_offset != 0) + // Manage/allocate auxilairy tensors + if (_impl->is_prepared) { - TensorInfo info_vector_sum_col(compute_reductionA_shape(*b->info()), 1, DataType::S32); - _vector_sum_col.allocator()->init(info_vector_sum_col); - if(!_reshape_b_only_on_first_run) - { - _memory_group.manage(&_vector_sum_col); - } - - // Configure Matrix B reduction kernel - _mtx_b_reduction_kernel->configure(compile_context, _convert_to_qasymm8 ? &_qasymm8_weights : b, &_vector_sum_col, reduction_info); - } - - // Initialize Matrix A reduction kernel only if _b_offset is not equal to 0 - if(_b_offset != 0) - { - TensorInfo info_vector_sum_row(compute_reductionB_shape(*a->info()), 1, DataType::S32); - _vector_sum_row.allocator()->init(info_vector_sum_row); - _memory_group.manage(&_vector_sum_row); - - // Configure matrix A reduction kernel - _mtx_a_reduction_kernel->configure(compile_context, a, &_vector_sum_row, reduction_info); - } - - GEMMKernelInfo gemm_kernel_info; - gemm_kernel_info.m = m; - gemm_kernel_info.n = n; - gemm_kernel_info.k = k; - gemm_kernel_info.depth_output_gemm3d = depth_output_gemm3d; - gemm_kernel_info.reinterpret_input_as_3d = reinterpret_input_as_3d; - gemm_kernel_info.lhs_info = lhs_info; - gemm_kernel_info.rhs_info = rhs_info; - gemm_kernel_info.a_offset = _a_offset; - gemm_kernel_info.b_offset = _b_offset; - // If GEMMLowpOutputStage != NONE, fuse the offset contribution with the output stage - if(gemm_info.gemmlowp_output_stage().type != GEMMLowpOutputStageType::NONE) - { - // Configure offset contribution kernel - const size_t num_filters = (gemm_info.gemmlowp_output_stage().is_quantized_per_channel) ? gemm_info.gemmlowp_output_stage().gemmlowp_multipliers.size() : 1; - - _gemm_output_stage_multipliers.allocator()->init(TensorInfo(TensorShape(num_filters), 1, DataType::S32)); - _gemm_output_stage_shifts.allocator()->init(TensorInfo(TensorShape(num_filters), 1, DataType::S32)); - - GEMMLowpOutputStageInfo gemmlowp_output_stage = gemm_info.gemmlowp_output_stage(); - gemmlowp_output_stage.output_data_type = _matrix_a->info()->data_type(); - if(num_filters == 1) - { - // Per-channel quantization with OFM == 1 is equivalent to uniform quantization. - // Setting this flag to false prevents the kernel from adding useless padding to the output multipliers and shifts - gemmlowp_output_stage.is_quantized_per_channel = false; - } - - gemm_kernel_info.output_stage = gemmlowp_output_stage; - - if(_is_gemm_reshaped && gemmlowp_output_stage.type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT) - { - // Configure and tune matrix multiply kernel with fused output stage - _mm_reshaped_only_rhs_kernel->configure(compile_context, _matrix_a, matrix_b, output, gemm_kernel_info, _a_offset == 0 ? nullptr : &_vector_sum_col, - _b_offset == 0 ? nullptr : &_vector_sum_row, c, &_gemm_output_stage_multipliers, &_gemm_output_stage_shifts); - } - else - { - _run_output_stage = true; - - _memory_group.manage(&_mm_result_s32); - - if(_is_gemm_reshaped) - { - _mm_reshaped_only_rhs_kernel->configure(compile_context, _matrix_a, matrix_b, &_mm_result_s32, gemm_kernel_info); - } - else - { - // Pick up the GEMM configuration - // It doesn't matter whether Datatype is DataType::QASYMM8 or DataType::QASYMM8_SIGNED, since it only affect the shape configuration - std::tie(lhs_info, rhs_info) = auto_select_gemm_config_native(auto_heuristics::CommonQuery{ gpu_target, DataType::QASYMM8, m, n, k, batch_size }, - _matrix_a->info(), _convert_to_qasymm8 ? _qasymm8_weights.info() : matrix_b->info(), reshape_info); - - // Configure matrix multiply kernel - _mm_native_kernel->configure(compile_context, _matrix_a, matrix_b, &_mm_result_s32, lhs_info, rhs_info, reshape_info); - - _offset_contribution_output_stage_kernel->configure(compile_context, &_mm_result_s32, _a_offset == 0 ? nullptr : &_vector_sum_col, _b_offset == 0 ? nullptr : &_vector_sum_row, c, output, - a->info()->dimension(0), - _a_offset, _b_offset, gemmlowp_output_stage, &_gemm_output_stage_multipliers, &_gemm_output_stage_shifts); - _mm_result_s32.allocator()->allocate(); - } - } - - _gemm_output_stage_multipliers.allocator()->allocate(); - _gemm_output_stage_shifts.allocator()->allocate(); - // Compute GEMM output multipliers and shifts for output stage - _gemm_output_stage_multipliers.map(); - _gemm_output_stage_shifts.map(); - std::memcpy(_gemm_output_stage_multipliers.ptr_to_element(Coordinates(0)), gemm_info.gemmlowp_output_stage().gemmlowp_multipliers.data(), num_filters * sizeof(int32_t)); - std::memcpy(_gemm_output_stage_shifts.ptr_to_element(Coordinates(0)), gemm_info.gemmlowp_output_stage().gemmlowp_shifts.data(), num_filters * sizeof(int32_t)); - _gemm_output_stage_multipliers.unmap(); - _gemm_output_stage_shifts.unmap(); + _impl->run_pack.add_const_tensor(ACL_SRC_0, a); + _impl->run_pack.add_tensor(ACL_DST, output); } else { - _run_offset_contribution = true; - if(_is_gemm_reshaped) - { - // Configure and tune matrix multiply kernel - _mm_reshaped_only_rhs_kernel->configure(compile_context, _matrix_a, matrix_b, output, gemm_kernel_info); - } - else - { - // Pick up the GEMM configuration - // It doesn't matter whether Datatype is DataType::QASYMM8 or DataType::QASYMM8_SIGNED, since it only affect the shape configuration - std::tie(lhs_info, rhs_info) = auto_select_gemm_config_native(auto_heuristics::CommonQuery{ gpu_target, DataType::QASYMM8, m, n, k, batch_size }, - a->info(), _convert_to_qasymm8 ? _qasymm8_weights.info() : b->info(), reshape_info); - - // Configure matrix multiply kernel - _mm_native_kernel->configure(compile_context, _matrix_a, matrix_b, output, lhs_info, rhs_info, reshape_info); - } - - // Configure offset contribution kernel - _offset_contribution_kernel->configure(compile_context, output, _a_offset == 0 ? nullptr : &_vector_sum_col, _b_offset == 0 ? nullptr : &_vector_sum_row, c, a->info()->dimension(0), _a_offset, - _b_offset); - } - - // Allocate tensors - if(_is_gemm_reshaped) - { - if(!_reshape_b_only_on_first_run) - { - _tmp_b.allocator()->allocate(); - } - } - - if(_a_offset != 0 && !_reshape_b_only_on_first_run) - { - _vector_sum_col.allocator()->allocate(); - } - - if(_b_offset != 0) - { - _vector_sum_row.allocator()->allocate(); + _impl->run_pack = {{ACL_SRC_0, a}, {ACL_SRC_1, _impl->b}, {ACL_SRC_2, c}, {ACL_DST, output}}; + _impl->workspace_tensors = + manage_workspace<CLTensor>(_impl->op->workspace(), _impl->memory_group, _impl->run_pack, _impl->run_pack); } } -Status CLGEMMLowpMatrixMultiplyCore::validate(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *output, const GEMMInfo &gemm_info) +Status CLGEMMLowpMatrixMultiplyCore::validate(const ITensorInfo *a, + const ITensorInfo *b, + const ITensorInfo *c, + const ITensorInfo *output, + const GEMMInfo &gemm_info) { - ARM_COMPUTE_ERROR_ON_NULLPTR(a, b, output); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(a, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(b, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM8, DataType::QSYMM8_PER_CHANNEL); - ARM_COMPUTE_RETURN_ERROR_ON(a->data_type() == DataType::QASYMM8 && b->data_type() == DataType::QASYMM8_SIGNED); - ARM_COMPUTE_RETURN_ERROR_ON(a->data_type() == DataType::QASYMM8_SIGNED && b->data_type() == DataType::QASYMM8); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.is_a_reshaped(), "Matrix A already reshaped is not supported"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.is_b_reshaped(), "Matrix B already reshaped is not supported"); - - int32_t a_offset = a->quantization_info().uniform().offset; - int32_t b_offset = b->quantization_info().uniform().offset; - - const ITensorInfo *matrix_a_info = a; - - TensorInfo tmp_b_info{}; - GEMMRHSMatrixInfo rhs_info; - GEMMLHSMatrixInfo lhs_info; - - // Get the GPU target - const GPUTarget gpu_target = CLScheduler::get().target(); - - bool reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d(); - const unsigned int m = reinterpret_input_as_3d ? (a->dimension(1) * a->dimension(2)) : a->dimension(1); - const unsigned int n = b->dimension(0); - const unsigned int k = a->dimension(0); - const unsigned int batch_size = reinterpret_input_as_3d ? a->dimension(3) : a->dimension(2); - const int depth_output_gemm3d = gemm_info.depth_output_gemm3d(); - - bool reshape_matrix_b = is_gemm_reshaped(auto_select_gemm_kernel(auto_heuristics::CommonQuery{ gpu_target, a->data_type(), m, n, k, batch_size }, gemm_info.reshape_b_only_on_first_run())); - - const GEMMReshapeInfo reshape_info = GEMMReshapeInfo(m, n, k, 1, 1, depth_output_gemm3d, reinterpret_input_as_3d); - - bool convert_to_qasymm8 = is_data_type_quantized_per_channel(b->data_type()) && is_data_type_quantized_symmetric(b->data_type()) - && is_data_type_quantized_asymmetric(a->data_type()); - TensorInfo weights_info(*b); - if(convert_to_qasymm8) - { - b_offset = -128; - weights_info.set_data_type(DataType::QASYMM8); - ARM_COMPUTE_RETURN_ON_ERROR(opencl::kernels::ClCastKernel::validate(b, &weights_info, ConvertPolicy::WRAP)); - } - const ITensorInfo *matrix_b_info = &weights_info; - if(reshape_matrix_b) - { - matrix_b_info = &tmp_b_info; - - // Pick up the GEMM configuration - // NOTE: No need to validate mlgo configurations as they automatically fall back to default heuristics if validation fails - // It doesn't matter whether Datatype is DataType::QASYMM8 or DataType::QASYMM8_SIGNED, since it only affect the shape configuration - const auto res = select_default_gemm_config_reshaped_only_rhs(auto_heuristics::CommonQuery{ gpu_target, DataType::QASYMM8, m, n, k, batch_size }); - lhs_info = res.lhs_info; - rhs_info = res.rhs_info; - - // Validate reshape RHS kernel - auto_init_if_empty(tmp_b_info, weights_info.clone()->set_tensor_shape(compute_rhs_reshaped_shape(weights_info, rhs_info))); - ARM_COMPUTE_RETURN_ON_ERROR(opencl::kernels::ClGemmReshapeRhsMatrixKernel::validate(&weights_info, &tmp_b_info, rhs_info)); - } - - TensorInfo info_vector_sum_col{}; - TensorInfo info_vector_sum_row{}; - - const GEMMLowpReductionKernelInfo reduction_info; - // Validate matrix B reduction kernel only if _a_offset is not equal to 0 - if(a_offset != 0) - { - info_vector_sum_col = TensorInfo(compute_reductionA_shape(weights_info), 1, DataType::S32); - - // Configure Matrix B reduction kernel - ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixBReductionKernel::validate(&weights_info, &info_vector_sum_col, reduction_info)); - } - - // Validate Matrix A reduction kernel only if _b_offset is not equal to 0 - if(b_offset != 0) - { - info_vector_sum_row = TensorInfo(compute_reductionB_shape(*a), 1, DataType::S32); - - // Configure matrix A reduction kernel - ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixAReductionKernel::validate(a, &info_vector_sum_row, reduction_info)); - } - - GEMMKernelInfo gemm_kernel_info; - gemm_kernel_info.m = m; - gemm_kernel_info.n = n; - gemm_kernel_info.k = k; - gemm_kernel_info.depth_output_gemm3d = depth_output_gemm3d; - gemm_kernel_info.reinterpret_input_as_3d = reinterpret_input_as_3d; - gemm_kernel_info.lhs_info = lhs_info; - gemm_kernel_info.rhs_info = rhs_info; - gemm_kernel_info.a_offset = a_offset; - gemm_kernel_info.b_offset = b_offset; - if(gemm_info.gemmlowp_output_stage().type != GEMMLowpOutputStageType::NONE) - { - const size_t num_filters = (gemm_info.gemmlowp_output_stage().is_quantized_per_channel) ? gemm_info.gemmlowp_output_stage().gemmlowp_multipliers.size() : 1; - - const TensorInfo gemm_output_stage_multipliers_shifts_info(TensorInfo(TensorShape(num_filters), 1, DataType::S32)); - - GEMMLowpOutputStageInfo gemmlowp_output_stage = gemm_info.gemmlowp_output_stage(); - gemmlowp_output_stage.output_data_type = a->data_type(); - - gemm_kernel_info.output_stage = gemmlowp_output_stage; - if(reshape_matrix_b && gemm_info.gemmlowp_output_stage().type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT) - { - ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel::validate(matrix_a_info, matrix_b_info, output, gemm_kernel_info, - a_offset == 0 ? nullptr : &info_vector_sum_col, - b_offset == 0 ? nullptr : &info_vector_sum_row, - c, - &gemm_output_stage_multipliers_shifts_info, - &gemm_output_stage_multipliers_shifts_info)); - } - else - { - TensorInfo mm_result_s32_info{}; - - if(reshape_matrix_b) - { - // Output tensor auto inizialitation if not yet initialized - auto_init_if_empty(mm_result_s32_info, a->clone()->set_tensor_shape(compute_mm_shape(*matrix_a_info, *matrix_b_info, reshape_info)).set_data_type(DataType::S32)); - - // Validate matrix multiply - ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel::validate(matrix_a_info, matrix_b_info, &mm_result_s32_info, gemm_kernel_info)); - } - else - { - // Output tensor auto inizialitation if not yet initialized - auto_init_if_empty(mm_result_s32_info, a->clone()->set_tensor_shape(compute_mm_shape(*matrix_a_info, *matrix_b_info, false, reshape_info)).set_data_type(DataType::S32)); - - // Pick up the GEMM configuration - // NOTE: No need to validate mlgo configurations as they automatically fall back to default heuristics if validation fails - // It doesn't matter whether Datatype is DataType::QASYMM8 or DataType::QASYMM8_SIGNED, since it only affect the shape configuration - const auto res = select_default_gemm_config_native(auto_heuristics::CommonQuery{ gpu_target, DataType::QASYMM8, m, n, k, batch_size }); - lhs_info = res.lhs_info; - rhs_info = res.rhs_info; - - // Validate matrix multiply - ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixMultiplyNativeKernel::validate(matrix_a_info, matrix_b_info, &mm_result_s32_info, lhs_info, rhs_info, reshape_info)); - } - - // Validate offset contribution kernel - ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpOffsetContributionOutputStageKernel::validate(&mm_result_s32_info, - a_offset == 0 ? nullptr : &info_vector_sum_col, - b_offset == 0 ? nullptr : &info_vector_sum_row, - c, - output, - a_offset, b_offset, - gemmlowp_output_stage, - &gemm_output_stage_multipliers_shifts_info, - &gemm_output_stage_multipliers_shifts_info)); - } - } - else - { - if(reshape_matrix_b) - { - // Validate matrix multiply - ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel::validate(matrix_a_info, matrix_b_info, output, gemm_kernel_info)); - } - else - { - // Pick up the GEMM configuration - // It doesn't matter whether Datatype is DataType::QASYMM8 or DataType::QASYMM8_SIGNED, since it only affect the shape configuration - const auto res = select_default_gemm_config_native(auto_heuristics::CommonQuery{ gpu_target, DataType::QASYMM8, m, n, k, batch_size }); - lhs_info = res.lhs_info; - rhs_info = res.rhs_info; - - // Validate matrix multiply - ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixMultiplyNativeKernel::validate(matrix_a_info, matrix_b_info, output, lhs_info, rhs_info, reshape_info)); - } - - if(output->total_size() != 0) - { - // Validate offset contribution kernel - ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpOffsetContributionKernel::validate(output, - a_offset == 0 ? nullptr : &info_vector_sum_col, - b_offset == 0 ? nullptr : &info_vector_sum_row, - c, - a_offset, b_offset)); - } - } - - return Status{}; + return OperatorType::validate(a, b, c, output, gemm_info); } void CLGEMMLowpMatrixMultiplyCore::run() { prepare(); - MemoryGroupResourceScope scope_mg(_memory_group); - - if(_is_gemm_reshaped) - { - if(!_reshape_b_only_on_first_run) - { - // Run reshape matrix B - ITensorPack mtx_b_pack; - mtx_b_pack.add_const_tensor(TensorType::ACL_SRC, _convert_to_qasymm8 ? &_qasymm8_weights : _original_b); - mtx_b_pack.add_tensor(TensorType::ACL_DST, &_tmp_b); - CLScheduler::get().enqueue(*_mtx_b_reshape_kernel, false); - } - } + MemoryGroupResourceScope scope_mg(_impl->memory_group); - // Run matrix B reduction kernel only if _a_offset is not equal to 0 - if(_a_offset != 0 && !_reshape_b_only_on_first_run) - { - CLScheduler::get().enqueue(*_mtx_b_reduction_kernel, false); - } - - // Run matrix A reduction kernel only if _b_offset is not equal to 0 - if(_b_offset != 0) - { - CLScheduler::get().enqueue(*_mtx_a_reduction_kernel, false); - } - - // Run matrix multiply - if(_is_gemm_reshaped) - { - CLScheduler::get().enqueue(*_mm_reshaped_only_rhs_kernel, false); - } - else - { - CLScheduler::get().enqueue(*_mm_native_kernel, false); - } - if(_run_output_stage) - { - // Run offset contribution/output stage kernel - CLScheduler::get().enqueue(*_offset_contribution_output_stage_kernel, true); - } - if(_run_offset_contribution) - { - // Run offset contribution kernel - CLScheduler::get().enqueue(*_offset_contribution_kernel, true); - } + _impl->op->run(_impl->run_pack); } void CLGEMMLowpMatrixMultiplyCore::prepare() { - if(!_is_prepared) + if (!_impl->is_prepared) { - if(_convert_to_qasymm8) - { - _qasymm8_weights.allocator()->allocate(); - ITensorPack convert_to_qs8_pack = { { ACL_SRC, _original_b }, { ACL_DST, &_qasymm8_weights } }; - CLScheduler::get().enqueue_op(*_weights_to_qasymm8, convert_to_qs8_pack, false); - } - - if(_is_gemm_reshaped && _reshape_b_only_on_first_run) - { - ARM_COMPUTE_ERROR_ON(!_original_b->is_used()); - - // Run reshape kernel and mark original weights tensor as unused - _tmp_b.allocator()->allocate(); - ITensorPack mtx_b_pack; - mtx_b_pack.add_const_tensor(TensorType::ACL_SRC, _convert_to_qasymm8 ? &_qasymm8_weights : _original_b); - mtx_b_pack.add_tensor(TensorType::ACL_DST, &_tmp_b); - CLScheduler::get().enqueue_op(*_mtx_b_reshape_kernel, mtx_b_pack, false); - _original_b->mark_as_unused(); - } + _impl->op->prepare(_impl->run_pack); - // Run matrix B reduction kernel only if _a_offset is not equal to 0 - if(_a_offset != 0 && _reshape_b_only_on_first_run) - { - _vector_sum_col.allocator()->allocate(); - CLScheduler::get().enqueue(*_mtx_b_reduction_kernel, false); - } + // Release temporary tensors that are only used in prepare stage + release_temporaries(_impl->aux_mem_req, _impl->workspace_tensors); - CLScheduler::get().queue().finish(); - _is_prepared = true; + _impl->is_prepared = true; } } } // namespace arm_compute diff --git a/src/runtime/CL/functions/CLGEMMLowpOutputStage.cpp b/src/runtime/CL/functions/CLGEMMLowpOutputStage.cpp index be452aaf3d..3dd8c5f101 100644 --- a/src/runtime/CL/functions/CLGEMMLowpOutputStage.cpp +++ b/src/runtime/CL/functions/CLGEMMLowpOutputStage.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2020 Arm Limited. + * Copyright (c) 2017-2021 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -23,166 +23,73 @@ */ #include "arm_compute/runtime/CL/functions/CLGEMMLowpOutputStage.h" +#include "arm_compute/core/CL/CLHelpers.h" +#include "arm_compute/core/CL/CLKernelLibrary.h" #include "arm_compute/core/CL/ICLTensor.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/KernelDescriptors.h" +#include "arm_compute/core/TensorInfo.h" #include "arm_compute/core/Types.h" -#include "src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleByFixedPointKernel.h" -#include "src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleByFloatKernel.h" -#include "src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleKernel.h" + +#include "src/core/CL/ICLKernel.h" +#include "src/gpu/cl/operators/ClGemmLowpOutputStage.h" #include <algorithm> namespace arm_compute { -void CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint::configure(const ICLTensor *input, const ICLTensor *bias, ICLTensor *output, - int result_fixedpoint_multiplier, int result_shift, int result_offset_after_shift, - int min, int max) -{ - configure(CLKernelLibrary::get().get_compile_context(), input, bias, output, result_fixedpoint_multiplier, result_shift, result_offset_after_shift, min, max); -} - -void CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint::configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *bias, ICLTensor *output, - int result_fixedpoint_multiplier, int result_shift, int result_offset_after_shift, - int min, int max) -{ - GEMMLowpOutputStageInfo info{}; - info.gemmlowp_multiplier = result_fixedpoint_multiplier; - info.gemmlowp_shift = result_shift; - info.gemmlowp_offset = result_offset_after_shift; - info.gemmlowp_min_bound = min; - info.gemmlowp_max_bound = max; - info.output_data_type = DataType::QASYMM8; - auto k = std::make_unique<CLGEMMLowpQuantizeDownInt32ScaleByFixedPointKernel>(); - k->configure(compile_context, input, bias, output, &info); - _kernel = std::move(k); -} - -Status CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint::validate(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, - int min, int max) -{ - GEMMLowpOutputStageInfo info{}; - info.gemmlowp_min_bound = min; - info.gemmlowp_max_bound = max; - info.output_data_type = DataType::QASYMM8; - return CLGEMMLowpQuantizeDownInt32ScaleByFixedPointKernel::validate(input, bias, output, &info); -} - -void CLGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPoint::configure(const ICLTensor *input, const ICLTensor *bias, ICLTensor *output, - int result_fixedpoint_multiplier, int result_shift, int result_offset_after_shift, - int min, int max) -{ - configure(CLKernelLibrary::get().get_compile_context(), input, bias, output, result_fixedpoint_multiplier, result_shift, result_offset_after_shift, min, max); -} - -void CLGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPoint::configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *bias, ICLTensor *output, - int result_fixedpoint_multiplier, int result_shift, int result_offset_after_shift, - int min, int max) +struct CLGEMMLowpOutputStage::Impl { - GEMMLowpOutputStageInfo info{}; - info.gemmlowp_multiplier = result_fixedpoint_multiplier; - info.gemmlowp_shift = result_shift; - info.gemmlowp_offset = result_offset_after_shift; - info.gemmlowp_min_bound = min; - info.gemmlowp_max_bound = max; - info.output_data_type = DataType::QASYMM8_SIGNED; - auto k = std::make_unique<CLGEMMLowpQuantizeDownInt32ScaleByFixedPointKernel>(); - k->configure(compile_context, input, bias, output, &info); - _kernel = std::move(k); -} - -Status CLGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPoint::validate(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, - int min, int max) + const ICLTensor *src{nullptr}; + const ICLTensor *bias{nullptr}; + ICLTensor *dst{nullptr}; + std::unique_ptr<opencl::ClGemmLowpOutputStage> op{nullptr}; + ITensorPack run_pack{}; +}; + +CLGEMMLowpOutputStage::CLGEMMLowpOutputStage() : _impl(std::make_unique<Impl>()) { - GEMMLowpOutputStageInfo info{}; - info.gemmlowp_min_bound = min; - info.gemmlowp_max_bound = max; - info.output_data_type = DataType::QASYMM8_SIGNED; - return CLGEMMLowpQuantizeDownInt32ScaleByFixedPointKernel::validate(input, bias, output, &info); } - -void CLGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPoint::configure(const ICLTensor *input, const ICLTensor *bias, ICLTensor *output, - int result_fixedpoint_multiplier, int result_shift, - int min, int max) +CLGEMMLowpOutputStage::CLGEMMLowpOutputStage(CLGEMMLowpOutputStage &&) = default; +CLGEMMLowpOutputStage &CLGEMMLowpOutputStage::operator=(CLGEMMLowpOutputStage &&) = default; +CLGEMMLowpOutputStage::~CLGEMMLowpOutputStage() = default; + +void CLGEMMLowpOutputStage::configure(const ICLTensor *input, + const ICLTensor *bias, + ICLTensor *output, + const GEMMLowpOutputStageInfo &info) { - configure(CLKernelLibrary::get().get_compile_context(), input, bias, output, result_fixedpoint_multiplier, result_shift, min, max); + configure(CLKernelLibrary::get().get_compile_context(), input, bias, output, info); } -void CLGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPoint::configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *bias, ICLTensor *output, - int result_fixedpoint_multiplier, int result_shift, - int min, int max) +void CLGEMMLowpOutputStage::configure(const CLCompileContext &compile_context, + const ICLTensor *input, + const ICLTensor *bias, + ICLTensor *output, + const GEMMLowpOutputStageInfo &info) { - GEMMLowpOutputStageInfo info{}; - info.gemmlowp_multiplier = result_fixedpoint_multiplier; - info.gemmlowp_shift = result_shift; - info.gemmlowp_min_bound = min; - info.gemmlowp_max_bound = max; - info.output_data_type = DataType::QSYMM16; - auto k = std::make_unique<CLGEMMLowpQuantizeDownInt32ScaleByFixedPointKernel>(); - k->configure(compile_context, input, bias, output, &info); - _kernel = std::move(k); -} + ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); -Status CLGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPoint::validate(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, - int min, int max) -{ - GEMMLowpOutputStageInfo info{}; - info.gemmlowp_min_bound = min; - info.gemmlowp_max_bound = max; - info.output_data_type = DataType::QSYMM16; - return CLGEMMLowpQuantizeDownInt32ScaleByFixedPointKernel::validate(input, bias, output, &info); -} + _impl->src = input; + _impl->bias = bias; + _impl->dst = output; -void CLGEMMLowpOutputStage::configure(const ICLTensor *input, const ICLTensor *bias, ICLTensor *output, const GEMMLowpOutputStageInfo &info) -{ - configure(CLKernelLibrary::get().get_compile_context(), input, bias, output, info); + _impl->op = std::make_unique<opencl::ClGemmLowpOutputStage>(); + _impl->op->configure(compile_context, input->info(), bias != nullptr ? bias->info() : nullptr, output->info(), + info); + _impl->run_pack = {{ACL_SRC, _impl->src}, {ACL_BIAS, _impl->bias}, {ACL_DST, _impl->dst}}; } -void CLGEMMLowpOutputStage::configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *bias, ICLTensor *output, const GEMMLowpOutputStageInfo &info) +Status CLGEMMLowpOutputStage::validate(const ITensorInfo *input, + const ITensorInfo *bias, + const ITensorInfo *output, + const GEMMLowpOutputStageInfo &info) { - ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); - - switch(info.type) - { - case GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT: - { - auto k = std::make_unique<CLGEMMLowpQuantizeDownInt32ScaleByFixedPointKernel>(); - k->configure(compile_context, input, bias, output, &info); - _kernel = std::move(k); - break; - } - case GEMMLowpOutputStageType::QUANTIZE_DOWN: - { - auto k = std::make_unique<CLGEMMLowpQuantizeDownInt32ScaleKernel>(); - k->configure(compile_context, input, bias, output, &info); - _kernel = std::move(k); - break; - } - case GEMMLowpOutputStageType::QUANTIZE_DOWN_FLOAT: - { - auto k = std::make_unique<CLGEMMLowpQuantizeDownInt32ScaleByFloatKernel>(); - k->configure(compile_context, input, bias, output, &info); - _kernel = std::move(k); - break; - } - default: - ARM_COMPUTE_ERROR("Unsupported GEMMLowpOutputStage type."); - } + return opencl::ClGemmLowpOutputStage::validate(input, bias, output, info); } -Status CLGEMMLowpOutputStage::validate(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, const GEMMLowpOutputStageInfo &info) +void CLGEMMLowpOutputStage::run() { - ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(output); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM16); - - switch(info.type) - { - case GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT: - return CLGEMMLowpQuantizeDownInt32ScaleByFixedPointKernel::validate(input, bias, output, &info); - case GEMMLowpOutputStageType::QUANTIZE_DOWN: - return CLGEMMLowpQuantizeDownInt32ScaleKernel::validate(input, bias, output, &info); - case GEMMLowpOutputStageType::QUANTIZE_DOWN_FLOAT: - return CLGEMMLowpQuantizeDownInt32ScaleByFloatKernel::validate(input, bias, output, &info); - default: - return ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Unsupported GEMMLowpOutputStage type."); - } + _impl->op->run(_impl->run_pack); } } // namespace arm_compute diff --git a/src/runtime/CL/functions/CLGather.cpp b/src/runtime/CL/functions/CLGather.cpp index bde34dc4db..2610cb1a3b 100644 --- a/src/runtime/CL/functions/CLGather.cpp +++ b/src/runtime/CL/functions/CLGather.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018-2020 Arm Limited. + * Copyright (c) 2018-2021 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -24,6 +24,8 @@ #include "arm_compute/runtime/CL/functions/CLGather.h" #include "arm_compute/core/CL/ICLTensor.h" + +#include "src/common/utils/Log.h" #include "src/core/CL/kernels/CLGatherKernel.h" namespace arm_compute @@ -33,8 +35,13 @@ void CLGather::configure(const ICLTensor *input, const ICLTensor *indices, ICLTe configure(CLKernelLibrary::get().get_compile_context(), input, indices, output, axis); } -void CLGather::configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *indices, ICLTensor *output, int axis) +void CLGather::configure(const CLCompileContext &compile_context, + const ICLTensor *input, + const ICLTensor *indices, + ICLTensor *output, + int axis) { + ARM_COMPUTE_LOG_PARAMS(input, indices, output, axis); auto k = std::make_unique<CLGatherKernel>(); k->configure(compile_context, input, indices, output, axis); _kernel = std::move(k); diff --git a/src/runtime/CL/functions/CLGenerateProposalsLayer.cpp b/src/runtime/CL/functions/CLGenerateProposalsLayer.cpp index 81e24dba08..b2c1d2631e 100644 --- a/src/runtime/CL/functions/CLGenerateProposalsLayer.cpp +++ b/src/runtime/CL/functions/CLGenerateProposalsLayer.cpp @@ -27,6 +27,8 @@ #include "arm_compute/core/Types.h" #include "arm_compute/runtime/CL/functions/CLDequantizationLayer.h" #include "arm_compute/runtime/CL/functions/CLQuantizationLayer.h" + +#include "src/common/utils/Log.h" #include "src/core/CL/kernels/CLBoundingBoxTransformKernel.h" #include "src/core/CL/kernels/CLGenerateProposalsLayerKernel.h" #include "src/core/CL/kernels/CLPadLayerKernel.h" @@ -69,47 +71,67 @@ CLGenerateProposalsLayer::CLGenerateProposalsLayer(std::shared_ptr<IMemoryManage CLGenerateProposalsLayer::~CLGenerateProposalsLayer() = default; -void CLGenerateProposalsLayer::configure(const ICLTensor *scores, const ICLTensor *deltas, const ICLTensor *anchors, ICLTensor *proposals, ICLTensor *scores_out, ICLTensor *num_valid_proposals, +void CLGenerateProposalsLayer::configure(const ICLTensor *scores, + const ICLTensor *deltas, + const ICLTensor *anchors, + ICLTensor *proposals, + ICLTensor *scores_out, + ICLTensor *num_valid_proposals, const GenerateProposalsInfo &info) { - configure(CLKernelLibrary::get().get_compile_context(), scores, deltas, anchors, proposals, scores_out, num_valid_proposals, info); + configure(CLKernelLibrary::get().get_compile_context(), scores, deltas, anchors, proposals, scores_out, + num_valid_proposals, info); } -void CLGenerateProposalsLayer::configure(const CLCompileContext &compile_context, const ICLTensor *scores, const ICLTensor *deltas, const ICLTensor *anchors, ICLTensor *proposals, - ICLTensor *scores_out, - ICLTensor *num_valid_proposals, const GenerateProposalsInfo &info) +void CLGenerateProposalsLayer::configure(const CLCompileContext &compile_context, + const ICLTensor *scores, + const ICLTensor *deltas, + const ICLTensor *anchors, + ICLTensor *proposals, + ICLTensor *scores_out, + ICLTensor *num_valid_proposals, + const GenerateProposalsInfo &info) { ARM_COMPUTE_ERROR_ON_NULLPTR(scores, deltas, anchors, proposals, scores_out, num_valid_proposals); - ARM_COMPUTE_ERROR_THROW_ON(CLGenerateProposalsLayer::validate(scores->info(), deltas->info(), anchors->info(), proposals->info(), scores_out->info(), num_valid_proposals->info(), info)); + ARM_COMPUTE_ERROR_THROW_ON(CLGenerateProposalsLayer::validate(scores->info(), deltas->info(), anchors->info(), + proposals->info(), scores_out->info(), + num_valid_proposals->info(), info)); + ARM_COMPUTE_LOG_PARAMS(scores, deltas, anchors, proposals, scores_out, num_valid_proposals, info); _is_nhwc = scores->info()->data_layout() == DataLayout::NHWC; const DataType scores_data_type = scores->info()->data_type(); _is_qasymm8 = scores_data_type == DataType::QASYMM8; - const int num_anchors = scores->info()->dimension(get_data_layout_dimension_index(scores->info()->data_layout(), DataLayoutDimension::CHANNEL)); - const int feat_width = scores->info()->dimension(get_data_layout_dimension_index(scores->info()->data_layout(), DataLayoutDimension::WIDTH)); - const int feat_height = scores->info()->dimension(get_data_layout_dimension_index(scores->info()->data_layout(), DataLayoutDimension::HEIGHT)); - const int total_num_anchors = num_anchors * feat_width * feat_height; - const int pre_nms_topN = info.pre_nms_topN(); - const int post_nms_topN = info.post_nms_topN(); - const size_t values_per_roi = info.values_per_roi(); + const int num_anchors = scores->info()->dimension( + get_data_layout_dimension_index(scores->info()->data_layout(), DataLayoutDimension::CHANNEL)); + const int feat_width = scores->info()->dimension( + get_data_layout_dimension_index(scores->info()->data_layout(), DataLayoutDimension::WIDTH)); + const int feat_height = scores->info()->dimension( + get_data_layout_dimension_index(scores->info()->data_layout(), DataLayoutDimension::HEIGHT)); + const int total_num_anchors = num_anchors * feat_width * feat_height; + const int pre_nms_topN = info.pre_nms_topN(); + const int post_nms_topN = info.post_nms_topN(); + const size_t values_per_roi = info.values_per_roi(); const QuantizationInfo scores_qinfo = scores->info()->quantization_info(); const DataType rois_data_type = (_is_qasymm8) ? DataType::QASYMM16 : scores_data_type; - const QuantizationInfo rois_qinfo = (_is_qasymm8) ? QuantizationInfo(0.125f, 0) : scores->info()->quantization_info(); + const QuantizationInfo rois_qinfo = + (_is_qasymm8) ? QuantizationInfo(0.125f, 0) : scores->info()->quantization_info(); // Compute all the anchors _memory_group.manage(&_all_anchors); - _compute_anchors_kernel->configure(compile_context, anchors, &_all_anchors, ComputeAnchorsInfo(feat_width, feat_height, info.spatial_scale())); + _compute_anchors_kernel->configure(compile_context, anchors, &_all_anchors, + ComputeAnchorsInfo(feat_width, feat_height, info.spatial_scale())); const TensorShape flatten_shape_deltas(values_per_roi, total_num_anchors); - _deltas_flattened.allocator()->init(TensorInfo(flatten_shape_deltas, 1, scores_data_type, deltas->info()->quantization_info())); + _deltas_flattened.allocator()->init( + TensorInfo(flatten_shape_deltas, 1, scores_data_type, deltas->info()->quantization_info())); // Permute and reshape deltas _memory_group.manage(&_deltas_flattened); - if(!_is_nhwc) + if (!_is_nhwc) { _memory_group.manage(&_deltas_permuted); - _permute_deltas.configure(compile_context, deltas, &_deltas_permuted, PermutationVector{ 2, 0, 1 }); + _permute_deltas.configure(compile_context, deltas, &_deltas_permuted, PermutationVector{2, 0, 1}); _flatten_deltas.configure(compile_context, &_deltas_permuted, &_deltas_flattened); _deltas_permuted.allocator()->allocate(); } @@ -123,10 +145,10 @@ void CLGenerateProposalsLayer::configure(const CLCompileContext &compile_context // Permute and reshape scores _memory_group.manage(&_scores_flattened); - if(!_is_nhwc) + if (!_is_nhwc) { _memory_group.manage(&_scores_permuted); - _permute_scores.configure(compile_context, scores, &_scores_permuted, PermutationVector{ 2, 0, 1 }); + _permute_scores.configure(compile_context, scores, &_scores_permuted, PermutationVector{2, 0, 1}); _flatten_scores.configure(compile_context, &_scores_permuted, &_scores_flattened); _scores_permuted.allocator()->allocate(); } @@ -137,7 +159,7 @@ void CLGenerateProposalsLayer::configure(const CLCompileContext &compile_context CLTensor *anchors_to_use = &_all_anchors; CLTensor *deltas_to_use = &_deltas_flattened; - if(_is_qasymm8) + if (_is_qasymm8) { _all_anchors_f32.allocator()->init(TensorInfo(_all_anchors.info()->tensor_shape(), 1, DataType::F32)); _deltas_flattened_f32.allocator()->init(TensorInfo(_deltas_flattened.info()->tensor_shape(), 1, DataType::F32)); @@ -160,11 +182,12 @@ void CLGenerateProposalsLayer::configure(const CLCompileContext &compile_context anchors_to_use->allocator()->allocate(); _all_proposals_to_use = &_all_proposals; - if(_is_qasymm8) + if (_is_qasymm8) { _memory_group.manage(&_all_proposals_quantized); // Requantize all_proposals to QASYMM16 with 0.125 scale and 0 offset - _all_proposals_quantized.allocator()->init(TensorInfo(_all_proposals.info()->tensor_shape(), 1, DataType::QASYMM16, QuantizationInfo(0.125f, 0))); + _all_proposals_quantized.allocator()->init( + TensorInfo(_all_proposals.info()->tensor_shape(), 1, DataType::QASYMM16, QuantizationInfo(0.125f, 0))); _quantize_all_proposals->configure(compile_context, &_all_proposals, &_all_proposals_quantized); _all_proposals.allocator()->allocate(); _all_proposals_to_use = &_all_proposals_quantized; @@ -180,7 +203,8 @@ void CLGenerateProposalsLayer::configure(const CLCompileContext &compile_context // Note that NMS needs outputs preinitialized. auto_init_if_empty(*scores_out->info(), TensorShape(scores_nms_size), 1, scores_data_type, scores_qinfo); - auto_init_if_empty(*_proposals_4_roi_values.info(), TensorShape(values_per_roi, scores_nms_size), 1, rois_data_type, rois_qinfo); + auto_init_if_empty(*_proposals_4_roi_values.info(), TensorShape(values_per_roi, scores_nms_size), 1, rois_data_type, + rois_qinfo); auto_init_if_empty(*num_valid_proposals->info(), TensorShape(1), 1, DataType::U32); // Initialize temporaries (unused) outputs @@ -192,20 +216,27 @@ void CLGenerateProposalsLayer::configure(const CLCompileContext &compile_context _num_valid_proposals = num_valid_proposals; _memory_group.manage(&_proposals_4_roi_values); - _cpp_nms.configure(&_scores_flattened, _all_proposals_to_use, nullptr, scores_out, &_proposals_4_roi_values, &_classes_nms_unused, nullptr, &_keeps_nms_unused, num_valid_proposals, - BoxNMSLimitInfo(0.0f, info.nms_thres(), scores_nms_size, false, NMSType::LINEAR, 0.5f, 0.001f, true, min_size_scaled, info.im_width(), info.im_height())); + _cpp_nms.configure(&_scores_flattened, _all_proposals_to_use, nullptr, scores_out, &_proposals_4_roi_values, + &_classes_nms_unused, nullptr, &_keeps_nms_unused, num_valid_proposals, + BoxNMSLimitInfo(0.0f, info.nms_thres(), scores_nms_size, false, NMSType::LINEAR, 0.5f, 0.001f, + true, min_size_scaled, info.im_width(), info.im_height())); _keeps_nms_unused.allocator()->allocate(); _classes_nms_unused.allocator()->allocate(); _all_proposals_to_use->allocator()->allocate(); _scores_flattened.allocator()->allocate(); // Add the first column that represents the batch id. This will be all zeros, as we don't support multiple images - _pad_kernel->configure(compile_context, &_proposals_4_roi_values, proposals, PaddingList{ { 1, 0 } }); + _pad_kernel->configure(compile_context, &_proposals_4_roi_values, proposals, PaddingList{{1, 0}}); _proposals_4_roi_values.allocator()->allocate(); } -Status CLGenerateProposalsLayer::validate(const ITensorInfo *scores, const ITensorInfo *deltas, const ITensorInfo *anchors, const ITensorInfo *proposals, const ITensorInfo *scores_out, - const ITensorInfo *num_valid_proposals, const GenerateProposalsInfo &info) +Status CLGenerateProposalsLayer::validate(const ITensorInfo *scores, + const ITensorInfo *deltas, + const ITensorInfo *anchors, + const ITensorInfo *proposals, + const ITensorInfo *scores_out, + const ITensorInfo *num_valid_proposals, + const GenerateProposalsInfo &info) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(scores, deltas, anchors, proposals, scores_out, num_valid_proposals); ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(scores, 1, DataType::QASYMM8, DataType::F16, DataType::F32); @@ -213,9 +244,12 @@ Status CLGenerateProposalsLayer::validate(const ITensorInfo *scores, const ITens ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(scores, deltas); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(scores, deltas); - const int num_anchors = scores->dimension(get_data_layout_dimension_index(scores->data_layout(), DataLayoutDimension::CHANNEL)); - const int feat_width = scores->dimension(get_data_layout_dimension_index(scores->data_layout(), DataLayoutDimension::WIDTH)); - const int feat_height = scores->dimension(get_data_layout_dimension_index(scores->data_layout(), DataLayoutDimension::HEIGHT)); + const int num_anchors = + scores->dimension(get_data_layout_dimension_index(scores->data_layout(), DataLayoutDimension::CHANNEL)); + const int feat_width = + scores->dimension(get_data_layout_dimension_index(scores->data_layout(), DataLayoutDimension::WIDTH)); + const int feat_height = + scores->dimension(get_data_layout_dimension_index(scores->data_layout(), DataLayoutDimension::HEIGHT)); const int num_images = scores->dimension(3); const int total_num_anchors = num_anchors * feat_width * feat_height; const int values_per_roi = info.values_per_roi(); @@ -224,76 +258,101 @@ Status CLGenerateProposalsLayer::validate(const ITensorInfo *scores, const ITens ARM_COMPUTE_RETURN_ERROR_ON(num_images > 1); - if(is_qasymm8) + if (is_qasymm8) { ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(anchors, 1, DataType::QSYMM16); const UniformQuantizationInfo anchors_qinfo = anchors->quantization_info().uniform(); ARM_COMPUTE_RETURN_ERROR_ON(anchors_qinfo.scale != 0.125f); } - TensorInfo all_anchors_info(anchors->clone()->set_tensor_shape(TensorShape(values_per_roi, total_num_anchors)).set_is_resizable(true)); - ARM_COMPUTE_RETURN_ON_ERROR(CLComputeAllAnchorsKernel::validate(anchors, &all_anchors_info, ComputeAnchorsInfo(feat_width, feat_height, info.spatial_scale()))); - - TensorInfo deltas_permuted_info = deltas->clone()->set_tensor_shape(TensorShape(values_per_roi * num_anchors, feat_width, feat_height)).set_is_resizable(true); - TensorInfo scores_permuted_info = scores->clone()->set_tensor_shape(TensorShape(num_anchors, feat_width, feat_height)).set_is_resizable(true); - if(scores->data_layout() == DataLayout::NHWC) + TensorInfo all_anchors_info( + anchors->clone()->set_tensor_shape(TensorShape(values_per_roi, total_num_anchors)).set_is_resizable(true)); + ARM_COMPUTE_RETURN_ON_ERROR(CLComputeAllAnchorsKernel::validate( + anchors, &all_anchors_info, ComputeAnchorsInfo(feat_width, feat_height, info.spatial_scale()))); + + TensorInfo deltas_permuted_info = + deltas->clone() + ->set_tensor_shape(TensorShape(values_per_roi * num_anchors, feat_width, feat_height)) + .set_is_resizable(true); + TensorInfo scores_permuted_info = + scores->clone()->set_tensor_shape(TensorShape(num_anchors, feat_width, feat_height)).set_is_resizable(true); + if (scores->data_layout() == DataLayout::NHWC) { ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(deltas, &deltas_permuted_info); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(scores, &scores_permuted_info); } else { - ARM_COMPUTE_RETURN_ON_ERROR(CLPermute::validate(deltas, &deltas_permuted_info, PermutationVector{ 2, 0, 1 })); - ARM_COMPUTE_RETURN_ON_ERROR(CLPermute::validate(scores, &scores_permuted_info, PermutationVector{ 2, 0, 1 })); + ARM_COMPUTE_RETURN_ON_ERROR(CLPermute::validate(deltas, &deltas_permuted_info, PermutationVector{2, 0, 1})); + ARM_COMPUTE_RETURN_ON_ERROR(CLPermute::validate(scores, &scores_permuted_info, PermutationVector{2, 0, 1})); } - TensorInfo deltas_flattened_info(deltas->clone()->set_tensor_shape(TensorShape(values_per_roi, total_num_anchors)).set_is_resizable(true)); + TensorInfo deltas_flattened_info( + deltas->clone()->set_tensor_shape(TensorShape(values_per_roi, total_num_anchors)).set_is_resizable(true)); ARM_COMPUTE_RETURN_ON_ERROR(CLReshapeLayer::validate(&deltas_permuted_info, &deltas_flattened_info)); - TensorInfo scores_flattened_info(scores->clone()->set_tensor_shape(TensorShape(1, total_num_anchors)).set_is_resizable(true)); - TensorInfo proposals_4_roi_values(deltas->clone()->set_tensor_shape(TensorShape(values_per_roi, total_num_anchors)).set_is_resizable(true)); + TensorInfo scores_flattened_info( + scores->clone()->set_tensor_shape(TensorShape(1, total_num_anchors)).set_is_resizable(true)); + TensorInfo proposals_4_roi_values( + deltas->clone()->set_tensor_shape(TensorShape(values_per_roi, total_num_anchors)).set_is_resizable(true)); ARM_COMPUTE_RETURN_ON_ERROR(CLReshapeLayer::validate(&scores_permuted_info, &scores_flattened_info)); TensorInfo *proposals_4_roi_values_to_use = &proposals_4_roi_values; - TensorInfo proposals_4_roi_values_quantized(deltas->clone()->set_tensor_shape(TensorShape(values_per_roi, total_num_anchors)).set_is_resizable(true)); - proposals_4_roi_values_quantized.set_data_type(DataType::QASYMM16).set_quantization_info(QuantizationInfo(0.125f, 0)); - if(is_qasymm8) + TensorInfo proposals_4_roi_values_quantized( + deltas->clone()->set_tensor_shape(TensorShape(values_per_roi, total_num_anchors)).set_is_resizable(true)); + proposals_4_roi_values_quantized.set_data_type(DataType::QASYMM16) + .set_quantization_info(QuantizationInfo(0.125f, 0)); + if (is_qasymm8) { - TensorInfo all_anchors_f32_info(anchors->clone()->set_tensor_shape(TensorShape(values_per_roi, total_num_anchors)).set_is_resizable(true).set_data_type(DataType::F32)); + TensorInfo all_anchors_f32_info(anchors->clone() + ->set_tensor_shape(TensorShape(values_per_roi, total_num_anchors)) + .set_is_resizable(true) + .set_data_type(DataType::F32)); ARM_COMPUTE_RETURN_ON_ERROR(CLDequantizationLayer::validate(&all_anchors_info, &all_anchors_f32_info)); - TensorInfo deltas_flattened_f32_info(deltas->clone()->set_tensor_shape(TensorShape(values_per_roi, total_num_anchors)).set_is_resizable(true).set_data_type(DataType::F32)); - ARM_COMPUTE_RETURN_ON_ERROR(CLDequantizationLayer::validate(&deltas_flattened_info, &deltas_flattened_f32_info)); - - TensorInfo proposals_4_roi_values_f32(deltas->clone()->set_tensor_shape(TensorShape(values_per_roi, total_num_anchors)).set_is_resizable(true).set_data_type(DataType::F32)); - ARM_COMPUTE_RETURN_ON_ERROR(CLBoundingBoxTransformKernel::validate(&all_anchors_f32_info, &proposals_4_roi_values_f32, &deltas_flattened_f32_info, - BoundingBoxTransformInfo(info.im_width(), info.im_height(), 1.f))); - - ARM_COMPUTE_RETURN_ON_ERROR(CLQuantizationLayer::validate(&proposals_4_roi_values_f32, &proposals_4_roi_values_quantized)); + TensorInfo deltas_flattened_f32_info(deltas->clone() + ->set_tensor_shape(TensorShape(values_per_roi, total_num_anchors)) + .set_is_resizable(true) + .set_data_type(DataType::F32)); + ARM_COMPUTE_RETURN_ON_ERROR( + CLDequantizationLayer::validate(&deltas_flattened_info, &deltas_flattened_f32_info)); + + TensorInfo proposals_4_roi_values_f32(deltas->clone() + ->set_tensor_shape(TensorShape(values_per_roi, total_num_anchors)) + .set_is_resizable(true) + .set_data_type(DataType::F32)); + ARM_COMPUTE_RETURN_ON_ERROR(CLBoundingBoxTransformKernel::validate( + &all_anchors_f32_info, &proposals_4_roi_values_f32, &deltas_flattened_f32_info, + BoundingBoxTransformInfo(info.im_width(), info.im_height(), 1.f))); + + ARM_COMPUTE_RETURN_ON_ERROR( + CLQuantizationLayer::validate(&proposals_4_roi_values_f32, &proposals_4_roi_values_quantized)); proposals_4_roi_values_to_use = &proposals_4_roi_values_quantized; } else { - ARM_COMPUTE_RETURN_ON_ERROR(CLBoundingBoxTransformKernel::validate(&all_anchors_info, &proposals_4_roi_values, &deltas_flattened_info, - BoundingBoxTransformInfo(info.im_width(), info.im_height(), 1.f))); + ARM_COMPUTE_RETURN_ON_ERROR( + CLBoundingBoxTransformKernel::validate(&all_anchors_info, &proposals_4_roi_values, &deltas_flattened_info, + BoundingBoxTransformInfo(info.im_width(), info.im_height(), 1.f))); } - ARM_COMPUTE_RETURN_ON_ERROR(CLPadLayerKernel::validate(proposals_4_roi_values_to_use, proposals, PaddingList{ { 1, 0 } })); + ARM_COMPUTE_RETURN_ON_ERROR( + CLPadLayerKernel::validate(proposals_4_roi_values_to_use, proposals, PaddingList{{1, 0}})); - if(num_valid_proposals->total_size() > 0) + if (num_valid_proposals->total_size() > 0) { ARM_COMPUTE_RETURN_ERROR_ON(num_valid_proposals->num_dimensions() > 1); ARM_COMPUTE_RETURN_ERROR_ON(num_valid_proposals->dimension(0) > 1); ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(num_valid_proposals, 1, DataType::U32); } - if(proposals->total_size() > 0) + if (proposals->total_size() > 0) { ARM_COMPUTE_RETURN_ERROR_ON(proposals->num_dimensions() > 2); ARM_COMPUTE_RETURN_ERROR_ON(proposals->dimension(0) != size_t(values_per_roi) + 1); ARM_COMPUTE_RETURN_ERROR_ON(proposals->dimension(1) != size_t(total_num_anchors)); - if(is_qasymm8) + if (is_qasymm8) { ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(proposals, 1, DataType::QASYMM16); const UniformQuantizationInfo proposals_qinfo = proposals->quantization_info().uniform(); @@ -306,7 +365,7 @@ Status CLGenerateProposalsLayer::validate(const ITensorInfo *scores, const ITens } } - if(scores_out->total_size() > 0) + if (scores_out->total_size() > 0) { ARM_COMPUTE_RETURN_ERROR_ON(scores_out->num_dimensions() > 1); ARM_COMPUTE_RETURN_ERROR_ON(scores_out->dimension(0) != size_t(total_num_anchors)); @@ -353,7 +412,7 @@ void CLGenerateProposalsLayer::run() CLScheduler::get().enqueue(*_compute_anchors_kernel, false); // Transpose and reshape the inputs - if(!_is_nhwc) + if (!_is_nhwc) { _permute_deltas.run(); _permute_scores.run(); @@ -361,7 +420,7 @@ void CLGenerateProposalsLayer::run() _flatten_deltas.run(); _flatten_scores.run(); - if(_is_qasymm8) + if (_is_qasymm8) { _dequantize_anchors->run(); _dequantize_deltas->run(); @@ -370,7 +429,7 @@ void CLGenerateProposalsLayer::run() // Build the boxes CLScheduler::get().enqueue(*_bounding_box_kernel, false); - if(_is_qasymm8) + if (_is_qasymm8) { _quantize_all_proposals->run(); } diff --git a/src/runtime/CL/functions/CLIndirectConvolutionLayer.cpp b/src/runtime/CL/functions/CLIndirectConvolutionLayer.cpp new file mode 100644 index 0000000000..1a2369c5c2 --- /dev/null +++ b/src/runtime/CL/functions/CLIndirectConvolutionLayer.cpp @@ -0,0 +1,100 @@ +/* + * Copyright (c) 2022 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/runtime/CL/functions/CLIndirectConvolutionLayer.h" + +#include "arm_compute/core/CL/ICLTensor.h" +#include "arm_compute/core/Utils.h" +#include "arm_compute/core/Validate.h" + +#include "src/common/utils/Log.h" +#include "src/gpu/cl/operators/ClIndirectConv2d.h" + +namespace arm_compute +{ +struct CLIndirectConvolutionLayer::Impl +{ + const ICLTensor *src{nullptr}; + const ICLTensor *weights{nullptr}; + const ICLTensor *biases{nullptr}; + ICLTensor *dst{nullptr}; + std::unique_ptr<opencl::ClIndirectConv2d> op{nullptr}; +}; + +CLIndirectConvolutionLayer::CLIndirectConvolutionLayer() : _impl(std::make_unique<Impl>()) +{ +} +CLIndirectConvolutionLayer::CLIndirectConvolutionLayer(CLIndirectConvolutionLayer &&) = default; +CLIndirectConvolutionLayer &CLIndirectConvolutionLayer::operator=(CLIndirectConvolutionLayer &&) = default; +CLIndirectConvolutionLayer::~CLIndirectConvolutionLayer() = default; + +void CLIndirectConvolutionLayer::configure(ICLTensor *input, + const ICLTensor *weights, + const ICLTensor *biases, + ICLTensor *output, + const PadStrideInfo &conv_info, + const ActivationLayerInfo &act_info) +{ + configure(CLKernelLibrary::get().get_compile_context(), input, weights, biases, output, conv_info, act_info); +} + +void CLIndirectConvolutionLayer::configure(const CLCompileContext &compile_context, + ICLTensor *input, + const ICLTensor *weights, + const ICLTensor *biases, + ICLTensor *output, + const PadStrideInfo &conv_info, + const ActivationLayerInfo &act_info) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output); + ARM_COMPUTE_LOG_PARAMS(input, weights, biases, output, conv_info, act_info); + + _impl->src = input; + _impl->weights = weights; + _impl->biases = biases; + _impl->dst = output; + _impl->op = std::make_unique<opencl::ClIndirectConv2d>(); + _impl->op->configure(compile_context, input->info(), weights->info(), + (biases != nullptr) ? biases->info() : nullptr, output->info(), conv_info, act_info); +} + +Status CLIndirectConvolutionLayer::validate(const ITensorInfo *input, + const ITensorInfo *weights, + const ITensorInfo *biases, + const ITensorInfo *output, + const PadStrideInfo &conv_info, + const ActivationLayerInfo &act_info) +{ + return opencl::ClIndirectConv2d::validate(input, weights, biases, output, conv_info, act_info); +} + +void CLIndirectConvolutionLayer::run() +{ + ITensorPack pack; + pack.add_tensor(TensorType::ACL_SRC, _impl->src); + pack.add_tensor(TensorType::ACL_SRC_1, _impl->weights); + pack.add_tensor(TensorType::ACL_SRC_2, _impl->biases); + pack.add_tensor(TensorType::ACL_DST, _impl->dst); + _impl->op->run(pack); +} +} // namespace arm_compute diff --git a/src/runtime/CL/functions/CLInstanceNormalizationLayer.cpp b/src/runtime/CL/functions/CLInstanceNormalizationLayer.cpp index 4a0bda8255..0e994e1aee 100644 --- a/src/runtime/CL/functions/CLInstanceNormalizationLayer.cpp +++ b/src/runtime/CL/functions/CLInstanceNormalizationLayer.cpp @@ -27,6 +27,8 @@ #include "arm_compute/core/Types.h" #include "arm_compute/runtime/CL/CLHelpers.h" #include "arm_compute/runtime/CL/CLScheduler.h" + +#include "src/common/utils/Log.h" #include "src/core/CL/ICLKernel.h" #include "src/core/CL/kernels/CLFillBorderKernel.h" #include "src/core/CL/kernels/CLInstanceNormalizationLayerKernel.h" @@ -34,40 +36,53 @@ namespace arm_compute { CLInstanceNormalizationLayer::CLInstanceNormalizationLayer(CLRuntimeContext *ctx) // NOLINT - : _inst_norm_kernel(), - _mean_var_kernel(), - _mean_var_tensor(), - _ctx(ctx) + : _inst_norm_kernel(), _mean_var_kernel(), _mean_var_tensor(), _ctx(ctx) { } CLInstanceNormalizationLayer::~CLInstanceNormalizationLayer() { } -void CLInstanceNormalizationLayer::configure(ICLTensor *input, ICLTensor *output, float gamma, float beta, float epsilon, bool use_mixed_precision) +void CLInstanceNormalizationLayer::configure( + ICLTensor *input, ICLTensor *output, float gamma, float beta, float epsilon, bool use_mixed_precision) { configure(CLKernelLibrary::get().get_compile_context(), input, output, gamma, beta, epsilon, use_mixed_precision); } -void CLInstanceNormalizationLayer::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, float gamma, float beta, float epsilon, bool use_mixed_precision) +void CLInstanceNormalizationLayer::configure(const CLCompileContext &compile_context, + ICLTensor *input, + ICLTensor *output, + float gamma, + float beta, + float epsilon, + bool use_mixed_precision) { + ARM_COMPUTE_LOG_PARAMS(input, output, gamma, beta, epsilon, use_mixed_precision); auto w = std::make_unique<CLComputeMeanVariance>(); w->configure(compile_context, input, &_mean_var_tensor, use_mixed_precision); _mean_var_kernel = std::move(w); auto k = std::make_unique<CLInstanceNormalizationLayerKernel>(); - k->configure(compile_context, input, &_mean_var_tensor, output, InstanceNormalizationLayerKernelInfo(gamma, beta, epsilon, use_mixed_precision)); + k->configure(compile_context, input, &_mean_var_tensor, output, + InstanceNormalizationLayerKernelInfo(gamma, beta, epsilon, use_mixed_precision)); _inst_norm_kernel = std::move(k); _mean_var_tensor.allocator()->allocate(); } -Status CLInstanceNormalizationLayer::validate(const ITensorInfo *input, const ITensorInfo *output, float gamma, float beta, float epsilon, bool use_mixed_precision) +Status CLInstanceNormalizationLayer::validate(const ITensorInfo *input, + const ITensorInfo *output, + float gamma, + float beta, + float epsilon, + bool use_mixed_precision) { - return CLInstanceNormalizationLayerKernel::validate(input, output, InstanceNormalizationLayerKernelInfo(gamma, beta, epsilon, use_mixed_precision)); + return CLInstanceNormalizationLayerKernel::validate( + input, output, InstanceNormalizationLayerKernelInfo(gamma, beta, epsilon, use_mixed_precision)); } void CLInstanceNormalizationLayer::run() { - ARM_COMPUTE_ERROR_ON_MSG(!_inst_norm_kernel, "The child class didn't set the CL kernel or function isn't configured"); + ARM_COMPUTE_ERROR_ON_MSG(!_inst_norm_kernel, + "The child class didn't set the CL kernel or function isn't configured"); schedule_kernel_on_ctx(_ctx, _mean_var_kernel.get()); schedule_kernel_on_ctx(_ctx, _inst_norm_kernel.get()); } diff --git a/src/runtime/CL/functions/CLL2NormalizeLayer.cpp b/src/runtime/CL/functions/CLL2NormalizeLayer.cpp index 8c360aaa9e..4fe1d9b20b 100644 --- a/src/runtime/CL/functions/CLL2NormalizeLayer.cpp +++ b/src/runtime/CL/functions/CLL2NormalizeLayer.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2020 Arm Limited. + * Copyright (c) 2017-2021 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -29,6 +29,8 @@ #include "arm_compute/core/TensorInfo.h" #include "arm_compute/core/Validate.h" #include "arm_compute/runtime/CL/CLScheduler.h" + +#include "src/common/utils/Log.h" #include "src/core/CL/kernels/CLFillBorderKernel.h" #include "src/core/CL/kernels/CLL2NormalizeLayerKernel.h" #include "src/core/CL/kernels/CLReductionOperationKernel.h" @@ -55,8 +57,11 @@ void CLL2NormalizeLayer::configure(ICLTensor *input, ICLTensor *output, int axis configure(CLKernelLibrary::get().get_compile_context(), input, output, axis, epsilon); } -void CLL2NormalizeLayer::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, int axis, float epsilon) +void CLL2NormalizeLayer::configure( + const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, int axis, float epsilon) { + ARM_COMPUTE_LOG_PARAMS(input, output, axis, epsilon); + // Reset auxiliary tensor _sumsq.allocator()->init(TensorInfo()); @@ -82,7 +87,8 @@ Status CLL2NormalizeLayer::validate(const ITensorInfo *input, const ITensorInfo sum_sq.set_tensor_shape(shape); const uint32_t actual_axis = wrap_around(axis, max_input_tensor_dim); - ARM_COMPUTE_RETURN_ON_ERROR(CLReductionOperation::validate(input, &sum_sq, actual_axis, ReductionOperation::SUM_SQUARE)); + ARM_COMPUTE_RETURN_ON_ERROR( + CLReductionOperation::validate(input, &sum_sq, actual_axis, ReductionOperation::SUM_SQUARE)); // Reduce shape on axis shape.set(actual_axis, 1); diff --git a/src/runtime/CL/functions/CLLSTMLayer.cpp b/src/runtime/CL/functions/CLLSTMLayer.cpp index 85d13c246e..3b50234c77 100644 --- a/src/runtime/CL/functions/CLLSTMLayer.cpp +++ b/src/runtime/CL/functions/CLLSTMLayer.cpp @@ -24,18 +24,15 @@ #include "arm_compute/runtime/CL/functions/CLLSTMLayer.h" #include "arm_compute/core/Utils.h" -#include "arm_compute/core/Validate.h" #include "arm_compute/core/utils/misc/InfoHelpers.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" #include "arm_compute/core/utils/quantization/AsymmHelpers.h" +#include "arm_compute/core/Validate.h" #include "arm_compute/runtime/CL/CLScheduler.h" + +#include "src/common/utils/Log.h" #include "src/core/CL/kernels/CLFillBorderKernel.h" -#include "src/core/CL/kernels/CLGEMMLowpMatrixMultiplyNativeKernel.h" -#include "src/core/CL/kernels/CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel.h" -#include "src/core/CL/kernels/CLGEMMLowpOffsetContributionKernel.h" -#include "src/core/CL/kernels/CLGEMMLowpOffsetContributionOutputStageKernel.h" -#include "src/core/CL/kernels/CLGEMMLowpReductionKernel.h" -#include "src/core/gpu/cl/kernels/ClTransposeKernel.h" +#include "src/gpu/cl/kernels/ClTransposeKernel.h" namespace arm_compute { @@ -43,51 +40,156 @@ using namespace arm_compute::misc::shape_calculator; using namespace arm_compute::utils::info_helpers; CLLSTMLayer::CLLSTMLayer(std::shared_ptr<IMemoryManager> memory_manager) - : _memory_group(std::move(memory_manager)), _fully_connected_input_gate(), _accum_input_gate1(), _subtract_input_gate(), _pixelwise_mul_input_gate(), _activation_input_gate(), - _fully_connected_forget_gate(), _accum_forget_gate1(), _pixelwise_mul_forget_gate(), _activation_forget_gate(), _fully_connected_cell_state(), _gemm_cell_state1(), - _transpose_cell_state(std::make_unique<opencl::kernels::ClTransposeKernel>()), _accum_cell_state1(), _accum_cell_state2(), _pixelwise_mul_cell_state1(), _activation_cell_state(), _cell_clip(), - _pixelwise_mul_cell_state2(), _fully_connected_output(), _pixelwise_mul_output_state1(), _accum_output1(), _activation_output(), _activation_output_state(), _pixelwise_mul_output_state2(), - _fully_connected_output_state(), _projection_clip(), _copy_cell_state(), _copy_output(), _concat_scratch_buffer(), _concat_inputs_forget_gate(), _concat_weights_forget_gate(), - _concat_weights_input_gate(), _concat_weights_output(), _ones_fill(), _mean_std_norm_input_gate(), _pixelwise_mul_input_gate_coeff(), _accum_input_gate_bias(), _mean_std_norm_forget_gate(), - _pixelwise_mul_forget_gate_coeff(), _accum_forget_gate_bias(), _mean_std_norm_cell_gate(), _pixelwise_mul_cell_gate_coeff(), _accum_cell_gate_bias(), _mean_std_norm_output_gate(), - _pixelwise_mul_output_gate_coeff(), _accum_output_gate_bias(), _input_gate_out1(), _input_gate_out2(), _input_gate_out3(), _input_gate_out4(), _forget_gate_out1(), _forget_gate_out2(), - _forget_gate_out3(), _forget_gate_out4(), _forget_gate_out5(), _forget_gate_out6(), _cell_state_out1(), _cell_state_out2(), _cell_state_out3(), _cell_state_out4(), _cell_state_out5(), _output1(), - _output2(), _output3(), _output4(), _cell_state_activation(), _output_state1(), _ones(), _input_layer_norm_out1(), _input_layer_norm_out2(), _forget_layer_norm_out1(), _forget_layer_norm_out2(), - _cell_layer_norm_out1(), _cell_layer_norm_out2(), _output_layer_norm_out1(), _output_layer_norm_out2(), _run_peephole_opt(false), _run_cifg_opt(false), _perform_cell_clipping(false), - _has_projection_weights(false), _perform_projection_clipping(false), _is_prepared(false), _is_layer_norm_lstm(false) + : _memory_group(std::move(memory_manager)), + _fully_connected_input_gate(), + _accum_input_gate1(), + _subtract_input_gate(), + _pixelwise_mul_input_gate(), + _activation_input_gate(), + _fully_connected_forget_gate(), + _accum_forget_gate1(), + _pixelwise_mul_forget_gate(), + _activation_forget_gate(), + _fully_connected_cell_state(), + _gemm_cell_state1(), + _transpose_cell_state(std::make_unique<opencl::kernels::ClTransposeKernel>()), + _accum_cell_state1(), + _accum_cell_state2(), + _pixelwise_mul_cell_state1(), + _activation_cell_state(), + _cell_clip(), + _pixelwise_mul_cell_state2(), + _fully_connected_output(), + _pixelwise_mul_output_state1(), + _accum_output1(), + _activation_output(), + _activation_output_state(), + _pixelwise_mul_output_state2(), + _fully_connected_output_state(), + _projection_clip(), + _copy_cell_state(), + _copy_output(), + _concat_scratch_buffer(), + _concat_inputs_forget_gate(), + _concat_weights_forget_gate(), + _concat_weights_input_gate(), + _concat_weights_output(), + _ones_fill(), + _mean_std_norm_input_gate(), + _pixelwise_mul_input_gate_coeff(), + _accum_input_gate_bias(), + _mean_std_norm_forget_gate(), + _pixelwise_mul_forget_gate_coeff(), + _accum_forget_gate_bias(), + _mean_std_norm_cell_gate(), + _pixelwise_mul_cell_gate_coeff(), + _accum_cell_gate_bias(), + _mean_std_norm_output_gate(), + _pixelwise_mul_output_gate_coeff(), + _accum_output_gate_bias(), + _input_gate_out1(), + _input_gate_out2(), + _input_gate_out3(), + _input_gate_out4(), + _forget_gate_out1(), + _forget_gate_out2(), + _forget_gate_out3(), + _forget_gate_out4(), + _forget_gate_out5(), + _forget_gate_out6(), + _cell_state_out1(), + _cell_state_out2(), + _cell_state_out3(), + _cell_state_out4(), + _cell_state_out5(), + _output1(), + _output2(), + _output3(), + _output4(), + _cell_state_activation(), + _output_state1(), + _ones(), + _input_layer_norm_out1(), + _input_layer_norm_out2(), + _forget_layer_norm_out1(), + _forget_layer_norm_out2(), + _cell_layer_norm_out1(), + _cell_layer_norm_out2(), + _output_layer_norm_out1(), + _output_layer_norm_out2(), + _run_peephole_opt(false), + _run_cifg_opt(false), + _perform_cell_clipping(false), + _has_projection_weights(false), + _perform_projection_clipping(false), + _is_prepared(false), + _is_layer_norm_lstm(false) { } CLLSTMLayer::~CLLSTMLayer() = default; -void CLLSTMLayer::configure(const ICLTensor *input, - const ICLTensor *input_to_forget_weights, const ICLTensor *input_to_cell_weights, const ICLTensor *input_to_output_weights, - const ICLTensor *recurrent_to_forget_weights, const ICLTensor *recurrent_to_cell_weights, const ICLTensor *recurrent_to_output_weights, - const ICLTensor *forget_gate_bias, const ICLTensor *cell_bias, const ICLTensor *output_gate_bias, - const ICLTensor *output_state_in, ICLTensor *cell_state_in, - ICLTensor *scratch_buffer, ICLTensor *output_state_out, ICLTensor *cell_state_out, ICLTensor *output, - const LSTMParams<ICLTensor> &lstm_params, const ActivationLayerInfo &activation_info, float cell_threshold, float projection_threshold) +void CLLSTMLayer::configure(const ICLTensor *input, + const ICLTensor *input_to_forget_weights, + const ICLTensor *input_to_cell_weights, + const ICLTensor *input_to_output_weights, + const ICLTensor *recurrent_to_forget_weights, + const ICLTensor *recurrent_to_cell_weights, + const ICLTensor *recurrent_to_output_weights, + const ICLTensor *forget_gate_bias, + const ICLTensor *cell_bias, + const ICLTensor *output_gate_bias, + const ICLTensor *output_state_in, + ICLTensor *cell_state_in, + ICLTensor *scratch_buffer, + ICLTensor *output_state_out, + ICLTensor *cell_state_out, + ICLTensor *output, + const LSTMParams<ICLTensor> &lstm_params, + const ActivationLayerInfo &activation_info, + float cell_threshold, + float projection_threshold) { - configure(CLKernelLibrary::get().get_compile_context(), input, input_to_forget_weights, input_to_cell_weights, input_to_output_weights, recurrent_to_forget_weights, recurrent_to_cell_weights, - recurrent_to_output_weights, forget_gate_bias, cell_bias, output_gate_bias, output_state_in, cell_state_in, scratch_buffer, output_state_out, cell_state_out, output, lstm_params, activation_info, + configure(CLKernelLibrary::get().get_compile_context(), input, input_to_forget_weights, input_to_cell_weights, + input_to_output_weights, recurrent_to_forget_weights, recurrent_to_cell_weights, + recurrent_to_output_weights, forget_gate_bias, cell_bias, output_gate_bias, output_state_in, + cell_state_in, scratch_buffer, output_state_out, cell_state_out, output, lstm_params, activation_info, cell_threshold, projection_threshold); } -void CLLSTMLayer::configure(const CLCompileContext &compile_context, const ICLTensor *input, - const ICLTensor *input_to_forget_weights, const ICLTensor *input_to_cell_weights, const ICLTensor *input_to_output_weights, - const ICLTensor *recurrent_to_forget_weights, const ICLTensor *recurrent_to_cell_weights, const ICLTensor *recurrent_to_output_weights, - const ICLTensor *forget_gate_bias, const ICLTensor *cell_bias, const ICLTensor *output_gate_bias, - const ICLTensor *output_state_in, ICLTensor *cell_state_in, - ICLTensor *scratch_buffer, ICLTensor *output_state_out, ICLTensor *cell_state_out, ICLTensor *output, - const LSTMParams<ICLTensor> &lstm_params, const ActivationLayerInfo &activation_info, float cell_threshold, float projection_threshold) +void CLLSTMLayer::configure(const CLCompileContext &compile_context, + const ICLTensor *input, + const ICLTensor *input_to_forget_weights, + const ICLTensor *input_to_cell_weights, + const ICLTensor *input_to_output_weights, + const ICLTensor *recurrent_to_forget_weights, + const ICLTensor *recurrent_to_cell_weights, + const ICLTensor *recurrent_to_output_weights, + const ICLTensor *forget_gate_bias, + const ICLTensor *cell_bias, + const ICLTensor *output_gate_bias, + const ICLTensor *output_state_in, + ICLTensor *cell_state_in, + ICLTensor *scratch_buffer, + ICLTensor *output_state_out, + ICLTensor *cell_state_out, + ICLTensor *output, + const LSTMParams<ICLTensor> &lstm_params, + const ActivationLayerInfo &activation_info, + float cell_threshold, + float projection_threshold) { - ARM_COMPUTE_ERROR_ON_NULLPTR(input, - input_to_forget_weights, input_to_cell_weights, input_to_output_weights, + ARM_COMPUTE_ERROR_ON_NULLPTR(input, input_to_forget_weights, input_to_cell_weights, input_to_output_weights, recurrent_to_forget_weights, recurrent_to_cell_weights, recurrent_to_output_weights, - forget_gate_bias, cell_bias, output_gate_bias, - output_state_in, cell_state_in, + forget_gate_bias, cell_bias, output_gate_bias, output_state_in, cell_state_in, scratch_buffer, output_state_out, cell_state_out, output); + ARM_COMPUTE_LOG_PARAMS(input, input_to_forget_weights, input_to_cell_weights, input_to_output_weights, + recurrent_to_forget_weights, recurrent_to_cell_weights, recurrent_to_output_weights, + forget_gate_bias, cell_bias, output_gate_bias, output_state_in, cell_state_in, + scratch_buffer, output_state_out, cell_state_out, output, lstm_params, activation_info, + cell_threshold, projection_threshold); + _is_layer_norm_lstm = lstm_params.use_layer_norm(); // Set lstm parameters @@ -95,13 +197,12 @@ void CLLSTMLayer::configure(const CLCompileContext &compile_context, const ICLTe build_lstm_params_tensor_info(lstm_params, &lstm_params_info); // Validate - ARM_COMPUTE_ERROR_THROW_ON(CLLSTMLayer::validate(input->info(), input_to_forget_weights->info(), - input_to_cell_weights->info(), input_to_output_weights->info(), - recurrent_to_forget_weights->info(), recurrent_to_cell_weights->info(), recurrent_to_output_weights->info(), - forget_gate_bias->info(), cell_bias->info(), output_gate_bias->info(), - output_state_in->info(), cell_state_in->info(), - scratch_buffer->info(), output_state_out->info(), cell_state_out->info(), output->info(), - lstm_params_info, activation_info, cell_threshold, projection_threshold)); + ARM_COMPUTE_ERROR_THROW_ON(CLLSTMLayer::validate( + input->info(), input_to_forget_weights->info(), input_to_cell_weights->info(), input_to_output_weights->info(), + recurrent_to_forget_weights->info(), recurrent_to_cell_weights->info(), recurrent_to_output_weights->info(), + forget_gate_bias->info(), cell_bias->info(), output_gate_bias->info(), output_state_in->info(), + cell_state_in->info(), scratch_buffer->info(), output_state_out->info(), cell_state_out->info(), output->info(), + lstm_params_info, activation_info, cell_threshold, projection_threshold)); const TensorShape cell_state_shape = cell_state_in->info()->tensor_shape(); // Configure block that calculates the forget gate @@ -125,26 +226,31 @@ void CLLSTMLayer::configure(const CLCompileContext &compile_context, const ICLTe weights_vector.emplace_back(input_to_forget_weights); weights_vector.emplace_back(recurrent_to_forget_weights); - const TensorShape weights_concat_shape = arm_compute::misc::shape_calculator::calculate_concatenate_shape(weights_vector, 0); + const TensorShape weights_concat_shape = + arm_compute::misc::shape_calculator::calculate_concatenate_shape(weights_vector, 0); _forget_gate_out6.allocator()->init(TensorInfo(weights_concat_shape, 1, input->info()->data_type())); _concat_weights_forget_gate.configure(compile_context, weights_vector, &_forget_gate_out6, Window::DimX); _memory_group.manage(&_forget_gate_out5); - _fully_connected_forget_gate.configure(compile_context, &_forget_gate_out2, &_forget_gate_out6, (_is_layer_norm_lstm) ? nullptr : forget_gate_bias, &_forget_gate_out5); + _fully_connected_forget_gate.configure(compile_context, &_forget_gate_out2, &_forget_gate_out6, + (_is_layer_norm_lstm) ? nullptr : forget_gate_bias, &_forget_gate_out5); _memory_group.manage(&_forget_gate_out1); _memory_group.manage(&_forget_gate_out3); _forget_gate_out6.allocator()->allocate(); CLTensor *forget_gate_out = &_forget_gate_out5; - if(lstm_params.has_peephole_opt()) + if (lstm_params.has_peephole_opt()) { _forget_gate_out4.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type())); _run_peephole_opt = true; _memory_group.manage(&_forget_gate_out4); - _pixelwise_mul_forget_gate.configure(compile_context, cell_state_in, lstm_params.cell_to_forget_weights(), &_forget_gate_out4, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN); - _accum_forget_gate1.configure(compile_context, &_forget_gate_out5, &_forget_gate_out4, &_forget_gate_out3, ConvertPolicy::SATURATE); + _pixelwise_mul_forget_gate.configure(compile_context, cell_state_in, lstm_params.cell_to_forget_weights(), + &_forget_gate_out4, 1, ConvertPolicy::SATURATE, + RoundingPolicy::TO_NEAREST_EVEN); + _accum_forget_gate1.configure(compile_context, &_forget_gate_out5, &_forget_gate_out4, &_forget_gate_out3, + ConvertPolicy::SATURATE); _forget_gate_out4.allocator()->allocate(); _forget_gate_out5.allocator()->allocate(); forget_gate_out = &_forget_gate_out3; @@ -153,22 +259,25 @@ void CLLSTMLayer::configure(const CLCompileContext &compile_context, const ICLTe { _forget_gate_out3.allocator()->allocate(); } - if(_is_layer_norm_lstm) + if (_is_layer_norm_lstm) { _forget_layer_norm_out1.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type())); _forget_layer_norm_out2.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type())); _memory_group.manage(&_forget_layer_norm_out1); _memory_group.manage(&_forget_layer_norm_out2); _mean_std_norm_forget_gate.configure(compile_context, forget_gate_out); - _pixelwise_mul_forget_gate_coeff.configure(compile_context, forget_gate_out, lstm_params.forget_layer_norm_weights(), &_forget_layer_norm_out1, 1, ConvertPolicy::SATURATE, - RoundingPolicy::TO_NEAREST_EVEN); + _pixelwise_mul_forget_gate_coeff.configure(compile_context, forget_gate_out, + lstm_params.forget_layer_norm_weights(), &_forget_layer_norm_out1, 1, + ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN); // forget_gate_out is going to be reassigned, so allocate the tensor that it was assigned to before forget_gate_out->allocator()->allocate(); - _accum_forget_gate_bias.configure(compile_context, &_forget_layer_norm_out1, forget_gate_bias, &_forget_layer_norm_out2, ConvertPolicy::SATURATE); + _accum_forget_gate_bias.configure(compile_context, &_forget_layer_norm_out1, forget_gate_bias, + &_forget_layer_norm_out2, ConvertPolicy::SATURATE); _forget_layer_norm_out1.allocator()->allocate(); forget_gate_out = &_forget_layer_norm_out2; } - _activation_forget_gate.configure(compile_context, forget_gate_out, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)); + _activation_forget_gate.configure(compile_context, forget_gate_out, nullptr, + ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)); // Configure block that calculates the input gate // input_gate = Activation(input * input_to_input_weights + output_state * recurrent_to_input_weights + PixelWiseMul(cell_state, cell_to_input_weights) + input_gate_bias), without CIFG @@ -177,12 +286,13 @@ void CLLSTMLayer::configure(const CLCompileContext &compile_context, const ICLTe // input_gate = Activation((input,output_state) * (input_to_input_weights,recurrent_to_input_weights) + PixelWiseMul(cell_state, cell_to_input_weights) + input_gate_bias), without CIFG _input_gate_out1.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type())); CLTensor *input_gate_out = &_input_gate_out1; - if(lstm_params.has_cifg_opt()) + if (lstm_params.has_cifg_opt()) { _memory_group.manage(&_input_gate_out1); _ones.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type())); _ones_fill.configure(compile_context, &_ones, PixelValue(1, _ones.info()->data_type())); - _subtract_input_gate.configure(compile_context, &_ones, forget_gate_out, &_input_gate_out1, ConvertPolicy::SATURATE); + _subtract_input_gate.configure(compile_context, &_ones, forget_gate_out, &_input_gate_out1, + ConvertPolicy::SATURATE); _ones.allocator()->allocate(); _run_cifg_opt = true; } @@ -194,7 +304,8 @@ void CLLSTMLayer::configure(const CLCompileContext &compile_context, const ICLTe std::vector<const ICLTensor *> lstm_weights; lstm_weights.emplace_back(lstm_params.input_to_input_weights()); lstm_weights.emplace_back(lstm_params.recurrent_to_input_weights()); - TensorShape lstm_weights_concat_shape = arm_compute::misc::shape_calculator::calculate_concatenate_shape(lstm_weights, 0); + TensorShape lstm_weights_concat_shape = + arm_compute::misc::shape_calculator::calculate_concatenate_shape(lstm_weights, 0); _input_gate_out2.allocator()->init(TensorInfo(lstm_weights_concat_shape, 1, input->info()->data_type())); _concat_weights_input_gate.configure(compile_context, lstm_weights, &_input_gate_out2, Window::DimX); @@ -202,15 +313,20 @@ void CLLSTMLayer::configure(const CLCompileContext &compile_context, const ICLTe _memory_group.manage(&_input_gate_out1); _memory_group.manage(&_input_gate_out3); - _fully_connected_input_gate.configure(compile_context, &_forget_gate_out2, &_input_gate_out2, (_is_layer_norm_lstm) ? nullptr : lstm_params.input_gate_bias(), &_input_gate_out3); + _fully_connected_input_gate.configure(compile_context, &_forget_gate_out2, &_input_gate_out2, + (_is_layer_norm_lstm) ? nullptr : lstm_params.input_gate_bias(), + &_input_gate_out3); _input_gate_out2.allocator()->allocate(); input_gate_out = &_input_gate_out3; - if(_run_peephole_opt) + if (_run_peephole_opt) { _memory_group.manage(&_input_gate_out4); - _pixelwise_mul_input_gate.configure(compile_context, cell_state_in, lstm_params.cell_to_input_weights(), &_input_gate_out4, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN); - _accum_input_gate1.configure(compile_context, &_input_gate_out3, &_input_gate_out4, &_input_gate_out1, ConvertPolicy::SATURATE); + _pixelwise_mul_input_gate.configure(compile_context, cell_state_in, lstm_params.cell_to_input_weights(), + &_input_gate_out4, 1, ConvertPolicy::SATURATE, + RoundingPolicy::TO_NEAREST_EVEN); + _accum_input_gate1.configure(compile_context, &_input_gate_out3, &_input_gate_out4, &_input_gate_out1, + ConvertPolicy::SATURATE); _input_gate_out3.allocator()->allocate(); _input_gate_out4.allocator()->allocate(); input_gate_out = &_input_gate_out1; @@ -220,22 +336,25 @@ void CLLSTMLayer::configure(const CLCompileContext &compile_context, const ICLTe _input_gate_out1.allocator()->allocate(); } - if(_is_layer_norm_lstm) + if (_is_layer_norm_lstm) { _input_layer_norm_out1.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type())); _input_layer_norm_out2.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type())); _memory_group.manage(&_input_layer_norm_out1); _memory_group.manage(&_input_layer_norm_out2); _mean_std_norm_input_gate.configure(compile_context, input_gate_out); - _pixelwise_mul_input_gate_coeff.configure(compile_context, input_gate_out, lstm_params.input_layer_norm_weights(), &_input_layer_norm_out1, 1, ConvertPolicy::SATURATE, - RoundingPolicy::TO_NEAREST_EVEN); + _pixelwise_mul_input_gate_coeff.configure(compile_context, input_gate_out, + lstm_params.input_layer_norm_weights(), &_input_layer_norm_out1, + 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN); // input_gate_out is going to be reassigned, so allocate the tensor that it was assigned to before input_gate_out->allocator()->allocate(); - _accum_input_gate_bias.configure(compile_context, &_input_layer_norm_out1, lstm_params.input_gate_bias(), &_input_layer_norm_out2, ConvertPolicy::SATURATE); + _accum_input_gate_bias.configure(compile_context, &_input_layer_norm_out1, lstm_params.input_gate_bias(), + &_input_layer_norm_out2, ConvertPolicy::SATURATE); _input_layer_norm_out1.allocator()->allocate(); input_gate_out = &_input_layer_norm_out2; } - _activation_input_gate.configure(compile_context, input_gate_out, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)); + _activation_input_gate.configure(compile_context, input_gate_out, nullptr, + ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)); } // Configure block that calculates the cell state @@ -248,44 +367,54 @@ void CLLSTMLayer::configure(const CLCompileContext &compile_context, const ICLTe _cell_state_out5.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type())); _memory_group.manage(&_cell_state_out1); - _fully_connected_cell_state.configure(compile_context, input, input_to_cell_weights, (_is_layer_norm_lstm) ? nullptr : cell_bias, &_cell_state_out1); + _fully_connected_cell_state.configure(compile_context, input, input_to_cell_weights, + (_is_layer_norm_lstm) ? nullptr : cell_bias, &_cell_state_out1); _memory_group.manage(&_cell_state_out2); _transpose_cell_state->configure(compile_context, recurrent_to_cell_weights->info(), _cell_state_out2.info()); _recurrent_to_cell_weights = recurrent_to_cell_weights; _memory_group.manage(&_cell_state_out3); - _gemm_cell_state1.configure(compile_context, output_state_in, &_cell_state_out2, nullptr, &_cell_state_out3, 1.f, 0.f); + _gemm_cell_state1.configure(compile_context, output_state_in, &_cell_state_out2, nullptr, &_cell_state_out3, 1.f, + 0.f); _cell_state_out2.allocator()->allocate(); _memory_group.manage(&_cell_state_out4); - _accum_cell_state1.configure(compile_context, &_cell_state_out1, &_cell_state_out3, &_cell_state_out4, ConvertPolicy::SATURATE); + _accum_cell_state1.configure(compile_context, &_cell_state_out1, &_cell_state_out3, &_cell_state_out4, + ConvertPolicy::SATURATE); CLTensor *cell_state_out_ptr = &_cell_state_out4; - if(_is_layer_norm_lstm) + if (_is_layer_norm_lstm) { _cell_layer_norm_out1.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type())); _cell_layer_norm_out2.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type())); _memory_group.manage(&_cell_layer_norm_out1); _memory_group.manage(&_cell_layer_norm_out2); _mean_std_norm_cell_gate.configure(compile_context, cell_state_out_ptr); - _pixelwise_mul_cell_gate_coeff.configure(compile_context, cell_state_out_ptr, lstm_params.cell_layer_norm_weights(), &_cell_layer_norm_out1, 1, ConvertPolicy::SATURATE, - RoundingPolicy::TO_NEAREST_EVEN); + _pixelwise_mul_cell_gate_coeff.configure(compile_context, cell_state_out_ptr, + lstm_params.cell_layer_norm_weights(), &_cell_layer_norm_out1, 1, + ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN); // cell_state_out_ptr is going to be reassigned, so allocate the tensor that it was assigned to before cell_state_out_ptr->allocator()->allocate(); - _accum_cell_gate_bias.configure(compile_context, &_cell_layer_norm_out1, cell_bias, &_cell_layer_norm_out2, ConvertPolicy::SATURATE); + _accum_cell_gate_bias.configure(compile_context, &_cell_layer_norm_out1, cell_bias, &_cell_layer_norm_out2, + ConvertPolicy::SATURATE); _cell_layer_norm_out1.allocator()->allocate(); cell_state_out_ptr = &_cell_layer_norm_out2; } _activation_cell_state.configure(compile_context, cell_state_out_ptr, nullptr, activation_info); _memory_group.manage(&_cell_state_out5); - _pixelwise_mul_cell_state1.configure(compile_context, cell_state_out_ptr, input_gate_out, &_cell_state_out5, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN); + _pixelwise_mul_cell_state1.configure(compile_context, cell_state_out_ptr, input_gate_out, &_cell_state_out5, 1, + ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN); cell_state_out_ptr->allocator()->allocate(); - _pixelwise_mul_cell_state2.configure(compile_context, forget_gate_out, cell_state_in, &_cell_state_out3, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN); - _accum_cell_state2.configure(compile_context, &_cell_state_out5, &_cell_state_out3, &_cell_state_out1, ConvertPolicy::SATURATE); + _pixelwise_mul_cell_state2.configure(compile_context, forget_gate_out, cell_state_in, &_cell_state_out3, 1, + ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN); + _accum_cell_state2.configure(compile_context, &_cell_state_out5, &_cell_state_out3, &_cell_state_out1, + ConvertPolicy::SATURATE); _cell_state_out3.allocator()->allocate(); _cell_state_out5.allocator()->allocate(); // Perform clipping - if(cell_threshold != 0.f) + if (cell_threshold != 0.f) { _perform_cell_clipping = true; - _cell_clip.configure(compile_context, &_cell_state_out1, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, -cell_threshold, cell_threshold)); + _cell_clip.configure(compile_context, &_cell_state_out1, nullptr, + ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, + cell_threshold, -cell_threshold)); } // Configure block that calculates the output @@ -297,7 +426,8 @@ void CLLSTMLayer::configure(const CLCompileContext &compile_context, const ICLTe std::vector<const ICLTensor *> in_out_weights; in_out_weights.emplace_back(input_to_output_weights); in_out_weights.emplace_back(recurrent_to_output_weights); - TensorShape in_out_weights_concat_shape = arm_compute::misc::shape_calculator::calculate_concatenate_shape(in_out_weights, 0); + TensorShape in_out_weights_concat_shape = + arm_compute::misc::shape_calculator::calculate_concatenate_shape(in_out_weights, 0); _output2.allocator()->init(TensorInfo(in_out_weights_concat_shape, 1, input->info()->data_type())); _concat_weights_output.configure(compile_context, in_out_weights, &_output2, Window::DimX); @@ -305,18 +435,20 @@ void CLLSTMLayer::configure(const CLCompileContext &compile_context, const ICLTe _memory_group.manage(&_output1); _memory_group.manage(&_output4); - _fully_connected_output.configure(compile_context, &_forget_gate_out2, &_output2, (_is_layer_norm_lstm) ? nullptr : output_gate_bias, &_output4); + _fully_connected_output.configure(compile_context, &_forget_gate_out2, &_output2, + (_is_layer_norm_lstm) ? nullptr : output_gate_bias, &_output4); _output2.allocator()->allocate(); _forget_gate_out2.allocator()->allocate(); CLTensor *output_gate_out = &_output4; - if(lstm_params.has_peephole_opt()) + if (lstm_params.has_peephole_opt()) { _output3.allocator()->init(TensorInfo(_cell_state_out1.info()->tensor_shape(), 1, input->info()->data_type())); _memory_group.manage(&_output3); - _pixelwise_mul_output_state1.configure(compile_context, &_cell_state_out1, lstm_params.cell_to_output_weights(), &_output3, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN); + _pixelwise_mul_output_state1.configure(compile_context, &_cell_state_out1, lstm_params.cell_to_output_weights(), + &_output3, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN); _accum_output1.configure(compile_context, &_output4, &_output3, &_output1, ConvertPolicy::SATURATE); _output4.allocator()->allocate(); output_gate_out = &_output1; @@ -328,22 +460,25 @@ void CLLSTMLayer::configure(const CLCompileContext &compile_context, const ICLTe { _output1.allocator()->allocate(); } - if(_is_layer_norm_lstm) + if (_is_layer_norm_lstm) { _output_layer_norm_out1.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type())); _output_layer_norm_out2.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type())); _memory_group.manage(&_output_layer_norm_out1); _memory_group.manage(&_output_layer_norm_out2); _mean_std_norm_output_gate.configure(compile_context, output_gate_out); - _pixelwise_mul_output_gate_coeff.configure(compile_context, output_gate_out, lstm_params.output_layer_norm_weights(), &_output_layer_norm_out1, 1, ConvertPolicy::SATURATE, - RoundingPolicy::TO_NEAREST_EVEN); + _pixelwise_mul_output_gate_coeff.configure(compile_context, output_gate_out, + lstm_params.output_layer_norm_weights(), &_output_layer_norm_out1, 1, + ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN); // output_gate_out is going to be reassigned, so allocate the tensor that it was assigned to before output_gate_out->allocator()->allocate(); - _accum_output_gate_bias.configure(compile_context, &_output_layer_norm_out1, output_gate_bias, &_output_layer_norm_out2, ConvertPolicy::SATURATE); + _accum_output_gate_bias.configure(compile_context, &_output_layer_norm_out1, output_gate_bias, + &_output_layer_norm_out2, ConvertPolicy::SATURATE); _output_layer_norm_out1.allocator()->allocate(); output_gate_out = &_output_layer_norm_out2; } - _activation_output.configure(compile_context, output_gate_out, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)); + _activation_output.configure(compile_context, output_gate_out, nullptr, + ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)); // Configure block that calculates the output state /** lstm_res = PixelwiseMul(output, Activation(cell_state)) @@ -360,19 +495,24 @@ void CLLSTMLayer::configure(const CLCompileContext &compile_context, const ICLTe _memory_group.manage(&_cell_state_activation); _activation_output_state.configure(compile_context, &_cell_state_out1, &_cell_state_activation, activation_info); - _pixelwise_mul_output_state2.configure(compile_context, &_cell_state_activation, output_gate_out, output_state_out_tmp, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN); + _pixelwise_mul_output_state2.configure(compile_context, &_cell_state_activation, output_gate_out, + output_state_out_tmp, 1, ConvertPolicy::SATURATE, + RoundingPolicy::TO_NEAREST_EVEN); _cell_state_activation.allocator()->allocate(); - if(lstm_params.has_projection()) + if (lstm_params.has_projection()) { _has_projection_weights = true; - _fully_connected_output_state.configure(compile_context, output_state_out_tmp, lstm_params.projection_weights(), lstm_params.projection_bias(), output_state_out); + _fully_connected_output_state.configure(compile_context, output_state_out_tmp, lstm_params.projection_weights(), + lstm_params.projection_bias(), output_state_out); _output_state1.allocator()->allocate(); // Perform clipping - if(projection_threshold != 0.f) + if (projection_threshold != 0.f) { _perform_projection_clipping = true; - _projection_clip.configure(compile_context, output_state_out, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, -projection_threshold, projection_threshold)); + _projection_clip.configure(compile_context, output_state_out, nullptr, + ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, + -projection_threshold, projection_threshold)); } } @@ -382,7 +522,7 @@ void CLLSTMLayer::configure(const CLCompileContext &compile_context, const ICLTe // Vector for holding the tensors to store in scratch buffer std::vector<const ICLTensor *> scratch_inputs; - if(!lstm_params.has_cifg_opt()) + if (!lstm_params.has_cifg_opt()) { scratch_inputs.emplace_back(input_gate_out); } @@ -396,29 +536,38 @@ void CLLSTMLayer::configure(const CLCompileContext &compile_context, const ICLTe output_gate_out->allocator()->allocate(); } -Status CLLSTMLayer::validate(const ITensorInfo *input, - const ITensorInfo *input_to_forget_weights, const ITensorInfo *input_to_cell_weights, const ITensorInfo *input_to_output_weights, - const ITensorInfo *recurrent_to_forget_weights, const ITensorInfo *recurrent_to_cell_weights, const ITensorInfo *recurrent_to_output_weights, - const ITensorInfo *forget_gate_bias, const ITensorInfo *cell_bias, const ITensorInfo *output_gate_bias, - const ITensorInfo *output_state_in, const ITensorInfo *cell_state_in, - const ITensorInfo *scratch_buffer, const ITensorInfo *output_state_out, const ITensorInfo *cell_state_out, const ITensorInfo *output, - const LSTMParams<ITensorInfo> &lstm_params, const ActivationLayerInfo &activation_info, float cell_threshold, float projection_threshold) +Status CLLSTMLayer::validate(const ITensorInfo *input, + const ITensorInfo *input_to_forget_weights, + const ITensorInfo *input_to_cell_weights, + const ITensorInfo *input_to_output_weights, + const ITensorInfo *recurrent_to_forget_weights, + const ITensorInfo *recurrent_to_cell_weights, + const ITensorInfo *recurrent_to_output_weights, + const ITensorInfo *forget_gate_bias, + const ITensorInfo *cell_bias, + const ITensorInfo *output_gate_bias, + const ITensorInfo *output_state_in, + const ITensorInfo *cell_state_in, + const ITensorInfo *scratch_buffer, + const ITensorInfo *output_state_out, + const ITensorInfo *cell_state_out, + const ITensorInfo *output, + const LSTMParams<ITensorInfo> &lstm_params, + const ActivationLayerInfo &activation_info, + float cell_threshold, + float projection_threshold) { - ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, - input_to_forget_weights, input_to_cell_weights, input_to_output_weights, - recurrent_to_forget_weights, recurrent_to_cell_weights, recurrent_to_output_weights, - forget_gate_bias, cell_bias, output_gate_bias, - output_state_in, cell_state_in, - scratch_buffer, output_state_out, cell_state_out, output); + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR( + input, input_to_forget_weights, input_to_cell_weights, input_to_output_weights, recurrent_to_forget_weights, + recurrent_to_cell_weights, recurrent_to_output_weights, forget_gate_bias, cell_bias, output_gate_bias, + output_state_in, cell_state_in, scratch_buffer, output_state_out, cell_state_out, output); // Check data types ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, - input_to_forget_weights, input_to_cell_weights, input_to_output_weights, - recurrent_to_forget_weights, recurrent_to_cell_weights, recurrent_to_output_weights, - forget_gate_bias, cell_bias, output_gate_bias, - output_state_in, cell_state_in, - scratch_buffer, output_state_out, cell_state_out, output); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES( + input, input_to_forget_weights, input_to_cell_weights, input_to_output_weights, recurrent_to_forget_weights, + recurrent_to_cell_weights, recurrent_to_output_weights, forget_gate_bias, cell_bias, output_gate_bias, + output_state_in, cell_state_in, scratch_buffer, output_state_out, cell_state_out, output); // Check dimensions ARM_COMPUTE_RETURN_ERROR_ON(input->num_dimensions() > 2); @@ -437,16 +586,16 @@ Status CLLSTMLayer::validate(const ITensorInfo *input, ARM_COMPUTE_RETURN_ERROR_ON(output_state_out->num_dimensions() > 2); ARM_COMPUTE_RETURN_ERROR_ON(cell_state_out->num_dimensions() > 2); ARM_COMPUTE_RETURN_ERROR_ON(output->num_dimensions() > 2); - ARM_COMPUTE_RETURN_ERROR_ON(cell_bias->dimension(0) * 4 != scratch_buffer->dimension(0) - && cell_bias->dimension(0) * 3 != scratch_buffer->dimension(0)); + ARM_COMPUTE_RETURN_ERROR_ON(cell_bias->dimension(0) * 4 != scratch_buffer->dimension(0) && + cell_bias->dimension(0) * 3 != scratch_buffer->dimension(0)); const unsigned int num_batches = input->dimension(1); const unsigned int num_cells = input_to_output_weights->dimension(1); - if(lstm_params.use_layer_norm()) + if (lstm_params.use_layer_norm()) { // If CIFG is used, input layer normalization weights tensor is omitted - if(lstm_params.has_cifg_opt()) + if (lstm_params.has_cifg_opt()) { ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.input_layer_norm_weights() != nullptr); } @@ -458,8 +607,12 @@ Status CLLSTMLayer::validate(const ITensorInfo *input, ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, lstm_params.input_layer_norm_weights()); } - ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(lstm_params.forget_layer_norm_weights(), lstm_params.cell_layer_norm_weights(), lstm_params.output_layer_norm_weights()); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, lstm_params.forget_layer_norm_weights(), lstm_params.cell_layer_norm_weights(), lstm_params.output_layer_norm_weights()); + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(lstm_params.forget_layer_norm_weights(), + lstm_params.cell_layer_norm_weights(), + lstm_params.output_layer_norm_weights()); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, lstm_params.forget_layer_norm_weights(), + lstm_params.cell_layer_norm_weights(), + lstm_params.output_layer_norm_weights()); ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.forget_layer_norm_weights()->num_dimensions() > 1); ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.cell_layer_norm_weights()->num_dimensions() > 1); ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.output_layer_norm_weights()->num_dimensions() > 1); @@ -469,7 +622,7 @@ Status CLLSTMLayer::validate(const ITensorInfo *input, } // Check peephole optimization - if(lstm_params.has_peephole_opt()) + if (lstm_params.has_peephole_opt()) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(lstm_params.cell_to_output_weights(), lstm_params.cell_to_forget_weights()); ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.cell_to_forget_weights()->num_dimensions() > 1); @@ -487,36 +640,42 @@ Status CLLSTMLayer::validate(const ITensorInfo *input, TensorInfo cell_state_tmp = TensorInfo(TensorShape(num_cells, num_batches), 1, input->data_type()); // Validate forget gate - ARM_COMPUTE_RETURN_ON_ERROR(CLFullyConnectedLayer::validate(input, input_to_forget_weights, (lstm_params.use_layer_norm()) ? nullptr : forget_gate_bias, &forget_gate)); + ARM_COMPUTE_RETURN_ON_ERROR(CLFullyConnectedLayer::validate( + input, input_to_forget_weights, (lstm_params.use_layer_norm()) ? nullptr : forget_gate_bias, &forget_gate)); std::vector<const ITensorInfo *> inputs_vector; inputs_vector.emplace_back(input); inputs_vector.emplace_back(output_state_in); - const TensorShape concat_shape = arm_compute::misc::shape_calculator::calculate_concatenate_shape(inputs_vector, 0); + const TensorShape concat_shape = arm_compute::misc::shape_calculator::calculate_concatenate_shape(inputs_vector, 0); TensorInfo forget_gate_concat = TensorInfo(concat_shape, 1, input->data_type()); ARM_COMPUTE_RETURN_ON_ERROR(CLConcatenateLayer::validate(inputs_vector, &forget_gate_concat, Window::DimX)); - if(lstm_params.has_peephole_opt()) + if (lstm_params.has_peephole_opt()) { - ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplication::validate(cell_state_in, lstm_params.cell_to_forget_weights(), &forget_gate, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN)); - ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAddition::validate(&forget_gate, &forget_gate, &forget_gate, ConvertPolicy::SATURATE)); + ARM_COMPUTE_RETURN_ON_ERROR( + CLPixelWiseMultiplication::validate(cell_state_in, lstm_params.cell_to_forget_weights(), &forget_gate, 1, + ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN)); + ARM_COMPUTE_RETURN_ON_ERROR( + CLArithmeticAddition::validate(&forget_gate, &forget_gate, &forget_gate, ConvertPolicy::SATURATE)); } - if(lstm_params.use_layer_norm()) + if (lstm_params.use_layer_norm()) { ARM_COMPUTE_RETURN_ON_ERROR(CLMeanStdDevNormalizationLayer::validate(&forget_gate)); - ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplication::validate(&forget_gate, lstm_params.forget_layer_norm_weights(), &forget_gate, 1, ConvertPolicy::SATURATE, - RoundingPolicy::TO_NEAREST_EVEN)); - ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAddition::validate(&forget_gate, forget_gate_bias, &forget_gate, ConvertPolicy::SATURATE)); + ARM_COMPUTE_RETURN_ON_ERROR( + CLPixelWiseMultiplication::validate(&forget_gate, lstm_params.forget_layer_norm_weights(), &forget_gate, 1, + ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN)); + ARM_COMPUTE_RETURN_ON_ERROR( + CLArithmeticAddition::validate(&forget_gate, forget_gate_bias, &forget_gate, ConvertPolicy::SATURATE)); } - ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayer::validate(&forget_gate, &forget_gate, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC))); + ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayer::validate( + &forget_gate, &forget_gate, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC))); // Validate input gate - if(!lstm_params.has_cifg_opt()) + if (!lstm_params.has_cifg_opt()) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(lstm_params.input_to_input_weights(), - lstm_params.recurrent_to_input_weights(), - lstm_params.input_gate_bias()); + lstm_params.recurrent_to_input_weights(), lstm_params.input_gate_bias()); ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.input_to_input_weights()->num_dimensions() > 2); ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.recurrent_to_input_weights()->num_dimensions() > 2); ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.input_gate_bias()->num_dimensions() > 1); @@ -524,88 +683,121 @@ Status CLLSTMLayer::validate(const ITensorInfo *input, std::vector<const ITensorInfo *> lstm_weights; lstm_weights.emplace_back(lstm_params.input_to_input_weights()); lstm_weights.emplace_back(lstm_params.recurrent_to_input_weights()); - TensorShape lstm_weights_concat_shape = arm_compute::misc::shape_calculator::calculate_concatenate_shape(lstm_weights, 0); - TensorInfo lstm_gate_concat = TensorInfo(lstm_weights_concat_shape, 1, input->data_type()); + TensorShape lstm_weights_concat_shape = + arm_compute::misc::shape_calculator::calculate_concatenate_shape(lstm_weights, 0); + TensorInfo lstm_gate_concat = TensorInfo(lstm_weights_concat_shape, 1, input->data_type()); ARM_COMPUTE_RETURN_ON_ERROR(CLConcatenateLayer::validate(lstm_weights, &lstm_gate_concat, Window::DimX)); - ARM_COMPUTE_RETURN_ON_ERROR(CLFullyConnectedLayer::validate(input, lstm_params.input_to_input_weights(), (lstm_params.use_layer_norm()) ? nullptr : lstm_params.input_gate_bias(), &input_gate)); + ARM_COMPUTE_RETURN_ON_ERROR(CLFullyConnectedLayer::validate( + input, lstm_params.input_to_input_weights(), + (lstm_params.use_layer_norm()) ? nullptr : lstm_params.input_gate_bias(), &input_gate)); - if(lstm_params.has_peephole_opt()) + if (lstm_params.has_peephole_opt()) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(lstm_params.cell_to_input_weights()); ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.cell_to_input_weights()->num_dimensions() > 1); - ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplication::validate(cell_state_in, lstm_params.cell_to_input_weights(), &input_gate, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN)); - ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAddition::validate(&input_gate, &input_gate, &input_gate, ConvertPolicy::SATURATE)); + ARM_COMPUTE_RETURN_ON_ERROR( + CLPixelWiseMultiplication::validate(cell_state_in, lstm_params.cell_to_input_weights(), &input_gate, 1, + ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN)); + ARM_COMPUTE_RETURN_ON_ERROR( + CLArithmeticAddition::validate(&input_gate, &input_gate, &input_gate, ConvertPolicy::SATURATE)); } - if(lstm_params.use_layer_norm()) + if (lstm_params.use_layer_norm()) { ARM_COMPUTE_RETURN_ON_ERROR(CLMeanStdDevNormalizationLayer::validate(&input_gate)); - ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplication::validate(&input_gate, lstm_params.input_layer_norm_weights(), &input_gate, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN)); - ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAddition::validate(&input_gate, lstm_params.input_gate_bias(), &input_gate, ConvertPolicy::SATURATE)); + ARM_COMPUTE_RETURN_ON_ERROR( + CLPixelWiseMultiplication::validate(&input_gate, lstm_params.input_layer_norm_weights(), &input_gate, 1, + ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN)); + ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAddition::validate(&input_gate, lstm_params.input_gate_bias(), + &input_gate, ConvertPolicy::SATURATE)); } - ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayer::validate(&input_gate, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC))); + ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayer::validate( + &input_gate, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC))); } else { - ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticSubtraction::validate(&forget_gate, &forget_gate, &forget_gate, ConvertPolicy::SATURATE)); + ARM_COMPUTE_RETURN_ON_ERROR( + CLArithmeticSubtraction::validate(&forget_gate, &forget_gate, &forget_gate, ConvertPolicy::SATURATE)); } // Validate cell state - ARM_COMPUTE_RETURN_ON_ERROR(CLFullyConnectedLayer::validate(input, input_to_cell_weights, (lstm_params.use_layer_norm()) ? nullptr : cell_bias, &cell_state_tmp)); - ARM_COMPUTE_RETURN_ON_ERROR(CLGEMM::validate(output_state_in, &units_out_transposed_info, nullptr, &cell_state_tmp, 1.f, 0.f, GEMMInfo())); - ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAddition::validate(&cell_state_tmp, &cell_state_tmp, &cell_state_tmp, ConvertPolicy::SATURATE)); - if(lstm_params.use_layer_norm()) + ARM_COMPUTE_RETURN_ON_ERROR(CLFullyConnectedLayer::validate( + input, input_to_cell_weights, (lstm_params.use_layer_norm()) ? nullptr : cell_bias, &cell_state_tmp)); + ARM_COMPUTE_RETURN_ON_ERROR( + CLGEMM::validate(output_state_in, &units_out_transposed_info, nullptr, &cell_state_tmp, 1.f, 0.f, GEMMInfo())); + ARM_COMPUTE_RETURN_ON_ERROR( + CLArithmeticAddition::validate(&cell_state_tmp, &cell_state_tmp, &cell_state_tmp, ConvertPolicy::SATURATE)); + if (lstm_params.use_layer_norm()) { ARM_COMPUTE_RETURN_ON_ERROR(CLMeanStdDevNormalizationLayer::validate(&cell_state_tmp)); - ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplication::validate(&cell_state_tmp, lstm_params.cell_layer_norm_weights(), &cell_state_tmp, 1, ConvertPolicy::SATURATE, - RoundingPolicy::TO_NEAREST_EVEN)); - ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAddition::validate(&cell_state_tmp, cell_bias, &cell_state_tmp, ConvertPolicy::SATURATE)); + ARM_COMPUTE_RETURN_ON_ERROR( + CLPixelWiseMultiplication::validate(&cell_state_tmp, lstm_params.cell_layer_norm_weights(), &cell_state_tmp, + 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN)); + ARM_COMPUTE_RETURN_ON_ERROR( + CLArithmeticAddition::validate(&cell_state_tmp, cell_bias, &cell_state_tmp, ConvertPolicy::SATURATE)); } ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayer::validate(&cell_state_tmp, nullptr, activation_info)); - ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplication::validate(&cell_state_tmp, &input_gate, &cell_state_tmp, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN)); - ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplication::validate(&cell_state_tmp, &forget_gate, &cell_state_tmp, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN)); - ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAddition::validate(&cell_state_tmp, &cell_state_tmp, &cell_state_tmp, ConvertPolicy::SATURATE)); - if(cell_threshold != 0.f) + ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplication::validate( + &cell_state_tmp, &input_gate, &cell_state_tmp, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN)); + ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplication::validate( + &cell_state_tmp, &forget_gate, &cell_state_tmp, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN)); + ARM_COMPUTE_RETURN_ON_ERROR( + CLArithmeticAddition::validate(&cell_state_tmp, &cell_state_tmp, &cell_state_tmp, ConvertPolicy::SATURATE)); + if (cell_threshold != 0.f) { - ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayer::validate(&cell_state_tmp, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, -cell_threshold, - cell_threshold))); + ARM_COMPUTE_RETURN_ON_ERROR( + CLActivationLayer::validate(&cell_state_tmp, nullptr, + ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, + cell_threshold, -cell_threshold))); } std::vector<const ITensorInfo *> in_out_weights; in_out_weights.emplace_back(input_to_output_weights); in_out_weights.emplace_back(recurrent_to_output_weights); - TensorShape in_out_weights_concat_shape = arm_compute::misc::shape_calculator::calculate_concatenate_shape(in_out_weights, 0); - TensorInfo in_out_gate_concat = TensorInfo(in_out_weights_concat_shape, 1, input->data_type()); + TensorShape in_out_weights_concat_shape = + arm_compute::misc::shape_calculator::calculate_concatenate_shape(in_out_weights, 0); + TensorInfo in_out_gate_concat = TensorInfo(in_out_weights_concat_shape, 1, input->data_type()); ARM_COMPUTE_RETURN_ON_ERROR(CLConcatenateLayer::validate(in_out_weights, &in_out_gate_concat, Window::DimX)); // Validate output gate tmp - ARM_COMPUTE_RETURN_ON_ERROR(CLFullyConnectedLayer::validate(input, input_to_output_weights, (lstm_params.use_layer_norm()) ? nullptr : output_gate_bias, &output_gate_tmp)); + ARM_COMPUTE_RETURN_ON_ERROR(CLFullyConnectedLayer::validate( + input, input_to_output_weights, (lstm_params.use_layer_norm()) ? nullptr : output_gate_bias, &output_gate_tmp)); - if(lstm_params.has_peephole_opt()) + if (lstm_params.has_peephole_opt()) { - ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplication::validate(&cell_state_tmp, lstm_params.cell_to_output_weights(), &output_gate_tmp, 1, ConvertPolicy::SATURATE, - RoundingPolicy::TO_NEAREST_EVEN)); - ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAddition::validate(&output_gate_tmp, &output_gate_tmp, &output_gate_tmp, ConvertPolicy::SATURATE)); + ARM_COMPUTE_RETURN_ON_ERROR( + CLPixelWiseMultiplication::validate(&cell_state_tmp, lstm_params.cell_to_output_weights(), &output_gate_tmp, + 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN)); + ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAddition::validate(&output_gate_tmp, &output_gate_tmp, &output_gate_tmp, + ConvertPolicy::SATURATE)); } - if(lstm_params.use_layer_norm()) + if (lstm_params.use_layer_norm()) { ARM_COMPUTE_RETURN_ON_ERROR(CLMeanStdDevNormalizationLayer::validate(&output_gate_tmp)); - ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplication::validate(&output_gate_tmp, lstm_params.output_layer_norm_weights(), &output_gate_tmp, 1, ConvertPolicy::SATURATE, - RoundingPolicy::TO_NEAREST_EVEN)); - ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAddition::validate(&output_gate_tmp, output_gate_bias, &output_gate_tmp, ConvertPolicy::SATURATE)); + ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplication::validate( + &output_gate_tmp, lstm_params.output_layer_norm_weights(), &output_gate_tmp, 1, ConvertPolicy::SATURATE, + RoundingPolicy::TO_NEAREST_EVEN)); + ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAddition::validate(&output_gate_tmp, output_gate_bias, &output_gate_tmp, + ConvertPolicy::SATURATE)); } - ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayer::validate(&output_gate_tmp, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC))); + ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayer::validate( + &output_gate_tmp, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC))); // Validate output state ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayer::validate(&cell_state_tmp, &cell_state_tmp, activation_info)); - ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplication::validate(&cell_state_tmp, &output_gate_tmp, &output_gate_tmp, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN)); - if(lstm_params.has_projection()) - { - ARM_COMPUTE_RETURN_ON_ERROR(CLFullyConnectedLayer::validate(&output_gate_tmp, lstm_params.projection_weights(), lstm_params.projection_bias(), output_state_out)); - if(projection_threshold != 0.f) + ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplication::validate(&cell_state_tmp, &output_gate_tmp, &output_gate_tmp, + 1, ConvertPolicy::SATURATE, + RoundingPolicy::TO_NEAREST_EVEN)); + if (lstm_params.has_projection()) + { + ARM_COMPUTE_RETURN_ON_ERROR(CLFullyConnectedLayer::validate(&output_gate_tmp, lstm_params.projection_weights(), + lstm_params.projection_bias(), output_state_out)); + if (projection_threshold != 0.f) { - ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayer::validate(output_state_out, output_state_out, - ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, -projection_threshold, projection_threshold))); + ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayer::validate( + output_state_out, output_state_out, + ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, -projection_threshold, + projection_threshold))); } } @@ -615,7 +807,7 @@ Status CLLSTMLayer::validate(const ITensorInfo *input, // Validate scratch concatenation std::vector<const ITensorInfo *> inputs_vector_info_raw; - if(!lstm_params.has_cifg_opt()) + if (!lstm_params.has_cifg_opt()) { inputs_vector_info_raw.push_back(&input_gate); } @@ -637,12 +829,12 @@ void CLLSTMLayer::run() _fully_connected_forget_gate.run(); - if(_run_peephole_opt) + if (_run_peephole_opt) { _pixelwise_mul_forget_gate.run(); _accum_forget_gate1.run(); } - if(_is_layer_norm_lstm) + if (_is_layer_norm_lstm) { _mean_std_norm_forget_gate.run(); _pixelwise_mul_forget_gate_coeff.run(); @@ -650,7 +842,7 @@ void CLLSTMLayer::run() } _activation_forget_gate.run(); - if(_run_cifg_opt) + if (_run_cifg_opt) { _ones_fill.run(); _subtract_input_gate.run(); @@ -659,13 +851,13 @@ void CLLSTMLayer::run() { _fully_connected_input_gate.run(); - if(_run_peephole_opt) + if (_run_peephole_opt) { _pixelwise_mul_input_gate.run(); _accum_input_gate1.run(); } - if(_is_layer_norm_lstm) + if (_is_layer_norm_lstm) { _mean_std_norm_input_gate.run(); _pixelwise_mul_input_gate_coeff.run(); @@ -678,12 +870,10 @@ void CLLSTMLayer::run() ITensorPack pack; pack.add_tensor(TensorType::ACL_SRC, _recurrent_to_cell_weights); pack.add_tensor(TensorType::ACL_DST, &_cell_state_out2); - CLScheduler::get().enqueue_op(*_transpose_cell_state, - pack, - false); + CLScheduler::get().enqueue_op(*_transpose_cell_state, pack, false); _gemm_cell_state1.run(); _accum_cell_state1.run(); - if(_is_layer_norm_lstm) + if (_is_layer_norm_lstm) { _mean_std_norm_cell_gate.run(); _pixelwise_mul_cell_gate_coeff.run(); @@ -694,19 +884,19 @@ void CLLSTMLayer::run() _pixelwise_mul_cell_state2.run(); _accum_cell_state2.run(); - if(_perform_cell_clipping) + if (_perform_cell_clipping) { _cell_clip.run(); } _fully_connected_output.run(); - if(_run_peephole_opt) + if (_run_peephole_opt) { _pixelwise_mul_output_state1.run(); _accum_output1.run(); } - if(_is_layer_norm_lstm) + if (_is_layer_norm_lstm) { _mean_std_norm_output_gate.run(); _pixelwise_mul_output_gate_coeff.run(); @@ -717,10 +907,10 @@ void CLLSTMLayer::run() _activation_output_state.run(); _pixelwise_mul_output_state2.run(); - if(_has_projection_weights) + if (_has_projection_weights) { _fully_connected_output_state.run(); - if(_perform_projection_clipping) + if (_perform_projection_clipping) { _projection_clip.run(); } @@ -734,10 +924,10 @@ void CLLSTMLayer::run() void CLLSTMLayer::prepare() { - if(!_is_prepared) + if (!_is_prepared) { _concat_weights_forget_gate.run(); - if(!_run_cifg_opt) + if (!_run_cifg_opt) { _concat_weights_input_gate.run(); } diff --git a/src/runtime/CL/functions/CLLSTMLayerQuantized.cpp b/src/runtime/CL/functions/CLLSTMLayerQuantized.cpp index a44dcd2e24..ea64eda023 100644 --- a/src/runtime/CL/functions/CLLSTMLayerQuantized.cpp +++ b/src/runtime/CL/functions/CLLSTMLayerQuantized.cpp @@ -25,14 +25,11 @@ #include "arm_compute/runtime/CL/functions/CLLSTMLayerQuantized.h" #include "arm_compute/core/Utils.h" -#include "arm_compute/core/Validate.h" #include "arm_compute/core/utils/quantization/AsymmHelpers.h" +#include "arm_compute/core/Validate.h" + +#include "src/common/utils/Log.h" #include "src/core/CL/kernels/CLFillBorderKernel.h" -#include "src/core/CL/kernels/CLGEMMLowpMatrixMultiplyNativeKernel.h" -#include "src/core/CL/kernels/CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel.h" -#include "src/core/CL/kernels/CLGEMMLowpOffsetContributionKernel.h" -#include "src/core/CL/kernels/CLGEMMLowpOffsetContributionOutputStageKernel.h" -#include "src/core/CL/kernels/CLGEMMLowpReductionKernel.h" #include "src/core/helpers/AutoConfiguration.h" #include <memory> @@ -49,44 +46,129 @@ const QuantizationInfo qsymm_0(1.f / 32768.f, 0); // qsymm16 with 0 integer bit } // namespace CLLSTMLayerQuantized::CLLSTMLayerQuantized(std::shared_ptr<IMemoryManager> memory_manager) - : _memory_group(std::move(memory_manager)), _gemmlowp(), _output_stage(), _transpose_weights(), _concat_input_weights(), _concat_recurrent_weights(), _concat_weights(), _concat_inputs(), - _concat_bias(), _sigmoid_forget_gate(), _sigmoid_input_gate(), _sigmoid_output_gate(), _tanh_modulation_gate(), _tanh_output_state(), _add_cell_state_tmps(), _add2(), _mul_forget_gate_cell_state(), - _mul_input_gate_input_mod_gate(), _mul_output_state_tmp_output_gate(), _slice_input_tensor(), _slice_forget_tensor(), _slice_cell_tensor(), _slice_output_tensor(), _dequantize(), _quantize(), - _input_to_input_weights(nullptr), _input_to_forget_weights(nullptr), _input_to_cell_weights(nullptr), _input_to_output_weights(nullptr), _recurrent_to_input_weights(nullptr), - _recurrent_to_forget_weights(nullptr), _recurrent_to_cell_weights(nullptr), _recurrent_to_output_weights(nullptr), _input_gate_bias(nullptr), _forget_gate_bias(nullptr), _cell_bias(nullptr), - _output_gate_bias(nullptr), _recurrent_weights(), _input_weights(), _weights(), _input(), _weights_transposed(), _output_highp(), _output_lowp(), _bias(), _forget_gate_input(), _input_gate_input(), - _output_gate_input(), _input_modulation_gate_input(), _forget_gate_output(), _input_gate_output(), _output_gate_output(), _input_modulation_gate_output(), _cell_state_tmp1(), _cell_state_tmp2(), - _output_state_tmp(), _output_state_out_symm(), _output_state_out_f32(), _is_prepared(false) + : _memory_group(std::move(memory_manager)), + _gemmlowp(), + _output_stage(), + _transpose_weights(), + _concat_input_weights(), + _concat_recurrent_weights(), + _concat_weights(), + _concat_inputs(), + _concat_bias(), + _sigmoid_forget_gate(), + _sigmoid_input_gate(), + _sigmoid_output_gate(), + _tanh_modulation_gate(), + _tanh_output_state(), + _add_cell_state_tmps(), + _add2(), + _mul_forget_gate_cell_state(), + _mul_input_gate_input_mod_gate(), + _mul_output_state_tmp_output_gate(), + _slice_input_tensor(), + _slice_forget_tensor(), + _slice_cell_tensor(), + _slice_output_tensor(), + _dequantize(), + _quantize(), + _input_to_input_weights(nullptr), + _input_to_forget_weights(nullptr), + _input_to_cell_weights(nullptr), + _input_to_output_weights(nullptr), + _recurrent_to_input_weights(nullptr), + _recurrent_to_forget_weights(nullptr), + _recurrent_to_cell_weights(nullptr), + _recurrent_to_output_weights(nullptr), + _input_gate_bias(nullptr), + _forget_gate_bias(nullptr), + _cell_bias(nullptr), + _output_gate_bias(nullptr), + _recurrent_weights(), + _input_weights(), + _weights(), + _input(), + _weights_transposed(), + _output_highp(), + _output_lowp(), + _bias(), + _forget_gate_input(), + _input_gate_input(), + _output_gate_input(), + _input_modulation_gate_input(), + _forget_gate_output(), + _input_gate_output(), + _output_gate_output(), + _input_modulation_gate_output(), + _cell_state_tmp1(), + _cell_state_tmp2(), + _output_state_tmp(), + _output_state_out_symm(), + _output_state_out_f32(), + _is_prepared(false) { } void CLLSTMLayerQuantized::configure(const ICLTensor *input, - const ICLTensor *input_to_input_weights, const ICLTensor *input_to_forget_weights, const ICLTensor *input_to_cell_weights, const ICLTensor *input_to_output_weights, - const ICLTensor *recurrent_to_input_weights, const ICLTensor *recurrent_to_forget_weights, const ICLTensor *recurrent_to_cell_weights, const ICLTensor *recurrent_to_output_weights, - const ICLTensor *input_gate_bias, const ICLTensor *forget_gate_bias, const ICLTensor *cell_bias, const ICLTensor *output_gate_bias, - ICLTensor *cell_state_in, const ICLTensor *output_state_in, - ICLTensor *cell_state_out, ICLTensor *output_state_out) + const ICLTensor *input_to_input_weights, + const ICLTensor *input_to_forget_weights, + const ICLTensor *input_to_cell_weights, + const ICLTensor *input_to_output_weights, + const ICLTensor *recurrent_to_input_weights, + const ICLTensor *recurrent_to_forget_weights, + const ICLTensor *recurrent_to_cell_weights, + const ICLTensor *recurrent_to_output_weights, + const ICLTensor *input_gate_bias, + const ICLTensor *forget_gate_bias, + const ICLTensor *cell_bias, + const ICLTensor *output_gate_bias, + ICLTensor *cell_state_in, + const ICLTensor *output_state_in, + ICLTensor *cell_state_out, + ICLTensor *output_state_out) { - configure(CLKernelLibrary::get().get_compile_context(), input, input_to_input_weights, input_to_forget_weights, input_to_cell_weights, input_to_output_weights, recurrent_to_input_weights, - recurrent_to_forget_weights, recurrent_to_cell_weights, recurrent_to_output_weights, input_gate_bias, forget_gate_bias, cell_bias, output_gate_bias, cell_state_in, output_state_in, cell_state_out, - output_state_out); + configure(CLKernelLibrary::get().get_compile_context(), input, input_to_input_weights, input_to_forget_weights, + input_to_cell_weights, input_to_output_weights, recurrent_to_input_weights, recurrent_to_forget_weights, + recurrent_to_cell_weights, recurrent_to_output_weights, input_gate_bias, forget_gate_bias, cell_bias, + output_gate_bias, cell_state_in, output_state_in, cell_state_out, output_state_out); } -void CLLSTMLayerQuantized::configure(const CLCompileContext &compile_context, const ICLTensor *input, - const ICLTensor *input_to_input_weights, const ICLTensor *input_to_forget_weights, const ICLTensor *input_to_cell_weights, const ICLTensor *input_to_output_weights, - const ICLTensor *recurrent_to_input_weights, const ICLTensor *recurrent_to_forget_weights, const ICLTensor *recurrent_to_cell_weights, const ICLTensor *recurrent_to_output_weights, - const ICLTensor *input_gate_bias, const ICLTensor *forget_gate_bias, const ICLTensor *cell_bias, const ICLTensor *output_gate_bias, - ICLTensor *cell_state_in, const ICLTensor *output_state_in, - ICLTensor *cell_state_out, ICLTensor *output_state_out) +void CLLSTMLayerQuantized::configure(const CLCompileContext &compile_context, + const ICLTensor *input, + const ICLTensor *input_to_input_weights, + const ICLTensor *input_to_forget_weights, + const ICLTensor *input_to_cell_weights, + const ICLTensor *input_to_output_weights, + const ICLTensor *recurrent_to_input_weights, + const ICLTensor *recurrent_to_forget_weights, + const ICLTensor *recurrent_to_cell_weights, + const ICLTensor *recurrent_to_output_weights, + const ICLTensor *input_gate_bias, + const ICLTensor *forget_gate_bias, + const ICLTensor *cell_bias, + const ICLTensor *output_gate_bias, + ICLTensor *cell_state_in, + const ICLTensor *output_state_in, + ICLTensor *cell_state_out, + ICLTensor *output_state_out) { - ARM_COMPUTE_ERROR_ON_NULLPTR(input, input_to_input_weights, input_to_forget_weights, input_to_cell_weights, input_to_output_weights, - recurrent_to_input_weights, recurrent_to_forget_weights, recurrent_to_cell_weights, recurrent_to_output_weights, - input_gate_bias, forget_gate_bias, cell_bias, output_gate_bias, cell_state_in, output_state_in, cell_state_out, output_state_out); - - ARM_COMPUTE_ERROR_THROW_ON(CLLSTMLayerQuantized::validate(input->info(), input_to_input_weights->info(), input_to_forget_weights->info(), input_to_cell_weights->info(), - input_to_output_weights->info(), - recurrent_to_input_weights->info(), recurrent_to_forget_weights->info(), recurrent_to_cell_weights->info(), recurrent_to_output_weights->info(), - input_gate_bias->info(), forget_gate_bias->info(), cell_bias->info(), output_gate_bias->info(), cell_state_in->info(), output_state_in->info(), cell_state_out->info(), output_state_out->info())); + ARM_COMPUTE_ERROR_ON_NULLPTR(input, input_to_input_weights, input_to_forget_weights, input_to_cell_weights, + input_to_output_weights, recurrent_to_input_weights, recurrent_to_forget_weights, + recurrent_to_cell_weights, recurrent_to_output_weights, input_gate_bias, + forget_gate_bias, cell_bias, output_gate_bias, cell_state_in, output_state_in, + cell_state_out, output_state_out); + + ARM_COMPUTE_LOG_PARAMS(input, input_to_input_weights, input_to_forget_weights, input_to_cell_weights, + input_to_output_weights, recurrent_to_input_weights, recurrent_to_forget_weights, + recurrent_to_cell_weights, recurrent_to_output_weights, input_gate_bias, forget_gate_bias, + cell_bias, output_gate_bias, cell_state_in, output_state_in, cell_state_out, + output_state_out); + + ARM_COMPUTE_ERROR_THROW_ON(CLLSTMLayerQuantized::validate( + input->info(), input_to_input_weights->info(), input_to_forget_weights->info(), input_to_cell_weights->info(), + input_to_output_weights->info(), recurrent_to_input_weights->info(), recurrent_to_forget_weights->info(), + recurrent_to_cell_weights->info(), recurrent_to_output_weights->info(), input_gate_bias->info(), + forget_gate_bias->info(), cell_bias->info(), output_gate_bias->info(), cell_state_in->info(), + output_state_in->info(), cell_state_out->info(), output_state_out->info())); const int input_size = input->info()->dimension(0); const int batch_size = input->info()->dimension(1); @@ -94,8 +176,10 @@ void CLLSTMLayerQuantized::configure(const CLCompileContext &compile_context, co const QuantizationInfo qweights = input_to_input_weights->info()->quantization_info(); // Weights quantization - auto_init_if_empty(*cell_state_out->info(), TensorInfo(TensorShape(batch_size, output_size), 1, DataType::QSYMM16, qsymm_4)); - auto_init_if_empty(*output_state_out->info(), TensorInfo(TensorShape(batch_size, output_size), 1, DataType::QASYMM8, qasymm)); + auto_init_if_empty(*cell_state_out->info(), + TensorInfo(TensorShape(batch_size, output_size), 1, DataType::QSYMM16, qsymm_4)); + auto_init_if_empty(*output_state_out->info(), + TensorInfo(TensorShape(batch_size, output_size), 1, DataType::QASYMM8, qasymm)); _input_to_input_weights = input_to_input_weights; _input_to_forget_weights = input_to_forget_weights; @@ -123,17 +207,20 @@ void CLLSTMLayerQuantized::configure(const CLCompileContext &compile_context, co recurrent_weights_vector.emplace_back(recurrent_to_cell_weights); recurrent_weights_vector.emplace_back(recurrent_to_output_weights); - _input_weights.allocator()->init(TensorInfo(TensorShape(input_size, 4 * output_size), 1, DataType::QASYMM8, qweights)); + _input_weights.allocator()->init( + TensorInfo(TensorShape(input_size, 4 * output_size), 1, DataType::QASYMM8, qweights)); _concat_input_weights.configure(compile_context, inputs_weights_vector, &_input_weights, Window::DimY); - _recurrent_weights.allocator()->init(TensorInfo(TensorShape(output_size, 4 * output_size), 1, DataType::QASYMM8, qweights)); + _recurrent_weights.allocator()->init( + TensorInfo(TensorShape(output_size, 4 * output_size), 1, DataType::QASYMM8, qweights)); _concat_recurrent_weights.configure(compile_context, recurrent_weights_vector, &_recurrent_weights, Window::DimY); std::vector<const ICLTensor *> weights_vector; weights_vector.emplace_back(&_recurrent_weights); weights_vector.emplace_back(&_input_weights); - _weights.allocator()->init(TensorInfo(TensorShape(output_size + input_size, 4 * output_size), 1, DataType::QASYMM8, qweights)); + _weights.allocator()->init( + TensorInfo(TensorShape(output_size + input_size, 4 * output_size), 1, DataType::QASYMM8, qweights)); _concat_weights.configure(compile_context, weights_vector, &_weights, Window::DimX); _transpose_weights.configure(compile_context, &_weights, &_weights_transposed); @@ -143,7 +230,8 @@ void CLLSTMLayerQuantized::configure(const CLCompileContext &compile_context, co input_vector.emplace_back(output_state_in); _memory_group.manage(&_input); - _input.allocator()->init(TensorInfo(TensorShape(output_size + input_size, batch_size), 1, DataType::QASYMM8, qasymm)); + _input.allocator()->init( + TensorInfo(TensorShape(output_size + input_size, batch_size), 1, DataType::QASYMM8, qasymm)); _concat_inputs.configure(compile_context, input_vector, &_input, Window::DimX); // Bias concatenation @@ -158,7 +246,8 @@ void CLLSTMLayerQuantized::configure(const CLCompileContext &compile_context, co // Invert the offset for gemmlowp _input.info()->set_quantization_info(QuantizationInfo(qasymm.uniform().scale, -qasymm.uniform().offset)); - _weights_transposed.info()->set_quantization_info(QuantizationInfo(qweights.uniform().scale, -qweights.uniform().offset)); + _weights_transposed.info()->set_quantization_info( + QuantizationInfo(qweights.uniform().scale, -qweights.uniform().offset)); // Run gemmlowp _memory_group.manage(&_output_highp); @@ -168,7 +257,8 @@ void CLLSTMLayerQuantized::configure(const CLCompileContext &compile_context, co // Set the offset back _input.info()->set_quantization_info(QuantizationInfo(qasymm.uniform().scale, qasymm.uniform().offset)); - _weights_transposed.info()->set_quantization_info(QuantizationInfo(qweights.uniform().scale, qweights.uniform().offset)); + _weights_transposed.info()->set_quantization_info( + QuantizationInfo(qweights.uniform().scale, qweights.uniform().offset)); // multiplier = (input_scale * weights_scale) / output_scale (2 ^ (-12)) _output_lowp.allocator()->init(TensorInfo(_output_highp.info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_3)); @@ -179,90 +269,122 @@ void CLLSTMLayerQuantized::configure(const CLCompileContext &compile_context, co quantization::calculate_quantized_multiplier(multiplier, &output_multiplier, &output_shift); _memory_group.manage(&_output_lowp); - _output_stage.configure(compile_context, &_output_highp, &_bias, &_output_lowp, output_multiplier, output_shift); + + GEMMLowpOutputStageInfo info{}; + info.type = GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT; + info.gemmlowp_multiplier = output_multiplier; + info.gemmlowp_shift = output_shift; + info.output_data_type = DataType::QSYMM16; + _output_stage.configure(compile_context, &_output_highp, &_bias, &_output_lowp, info); _output_highp.allocator()->allocate(); _bias.allocator()->allocate(); // Get the gate tensors - if(batch_size > 1) + if (batch_size > 1) { _memory_group.manage(&_input_gate_input); - _slice_input_tensor.configure(compile_context, &_output_lowp, &_input_gate_input, { 0, 0 }, { output_size, batch_size }); + _slice_input_tensor.configure(compile_context, &_output_lowp, &_input_gate_input, {0, 0}, + {output_size, batch_size}); _memory_group.manage(&_forget_gate_input); - _slice_forget_tensor.configure(compile_context, &_output_lowp, &_forget_gate_input, { output_size, 0 }, { 2 * output_size, batch_size }); + _slice_forget_tensor.configure(compile_context, &_output_lowp, &_forget_gate_input, {output_size, 0}, + {2 * output_size, batch_size}); _memory_group.manage(&_input_modulation_gate_input); - _slice_cell_tensor.configure(compile_context, &_output_lowp, &_input_modulation_gate_input, { 2 * output_size, 0 }, { 3 * output_size, batch_size }); + _slice_cell_tensor.configure(compile_context, &_output_lowp, &_input_modulation_gate_input, + {2 * output_size, 0}, {3 * output_size, batch_size}); _memory_group.manage(&_output_gate_input); - _slice_output_tensor.configure(compile_context, &_output_lowp, &_output_gate_input, { 3 * output_size, 0 }, { 4 * output_size, batch_size }); + _slice_output_tensor.configure(compile_context, &_output_lowp, &_output_gate_input, {3 * output_size, 0}, + {4 * output_size, batch_size}); _output_lowp.allocator()->allocate(); } else { _memory_group.manage(&_input_gate_input); - _slice_input_tensor.configure(compile_context, &_output_lowp, &_input_gate_input, { 0 }, { output_size }); + _slice_input_tensor.configure(compile_context, &_output_lowp, &_input_gate_input, {0}, {output_size}); _memory_group.manage(&_forget_gate_input); - _slice_forget_tensor.configure(compile_context, &_output_lowp, &_forget_gate_input, { output_size }, { 2 * output_size }); + _slice_forget_tensor.configure(compile_context, &_output_lowp, &_forget_gate_input, {output_size}, + {2 * output_size}); _memory_group.manage(&_input_modulation_gate_input); - _slice_cell_tensor.configure(compile_context, &_output_lowp, &_input_modulation_gate_input, { 2 * output_size }, { 3 * output_size }); + _slice_cell_tensor.configure(compile_context, &_output_lowp, &_input_modulation_gate_input, {2 * output_size}, + {3 * output_size}); _memory_group.manage(&_output_gate_input); - _slice_output_tensor.configure(compile_context, &_output_lowp, &_output_gate_input, { 3 * output_size }, { 4 * output_size }); + _slice_output_tensor.configure(compile_context, &_output_lowp, &_output_gate_input, {3 * output_size}, + {4 * output_size}); _output_lowp.allocator()->allocate(); } // Forget gate _memory_group.manage(&_forget_gate_output); - _forget_gate_output.allocator()->init(TensorInfo(_forget_gate_input.info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_0)); - _sigmoid_forget_gate.configure(compile_context, &_forget_gate_input, &_forget_gate_output, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)); + _forget_gate_output.allocator()->init( + TensorInfo(_forget_gate_input.info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_0)); + _sigmoid_forget_gate.configure(compile_context, &_forget_gate_input, &_forget_gate_output, + ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)); _forget_gate_input.allocator()->allocate(); // Input gate _memory_group.manage(&_input_gate_output); - _input_gate_output.allocator()->init(TensorInfo(_input_gate_input.info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_0)); - _sigmoid_input_gate.configure(compile_context, &_input_gate_input, &_input_gate_output, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)); + _input_gate_output.allocator()->init( + TensorInfo(_input_gate_input.info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_0)); + _sigmoid_input_gate.configure(compile_context, &_input_gate_input, &_input_gate_output, + ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)); _input_gate_input.allocator()->allocate(); // Input modulation gate equation _memory_group.manage(&_input_modulation_gate_output); - _input_modulation_gate_output.allocator()->init(TensorInfo(_input_modulation_gate_input.info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_0)); - _tanh_modulation_gate.configure(compile_context, &_input_modulation_gate_input, &_input_modulation_gate_output, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.0f, 1.0f)); + _input_modulation_gate_output.allocator()->init( + TensorInfo(_input_modulation_gate_input.info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_0)); + _tanh_modulation_gate.configure(compile_context, &_input_modulation_gate_input, &_input_modulation_gate_output, + ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.0f, 1.0f)); _input_modulation_gate_input.allocator()->allocate(); // Output gate _memory_group.manage(&_output_gate_output); - _output_gate_output.allocator()->init(TensorInfo(_output_gate_input.info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_0)); - _sigmoid_output_gate.configure(compile_context, &_output_gate_input, &_output_gate_output, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)); + _output_gate_output.allocator()->init( + TensorInfo(_output_gate_input.info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_0)); + _sigmoid_output_gate.configure(compile_context, &_output_gate_input, &_output_gate_output, + ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)); _output_gate_input.allocator()->allocate(); // Long term memory _memory_group.manage(&_cell_state_tmp1); - _cell_state_tmp1.allocator()->init(TensorInfo(_forget_gate_output.info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_4)); - _mul_forget_gate_cell_state.configure(compile_context, &_forget_gate_output, cell_state_in, &_cell_state_tmp1, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO); + _cell_state_tmp1.allocator()->init( + TensorInfo(_forget_gate_output.info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_4)); + _mul_forget_gate_cell_state.configure(compile_context, &_forget_gate_output, cell_state_in, &_cell_state_tmp1, 1, + ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO); _forget_gate_output.allocator()->allocate(); _memory_group.manage(&_cell_state_tmp2); - _cell_state_tmp2.allocator()->init(TensorInfo(_input_gate_output.info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_4)); - _mul_input_gate_input_mod_gate.configure(compile_context, &_input_gate_output, &_input_modulation_gate_output, &_cell_state_tmp2, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO); + _cell_state_tmp2.allocator()->init( + TensorInfo(_input_gate_output.info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_4)); + _mul_input_gate_input_mod_gate.configure(compile_context, &_input_gate_output, &_input_modulation_gate_output, + &_cell_state_tmp2, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO); _input_modulation_gate_output.allocator()->allocate(); _input_gate_output.allocator()->allocate(); - _add_cell_state_tmps.configure(compile_context, &_cell_state_tmp1, &_cell_state_tmp2, cell_state_out, ConvertPolicy::SATURATE); + _add_cell_state_tmps.configure(compile_context, &_cell_state_tmp1, &_cell_state_tmp2, cell_state_out, + ConvertPolicy::SATURATE); _cell_state_tmp1.allocator()->allocate(); _cell_state_tmp2.allocator()->allocate(); // Short term memory _memory_group.manage(&_output_state_tmp); - _output_state_tmp.allocator()->init(TensorInfo(cell_state_out->info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_0)); - _tanh_output_state.configure(compile_context, cell_state_out, &_output_state_tmp, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.0f, 1.0f)); + _output_state_tmp.allocator()->init( + TensorInfo(cell_state_out->info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_0)); + _tanh_output_state.configure(compile_context, cell_state_out, &_output_state_tmp, + ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.0f, 1.0f)); _memory_group.manage(&_output_state_out_symm); - _output_state_out_symm.allocator()->init(TensorInfo(_output_gate_output.info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_0)); - _mul_output_state_tmp_output_gate.configure(compile_context, &_output_state_tmp, &_output_gate_output, &_output_state_out_symm, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO); + _output_state_out_symm.allocator()->init( + TensorInfo(_output_gate_output.info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_0)); + _mul_output_state_tmp_output_gate.configure(compile_context, &_output_state_tmp, &_output_gate_output, + &_output_state_out_symm, 1, ConvertPolicy::SATURATE, + RoundingPolicy::TO_ZERO); _output_gate_output.allocator()->allocate(); _output_state_tmp.allocator()->allocate(); // Requantize the output state from QSYMM16 to QASYMM8 _memory_group.manage(&_output_state_out_f32); - _output_state_out_f32.allocator()->init(TensorInfo(_output_state_out_symm.info()->tensor_shape(), 1, DataType::F32)); + _output_state_out_f32.allocator()->init( + TensorInfo(_output_state_out_symm.info()->tensor_shape(), 1, DataType::F32)); _dequantize.configure(compile_context, &_output_state_out_symm, &_output_state_out_f32); _output_state_out_symm.allocator()->allocate(); @@ -271,15 +393,28 @@ void CLLSTMLayerQuantized::configure(const CLCompileContext &compile_context, co } Status CLLSTMLayerQuantized::validate(const ITensorInfo *input, - const ITensorInfo *input_to_input_weights, const ITensorInfo *input_to_forget_weights, const ITensorInfo *input_to_cell_weights, const ITensorInfo *input_to_output_weights, - const ITensorInfo *recurrent_to_input_weights, const ITensorInfo *recurrent_to_forget_weights, const ITensorInfo *recurrent_to_cell_weights, const ITensorInfo *recurrent_to_output_weights, - const ITensorInfo *input_gate_bias, const ITensorInfo *forget_gate_bias, const ITensorInfo *cell_bias, const ITensorInfo *output_gate_bias, - const ITensorInfo *cell_state_in, const ITensorInfo *output_state_in, - const ITensorInfo *cell_state_out, const ITensorInfo *output_state_out) + const ITensorInfo *input_to_input_weights, + const ITensorInfo *input_to_forget_weights, + const ITensorInfo *input_to_cell_weights, + const ITensorInfo *input_to_output_weights, + const ITensorInfo *recurrent_to_input_weights, + const ITensorInfo *recurrent_to_forget_weights, + const ITensorInfo *recurrent_to_cell_weights, + const ITensorInfo *recurrent_to_output_weights, + const ITensorInfo *input_gate_bias, + const ITensorInfo *forget_gate_bias, + const ITensorInfo *cell_bias, + const ITensorInfo *output_gate_bias, + const ITensorInfo *cell_state_in, + const ITensorInfo *output_state_in, + const ITensorInfo *cell_state_out, + const ITensorInfo *output_state_out) { - ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, input_to_input_weights, input_to_forget_weights, input_to_cell_weights, input_to_output_weights, recurrent_to_input_weights, - recurrent_to_forget_weights, recurrent_to_cell_weights, recurrent_to_output_weights, input_gate_bias, forget_gate_bias, cell_bias, output_gate_bias, cell_state_in, - output_state_in, cell_state_out, output_state_out); + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR( + input, input_to_input_weights, input_to_forget_weights, input_to_cell_weights, input_to_output_weights, + recurrent_to_input_weights, recurrent_to_forget_weights, recurrent_to_cell_weights, recurrent_to_output_weights, + input_gate_bias, forget_gate_bias, cell_bias, output_gate_bias, cell_state_in, output_state_in, cell_state_out, + output_state_out); ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_NOT_IN(input, DataType::QASYMM8); const int input_size = input->dimension(0); @@ -292,29 +427,51 @@ Status CLLSTMLayerQuantized::validate(const ITensorInfo *input, ARM_COMPUTE_RETURN_ERROR_ON(input_gate_bias->num_dimensions() > 1); ARM_COMPUTE_RETURN_ERROR_ON(output_state_in->num_dimensions() > 2); - TensorInfo input_weights_info(input_to_input_weights->clone()->set_tensor_shape(TensorShape(input_size, output_size)).set_data_type(DataType::QASYMM8)); - TensorInfo recurrent_weights_info(input_to_input_weights->clone()->set_tensor_shape(TensorShape(output_size, output_size)).set_data_type(DataType::QASYMM8)); - TensorInfo bias_info(input_gate_bias->clone()->set_tensor_shape(TensorShape(output_size)).set_data_type(DataType::S32)); - TensorInfo output_state_info(cell_state_in->clone()->set_tensor_shape(TensorShape(output_size, batch_size)).set_data_type(DataType::QASYMM8).set_quantization_info(qasymm)); - TensorInfo cell_state_info(cell_state_in->clone()->set_tensor_shape(TensorShape(output_size, batch_size)).set_data_type(DataType::QSYMM16).set_quantization_info(qsymm_4)); + TensorInfo input_weights_info(input_to_input_weights->clone() + ->set_tensor_shape(TensorShape(input_size, output_size)) + .set_data_type(DataType::QASYMM8)); + TensorInfo recurrent_weights_info(input_to_input_weights->clone() + ->set_tensor_shape(TensorShape(output_size, output_size)) + .set_data_type(DataType::QASYMM8)); + TensorInfo bias_info( + input_gate_bias->clone()->set_tensor_shape(TensorShape(output_size)).set_data_type(DataType::S32)); + TensorInfo output_state_info(cell_state_in->clone() + ->set_tensor_shape(TensorShape(output_size, batch_size)) + .set_data_type(DataType::QASYMM8) + .set_quantization_info(qasymm)); + TensorInfo cell_state_info(cell_state_in->clone() + ->set_tensor_shape(TensorShape(output_size, batch_size)) + .set_data_type(DataType::QSYMM16) + .set_quantization_info(qsymm_4)); // Shape checks - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&input_weights_info, input_to_input_weights, input_to_forget_weights, input_to_cell_weights, input_to_output_weights); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&recurrent_weights_info, recurrent_to_input_weights, recurrent_to_forget_weights, recurrent_to_cell_weights, recurrent_to_output_weights); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&bias_info, input_gate_bias, forget_gate_bias, cell_bias, output_gate_bias); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&input_weights_info, input_to_input_weights, input_to_forget_weights, + input_to_cell_weights, input_to_output_weights); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&recurrent_weights_info, recurrent_to_input_weights, + recurrent_to_forget_weights, recurrent_to_cell_weights, + recurrent_to_output_weights); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&bias_info, input_gate_bias, forget_gate_bias, cell_bias, + output_gate_bias); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&cell_state_info, cell_state_in); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&output_state_info, output_state_in); // Data type checks - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&input_weights_info, input, input_to_input_weights, input_to_forget_weights, input_to_cell_weights, input_to_output_weights); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&recurrent_weights_info, recurrent_to_input_weights, recurrent_to_forget_weights, recurrent_to_cell_weights, recurrent_to_output_weights); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&bias_info, input_gate_bias, forget_gate_bias, cell_bias, output_gate_bias); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&input_weights_info, input, input_to_input_weights, + input_to_forget_weights, input_to_cell_weights, + input_to_output_weights); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&recurrent_weights_info, recurrent_to_input_weights, + recurrent_to_forget_weights, recurrent_to_cell_weights, + recurrent_to_output_weights); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&bias_info, input_gate_bias, forget_gate_bias, cell_bias, + output_gate_bias); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&cell_state_info, cell_state_in); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&output_state_info, output_state_in); // Quantization checks - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(input_to_input_weights, input_to_forget_weights, input_to_cell_weights, input_to_output_weights); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(recurrent_to_input_weights, recurrent_to_forget_weights, recurrent_to_cell_weights, recurrent_to_output_weights); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(input_to_input_weights, input_to_forget_weights, + input_to_cell_weights, input_to_output_weights); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(recurrent_to_input_weights, recurrent_to_forget_weights, + recurrent_to_cell_weights, recurrent_to_output_weights); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(&cell_state_info, cell_state_in); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(&output_state_info, output_state_in); @@ -336,7 +493,8 @@ Status CLLSTMLayerQuantized::validate(const ITensorInfo *input, recurrent_weights_vector.emplace_back(recurrent_to_cell_weights); recurrent_weights_vector.emplace_back(recurrent_to_output_weights); const TensorInfo recurrent_weights(TensorShape(output_size, 4 * output_size), 1, DataType::QASYMM8, qweights); - ARM_COMPUTE_RETURN_ON_ERROR(CLConcatenateLayer::validate(recurrent_weights_vector, &recurrent_weights, Window::DimY)); + ARM_COMPUTE_RETURN_ON_ERROR( + CLConcatenateLayer::validate(recurrent_weights_vector, &recurrent_weights, Window::DimY)); // _concat_weights std::vector<const ITensorInfo *> weights_vector; @@ -346,7 +504,7 @@ Status CLLSTMLayerQuantized::validate(const ITensorInfo *input, ARM_COMPUTE_RETURN_ON_ERROR(CLConcatenateLayer::validate(weights_vector, &weights, Window::DimX)); // _transpose_weights const TensorShape weights_transposed_shape(weights.tensor_shape()[1], weights.tensor_shape()[0]); - TensorInfo weights_transposed = weights.clone()->set_is_resizable(true).set_tensor_shape(weights_transposed_shape); + TensorInfo weights_transposed = weights.clone()->set_is_resizable(true).set_tensor_shape(weights_transposed_shape); ARM_COMPUTE_RETURN_ON_ERROR(CLTranspose::validate(&weights, &weights_transposed)); // _concat_inputs @@ -372,7 +530,8 @@ Status CLLSTMLayerQuantized::validate(const ITensorInfo *input, // _gemmlowp const TensorInfo output_highp(TensorShape(4 * output_size, batch_size), 1, DataType::S32); - ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixMultiplyCore::validate(&input_concatenated, &weights_transposed, nullptr, &output_highp)); + ARM_COMPUTE_RETURN_ON_ERROR( + CLGEMMLowpMatrixMultiplyCore::validate(&input_concatenated, &weights_transposed, nullptr, &output_highp)); // Set the offset back input_concatenated.set_quantization_info(QuantizationInfo(qasymm.uniform().scale, qasymm.uniform().offset)); @@ -383,78 +542,107 @@ Status CLLSTMLayerQuantized::validate(const ITensorInfo *input, const float multiplier = 4096.f * qasymm.uniform().scale * qweights.uniform().scale; int output_multiplier = 0; int output_shift = 0; - ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier(multiplier, &output_multiplier, &output_shift)); + ARM_COMPUTE_RETURN_ON_ERROR( + quantization::calculate_quantized_multiplier(multiplier, &output_multiplier, &output_shift)); // _output_stage - ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPoint::validate(&output_highp, &bias_concatenated, &output_lowp)); + GEMMLowpOutputStageInfo info{}; + info.type = GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT; + info.gemmlowp_multiplier = output_multiplier; + info.gemmlowp_shift = output_shift; + info.output_data_type = DataType::QSYMM16; + ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpOutputStage::validate(&output_highp, &bias_concatenated, &output_lowp, info)); TensorInfo input_gate_input; TensorInfo forget_gate_input; TensorInfo input_modulation_gate_input; TensorInfo output_gate_input; - if(batch_size > 1) + if (batch_size > 1) { // _slice_input_tensor input_gate_input = TensorInfo(TensorShape(output_size, batch_size), 1, DataType::QSYMM16, qsymm_3); - ARM_COMPUTE_RETURN_ON_ERROR(CLSlice::validate(&output_lowp, &input_gate_input, { 0, 0 }, { output_size, batch_size })); + ARM_COMPUTE_RETURN_ON_ERROR( + CLSlice::validate(&output_lowp, &input_gate_input, {0, 0}, {output_size, batch_size})); // _slice_forget_tensor forget_gate_input = TensorInfo(TensorShape(output_size, batch_size), 1, DataType::QSYMM16, qsymm_3); - ARM_COMPUTE_RETURN_ON_ERROR(CLSlice::validate(&output_lowp, &forget_gate_input, { output_size, 0 }, { 2 * output_size, batch_size })); + ARM_COMPUTE_RETURN_ON_ERROR( + CLSlice::validate(&output_lowp, &forget_gate_input, {output_size, 0}, {2 * output_size, batch_size})); // _slice_cell_tensor input_modulation_gate_input = TensorInfo(TensorShape(output_size, batch_size), 1, DataType::QSYMM16, qsymm_3); - ARM_COMPUTE_RETURN_ON_ERROR(CLSlice::validate(&output_lowp, &input_modulation_gate_input, { 2 * output_size, 0 }, { 3 * output_size, batch_size })); + ARM_COMPUTE_RETURN_ON_ERROR(CLSlice::validate(&output_lowp, &input_modulation_gate_input, {2 * output_size, 0}, + {3 * output_size, batch_size})); // _slice_output_tensor output_gate_input = TensorInfo(TensorShape(output_size, batch_size), 1, DataType::QSYMM16, qsymm_3); - ARM_COMPUTE_RETURN_ON_ERROR(CLSlice::validate(&output_lowp, &output_gate_input, { 3 * output_size, 0 }, { 4 * output_size, batch_size })); + ARM_COMPUTE_RETURN_ON_ERROR( + CLSlice::validate(&output_lowp, &output_gate_input, {3 * output_size, 0}, {4 * output_size, batch_size})); } else { // _slice_input_tensor input_gate_input = TensorInfo(TensorShape(output_size), 1, DataType::QSYMM16, qsymm_3); - ARM_COMPUTE_RETURN_ON_ERROR(CLSlice::validate(&output_lowp, &input_gate_input, { 0 }, { output_size })); + ARM_COMPUTE_RETURN_ON_ERROR(CLSlice::validate(&output_lowp, &input_gate_input, {0}, {output_size})); // _slice_forget_tensor forget_gate_input = TensorInfo(TensorShape(output_size), 1, DataType::QSYMM16, qsymm_3); - ARM_COMPUTE_RETURN_ON_ERROR(CLSlice::validate(&output_lowp, &forget_gate_input, { output_size }, { 2 * output_size })); + ARM_COMPUTE_RETURN_ON_ERROR( + CLSlice::validate(&output_lowp, &forget_gate_input, {output_size}, {2 * output_size})); // _slice_cell_tensor input_modulation_gate_input = TensorInfo(TensorShape(output_size), 1, DataType::QSYMM16, qsymm_3); - ARM_COMPUTE_RETURN_ON_ERROR(CLSlice::validate(&output_lowp, &input_modulation_gate_input, { 2 * output_size }, { 3 * output_size })); + ARM_COMPUTE_RETURN_ON_ERROR( + CLSlice::validate(&output_lowp, &input_modulation_gate_input, {2 * output_size}, {3 * output_size})); // _slice_output_tensor output_gate_input = TensorInfo(TensorShape(output_size), 1, DataType::QSYMM16, qsymm_3); - ARM_COMPUTE_RETURN_ON_ERROR(CLSlice::validate(&output_lowp, &output_gate_input, { 3 * output_size }, { 4 * output_size })); + ARM_COMPUTE_RETURN_ON_ERROR( + CLSlice::validate(&output_lowp, &output_gate_input, {3 * output_size}, {4 * output_size})); } // _sigmoid_forget_gate const TensorInfo forget_gate_output(forget_gate_input.tensor_shape(), 1, DataType::QSYMM16, qsymm_0); - ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayer::validate(&forget_gate_input, &forget_gate_output, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC))); + ARM_COMPUTE_RETURN_ON_ERROR( + CLActivationLayer::validate(&forget_gate_input, &forget_gate_output, + ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC))); // _sigmoid_input_gate const TensorInfo input_gate_output(input_gate_input.tensor_shape(), 1, DataType::QSYMM16, qsymm_0); - ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayer::validate(&input_gate_input, &input_gate_output, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC))); + ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayer::validate( + &input_gate_input, &input_gate_output, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC))); // _tanh_modulation_gate - const TensorInfo input_modulation_gate_output(input_modulation_gate_input.tensor_shape(), 1, DataType::QSYMM16, qsymm_0); - ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayer::validate(&input_modulation_gate_input, &input_modulation_gate_output, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.0f, 1.0f))); + const TensorInfo input_modulation_gate_output(input_modulation_gate_input.tensor_shape(), 1, DataType::QSYMM16, + qsymm_0); + ARM_COMPUTE_RETURN_ON_ERROR( + CLActivationLayer::validate(&input_modulation_gate_input, &input_modulation_gate_output, + ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.0f, 1.0f))); // _sigmoid_output_gate const TensorInfo output_gate_output(output_gate_input.tensor_shape(), 1, DataType::QSYMM16, qsymm_0); - ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayer::validate(&output_gate_input, &output_gate_output, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC))); + ARM_COMPUTE_RETURN_ON_ERROR( + CLActivationLayer::validate(&output_gate_input, &output_gate_output, + ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC))); // _mul_forget_gate_cell_state const TensorInfo cell_state_tmp1(forget_gate_output.tensor_shape(), 1, DataType::QSYMM16, qsymm_4); - ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplication::validate(&forget_gate_output, cell_state_in, &cell_state_tmp1, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO)); + ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplication::validate( + &forget_gate_output, cell_state_in, &cell_state_tmp1, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO)); // _mul_input_gate_input_mod_gate const TensorInfo cell_state_tmp2(input_gate_output.tensor_shape(), 1, DataType::QSYMM16, qsymm_4); - ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplication::validate(&input_gate_output, &input_modulation_gate_output, &cell_state_tmp2, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO)); + ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplication::validate(&input_gate_output, &input_modulation_gate_output, + &cell_state_tmp2, 1, ConvertPolicy::SATURATE, + RoundingPolicy::TO_ZERO)); // _add_cell_state_tmps - ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAddition::validate(&cell_state_tmp1, &cell_state_tmp2, cell_state_out, ConvertPolicy::SATURATE)); + ARM_COMPUTE_RETURN_ON_ERROR( + CLArithmeticAddition::validate(&cell_state_tmp1, &cell_state_tmp2, cell_state_out, ConvertPolicy::SATURATE)); // _tanh_modulation_gate const TensorInfo output_state_tmp(cell_state_out->tensor_shape(), 1, DataType::QSYMM16, qsymm_0); - ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayer::validate(cell_state_out, &output_state_tmp, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.0f, 1.0f))); + ARM_COMPUTE_RETURN_ON_ERROR( + CLActivationLayer::validate(cell_state_out, &output_state_tmp, + ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.0f, 1.0f))); // _mul_output_state_tmp_output_gate const TensorInfo output_state_out_symm(output_gate_output.tensor_shape(), 1, DataType::QSYMM16, qsymm_0); - ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplication::validate(&output_state_tmp, &output_gate_output, &output_state_out_symm, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO)); + ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplication::validate(&output_state_tmp, &output_gate_output, + &output_state_out_symm, 1, ConvertPolicy::SATURATE, + RoundingPolicy::TO_ZERO)); // _dequantize const TensorInfo output_state_out_f32(output_state_out_symm.tensor_shape(), 1, DataType::F32); @@ -463,14 +651,14 @@ Status CLLSTMLayerQuantized::validate(const ITensorInfo *input, // _quantize ARM_COMPUTE_RETURN_ON_ERROR(CLQuantizationLayer::validate(&output_state_out_f32, output_state_out)); - if(cell_state_out->total_size() != 0) + if (cell_state_out->total_size() != 0) { ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&cell_state_info, cell_state_out); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&cell_state_info, cell_state_out); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(&cell_state_info, cell_state_out); } - if(output_state_out->total_size() != 0) + if (output_state_out->total_size() != 0) { ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&output_state_info, output_state_out); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&output_state_info, output_state_out); @@ -529,7 +717,7 @@ void CLLSTMLayerQuantized::run() void CLLSTMLayerQuantized::prepare() { - if(!_is_prepared) + if (!_is_prepared) { _input_weights.allocator()->allocate(); _concat_input_weights.run(); diff --git a/src/runtime/CL/functions/CLLogicalAnd.cpp b/src/runtime/CL/functions/CLLogicalAnd.cpp index 98c98abed5..ea21c54bc3 100644 --- a/src/runtime/CL/functions/CLLogicalAnd.cpp +++ b/src/runtime/CL/functions/CLLogicalAnd.cpp @@ -22,8 +22,11 @@ * SOFTWARE. */ #include "arm_compute/runtime/CL/functions/CLLogicalAnd.h" + #include "arm_compute/core/CL/ICLTensor.h" -#include "src/core/gpu/cl/kernels/ClElementwiseKernel.h" + +#include "src/common/utils/Log.h" +#include "src/gpu/cl/kernels/ClElementwiseKernel.h" #include <utility> @@ -31,8 +34,12 @@ namespace arm_compute { namespace experimental { -void CLLogicalAnd::configure(const CLCompileContext &compile_context, ITensorInfo *input1, ITensorInfo *input2, ITensorInfo *output) +void CLLogicalAnd::configure(const CLCompileContext &compile_context, + ITensorInfo *input1, + ITensorInfo *input2, + ITensorInfo *output) { + ARM_COMPUTE_LOG_PARAMS(input1, input2, output); auto k = std::make_unique<arm_compute::opencl::kernels::ClLogicalBinaryKernel>(); k->configure(compile_context, LogicalOperation::And, input1, input2, output); _kernel = std::move(k); @@ -51,17 +58,16 @@ void CLLogicalAnd::run(ITensorPack &tensors) struct CLLogicalAnd::Impl { - const ICLTensor *src0{ nullptr }; - const ICLTensor *src1{ nullptr }; - ICLTensor *dst{ nullptr }; - std::unique_ptr<experimental::CLLogicalAnd> op{ nullptr }; + const ICLTensor *src0{nullptr}; + const ICLTensor *src1{nullptr}; + ICLTensor *dst{nullptr}; + std::unique_ptr<experimental::CLLogicalAnd> op{nullptr}; }; -CLLogicalAnd::CLLogicalAnd() - : _impl(std::make_unique<Impl>()) +CLLogicalAnd::CLLogicalAnd() : _impl(std::make_unique<Impl>()) { } -CLLogicalAnd::CLLogicalAnd(CLLogicalAnd &&) = default; +CLLogicalAnd::CLLogicalAnd(CLLogicalAnd &&) = default; CLLogicalAnd &CLLogicalAnd::operator=(CLLogicalAnd &&) = default; CLLogicalAnd::~CLLogicalAnd() = default; @@ -70,7 +76,10 @@ void CLLogicalAnd::configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *ou configure(CLKernelLibrary::get().get_compile_context(), input1, input2, output); } -void CLLogicalAnd::configure(const CLCompileContext &compile_context, ICLTensor *input1, ICLTensor *input2, ICLTensor *output) +void CLLogicalAnd::configure(const CLCompileContext &compile_context, + ICLTensor *input1, + ICLTensor *input2, + ICLTensor *output) { _impl->src0 = input1; _impl->src1 = input2; diff --git a/src/runtime/CL/functions/CLLogicalNot.cpp b/src/runtime/CL/functions/CLLogicalNot.cpp index 388d2bce86..71f9cce54f 100644 --- a/src/runtime/CL/functions/CLLogicalNot.cpp +++ b/src/runtime/CL/functions/CLLogicalNot.cpp @@ -25,23 +25,23 @@ #include "arm_compute/core/CL/CLKernelLibrary.h" #include "arm_compute/core/CL/ICLTensor.h" + #include "src/core/CL/ICLKernel.h" -#include "src/runtime/gpu/cl/operators/ClLogicalNot.h" +#include "src/gpu/cl/operators/ClLogicalNot.h" namespace arm_compute { struct CLLogicalNot::Impl { - const ICLTensor *src{ nullptr }; - ICLTensor *dst{ nullptr }; - std::unique_ptr<opencl::ClLogicalNot> op{ nullptr }; + const ICLTensor *src{nullptr}; + ICLTensor *dst{nullptr}; + std::unique_ptr<opencl::ClLogicalNot> op{nullptr}; }; -CLLogicalNot::CLLogicalNot() - : _impl(std::make_unique<Impl>()) +CLLogicalNot::CLLogicalNot() : _impl(std::make_unique<Impl>()) { } -CLLogicalNot::CLLogicalNot(CLLogicalNot &&) = default; +CLLogicalNot::CLLogicalNot(CLLogicalNot &&) = default; CLLogicalNot &CLLogicalNot::operator=(CLLogicalNot &&) = default; CLLogicalNot::~CLLogicalNot() = default; @@ -72,4 +72,4 @@ void CLLogicalNot::run() _impl->op->run(pack); } -} // namespace arm_compute
\ No newline at end of file +} // namespace arm_compute diff --git a/src/runtime/CL/functions/CLLogicalOr.cpp b/src/runtime/CL/functions/CLLogicalOr.cpp index 897963ab50..3db4fdae84 100644 --- a/src/runtime/CL/functions/CLLogicalOr.cpp +++ b/src/runtime/CL/functions/CLLogicalOr.cpp @@ -22,8 +22,11 @@ * SOFTWARE. */ #include "arm_compute/runtime/CL/functions/CLLogicalOr.h" + #include "arm_compute/core/CL/ICLTensor.h" -#include "src/core/gpu/cl/kernels/ClElementwiseKernel.h" + +#include "src/common/utils/Log.h" +#include "src/gpu/cl/kernels/ClElementwiseKernel.h" #include <utility> @@ -31,8 +34,12 @@ namespace arm_compute { namespace experimental { -void CLLogicalOr::configure(const CLCompileContext &compile_context, ITensorInfo *input1, ITensorInfo *input2, ITensorInfo *output) +void CLLogicalOr::configure(const CLCompileContext &compile_context, + ITensorInfo *input1, + ITensorInfo *input2, + ITensorInfo *output) { + ARM_COMPUTE_LOG_PARAMS(input1, input2, output); auto k = std::make_unique<arm_compute::opencl::kernels::ClLogicalBinaryKernel>(); k->configure(compile_context, LogicalOperation::Or, input1, input2, output); _kernel = std::move(k); @@ -51,17 +58,16 @@ void CLLogicalOr::run(ITensorPack &tensors) struct CLLogicalOr::Impl { - const ICLTensor *src0{ nullptr }; - const ICLTensor *src1{ nullptr }; - ICLTensor *dst{ nullptr }; - std::unique_ptr<experimental::CLLogicalOr> op{ nullptr }; + const ICLTensor *src0{nullptr}; + const ICLTensor *src1{nullptr}; + ICLTensor *dst{nullptr}; + std::unique_ptr<experimental::CLLogicalOr> op{nullptr}; }; -CLLogicalOr::CLLogicalOr() - : _impl(std::make_unique<Impl>()) +CLLogicalOr::CLLogicalOr() : _impl(std::make_unique<Impl>()) { } -CLLogicalOr::CLLogicalOr(CLLogicalOr &&) = default; +CLLogicalOr::CLLogicalOr(CLLogicalOr &&) = default; CLLogicalOr &CLLogicalOr::operator=(CLLogicalOr &&) = default; CLLogicalOr::~CLLogicalOr() = default; @@ -70,7 +76,10 @@ void CLLogicalOr::configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *out configure(CLKernelLibrary::get().get_compile_context(), input1, input2, output); } -void CLLogicalOr::configure(const CLCompileContext &compile_context, ICLTensor *input1, ICLTensor *input2, ICLTensor *output) +void CLLogicalOr::configure(const CLCompileContext &compile_context, + ICLTensor *input1, + ICLTensor *input2, + ICLTensor *output) { _impl->src0 = input1; _impl->src1 = input2; diff --git a/src/runtime/CL/functions/CLMatMul.cpp b/src/runtime/CL/functions/CLMatMul.cpp new file mode 100644 index 0000000000..e8bdad706b --- /dev/null +++ b/src/runtime/CL/functions/CLMatMul.cpp @@ -0,0 +1,87 @@ +/* + * Copyright (c) 2023 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/runtime/CL/functions/CLMatMul.h" + +#include "arm_compute/runtime/CL/CLTensor.h" +#include "arm_compute/runtime/CL/CLTypes.h" + +#include "src/gpu/cl/operators/ClMatMul.h" + +namespace arm_compute +{ +using OperatorType = opencl::ClMatMul; + +struct CLMatMul::Impl +{ + std::unique_ptr<OperatorType> op{nullptr}; + ITensorPack run_pack{}; +}; +CLMatMul::CLMatMul() : _impl(std::make_unique<Impl>()) +{ +} + +CLMatMul::~CLMatMul() = default; + +void CLMatMul::configure(ICLTensor *lhs, + ICLTensor *rhs, + ICLTensor *output, + const MatMulInfo &matmul_info, + const GpuMatMulSettings &settings, + const ActivationLayerInfo &act_info) +{ + ARM_COMPUTE_UNUSED(settings); + configure(CLKernelLibrary::get().get_compile_context(), lhs, rhs, output, matmul_info, settings, act_info); +} + +void CLMatMul::configure(const CLCompileContext &compile_context, + ICLTensor *lhs, + ICLTensor *rhs, + ICLTensor *output, + const MatMulInfo &matmul_info, + const GpuMatMulSettings &settings, + const ActivationLayerInfo &act_info) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(lhs, rhs, output); + ARM_COMPUTE_UNUSED(settings); + + _impl->op = std::make_unique<OperatorType>(); + _impl->op->configure(compile_context, lhs->info(), rhs->info(), output->info(), matmul_info, act_info); + _impl->run_pack = {{ACL_SRC_0, lhs}, {ACL_SRC_1, rhs}, {ACL_DST, output}}; +} + +Status CLMatMul::validate(const ITensorInfo *lhs, + const ITensorInfo *rhs, + const ITensorInfo *output, + const MatMulInfo &matmul_info, + const ActivationLayerInfo &act_info) +{ + return OperatorType::validate(lhs, rhs, output, matmul_info, act_info); +} + +void CLMatMul::run() +{ + _impl->op->run(_impl->run_pack); +} + +} // namespace arm_compute diff --git a/src/runtime/CL/functions/CLMaxUnpoolingLayer.cpp b/src/runtime/CL/functions/CLMaxUnpoolingLayer.cpp index 52151cdfe1..7494f379b9 100644 --- a/src/runtime/CL/functions/CLMaxUnpoolingLayer.cpp +++ b/src/runtime/CL/functions/CLMaxUnpoolingLayer.cpp @@ -27,32 +27,44 @@ #include "arm_compute/core/PixelValue.h" #include "arm_compute/core/Validate.h" #include "arm_compute/runtime/CL/CLScheduler.h" + +#include "src/common/utils/Log.h" #include "src/core/CL/kernels/CLMaxUnpoolingLayerKernel.h" namespace arm_compute { CLMaxUnpoolingLayer::CLMaxUnpoolingLayer() - : _fill(), - _unpooling_layer_kernel(std::make_unique<CLMaxUnpoolingLayerKernel>()) + : _fill(), _unpooling_layer_kernel(std::make_unique<CLMaxUnpoolingLayerKernel>()) { } CLMaxUnpoolingLayer::~CLMaxUnpoolingLayer() = default; -void CLMaxUnpoolingLayer::configure(ICLTensor *input, ICLTensor *indices, ICLTensor *output, const PoolingLayerInfo &pool_info) +void CLMaxUnpoolingLayer::configure(ICLTensor *input, + ICLTensor *indices, + ICLTensor *output, + const PoolingLayerInfo &pool_info) { configure(CLKernelLibrary::get().get_compile_context(), input, indices, output, pool_info); } -void CLMaxUnpoolingLayer::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *indices, ICLTensor *output, const PoolingLayerInfo &pool_info) +void CLMaxUnpoolingLayer::configure(const CLCompileContext &compile_context, + ICLTensor *input, + ICLTensor *indices, + ICLTensor *output, + const PoolingLayerInfo &pool_info) { + ARM_COMPUTE_LOG_PARAMS(input, indices, output, pool_info); const PixelValue zero_value(0.f); _fill.configure(output, zero_value); _unpooling_layer_kernel->configure(compile_context, input, indices, output, pool_info); } -Status CLMaxUnpoolingLayer::validate(const ITensorInfo *input, const ITensorInfo *indices, const ITensorInfo *output, const PoolingLayerInfo &pool_info) +Status CLMaxUnpoolingLayer::validate(const ITensorInfo *input, + const ITensorInfo *indices, + const ITensorInfo *output, + const PoolingLayerInfo &pool_info) { return CLMaxUnpoolingLayerKernel::validate(input, indices, output, pool_info); } diff --git a/src/runtime/CL/functions/CLMeanStdDevNormalizationLayer.cpp b/src/runtime/CL/functions/CLMeanStdDevNormalizationLayer.cpp index 0f6a0e47a4..5892c0e840 100644 --- a/src/runtime/CL/functions/CLMeanStdDevNormalizationLayer.cpp +++ b/src/runtime/CL/functions/CLMeanStdDevNormalizationLayer.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2020 Arm Limited. + * Copyright (c) 2019-2021 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -24,6 +24,8 @@ #include "arm_compute/runtime/CL/functions/CLMeanStdDevNormalizationLayer.h" #include "arm_compute/core/Types.h" + +#include "src/common/utils/Log.h" #include "src/core/CL/kernels/CLMeanStdDevNormalizationKernel.h" namespace arm_compute @@ -33,8 +35,12 @@ void CLMeanStdDevNormalizationLayer::configure(ICLTensor *input, ICLTensor *outp configure(CLKernelLibrary::get().get_compile_context(), input, output, epsilon); } -void CLMeanStdDevNormalizationLayer::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, float epsilon) +void CLMeanStdDevNormalizationLayer::configure(const CLCompileContext &compile_context, + ICLTensor *input, + ICLTensor *output, + float epsilon) { + ARM_COMPUTE_LOG_PARAMS(input, output, epsilon); auto k = std::make_unique<CLMeanStdDevNormalizationKernel>(); k->configure(compile_context, input, output, epsilon); _kernel = std::move(k); diff --git a/src/runtime/CL/functions/CLNormalizationLayer.cpp b/src/runtime/CL/functions/CLNormalizationLayer.cpp index 12560f1b02..f93f82f1a2 100644 --- a/src/runtime/CL/functions/CLNormalizationLayer.cpp +++ b/src/runtime/CL/functions/CLNormalizationLayer.cpp @@ -30,6 +30,8 @@ #include "arm_compute/core/Types.h" #include "arm_compute/core/Validate.h" #include "arm_compute/runtime/CL/CLScheduler.h" + +#include "src/common/utils/Log.h" #include "src/core/CL/kernels/CLFillBorderKernel.h" #include "src/core/CL/kernels/CLNormalizationLayerKernel.h" @@ -48,28 +50,35 @@ void CLNormalizationLayer::configure(ICLTensor *input, ICLTensor *output, const configure(CLKernelLibrary::get().get_compile_context(), input, output, norm_info); } -void CLNormalizationLayer::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, const NormalizationLayerInfo &norm_info) +void CLNormalizationLayer::configure(const CLCompileContext &compile_context, + ICLTensor *input, + ICLTensor *output, + const NormalizationLayerInfo &norm_info) { ARM_COMPUTE_ERROR_ON(input == nullptr); + ARM_COMPUTE_LOG_PARAMS(input, output, norm_info); // Configure normalization kernel _norm_kernel->configure(compile_context, input, output, norm_info); - if(!_norm_kernel->border_size().empty()) + if (!_norm_kernel->border_size().empty()) { // Fill the border by 3 elements since we need vload4 in the IN_MAP normalization kernel - _border_handler->configure(compile_context, input, _norm_kernel->border_size(), BorderMode::CONSTANT, PixelValue()); + _border_handler->configure(compile_context, input, _norm_kernel->border_size(), BorderMode::CONSTANT, + PixelValue()); } } -Status CLNormalizationLayer::validate(const ITensorInfo *input, const ITensorInfo *output, const NormalizationLayerInfo &norm_info) +Status CLNormalizationLayer::validate(const ITensorInfo *input, + const ITensorInfo *output, + const NormalizationLayerInfo &norm_info) { return CLNormalizationLayerKernel::validate(input, output, norm_info); } void CLNormalizationLayer::run() { - if(!_norm_kernel->border_size().empty()) + if (!_norm_kernel->border_size().empty()) { // Run border handler CLScheduler::get().enqueue(*_border_handler, false); @@ -78,4 +87,4 @@ void CLNormalizationLayer::run() // Run normalization kernel CLScheduler::get().enqueue(*_norm_kernel); } -} // namespace arm_compute
\ No newline at end of file +} // namespace arm_compute diff --git a/src/runtime/CL/functions/CLNormalizePlanarYUVLayer.cpp b/src/runtime/CL/functions/CLNormalizePlanarYUVLayer.cpp index 70189a2cb6..939c95bd45 100644 --- a/src/runtime/CL/functions/CLNormalizePlanarYUVLayer.cpp +++ b/src/runtime/CL/functions/CLNormalizePlanarYUVLayer.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018-2020 Arm Limited. + * Copyright (c) 2018-2021 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -24,26 +24,37 @@ #include "arm_compute/runtime/CL/functions/CLNormalizePlanarYUVLayer.h" +#include "src/common/utils/Log.h" #include "src/core/CL/kernels/CLNormalizePlanarYUVLayerKernel.h" #include <utility> namespace arm_compute { -void CLNormalizePlanarYUVLayer::configure(const ICLTensor *input, ICLTensor *output, const ICLTensor *mean, const ICLTensor *std) +void CLNormalizePlanarYUVLayer::configure(const ICLTensor *input, + ICLTensor *output, + const ICLTensor *mean, + const ICLTensor *std) { configure(CLKernelLibrary::get().get_compile_context(), input, output, mean, std); } -void CLNormalizePlanarYUVLayer::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const ICLTensor *mean, const ICLTensor *std) +void CLNormalizePlanarYUVLayer::configure(const CLCompileContext &compile_context, + const ICLTensor *input, + ICLTensor *output, + const ICLTensor *mean, + const ICLTensor *std) { + ARM_COMPUTE_LOG_PARAMS(input, output, mean, std); auto k = std::make_unique<CLNormalizePlanarYUVLayerKernel>(); k->configure(compile_context, input, output, mean, std); _kernel = std::move(k); } -Status CLNormalizePlanarYUVLayer::validate(const ITensorInfo *input, const ITensorInfo *output, - const ITensorInfo *mean, const ITensorInfo *std) +Status CLNormalizePlanarYUVLayer::validate(const ITensorInfo *input, + const ITensorInfo *output, + const ITensorInfo *mean, + const ITensorInfo *std) { return CLNormalizePlanarYUVLayerKernel::validate(input, output, mean, std); } diff --git a/src/runtime/CL/functions/CLPReluLayer.cpp b/src/runtime/CL/functions/CLPReluLayer.cpp index bb7aff218d..ce6d285ebe 100644 --- a/src/runtime/CL/functions/CLPReluLayer.cpp +++ b/src/runtime/CL/functions/CLPReluLayer.cpp @@ -22,10 +22,12 @@ * SOFTWARE. */ #include "arm_compute/runtime/CL/functions/CLPReluLayer.h" + #include "arm_compute/core/CL/CLKernelLibrary.h" #include "arm_compute/core/CL/ICLTensor.h" -#include "src/core/gpu/cl/IClKernel.h" -#include "src/runtime/gpu/cl/operators/ClPRelu.h" + +#include "src/gpu/cl/IClKernel.h" +#include "src/gpu/cl/operators/ClPRelu.h" namespace arm_compute { @@ -33,17 +35,16 @@ using OperatorType = opencl::ClPRelu; struct CLPReluLayer::Impl { - const ICLTensor *src_0{ nullptr }; - const ICLTensor *src_1{ nullptr }; - ICLTensor *dst{ nullptr }; - std::unique_ptr<OperatorType> op{ nullptr }; + const ICLTensor *src_0{nullptr}; + const ICLTensor *src_1{nullptr}; + ICLTensor *dst{nullptr}; + std::unique_ptr<OperatorType> op{nullptr}; }; -CLPReluLayer::CLPReluLayer() - : _impl(std::make_unique<Impl>()) +CLPReluLayer::CLPReluLayer() : _impl(std::make_unique<Impl>()) { } -CLPReluLayer::CLPReluLayer(CLPReluLayer &&) = default; +CLPReluLayer::CLPReluLayer(CLPReluLayer &&) = default; CLPReluLayer &CLPReluLayer::operator=(CLPReluLayer &&) = default; CLPReluLayer::~CLPReluLayer() = default; @@ -52,13 +53,17 @@ void CLPReluLayer::configure(ICLTensor *input, ICLTensor *alpha, ICLTensor *outp configure(CLKernelLibrary::get().get_compile_context(), input, alpha, output); } -void CLPReluLayer::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *alpha, ICLTensor *output) +void CLPReluLayer::configure(const CLCompileContext &compile_context, + ICLTensor *input, + ICLTensor *alpha, + ICLTensor *output) { _impl->src_0 = input; _impl->src_1 = alpha; _impl->dst = output; _impl->op = std::make_unique<OperatorType>(); - _impl->op->configure(compile_context, input->info(), alpha->info(), (output == nullptr ? input->info() : output->info())); + _impl->op->configure(compile_context, input->info(), alpha->info(), + (output == nullptr ? input->info() : output->info())); } Status CLPReluLayer::validate(const ITensorInfo *input, const ITensorInfo *alpha, const ITensorInfo *output) diff --git a/src/runtime/CL/functions/CLPadLayer.cpp b/src/runtime/CL/functions/CLPadLayer.cpp index d105c0597c..e788ded512 100644 --- a/src/runtime/CL/functions/CLPadLayer.cpp +++ b/src/runtime/CL/functions/CLPadLayer.cpp @@ -22,34 +22,38 @@ * SOFTWARE. */ #include "arm_compute/runtime/CL/functions/CLPadLayer.h" + +#include "src/common/utils/Log.h" #include "src/core/CL/kernels/CLPadLayerKernel.h" namespace arm_compute { -CLPadLayer::CLPadLayer() - : _pad_kernel(std::make_unique<CLPadLayerKernel>()), - _copy(), - _perform_pad(false) +CLPadLayer::CLPadLayer() : _pad_kernel(std::make_unique<CLPadLayerKernel>()), _copy(), _perform_pad(false) { } CLPadLayer::~CLPadLayer() = default; -void CLPadLayer::configure(ICLTensor *input, ICLTensor *output, const PaddingList &padding, PixelValue constant_value, PaddingMode mode) +void CLPadLayer::configure( + ICLTensor *input, ICLTensor *output, const PaddingList &padding, PixelValue constant_value, PaddingMode mode) { configure(CLKernelLibrary::get().get_compile_context(), input, output, padding, constant_value, mode); } -void CLPadLayer::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, const PaddingList &padding, PixelValue constant_value, PaddingMode mode) +void CLPadLayer::configure(const CLCompileContext &compile_context, + ICLTensor *input, + ICLTensor *output, + const PaddingList &padding, + PixelValue constant_value, + PaddingMode mode) { ARM_COMPUTE_ERROR_THROW_ON(validate(input->info(), output->info(), padding, constant_value, mode)); + ARM_COMPUTE_LOG_PARAMS(input, output, padding, constant_value, mode); - _perform_pad = std::any_of(padding.begin(), padding.end(), [](PaddingInfo info) - { - return info.first > 0 || info.second > 0; - }); + _perform_pad = + std::any_of(padding.begin(), padding.end(), [](PaddingInfo info) { return info.first > 0 || info.second > 0; }); - if(_perform_pad) + if (_perform_pad) { _pad_kernel->configure(compile_context, input, output, padding, constant_value, mode); } @@ -59,14 +63,16 @@ void CLPadLayer::configure(const CLCompileContext &compile_context, ICLTensor *i _copy.configure(compile_context, input, output); } } -Status CLPadLayer::validate(const ITensorInfo *input, const ITensorInfo *output, const PaddingList &padding, PixelValue constant_value, PaddingMode mode) +Status CLPadLayer::validate(const ITensorInfo *input, + const ITensorInfo *output, + const PaddingList &padding, + PixelValue constant_value, + PaddingMode mode) { - bool perform_pad = std::any_of(padding.begin(), padding.end(), [](PaddingInfo info) - { - return info.first > 0 || info.second > 0; - }); + bool perform_pad = + std::any_of(padding.begin(), padding.end(), [](PaddingInfo info) { return info.first > 0 || info.second > 0; }); - if(perform_pad) + if (perform_pad) { ARM_COMPUTE_RETURN_ON_ERROR(CLPadLayerKernel::validate(input, output, padding, constant_value, mode)); } @@ -78,7 +84,7 @@ Status CLPadLayer::validate(const ITensorInfo *input, const ITensorInfo *output, } void CLPadLayer::run() { - if(_perform_pad) + if (_perform_pad) { CLScheduler::get().enqueue(*_pad_kernel); } @@ -87,4 +93,4 @@ void CLPadLayer::run() _copy.run(); } } -} // namespace arm_compute
\ No newline at end of file +} // namespace arm_compute diff --git a/src/runtime/CL/functions/CLPermute.cpp b/src/runtime/CL/functions/CLPermute.cpp index c1da2a9eca..7f97eed98a 100644 --- a/src/runtime/CL/functions/CLPermute.cpp +++ b/src/runtime/CL/functions/CLPermute.cpp @@ -27,20 +27,21 @@ #include "arm_compute/core/CL/ICLTensor.h" #include "arm_compute/core/Types.h" #include "arm_compute/core/Validate.h" + +#include "src/common/utils/Log.h" #include "src/core/CL/ICLKernel.h" -#include "src/runtime/gpu/cl/operators/ClPermute.h" +#include "src/gpu/cl/operators/ClPermute.h" namespace arm_compute { struct CLPermute::Impl { - const ICLTensor *src{ nullptr }; - ICLTensor *dst{ nullptr }; - std::unique_ptr<opencl::ClPermute> op{ nullptr }; + const ICLTensor *src{nullptr}; + ICLTensor *dst{nullptr}; + std::unique_ptr<opencl::ClPermute> op{nullptr}; }; -CLPermute::CLPermute() - : _impl(std::make_unique<Impl>()) +CLPermute::CLPermute() : _impl(std::make_unique<Impl>()) { } @@ -51,9 +52,13 @@ void CLPermute::configure(const ICLTensor *input, ICLTensor *output, const Permu configure(CLKernelLibrary::get().get_compile_context(), input, output, perm); } -void CLPermute::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const PermutationVector &perm) +void CLPermute::configure(const CLCompileContext &compile_context, + const ICLTensor *input, + ICLTensor *output, + const PermutationVector &perm) { ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); + ARM_COMPUTE_LOG_PARAMS(input, output, perm); _impl->src = input; _impl->dst = output; @@ -74,4 +79,4 @@ void CLPermute::run() pack.add_tensor(TensorType::ACL_DST, _impl->dst); _impl->op->run(pack); } -} // namespace arm_compute
\ No newline at end of file +} // namespace arm_compute diff --git a/src/runtime/CL/functions/CLPixelWiseMultiplication.cpp b/src/runtime/CL/functions/CLPixelWiseMultiplication.cpp index 932659268d..6aa9d9cbb3 100644 --- a/src/runtime/CL/functions/CLPixelWiseMultiplication.cpp +++ b/src/runtime/CL/functions/CLPixelWiseMultiplication.cpp @@ -25,8 +25,9 @@ #include "arm_compute/core/CL/ICLTensor.h" #include "arm_compute/runtime/CL/CLScheduler.h" + #include "src/core/CL/ICLKernel.h" -#include "src/runtime/gpu/cl/operators/ClMul.h" +#include "src/gpu/cl/operators/ClMul.h" #include <utility> @@ -34,38 +35,55 @@ namespace arm_compute { struct CLPixelWiseMultiplication::Impl { - const ICLTensor *src_0{ nullptr }; - const ICLTensor *src_1{ nullptr }; - ICLTensor *dst{ nullptr }; - std::unique_ptr<opencl::ClMul> op{ nullptr }; + const ICLTensor *src_0{nullptr}; + const ICLTensor *src_1{nullptr}; + ICLTensor *dst{nullptr}; + std::unique_ptr<opencl::ClMul> op{nullptr}; }; -CLPixelWiseMultiplication::CLPixelWiseMultiplication() - : _impl(std::make_unique<Impl>()) +CLPixelWiseMultiplication::CLPixelWiseMultiplication() : _impl(std::make_unique<Impl>()) { } -CLPixelWiseMultiplication::CLPixelWiseMultiplication(CLPixelWiseMultiplication &&) = default; +CLPixelWiseMultiplication::CLPixelWiseMultiplication(CLPixelWiseMultiplication &&) = default; CLPixelWiseMultiplication &CLPixelWiseMultiplication::operator=(CLPixelWiseMultiplication &&) = default; CLPixelWiseMultiplication::~CLPixelWiseMultiplication() = default; -void CLPixelWiseMultiplication::configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *output, float scale, - ConvertPolicy overflow_policy, RoundingPolicy rounding_policy, const ActivationLayerInfo &act_info) +void CLPixelWiseMultiplication::configure(ICLTensor *input1, + ICLTensor *input2, + ICLTensor *output, + float scale, + ConvertPolicy overflow_policy, + RoundingPolicy rounding_policy, + const ActivationLayerInfo &act_info) { - configure(CLKernelLibrary::get().get_compile_context(), input1, input2, output, scale, overflow_policy, rounding_policy, act_info); + configure(CLKernelLibrary::get().get_compile_context(), input1, input2, output, scale, overflow_policy, + rounding_policy, act_info); } -void CLPixelWiseMultiplication::configure(const CLCompileContext &compile_context, ICLTensor *input1, ICLTensor *input2, ICLTensor *output, float scale, - ConvertPolicy overflow_policy, RoundingPolicy rounding_policy, const ActivationLayerInfo &act_info) +void CLPixelWiseMultiplication::configure(const CLCompileContext &compile_context, + ICLTensor *input1, + ICLTensor *input2, + ICLTensor *output, + float scale, + ConvertPolicy overflow_policy, + RoundingPolicy rounding_policy, + const ActivationLayerInfo &act_info) { _impl->src_0 = input1; _impl->src_1 = input2; _impl->dst = output; _impl->op = std::make_unique<opencl::ClMul>(); - _impl->op->configure(compile_context, input1->info(), input2->info(), output->info(), scale, overflow_policy, rounding_policy, act_info); + _impl->op->configure(compile_context, input1->info(), input2->info(), output->info(), scale, overflow_policy, + rounding_policy, act_info); } -Status CLPixelWiseMultiplication::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, float scale, - ConvertPolicy overflow_policy, RoundingPolicy rounding_policy, const ActivationLayerInfo &act_info) +Status CLPixelWiseMultiplication::validate(const ITensorInfo *input1, + const ITensorInfo *input2, + const ITensorInfo *output, + float scale, + ConvertPolicy overflow_policy, + RoundingPolicy rounding_policy, + const ActivationLayerInfo &act_info) { return opencl::ClMul::validate(input1, input2, output, scale, overflow_policy, rounding_policy, act_info); } @@ -82,26 +100,33 @@ void CLPixelWiseMultiplication::run() struct CLComplexPixelWiseMultiplication::Impl { - const ICLTensor *src_0{ nullptr }; - const ICLTensor *src_1{ nullptr }; - ICLTensor *dst{ nullptr }; - std::unique_ptr<opencl::ClComplexMul> op{ nullptr }; + const ICLTensor *src_0{nullptr}; + const ICLTensor *src_1{nullptr}; + ICLTensor *dst{nullptr}; + std::unique_ptr<opencl::ClComplexMul> op{nullptr}; }; -CLComplexPixelWiseMultiplication::CLComplexPixelWiseMultiplication() - : _impl(std::make_unique<Impl>()) +CLComplexPixelWiseMultiplication::CLComplexPixelWiseMultiplication() : _impl(std::make_unique<Impl>()) { } CLComplexPixelWiseMultiplication::CLComplexPixelWiseMultiplication(CLComplexPixelWiseMultiplication &&) = default; -CLComplexPixelWiseMultiplication &CLComplexPixelWiseMultiplication::operator=(CLComplexPixelWiseMultiplication &&) = default; -CLComplexPixelWiseMultiplication::~CLComplexPixelWiseMultiplication() = default; - -void CLComplexPixelWiseMultiplication::configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *output, const ActivationLayerInfo &act_info) +CLComplexPixelWiseMultiplication & +CLComplexPixelWiseMultiplication::operator=(CLComplexPixelWiseMultiplication &&) = default; +CLComplexPixelWiseMultiplication::~CLComplexPixelWiseMultiplication() = default; + +void CLComplexPixelWiseMultiplication::configure(ICLTensor *input1, + ICLTensor *input2, + ICLTensor *output, + const ActivationLayerInfo &act_info) { configure(CLKernelLibrary::get().get_compile_context(), input1, input2, output, act_info); } -void CLComplexPixelWiseMultiplication::configure(const CLCompileContext &compile_context, ICLTensor *input1, ICLTensor *input2, ICLTensor *output, const ActivationLayerInfo &act_info) +void CLComplexPixelWiseMultiplication::configure(const CLCompileContext &compile_context, + ICLTensor *input1, + ICLTensor *input2, + ICLTensor *output, + const ActivationLayerInfo &act_info) { _impl->src_0 = input1; _impl->src_1 = input2; @@ -110,7 +135,10 @@ void CLComplexPixelWiseMultiplication::configure(const CLCompileContext &compile _impl->op->configure(compile_context, input1->info(), input2->info(), output->info(), act_info); } -Status CLComplexPixelWiseMultiplication::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const ActivationLayerInfo &act_info) +Status CLComplexPixelWiseMultiplication::validate(const ITensorInfo *input1, + const ITensorInfo *input2, + const ITensorInfo *output, + const ActivationLayerInfo &act_info) { return opencl::ClComplexMul::validate(input1, input2, output, act_info); } diff --git a/src/runtime/CL/functions/CLPooling3dLayer.cpp b/src/runtime/CL/functions/CLPooling3dLayer.cpp new file mode 100644 index 0000000000..ce1092a7cc --- /dev/null +++ b/src/runtime/CL/functions/CLPooling3dLayer.cpp @@ -0,0 +1,77 @@ +/* + * Copyright (c) 2022 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/runtime/CL/functions/CLPooling3dLayer.h" + +#include "arm_compute/core/CL/CLKernelLibrary.h" +#include "arm_compute/core/CL/ICLTensor.h" + +#include "src/core/CL/ICLKernel.h" +#include "src/gpu/cl/operators/ClPool3d.h" + +namespace arm_compute +{ +struct CLPooling3dLayer::Impl +{ + const ICLTensor *src{nullptr}; + ICLTensor *dst{nullptr}; + ICLTensor *indices{nullptr}; + std::unique_ptr<opencl::ClPool3d> op{nullptr}; +}; + +CLPooling3dLayer::CLPooling3dLayer() : _impl(std::make_unique<Impl>()) +{ +} +CLPooling3dLayer::~CLPooling3dLayer() = default; + +void CLPooling3dLayer::configure(const ICLTensor *input, ICLTensor *output, const Pooling3dLayerInfo &pool_info) +{ + configure(CLKernelLibrary::get().get_compile_context(), input, output, pool_info); +} + +void CLPooling3dLayer::configure(const CLCompileContext &compile_context, + const ICLTensor *input, + ICLTensor *output, + const Pooling3dLayerInfo &pool_info) +{ + _impl->src = input; + _impl->dst = output; + + _impl->op = std::make_unique<opencl::ClPool3d>(); + _impl->op->configure(compile_context, input->info(), output->info(), pool_info); +} + +Status +CLPooling3dLayer::validate(const ITensorInfo *input, const ITensorInfo *output, const Pooling3dLayerInfo &pool_info) +{ + return opencl::ClPool3d::validate(input, output, pool_info); +} + +void CLPooling3dLayer::run() +{ + ITensorPack pack; + pack.add_tensor(TensorType::ACL_SRC, _impl->src); + pack.add_tensor(TensorType::ACL_DST_0, _impl->dst); + _impl->op->run(pack); +} +} // namespace arm_compute diff --git a/src/runtime/CL/functions/CLPoolingLayer.cpp b/src/runtime/CL/functions/CLPoolingLayer.cpp index 7ba911c342..65e53b9be3 100644 --- a/src/runtime/CL/functions/CLPoolingLayer.cpp +++ b/src/runtime/CL/functions/CLPoolingLayer.cpp @@ -25,41 +25,52 @@ #include "arm_compute/core/CL/CLKernelLibrary.h" #include "arm_compute/core/CL/ICLTensor.h" + #include "src/core/CL/ICLKernel.h" -#include "src/runtime/gpu/cl/operators/ClPool2d.h" +#include "src/gpu/cl/operators/ClPool2d.h" namespace arm_compute { struct CLPoolingLayer::Impl { - const ICLTensor *src{ nullptr }; - ICLTensor *dst{ nullptr }; - ICLTensor *indices{ nullptr }; - std::unique_ptr<opencl::ClPool2d> op{ nullptr }; + const ICLTensor *src{nullptr}; + ICLTensor *dst{nullptr}; + ICLTensor *indices{nullptr}; + std::unique_ptr<opencl::ClPool2d> op{nullptr}; }; -CLPoolingLayer::CLPoolingLayer() - : _impl(std::make_unique<Impl>()) +CLPoolingLayer::CLPoolingLayer() : _impl(std::make_unique<Impl>()) { } CLPoolingLayer::~CLPoolingLayer() = default; -void CLPoolingLayer::configure(ICLTensor *input, ICLTensor *output, const PoolingLayerInfo &pool_info, ICLTensor *indices) +void CLPoolingLayer::configure(ICLTensor *input, + ICLTensor *output, + const PoolingLayerInfo &pool_info, + ICLTensor *indices) { configure(CLKernelLibrary::get().get_compile_context(), input, output, pool_info, indices); } -void CLPoolingLayer::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, const PoolingLayerInfo &pool_info, ICLTensor *indices) +void CLPoolingLayer::configure(const CLCompileContext &compile_context, + ICLTensor *input, + ICLTensor *output, + const PoolingLayerInfo &pool_info, + ICLTensor *indices) { _impl->src = input; _impl->dst = output; _impl->indices = indices; _impl->op = std::make_unique<opencl::ClPool2d>(); - _impl->op->configure(compile_context, input->info(), output->info(), pool_info, (indices) ? indices->info() : nullptr); + _impl->op->configure(compile_context, input->info(), output->info(), pool_info, + (indices) ? indices->info() : nullptr); } -Status CLPoolingLayer::validate(const ITensorInfo *input, const ITensorInfo *output, const PoolingLayerInfo &pool_info, const ITensorInfo *indices) +Status CLPoolingLayer::validate(const ITensorInfo *input, + const ITensorInfo *output, + const PoolingLayerInfo &pool_info, + const ITensorInfo *indices) { return opencl::ClPool2d::validate(input, output, pool_info, indices); } diff --git a/src/runtime/CL/functions/CLPriorBoxLayer.cpp b/src/runtime/CL/functions/CLPriorBoxLayer.cpp index 5ace7c6d7a..cfd0ec4fbf 100644 --- a/src/runtime/CL/functions/CLPriorBoxLayer.cpp +++ b/src/runtime/CL/functions/CLPriorBoxLayer.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018-2020 Arm Limited. + * Copyright (c) 2018-2021 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -29,28 +29,40 @@ #include "arm_compute/core/Types.h" #include "arm_compute/core/Validate.h" #include "arm_compute/runtime/CL/CLScheduler.h" + +#include "src/common/utils/Log.h" #include "src/core/CL/kernels/CLFillBorderKernel.h" #include "src/core/CL/kernels/CLPriorBoxLayerKernel.h" using namespace arm_compute; -CLPriorBoxLayer::CLPriorBoxLayer() - : _min(nullptr), _max(nullptr), _aspect_ratios(nullptr) +CLPriorBoxLayer::CLPriorBoxLayer() : _min(nullptr), _max(nullptr), _aspect_ratios(nullptr) { } -void CLPriorBoxLayer::configure(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, const PriorBoxLayerInfo &info) +void CLPriorBoxLayer::configure(const ICLTensor *input1, + const ICLTensor *input2, + ICLTensor *output, + const PriorBoxLayerInfo &info) { configure(CLKernelLibrary::get().get_compile_context(), input1, input2, output, info); } -void CLPriorBoxLayer::configure(const CLCompileContext &compile_context, const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, const PriorBoxLayerInfo &info) +void CLPriorBoxLayer::configure(const CLCompileContext &compile_context, + const ICLTensor *input1, + const ICLTensor *input2, + ICLTensor *output, + const PriorBoxLayerInfo &info) { - _min = cl::Buffer(CLScheduler::get().context(), CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE, info.min_sizes().size() * sizeof(float)); - _aspect_ratios = cl::Buffer(CLScheduler::get().context(), CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE, info.aspect_ratios().size() * sizeof(float)); - if(!info.max_sizes().empty()) + ARM_COMPUTE_LOG_PARAMS(input1, input2, output, info); + _min = cl::Buffer(CLScheduler::get().context(), CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE, + info.min_sizes().size() * sizeof(float)); + _aspect_ratios = cl::Buffer(CLScheduler::get().context(), CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE, + info.aspect_ratios().size() * sizeof(float)); + if (!info.max_sizes().empty()) { - _max = cl::Buffer(CLScheduler::get().context(), CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE, info.max_sizes().size() * sizeof(float)); + _max = cl::Buffer(CLScheduler::get().context(), CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE, + info.max_sizes().size() * sizeof(float)); } auto k = std::make_unique<CLPriorBoxLayerKernel>(); @@ -58,7 +70,10 @@ void CLPriorBoxLayer::configure(const CLCompileContext &compile_context, const I _kernel = std::move(k); } -Status CLPriorBoxLayer::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const PriorBoxLayerInfo &info) +Status CLPriorBoxLayer::validate(const ITensorInfo *input1, + const ITensorInfo *input2, + const ITensorInfo *output, + const PriorBoxLayerInfo &info) { return CLPriorBoxLayerKernel::validate(input1, input2, output, info); -}
\ No newline at end of file +} diff --git a/src/runtime/CL/functions/CLQLSTMLayer.cpp b/src/runtime/CL/functions/CLQLSTMLayer.cpp index fcf5b9d2a4..12f6f89290 100644 --- a/src/runtime/CL/functions/CLQLSTMLayer.cpp +++ b/src/runtime/CL/functions/CLQLSTMLayer.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2021 Arm Limited. + * Copyright (c) 2020-2022 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -26,30 +26,36 @@ #include "arm_compute/core/KernelDescriptors.h" #include "arm_compute/core/QuantizationInfo.h" #include "arm_compute/core/Utils.h" -#include "arm_compute/core/Validate.h" #include "arm_compute/core/utils/misc/InfoHelpers.h" #include "arm_compute/core/utils/quantization/AsymmHelpers.h" +#include "arm_compute/core/Validate.h" #include "arm_compute/runtime/CL/CLScheduler.h" + +#include "src/common/utils/Log.h" #include "src/core/CL/kernels/CLFillBorderKernel.h" -#include "src/core/CL/kernels/CLGEMMLowpMatrixMultiplyNativeKernel.h" -#include "src/core/CL/kernels/CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel.h" -#include "src/core/CL/kernels/CLGEMMLowpOffsetContributionKernel.h" -#include "src/core/CL/kernels/CLGEMMLowpOffsetContributionOutputStageKernel.h" -#include "src/core/CL/kernels/CLGEMMLowpReductionKernel.h" #include "src/core/CL/kernels/CLQLSTMLayerNormalizationKernel.h" #include "src/core/helpers/WindowHelpers.h" +#include "src/gpu/cl/kernels/ClGemmLowpReductionKernel.h" namespace arm_compute { using namespace arm_compute::utils::info_helpers; +using namespace arm_compute::opencl::kernels; namespace { -Status validate_mm(GEMMLowpOutputStageInfo &gemmlowp_info, const ITensorInfo *mm_input, const ITensorInfo *mm_weights, const ITensorInfo *bias, - float gemmlowp_scale, const TensorInfo *mm_res_info, const TensorInfo *outstage_tensor_info) +Status validate_mm(GEMMLowpOutputStageInfo &gemmlowp_info, + const ITensorInfo *mm_input, + const ITensorInfo *mm_weights, + const ITensorInfo *bias, + float gemmlowp_scale, + const TensorInfo *mm_res_info, + const TensorInfo *outstage_tensor_info) { ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixMultiplyCore::validate(mm_input, mm_weights, nullptr, mm_res_info)); - ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier(gemmlowp_scale, &gemmlowp_info.gemmlowp_multiplier, &gemmlowp_info.gemmlowp_shift)); - ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpOutputStage::validate(mm_res_info, bias, outstage_tensor_info, gemmlowp_info)); + ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier( + gemmlowp_scale, &gemmlowp_info.gemmlowp_multiplier, &gemmlowp_info.gemmlowp_shift)); + ARM_COMPUTE_RETURN_ON_ERROR( + CLGEMMLowpOutputStage::validate(mm_res_info, bias, outstage_tensor_info, gemmlowp_info)); return Status{}; } } // namespace @@ -79,33 +85,31 @@ void CLQLSTMLayer::TensorCopyKernel::run() _src->map(q, true); _dst->map(q, true); - Iterator input_iter{ _src, _window }; - Iterator output_iter{ _dst, _window }; + Iterator input_iter{_src, _window}; + Iterator output_iter{_dst, _window}; - execute_window_loop(_window, [&](const Coordinates &) - { - memcpy(output_iter.ptr(), input_iter.ptr(), _row_size); - }, - input_iter, output_iter); + execute_window_loop( + _window, [&](const Coordinates &) { memcpy(output_iter.ptr(), input_iter.ptr(), _row_size); }, input_iter, + output_iter); _src->unmap(q); _dst->unmap(q); } CLQLSTMLayer::CLQLSTMLayer(std::shared_ptr<IMemoryManager> memory_manager) - : _input_to_input_reduction(std::make_unique<CLGEMMLowpMatrixAReductionKernel>()), - _recurrent_to_input_reduction(std::make_unique<CLGEMMLowpMatrixAReductionKernel>()), - _input_to_forget_reduction(std::make_unique<CLGEMMLowpMatrixAReductionKernel>()), - _recurrent_to_forget_reduction(std::make_unique<CLGEMMLowpMatrixAReductionKernel>()), - _input_to_cell_reduction(std::make_unique<CLGEMMLowpMatrixAReductionKernel>()), - _recurrent_to_cell_reduction(std::make_unique<CLGEMMLowpMatrixAReductionKernel>()), - _input_to_output_reduction(std::make_unique<CLGEMMLowpMatrixAReductionKernel>()), - _recurrent_to_output_reduction(std::make_unique<CLGEMMLowpMatrixAReductionKernel>()), - _projection_reduction(std::make_unique<CLGEMMLowpMatrixAReductionKernel>()), + : _input_to_input_reduction(std::make_unique<ClGemmLowpMatrixAReductionKernel>()), + _recurrent_to_input_reduction(std::make_unique<ClGemmLowpMatrixAReductionKernel>()), + _input_to_forget_reduction(std::make_unique<ClGemmLowpMatrixAReductionKernel>()), + _recurrent_to_forget_reduction(std::make_unique<ClGemmLowpMatrixAReductionKernel>()), + _input_to_cell_reduction(std::make_unique<ClGemmLowpMatrixAReductionKernel>()), + _recurrent_to_cell_reduction(std::make_unique<ClGemmLowpMatrixAReductionKernel>()), + _input_to_output_reduction(std::make_unique<ClGemmLowpMatrixAReductionKernel>()), + _recurrent_to_output_reduction(std::make_unique<ClGemmLowpMatrixAReductionKernel>()), + _projection_reduction(std::make_unique<ClGemmLowpMatrixAReductionKernel>()), _layer_norms(), _copy_output() { - for(auto &norm : _layer_norms) + for (auto &norm : _layer_norms) { norm = std::make_unique<CLQLSTMLayerNormalizationKernel>(); } @@ -130,17 +134,22 @@ Status CLQLSTMLayer::validate_layer_norm(const ITensorInfo &in, const ITensorInf { // Output quantization scale will be different, but ignored here // since it will be configured at configure() stage. - const TensorInfo out - { - in - }; + const TensorInfo out{in}; return CLQLSTMLayerNormalizationKernel::validate(&in, &out, &weight, &bias); } -void CLQLSTMLayer::configure_mm(const CLCompileContext &compile_context, CLGEMMLowpMatrixMultiplyCore &mm, CLGEMMLowpOutputStage &outstage, GEMMLowpOutputStageInfo &gemmlowp_info, - const ICLTensor *mm_input, const ICLTensor *mm_weights, const ICLTensor *bias, - CLTensor *mm_res, CLTensor *outstage_res, float gemmlowp_scale, - const TensorInfo &mm_res_info, const TensorInfo &outstage_tensor_info) +void CLQLSTMLayer::configure_mm(const CLCompileContext &compile_context, + CLGEMMLowpMatrixMultiplyCore &mm, + CLGEMMLowpOutputStage &outstage, + GEMMLowpOutputStageInfo &gemmlowp_info, + const ICLTensor *mm_input, + const ICLTensor *mm_weights, + const ICLTensor *bias, + CLTensor *mm_res, + CLTensor *outstage_res, + float gemmlowp_scale, + const TensorInfo &mm_res_info, + const TensorInfo &outstage_tensor_info) { _memory_group.manage(mm_res); _memory_group.manage(outstage_res); @@ -152,30 +161,51 @@ void CLQLSTMLayer::configure_mm(const CLCompileContext &compile_context, CLGEMML mm.configure(compile_context, mm_input, mm_weights, nullptr, mm_res); // Configure output stage - quantization::calculate_quantized_multiplier(gemmlowp_scale, &gemmlowp_info.gemmlowp_multiplier, &gemmlowp_info.gemmlowp_shift); + quantization::calculate_quantized_multiplier(gemmlowp_scale, &gemmlowp_info.gemmlowp_multiplier, + &gemmlowp_info.gemmlowp_shift); outstage.configure(compile_context, mm_res, bias, outstage_res, gemmlowp_info); mm_res->allocator()->allocate(); } -void CLQLSTMLayer::configure(const ICLTensor *input, - const ICLTensor *input_to_forget_weights, const ICLTensor *input_to_cell_weights, const ICLTensor *input_to_output_weights, - const ICLTensor *recurrent_to_forget_weights, const ICLTensor *recurrent_to_cell_weights, const ICLTensor *recurrent_to_output_weights, - const ICLTensor *forget_gate_bias, const ICLTensor *cell_bias, const ICLTensor *output_gate_bias, - ICLTensor *cell_state_in, ICLTensor *output_state_in, - ICLTensor *cell_state_out, ICLTensor *output_state_out, ICLTensor *output, +void CLQLSTMLayer::configure(const ICLTensor *input, + const ICLTensor *input_to_forget_weights, + const ICLTensor *input_to_cell_weights, + const ICLTensor *input_to_output_weights, + const ICLTensor *recurrent_to_forget_weights, + const ICLTensor *recurrent_to_cell_weights, + const ICLTensor *recurrent_to_output_weights, + const ICLTensor *forget_gate_bias, + const ICLTensor *cell_bias, + const ICLTensor *output_gate_bias, + ICLTensor *cell_state_in, + ICLTensor *output_state_in, + ICLTensor *cell_state_out, + ICLTensor *output_state_out, + ICLTensor *output, const LSTMParams<ICLTensor> &lstm_params) { - configure(CLKernelLibrary::get().get_compile_context(), input, input_to_forget_weights, input_to_cell_weights, input_to_output_weights, - recurrent_to_forget_weights, recurrent_to_cell_weights, recurrent_to_output_weights, forget_gate_bias, cell_bias, output_gate_bias, - cell_state_in, output_state_in, cell_state_out, output_state_out, output, lstm_params); + configure(CLKernelLibrary::get().get_compile_context(), input, input_to_forget_weights, input_to_cell_weights, + input_to_output_weights, recurrent_to_forget_weights, recurrent_to_cell_weights, + recurrent_to_output_weights, forget_gate_bias, cell_bias, output_gate_bias, cell_state_in, + output_state_in, cell_state_out, output_state_out, output, lstm_params); } -void CLQLSTMLayer::configure(const CLCompileContext &compile_context, const ICLTensor *input, - const ICLTensor *input_to_forget_weights, const ICLTensor *input_to_cell_weights, const ICLTensor *input_to_output_weights, - const ICLTensor *recurrent_to_forget_weights, const ICLTensor *recurrent_to_cell_weights, const ICLTensor *recurrent_to_output_weights, - const ICLTensor *forget_gate_bias, const ICLTensor *cell_bias, const ICLTensor *output_gate_bias, - ICLTensor *cell_state_in, ICLTensor *output_state_in, - ICLTensor *cell_state_out, ICLTensor *output_state_out, ICLTensor *output, +void CLQLSTMLayer::configure(const CLCompileContext &compile_context, + const ICLTensor *input, + const ICLTensor *input_to_forget_weights, + const ICLTensor *input_to_cell_weights, + const ICLTensor *input_to_output_weights, + const ICLTensor *recurrent_to_forget_weights, + const ICLTensor *recurrent_to_cell_weights, + const ICLTensor *recurrent_to_output_weights, + const ICLTensor *forget_gate_bias, + const ICLTensor *cell_bias, + const ICLTensor *output_gate_bias, + ICLTensor *cell_state_in, + ICLTensor *output_state_in, + ICLTensor *cell_state_out, + ICLTensor *output_state_out, + ICLTensor *output, const LSTMParams<ICLTensor> &lstm_params) { ARM_COMPUTE_ERROR_ON_NULLPTR(input, input_to_forget_weights, input_to_cell_weights, input_to_output_weights, @@ -183,16 +213,20 @@ void CLQLSTMLayer::configure(const CLCompileContext &compile_context, const ICLT forget_gate_bias, cell_bias, output_gate_bias, cell_state_in, output_state_in, cell_state_out, output_state_out, output); + ARM_COMPUTE_LOG_PARAMS(input, input_to_forget_weights, input_to_cell_weights, input_to_output_weights, + recurrent_to_forget_weights, recurrent_to_cell_weights, recurrent_to_output_weights, + forget_gate_bias, cell_bias, output_gate_bias, cell_state_in, output_state_in, + cell_state_out, output_state_out, output, lstm_params); // Set lstm parameters LSTMParams<ITensorInfo> lstm_params_info{}; build_lstm_params_tensor_info(lstm_params, &lstm_params_info); // Validate - ARM_COMPUTE_ERROR_THROW_ON(CLQLSTMLayer::validate(input->info(), input_to_forget_weights->info(), input_to_cell_weights->info(), input_to_output_weights->info(), - recurrent_to_forget_weights->info(), recurrent_to_cell_weights->info(), recurrent_to_output_weights->info(), - forget_gate_bias->info(), cell_bias->info(), output_gate_bias->info(), - cell_state_in->info(), output_state_in->info(), cell_state_out->info(), output_state_out->info(), output->info(), - lstm_params_info)); + ARM_COMPUTE_ERROR_THROW_ON(CLQLSTMLayer::validate( + input->info(), input_to_forget_weights->info(), input_to_cell_weights->info(), input_to_output_weights->info(), + recurrent_to_forget_weights->info(), recurrent_to_cell_weights->info(), recurrent_to_output_weights->info(), + forget_gate_bias->info(), cell_bias->info(), output_gate_bias->info(), cell_state_in->info(), + output_state_in->info(), cell_state_out->info(), output_state_out->info(), output->info(), lstm_params_info)); const int batch_size = input->info()->dimension(1); const int num_units = input_to_output_weights->info()->dimension(1); @@ -213,7 +247,7 @@ void CLQLSTMLayer::configure(const CLCompileContext &compile_context, const ICLT // Layer normalization _has_layer_norm = lstm_params.use_layer_norm(); - if(_has_layer_norm) + if (_has_layer_norm) { set_layer_norm_weight(lstm_params.forget_layer_norm_weights(), LayerNormGate::Forget); set_layer_norm_weight(lstm_params.cell_layer_norm_weights(), LayerNormGate::Cell); @@ -235,49 +269,75 @@ void CLQLSTMLayer::configure(const CLCompileContext &compile_context, const ICLT // Calculate quantized parameters for clipping. int16_t quantized_cell_clip = 0; - if(lstm_params.cell_clip() > 0.0f) + if (lstm_params.cell_clip() > 0.0f) { quantized_cell_clip = quantize_qsymm16(lstm_params.cell_clip(), qcell_state_in); } _has_cell_clipping = quantized_cell_clip > 0; // Precompute effective bias for optimizing the matmul computations. - if(!_has_cifg) + if (!_has_cifg) { _input_to_input_weights = lstm_params.input_to_input_weights(); _recurrent_to_input_weights = lstm_params.recurrent_to_input_weights(); - _input_to_input_reduction->configure(compile_context, _input_to_input_weights, &_input_to_input_eff_bias, GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true)); - _recurrent_to_input_reduction->configure(compile_context, _recurrent_to_input_weights, &_recurrent_to_input_eff_bias, GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true)); + _input_to_input_reduction->configure(compile_context, _input_to_input_weights->info(), + _input_to_input_eff_bias.info(), + GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true)); + _recurrent_to_input_reduction->configure( + compile_context, _recurrent_to_input_weights->info(), _recurrent_to_input_eff_bias.info(), + GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true)); } - _input_to_forget_reduction->configure(compile_context, input_to_forget_weights, &_input_to_forget_eff_bias, GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true)); - _recurrent_to_forget_reduction->configure(compile_context, recurrent_to_forget_weights, &_recurrent_to_forget_eff_bias, GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true)); - _input_to_cell_reduction->configure(compile_context, input_to_cell_weights, &_input_to_cell_eff_bias, GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true)); - _recurrent_to_cell_reduction->configure(compile_context, recurrent_to_cell_weights, &_recurrent_to_cell_eff_bias, GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true)); - _input_to_output_reduction->configure(compile_context, input_to_output_weights, &_input_to_output_eff_bias, GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true)); - _recurrent_to_output_reduction->configure(compile_context, recurrent_to_output_weights, &_recurrent_to_output_eff_bias, GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true)); - if(_has_projection) + _input_to_forget_reduction->configure(compile_context, input_to_forget_weights->info(), + _input_to_forget_eff_bias.info(), + GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true)); + _recurrent_to_forget_reduction->configure( + compile_context, recurrent_to_forget_weights->info(), _recurrent_to_forget_eff_bias.info(), + GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true)); + _input_to_cell_reduction->configure(compile_context, input_to_cell_weights->info(), _input_to_cell_eff_bias.info(), + GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true)); + _recurrent_to_cell_reduction->configure( + compile_context, recurrent_to_cell_weights->info(), _recurrent_to_cell_eff_bias.info(), + GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true)); + _input_to_output_reduction->configure(compile_context, input_to_output_weights->info(), + _input_to_output_eff_bias.info(), + GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true)); + _recurrent_to_output_reduction->configure( + compile_context, recurrent_to_output_weights->info(), _recurrent_to_output_eff_bias.info(), + GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true)); + if (_has_projection) { - _projection_reduction->configure(compile_context, _projection_weights, &_projection_eff_bias, GEMMLowpReductionKernelInfo(output_size, false, lstm_params.hidden_state_zero(), true)); - if(_projection_bias != nullptr) + _projection_reduction->configure( + compile_context, _projection_weights->info(), _projection_eff_bias.info(), + GEMMLowpReductionKernelInfo(output_size, false, lstm_params.hidden_state_zero(), true)); + if (_projection_bias != nullptr) { - _projection_bias_add.configure(compile_context, _projection_bias, &_projection_eff_bias, &_projection_eff_bias, ConvertPolicy::SATURATE); + _projection_bias_add.configure(compile_context, _projection_bias, &_projection_eff_bias, + &_projection_eff_bias, ConvertPolicy::SATURATE); } } // Pre-transpose weights to be used in GEMM. - _transpose_input_to_forget_weights.configure(compile_context, input_to_forget_weights, &_input_to_forget_weights_transposed); - _transpose_input_to_cell_weights.configure(compile_context, input_to_cell_weights, &_input_to_cell_weights_transposed); - _transpose_input_to_output_weights.configure(compile_context, input_to_output_weights, &_input_to_output_weights_transposed); - _transpose_recurrent_to_forget_weights.configure(compile_context, recurrent_to_forget_weights, &_recurrent_to_forget_weights_transposed); - _transpose_recurrent_to_cell_weights.configure(compile_context, recurrent_to_cell_weights, &_recurrent_to_cell_weights_transposed); - _transpose_recurrent_to_output_weights.configure(compile_context, recurrent_to_output_weights, &_recurrent_to_output_weights_transposed); - if(!_has_cifg) + _transpose_input_to_forget_weights.configure(compile_context, input_to_forget_weights, + &_input_to_forget_weights_transposed); + _transpose_input_to_cell_weights.configure(compile_context, input_to_cell_weights, + &_input_to_cell_weights_transposed); + _transpose_input_to_output_weights.configure(compile_context, input_to_output_weights, + &_input_to_output_weights_transposed); + _transpose_recurrent_to_forget_weights.configure(compile_context, recurrent_to_forget_weights, + &_recurrent_to_forget_weights_transposed); + _transpose_recurrent_to_cell_weights.configure(compile_context, recurrent_to_cell_weights, + &_recurrent_to_cell_weights_transposed); + _transpose_recurrent_to_output_weights.configure(compile_context, recurrent_to_output_weights, + &_recurrent_to_output_weights_transposed); + if (!_has_cifg) { - _transpose_input_to_input_weights.configure(compile_context, lstm_params.input_to_input_weights(), &_input_to_input_weights_transposed); - _transpose_recurrent_to_input_weights.configure(compile_context, lstm_params.recurrent_to_input_weights(), &_recurrent_to_input_weights_transposed); + _transpose_input_to_input_weights.configure(compile_context, lstm_params.input_to_input_weights(), + &_input_to_input_weights_transposed); + _transpose_recurrent_to_input_weights.configure(compile_context, lstm_params.recurrent_to_input_weights(), + &_recurrent_to_input_weights_transposed); } - if(_has_projection) + if (_has_projection) { _transpose_projection_weights.configure(compile_context, _projection_weights, &_projection_weights_transposed); } @@ -290,42 +350,55 @@ void CLQLSTMLayer::configure(const CLCompileContext &compile_context, const ICLT const TensorInfo mm_out_info(TensorShape(num_units, batch_size), 1, DataType::S32); // Forget gate. - const TensorInfo forget_gate_outstage_info(mm_out_info.tensor_shape(), 1, DataType::QSYMM16, QuantizationInfo(lstm_params.forget_intermediate_scale(), 0)); - const float input_to_forget_scale = input_to_forget_weights->info()->quantization_info().uniform().scale * qinput.scale / lstm_params.forget_intermediate_scale(); - configure_mm(compile_context, _mm_input_to_forget, _input_to_forget_outstage, gemmlowp_info, - input, &_input_to_forget_weights_transposed, &_input_to_forget_eff_bias, - &_mm_input_to_forget_res, &_input_to_forget_outstage_res, input_to_forget_scale, - mm_out_info, forget_gate_outstage_info); - - const float recurrent_to_forget_scale = recurrent_to_forget_weights->info()->quantization_info().uniform().scale * qoutput_state_in.scale / lstm_params.forget_intermediate_scale(); + const TensorInfo forget_gate_outstage_info(mm_out_info.tensor_shape(), 1, DataType::QSYMM16, + QuantizationInfo(lstm_params.forget_intermediate_scale(), 0)); + const float input_to_forget_scale = input_to_forget_weights->info()->quantization_info().uniform().scale * + qinput.scale / lstm_params.forget_intermediate_scale(); + configure_mm(compile_context, _mm_input_to_forget, _input_to_forget_outstage, gemmlowp_info, input, + &_input_to_forget_weights_transposed, &_input_to_forget_eff_bias, &_mm_input_to_forget_res, + &_input_to_forget_outstage_res, input_to_forget_scale, mm_out_info, forget_gate_outstage_info); + + const float recurrent_to_forget_scale = recurrent_to_forget_weights->info()->quantization_info().uniform().scale * + qoutput_state_in.scale / lstm_params.forget_intermediate_scale(); configure_mm(compile_context, _mm_recurrent_to_forget, _recurrent_to_forget_outstage, gemmlowp_info, output_state_in, &_recurrent_to_forget_weights_transposed, &_recurrent_to_forget_eff_bias, &_mm_recurrent_to_forget_res, &_recurrent_to_forget_outstage_res, recurrent_to_forget_scale, mm_out_info, forget_gate_outstage_info); - _accumulate_input_recurrent_forget.configure(compile_context, &_input_to_forget_outstage_res, &_recurrent_to_forget_outstage_res, &_recurrent_to_forget_outstage_res, + _accumulate_input_recurrent_forget.configure(compile_context, &_input_to_forget_outstage_res, + &_recurrent_to_forget_outstage_res, &_recurrent_to_forget_outstage_res, ConvertPolicy::SATURATE); _input_to_forget_outstage_res.allocator()->allocate(); - if(_has_peephole) + if (_has_peephole) { _mul_cell_to_forget_res.allocator()->init(TensorInfo(cell_state_in->info()->tensor_shape(), 1, DataType::S32)); _memory_group.manage(&_mul_cell_to_forget_res); - _pixelwise_mul_cell_to_forget.configure(compile_context, cell_state_in, lstm_params.cell_to_forget_weights(), &_mul_cell_to_forget_res, 1.f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO); - _cell_to_forget_outstage_res.allocator()->init(TensorInfo(_mul_cell_to_forget_res.info()->tensor_shape(), 1, DataType::QSYMM16, QuantizationInfo(lstm_params.forget_intermediate_scale(), 0))); + _pixelwise_mul_cell_to_forget.configure(compile_context, cell_state_in, lstm_params.cell_to_forget_weights(), + &_mul_cell_to_forget_res, 1.f, ConvertPolicy::SATURATE, + RoundingPolicy::TO_ZERO); + _cell_to_forget_outstage_res.allocator()->init( + TensorInfo(_mul_cell_to_forget_res.info()->tensor_shape(), 1, DataType::QSYMM16, + QuantizationInfo(lstm_params.forget_intermediate_scale(), 0))); _memory_group.manage(&_cell_to_forget_outstage_res); - const float cell_to_forget_scale = std::pow(2, cell_shift) * lstm_params.cell_to_forget_weights()->info()->quantization_info().uniform().scale / lstm_params.forget_intermediate_scale(); - quantization::calculate_quantized_multiplier(cell_to_forget_scale, &gemmlowp_info.gemmlowp_multiplier, &gemmlowp_info.gemmlowp_shift); - _cell_to_forget_outstage.configure(compile_context, &_mul_cell_to_forget_res, nullptr, &_cell_to_forget_outstage_res, gemmlowp_info); + const float cell_to_forget_scale = + std::pow(2, cell_shift) * + lstm_params.cell_to_forget_weights()->info()->quantization_info().uniform().scale / + lstm_params.forget_intermediate_scale(); + quantization::calculate_quantized_multiplier(cell_to_forget_scale, &gemmlowp_info.gemmlowp_multiplier, + &gemmlowp_info.gemmlowp_shift); + _cell_to_forget_outstage.configure(compile_context, &_mul_cell_to_forget_res, nullptr, + &_cell_to_forget_outstage_res, gemmlowp_info); _mul_cell_to_forget_res.allocator()->allocate(); - _accumulate_cell_forget.configure(compile_context, &_recurrent_to_forget_outstage_res, &_cell_to_forget_outstage_res, &_recurrent_to_forget_outstage_res, + _accumulate_cell_forget.configure(compile_context, &_recurrent_to_forget_outstage_res, + &_cell_to_forget_outstage_res, &_recurrent_to_forget_outstage_res, ConvertPolicy::SATURATE); _cell_to_forget_outstage_res.allocator()->allocate(); } CLTensor *forget_activation_input = &_recurrent_to_forget_outstage_res; - if(_has_layer_norm) + if (_has_layer_norm) { configure_layer_norm(LayerNormGate::Forget, &_recurrent_to_forget_outstage_res); _recurrent_to_forget_outstage_res.allocator()->allocate(); @@ -338,30 +411,33 @@ void CLQLSTMLayer::configure(const CLCompileContext &compile_context, const ICLT const TensorInfo forget_gate_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, sigmoid_tanh_outqinfo); _memory_group.manage(&_forget_gate); _forget_gate.allocator()->init(forget_gate_info); - _forget_gate_sigmoid.configure(compile_context, forget_activation_input, &_forget_gate, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)); + _forget_gate_sigmoid.configure(compile_context, forget_activation_input, &_forget_gate, + ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)); forget_activation_input->allocator()->allocate(); // Modulation gate. - const TensorInfo cell_outstage_info(mm_out_info.tensor_shape(), 1, DataType::QSYMM16, QuantizationInfo(lstm_params.cell_intermediate_scale(), 0)); - const float input_to_cell_scale = input_to_cell_weights->info()->quantization_info().uniform().scale * qinput.scale / lstm_params.cell_intermediate_scale(); - configure_mm(compile_context, _mm_input_to_cell, _input_to_cell_outstage, gemmlowp_info, - input, &_input_to_cell_weights_transposed, &_input_to_cell_eff_bias, - &_mm_input_to_cell_res, &_input_to_cell_outstage_res, input_to_cell_scale, - mm_out_info, cell_outstage_info); - - const float recurrent_to_cell_scale = recurrent_to_cell_weights->info()->quantization_info().uniform().scale * qoutput_state_in.scale / lstm_params.cell_intermediate_scale(); - configure_mm(compile_context, _mm_recurrent_to_cell, _recurrent_to_cell_outstage, gemmlowp_info, - output_state_in, &_recurrent_to_cell_weights_transposed, &_recurrent_to_cell_eff_bias, - &_mm_recurrent_to_cell_res, &_recurrent_to_cell_outstage_res, recurrent_to_cell_scale, - mm_out_info, cell_outstage_info); - - _accumulate_input_recurrent_modulation.configure(compile_context, &_input_to_cell_outstage_res, &_recurrent_to_cell_outstage_res, &_recurrent_to_cell_outstage_res, + const TensorInfo cell_outstage_info(mm_out_info.tensor_shape(), 1, DataType::QSYMM16, + QuantizationInfo(lstm_params.cell_intermediate_scale(), 0)); + const float input_to_cell_scale = input_to_cell_weights->info()->quantization_info().uniform().scale * + qinput.scale / lstm_params.cell_intermediate_scale(); + configure_mm(compile_context, _mm_input_to_cell, _input_to_cell_outstage, gemmlowp_info, input, + &_input_to_cell_weights_transposed, &_input_to_cell_eff_bias, &_mm_input_to_cell_res, + &_input_to_cell_outstage_res, input_to_cell_scale, mm_out_info, cell_outstage_info); + + const float recurrent_to_cell_scale = recurrent_to_cell_weights->info()->quantization_info().uniform().scale * + qoutput_state_in.scale / lstm_params.cell_intermediate_scale(); + configure_mm(compile_context, _mm_recurrent_to_cell, _recurrent_to_cell_outstage, gemmlowp_info, output_state_in, + &_recurrent_to_cell_weights_transposed, &_recurrent_to_cell_eff_bias, &_mm_recurrent_to_cell_res, + &_recurrent_to_cell_outstage_res, recurrent_to_cell_scale, mm_out_info, cell_outstage_info); + + _accumulate_input_recurrent_modulation.configure(compile_context, &_input_to_cell_outstage_res, + &_recurrent_to_cell_outstage_res, &_recurrent_to_cell_outstage_res, ConvertPolicy::SATURATE); _input_to_cell_outstage_res.allocator()->allocate(); CLTensor *cell_activation_input = &_recurrent_to_cell_outstage_res; - if(_has_layer_norm) + if (_has_layer_norm) { configure_layer_norm(LayerNormGate::Cell, &_recurrent_to_cell_outstage_res); _recurrent_to_cell_outstage_res.allocator()->allocate(); @@ -371,14 +447,15 @@ void CLQLSTMLayer::configure(const CLCompileContext &compile_context, const ICLT const TensorInfo cell_gate_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, sigmoid_tanh_outqinfo); _memory_group.manage(&_cell_gate); _cell_gate.allocator()->init(cell_gate_info); - _cell_gate_tanh.configure(compile_context, cell_activation_input, &_cell_gate, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.f, 1.f)); + _cell_gate_tanh.configure(compile_context, cell_activation_input, &_cell_gate, + ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.f, 1.f)); cell_activation_input->allocator()->allocate(); // Input gate. const TensorInfo input_gate_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, sigmoid_tanh_outqinfo); _input_gate.allocator()->init(input_gate_info); _memory_group.manage(&_input_gate); - if(_has_cifg) + if (_has_cifg) { _ones.allocator()->init(*_forget_gate.info()); _input_gate_sub.configure(compile_context, &_ones, &_forget_gate, &_input_gate, ConvertPolicy::SATURATE); @@ -386,107 +463,142 @@ void CLQLSTMLayer::configure(const CLCompileContext &compile_context, const ICLT } else { - const TensorInfo input_outstage_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, QuantizationInfo(lstm_params.input_intermediate_scale(), 0)); - const float input_to_input_scale = _input_to_input_weights->info()->quantization_info().uniform().scale * qinput.scale / lstm_params.input_intermediate_scale(); - configure_mm(compile_context, _mm_input_to_input, _input_to_input_outstage, gemmlowp_info, - input, &_input_to_input_weights_transposed, &_input_to_input_eff_bias, - &_mm_input_to_input_res, &_input_to_input_outstage_res, input_to_input_scale, - mm_out_info, input_outstage_info); - - const float recurrent_to_input_scale = _recurrent_to_input_weights->info()->quantization_info().uniform().scale * qoutput_state_in.scale / lstm_params.input_intermediate_scale(); + const TensorInfo input_outstage_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, + QuantizationInfo(lstm_params.input_intermediate_scale(), 0)); + const float input_to_input_scale = _input_to_input_weights->info()->quantization_info().uniform().scale * + qinput.scale / lstm_params.input_intermediate_scale(); + configure_mm(compile_context, _mm_input_to_input, _input_to_input_outstage, gemmlowp_info, input, + &_input_to_input_weights_transposed, &_input_to_input_eff_bias, &_mm_input_to_input_res, + &_input_to_input_outstage_res, input_to_input_scale, mm_out_info, input_outstage_info); + + const float recurrent_to_input_scale = + _recurrent_to_input_weights->info()->quantization_info().uniform().scale * qoutput_state_in.scale / + lstm_params.input_intermediate_scale(); configure_mm(compile_context, _mm_recurrent_to_input, _recurrent_to_input_outstage, gemmlowp_info, output_state_in, &_recurrent_to_input_weights_transposed, &_recurrent_to_input_eff_bias, &_mm_recurrent_to_input_res, &_recurrent_to_input_outstage_res, recurrent_to_input_scale, mm_out_info, input_outstage_info); - _accumulate_input_recurrent_input.configure(compile_context, &_input_to_input_outstage_res, &_recurrent_to_input_outstage_res, &_recurrent_to_input_outstage_res, - ConvertPolicy::SATURATE); + _accumulate_input_recurrent_input.configure(compile_context, &_input_to_input_outstage_res, + &_recurrent_to_input_outstage_res, + &_recurrent_to_input_outstage_res, ConvertPolicy::SATURATE); _input_to_input_outstage_res.allocator()->allocate(); - if(_has_peephole) + if (_has_peephole) { - _mul_cell_to_input_res.allocator()->init(TensorInfo(cell_state_in->info()->tensor_shape(), 1, DataType::S32)); + _mul_cell_to_input_res.allocator()->init( + TensorInfo(cell_state_in->info()->tensor_shape(), 1, DataType::S32)); _memory_group.manage(&_mul_cell_to_input_res); - _pixelwise_mul_cell_to_input.configure(compile_context, cell_state_in, lstm_params.cell_to_input_weights(), &_mul_cell_to_input_res, 1.f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO); - const float cell_to_input_scale = std::pow(2, cell_shift) * lstm_params.cell_to_input_weights()->info()->quantization_info().uniform().scale / lstm_params.input_intermediate_scale(); - quantization::calculate_quantized_multiplier(cell_to_input_scale, &gemmlowp_info.gemmlowp_multiplier, &gemmlowp_info.gemmlowp_shift); - _cell_to_input_outstage_res.allocator()->init(TensorInfo(_mul_cell_to_input_res.info()->tensor_shape(), 1, DataType::QSYMM16, QuantizationInfo(lstm_params.input_intermediate_scale(), 0))); + _pixelwise_mul_cell_to_input.configure(compile_context, cell_state_in, lstm_params.cell_to_input_weights(), + &_mul_cell_to_input_res, 1.f, ConvertPolicy::SATURATE, + RoundingPolicy::TO_ZERO); + const float cell_to_input_scale = + std::pow(2, cell_shift) * + lstm_params.cell_to_input_weights()->info()->quantization_info().uniform().scale / + lstm_params.input_intermediate_scale(); + quantization::calculate_quantized_multiplier(cell_to_input_scale, &gemmlowp_info.gemmlowp_multiplier, + &gemmlowp_info.gemmlowp_shift); + _cell_to_input_outstage_res.allocator()->init( + TensorInfo(_mul_cell_to_input_res.info()->tensor_shape(), 1, DataType::QSYMM16, + QuantizationInfo(lstm_params.input_intermediate_scale(), 0))); _memory_group.manage(&_cell_to_input_outstage_res); - _cell_to_input_outstage.configure(compile_context, &_mul_cell_to_input_res, nullptr, &_cell_to_input_outstage_res, gemmlowp_info); + _cell_to_input_outstage.configure(compile_context, &_mul_cell_to_input_res, nullptr, + &_cell_to_input_outstage_res, gemmlowp_info); _mul_cell_to_input_res.allocator()->allocate(); - _accumulate_cell_input.configure(&_recurrent_to_input_outstage_res, &_cell_to_input_outstage_res, &_recurrent_to_input_outstage_res, ConvertPolicy::SATURATE); + _accumulate_cell_input.configure(&_recurrent_to_input_outstage_res, &_cell_to_input_outstage_res, + &_recurrent_to_input_outstage_res, ConvertPolicy::SATURATE); _cell_to_input_outstage_res.allocator()->allocate(); } CLTensor *input_activation_input = &_recurrent_to_input_outstage_res; - if(_has_layer_norm) + if (_has_layer_norm) { configure_layer_norm(LayerNormGate::Input, &_recurrent_to_input_outstage_res); _recurrent_to_input_outstage_res.allocator()->allocate(); input_activation_input = &get_layer_norm_output(LayerNormGate::Input); } - _input_gate_sigmoid.configure(compile_context, input_activation_input, &_input_gate, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)); + _input_gate_sigmoid.configure(compile_context, input_activation_input, &_input_gate, + ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)); input_activation_input->allocator()->allocate(); } // Cell. // TODO(COMPMID-3396): Perform multiplication in the quantized domain in CLPixelWiseMultiplication - _pixelwise_mul_forget_cell.configure(compile_context, &_forget_gate, cell_state_in, &_forget_gate, 1.f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO); + _pixelwise_mul_forget_cell.configure(compile_context, &_forget_gate, cell_state_in, &_forget_gate, 1.f, + ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO); const float cell_gate_scale = _cell_gate.info()->quantization_info().uniform().scale; const float mul_input_cell_scale = cell_gate_scale * std::pow(2, 15 + cell_shift); - const TensorInfo mul_input_cell_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, QuantizationInfo(mul_input_cell_scale, 0)); + const TensorInfo mul_input_cell_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, + QuantizationInfo(mul_input_cell_scale, 0)); _memory_group.manage(&_mul_input_cell_res); _mul_input_cell_res.allocator()->init(mul_input_cell_info); - _pixelwise_mul_input_cell.configure(compile_context, &_input_gate, &_cell_gate, &_mul_input_cell_res, 1.f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO); + _pixelwise_mul_input_cell.configure(compile_context, &_input_gate, &_cell_gate, &_mul_input_cell_res, 1.f, + ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO); _cell_gate.allocator()->allocate(); - _add_forget_cell.configure(compile_context, &_forget_gate, &_mul_input_cell_res, cell_state_out, ConvertPolicy::SATURATE); + _add_forget_cell.configure(compile_context, &_forget_gate, &_mul_input_cell_res, cell_state_out, + ConvertPolicy::SATURATE); _mul_input_cell_res.allocator()->allocate(); _forget_gate.allocator()->allocate(); - if(_has_cell_clipping) + if (_has_cell_clipping) { - _cell_clip.configure(compile_context, cell_state_out, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, -quantized_cell_clip, quantized_cell_clip)); + _cell_clip.configure(compile_context, cell_state_out, nullptr, + ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, + -quantized_cell_clip, quantized_cell_clip)); } // Output gate. - const TensorInfo output_outstage_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, QuantizationInfo(lstm_params.output_intermediate_scale(), 0)); - const float input_to_output_scale = input_to_output_weights->info()->quantization_info().uniform().scale * qinput.scale / lstm_params.output_intermediate_scale(); - configure_mm(compile_context, _mm_input_to_output, _input_to_output_outstage, gemmlowp_info, - input, &_input_to_output_weights_transposed, &_input_to_output_eff_bias, - &_mm_input_to_output_res, &_input_to_output_outstage_res, input_to_output_scale, - mm_out_info, output_outstage_info); - - const float recurrent_to_output_scale = recurrent_to_output_weights->info()->quantization_info().uniform().scale * qoutput_state_in.scale / lstm_params.output_intermediate_scale(); + const TensorInfo output_outstage_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, + QuantizationInfo(lstm_params.output_intermediate_scale(), 0)); + const float input_to_output_scale = input_to_output_weights->info()->quantization_info().uniform().scale * + qinput.scale / lstm_params.output_intermediate_scale(); + configure_mm(compile_context, _mm_input_to_output, _input_to_output_outstage, gemmlowp_info, input, + &_input_to_output_weights_transposed, &_input_to_output_eff_bias, &_mm_input_to_output_res, + &_input_to_output_outstage_res, input_to_output_scale, mm_out_info, output_outstage_info); + + const float recurrent_to_output_scale = recurrent_to_output_weights->info()->quantization_info().uniform().scale * + qoutput_state_in.scale / lstm_params.output_intermediate_scale(); configure_mm(compile_context, _mm_recurrent_to_output, _recurrent_to_output_outstage, gemmlowp_info, output_state_in, &_recurrent_to_output_weights_transposed, &_recurrent_to_output_eff_bias, &_mm_recurrent_to_output_res, &_recurrent_to_output_outstage_res, recurrent_to_output_scale, mm_out_info, output_outstage_info); - _accumulate_input_recurrent_output.configure(compile_context, &_recurrent_to_output_outstage_res, &_input_to_output_outstage_res, &_recurrent_to_output_outstage_res, + _accumulate_input_recurrent_output.configure(compile_context, &_recurrent_to_output_outstage_res, + &_input_to_output_outstage_res, &_recurrent_to_output_outstage_res, ConvertPolicy::SATURATE); _input_to_output_outstage_res.allocator()->allocate(); - if(_has_peephole) + if (_has_peephole) { // TODO(COMPMID-3396): Perform multiplication in the quantized domain in CLPixelWiseMultiplication // Here we are not using the output stage because all operations are done in float _mul_cell_to_output_res.allocator()->init(TensorInfo(cell_state_out->info()->tensor_shape(), 1, DataType::S32)); _memory_group.manage(&_mul_cell_to_output_res); - _pixelwise_mul_cell_to_output.configure(compile_context, cell_state_out, lstm_params.cell_to_output_weights(), &_mul_cell_to_output_res, 1.f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO); - - const float cell_to_output_scale = std::pow(2, cell_shift) * lstm_params.cell_to_output_weights()->info()->quantization_info().uniform().scale / lstm_params.output_intermediate_scale(); - quantization::calculate_quantized_multiplier(cell_to_output_scale, &gemmlowp_info.gemmlowp_multiplier, &gemmlowp_info.gemmlowp_shift); - _cell_to_output_outstage_res.allocator()->init(TensorInfo(_mul_cell_to_output_res.info()->tensor_shape(), 1, DataType::QSYMM16, QuantizationInfo(lstm_params.output_intermediate_scale(), 0))); + _pixelwise_mul_cell_to_output.configure(compile_context, cell_state_out, lstm_params.cell_to_output_weights(), + &_mul_cell_to_output_res, 1.f, ConvertPolicy::SATURATE, + RoundingPolicy::TO_ZERO); + + const float cell_to_output_scale = + std::pow(2, cell_shift) * + lstm_params.cell_to_output_weights()->info()->quantization_info().uniform().scale / + lstm_params.output_intermediate_scale(); + quantization::calculate_quantized_multiplier(cell_to_output_scale, &gemmlowp_info.gemmlowp_multiplier, + &gemmlowp_info.gemmlowp_shift); + _cell_to_output_outstage_res.allocator()->init( + TensorInfo(_mul_cell_to_output_res.info()->tensor_shape(), 1, DataType::QSYMM16, + QuantizationInfo(lstm_params.output_intermediate_scale(), 0))); _memory_group.manage(&_cell_to_output_outstage_res); - _cell_to_output_outstage.configure(compile_context, &_mul_cell_to_output_res, nullptr, &_cell_to_output_outstage_res, gemmlowp_info); + _cell_to_output_outstage.configure(compile_context, &_mul_cell_to_output_res, nullptr, + &_cell_to_output_outstage_res, gemmlowp_info); _mul_cell_to_output_res.allocator()->allocate(); - _accumulate_cell_to_output.configure(compile_context, &_recurrent_to_output_outstage_res, &_cell_to_output_outstage_res, &_recurrent_to_output_outstage_res, + _accumulate_cell_to_output.configure(compile_context, &_recurrent_to_output_outstage_res, + &_cell_to_output_outstage_res, &_recurrent_to_output_outstage_res, ConvertPolicy::SATURATE); _cell_to_output_outstage_res.allocator()->allocate(); } CLTensor *output_activation_input = &_recurrent_to_output_outstage_res; - if(_has_layer_norm) + if (_has_layer_norm) { configure_layer_norm(LayerNormGate::Output, &_recurrent_to_output_outstage_res); _recurrent_to_output_outstage_res.allocator()->allocate(); @@ -496,20 +608,24 @@ void CLQLSTMLayer::configure(const CLCompileContext &compile_context, const ICLT const TensorInfo output_gate_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, sigmoid_tanh_outqinfo); _memory_group.manage(&_output_gate); _output_gate.allocator()->init(output_gate_info); - _output_gate_sigmoid.configure(compile_context, output_activation_input, &_output_gate, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)); + _output_gate_sigmoid.configure(compile_context, output_activation_input, &_output_gate, + ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)); output_activation_input->allocator()->allocate(); // Hidden. - _hidden_tanh.configure(compile_context, cell_state_out, &_input_gate, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.f, 1.f)); + _hidden_tanh.configure(compile_context, cell_state_out, &_input_gate, + ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.f, 1.f)); // TODO(COMPMID-3396): Perform multiplication in the quantized domain in CLPixelWiseMultiplication _memory_group.manage(&_hidden_mul_res); const TensorInfo hidden_mul_res(_input_gate.info()->tensor_shape(), 1, DataType::S32); _hidden_mul_res.allocator()->init(hidden_mul_res); - _pixelwise_mul_hidden.configure(compile_context, &_output_gate, &_input_gate, &_hidden_mul_res, 1.f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO); + _pixelwise_mul_hidden.configure(compile_context, &_output_gate, &_input_gate, &_hidden_mul_res, 1.f, + ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO); _output_gate.allocator()->allocate(); _input_gate.allocator()->allocate(); const float hidden_state_scale = std::pow(2, -15) / lstm_params.hidden_state_scale() * std::pow(2, -15); - quantization::calculate_quantized_multiplier(hidden_state_scale, &gemmlowp_info.gemmlowp_multiplier, &gemmlowp_info.gemmlowp_shift, /* ignore_epsilon */ true); + quantization::calculate_quantized_multiplier(hidden_state_scale, &gemmlowp_info.gemmlowp_multiplier, + &gemmlowp_info.gemmlowp_shift, /* ignore_epsilon */ true); gemmlowp_info.gemmlowp_offset = lstm_params.hidden_state_zero(); gemmlowp_info.output_data_type = output_state_in->info()->data_type(); @@ -518,7 +634,7 @@ void CLQLSTMLayer::configure(const CLCompileContext &compile_context, const ICLT _memory_group.manage(&_hidden_gate); - if(_projection_tensor_copy_required) + if (_projection_tensor_copy_required) { _hidden_gate.allocator()->init(*output_state_out->info()); _hidden_gate.info()->set_tensor_shape(_hidden_mul_res.info()->tensor_shape()); @@ -529,27 +645,26 @@ void CLQLSTMLayer::configure(const CLCompileContext &compile_context, const ICLT _hidden_mul_res.allocator()->allocate(); // Projection. - if(_has_projection) + if (_has_projection) { const TensorInfo projection_outstage_info(*output_state_out->info()); - const UniformQuantizationInfo qprojection = _projection_weights->info()->quantization_info().uniform(); - const float projection_scale = qprojection.scale * lstm_params.hidden_state_scale() / qoutput_state_in.scale; - gemmlowp_info.gemmlowp_offset = qoutput_state_in.offset; - gemmlowp_info.gemmlowp_min_bound = std::numeric_limits<int8_t>::lowest(); - gemmlowp_info.gemmlowp_max_bound = std::numeric_limits<int8_t>::max(); - gemmlowp_info.output_data_type = DataType::QASYMM8_SIGNED; - - TensorInfo projection_mm_out_info{ mm_out_info }; + const UniformQuantizationInfo qprojection = _projection_weights->info()->quantization_info().uniform(); + const float projection_scale = qprojection.scale * lstm_params.hidden_state_scale() / qoutput_state_in.scale; + gemmlowp_info.gemmlowp_offset = qoutput_state_in.offset; + gemmlowp_info.gemmlowp_min_bound = std::numeric_limits<int8_t>::lowest(); + gemmlowp_info.gemmlowp_max_bound = std::numeric_limits<int8_t>::max(); + gemmlowp_info.output_data_type = DataType::QASYMM8_SIGNED; + + TensorInfo projection_mm_out_info{mm_out_info}; projection_mm_out_info.set_tensor_shape(TensorShape(output_size, batch_size)); - configure_mm(compile_context, _mm_projection, _projection_outstage, gemmlowp_info, - hidden_gate_result, &_projection_weights_transposed, &_projection_eff_bias, - &_mm_projection_res, &_projection_outstage_res, projection_scale, - projection_mm_out_info, projection_outstage_info); + configure_mm(compile_context, _mm_projection, _projection_outstage, gemmlowp_info, hidden_gate_result, + &_projection_weights_transposed, &_projection_eff_bias, &_mm_projection_res, + &_projection_outstage_res, projection_scale, projection_mm_out_info, projection_outstage_info); ICLTensor *accumulate_destination = output_state_out; - if(_projection_tensor_copy_required) + if (_projection_tensor_copy_required) { _hidden_gate.allocator()->allocate(); _projection_accumulate_res.allocator()->init(*output_state_in->info()); @@ -558,31 +673,34 @@ void CLQLSTMLayer::configure(const CLCompileContext &compile_context, const ICLT accumulate_destination = &_projection_accumulate_res; } - _accumulate_projection.configure(compile_context, &_projection_outstage_res, accumulate_destination, accumulate_destination, ConvertPolicy::SATURATE); + _accumulate_projection.configure(compile_context, &_projection_outstage_res, accumulate_destination, + accumulate_destination, ConvertPolicy::SATURATE); _projection_outstage_res.allocator()->allocate(); - if(_projection_tensor_copy_required) + if (_projection_tensor_copy_required) { _projection_accumulate_to_output_copy.configure(_projection_accumulate_res, *output_state_out); _projection_accumulate_res.allocator()->allocate(); } - int8_t quantized_projection_clip{ 0 }; - if(lstm_params.projection_clip() > 0.0f) + int8_t quantized_projection_clip{0}; + if (lstm_params.projection_clip() > 0.0f) { - quantized_projection_clip = utility::clamp<int8_t>(lstm_params.projection_clip() / qprojection.scale, -128, 127); + quantized_projection_clip = + utility::clamp<int8_t>(lstm_params.projection_clip() / qprojection.scale, -128, 127); } - if(quantized_projection_clip > 0) + if (quantized_projection_clip > 0) { - _projection_clip.configure(compile_context, output_state_out, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, -quantized_projection_clip, - quantized_projection_clip)); + _projection_clip.configure(compile_context, output_state_out, nullptr, + ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, + -quantized_projection_clip, quantized_projection_clip)); _has_projection_clipping = true; } } else { - if(_projection_tensor_copy_required) + if (_projection_tensor_copy_required) { _hidden_to_output_copy.configure(_hidden_gate, *output_state_out); _hidden_gate.allocator()->allocate(); @@ -593,17 +711,27 @@ void CLQLSTMLayer::configure(const CLCompileContext &compile_context, const ICLT _copy_output.configure(compile_context, output_state_out, output); } -Status CLQLSTMLayer::validate(const ITensorInfo *input, - const ITensorInfo *input_to_forget_weights, const ITensorInfo *input_to_cell_weights, const ITensorInfo *input_to_output_weights, - const ITensorInfo *recurrent_to_forget_weights, const ITensorInfo *recurrent_to_cell_weights, const ITensorInfo *recurrent_to_output_weights, - const ITensorInfo *forget_gate_bias, const ITensorInfo *cell_bias, const ITensorInfo *output_gate_bias, - const ITensorInfo *cell_state_in, const ITensorInfo *output_state_in, - const ITensorInfo *cell_state_out, const ITensorInfo *output_state_out, const ITensorInfo *output, +Status CLQLSTMLayer::validate(const ITensorInfo *input, + const ITensorInfo *input_to_forget_weights, + const ITensorInfo *input_to_cell_weights, + const ITensorInfo *input_to_output_weights, + const ITensorInfo *recurrent_to_forget_weights, + const ITensorInfo *recurrent_to_cell_weights, + const ITensorInfo *recurrent_to_output_weights, + const ITensorInfo *forget_gate_bias, + const ITensorInfo *cell_bias, + const ITensorInfo *output_gate_bias, + const ITensorInfo *cell_state_in, + const ITensorInfo *output_state_in, + const ITensorInfo *cell_state_out, + const ITensorInfo *output_state_out, + const ITensorInfo *output, const LSTMParams<ITensorInfo> &lstm_params) { - ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, input_to_forget_weights, input_to_cell_weights, input_to_output_weights, recurrent_to_forget_weights, recurrent_to_cell_weights, - recurrent_to_output_weights, forget_gate_bias, cell_bias, output_gate_bias, cell_state_in, output_state_in, - cell_state_out, output_state_out, output); + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, input_to_forget_weights, input_to_cell_weights, input_to_output_weights, + recurrent_to_forget_weights, recurrent_to_cell_weights, + recurrent_to_output_weights, forget_gate_bias, cell_bias, output_gate_bias, + cell_state_in, output_state_in, cell_state_out, output_state_out, output); ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8_SIGNED); ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->num_dimensions() != 2, "Input must have exactly 2 dimensions"); @@ -615,13 +743,16 @@ Status CLQLSTMLayer::validate(const ITensorInfo *input, ARM_COMPUTE_RETURN_ERROR_ON(input_to_output_weights->num_dimensions() != 2); ARM_COMPUTE_RETURN_ERROR_ON(input_to_output_weights->dimension(0) != input_size); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input_to_output_weights, input_to_forget_weights, input_to_cell_weights); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input_to_output_weights, input_to_forget_weights, + input_to_cell_weights); ARM_COMPUTE_RETURN_ERROR_ON(recurrent_to_output_weights->num_dimensions() != 2); ARM_COMPUTE_RETURN_ERROR_ON(recurrent_to_output_weights->dimension(1) != num_units); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(recurrent_to_output_weights, recurrent_to_forget_weights, recurrent_to_cell_weights); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(recurrent_to_output_weights, recurrent_to_forget_weights, + recurrent_to_cell_weights); ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input_to_forget_weights, 1, DataType::QSYMM8); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input_to_forget_weights, input_to_cell_weights, input_to_output_weights, - recurrent_to_forget_weights, recurrent_to_cell_weights, recurrent_to_output_weights); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input_to_forget_weights, input_to_cell_weights, + input_to_output_weights, recurrent_to_forget_weights, + recurrent_to_cell_weights, recurrent_to_output_weights); ARM_COMPUTE_RETURN_ERROR_ON(forget_gate_bias->num_dimensions() != 1); ARM_COMPUTE_RETURN_ERROR_ON(forget_gate_bias->dimension(0) != num_units); @@ -640,20 +771,25 @@ Status CLQLSTMLayer::validate(const ITensorInfo *input, ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output_state_in); // Check whether peephole weights are all there or none - if(lstm_params.has_peephole_opt()) + if (lstm_params.has_peephole_opt()) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(lstm_params.cell_to_forget_weights(), lstm_params.cell_to_output_weights()); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lstm_params.cell_to_forget_weights(), 1, DataType::QSYMM16); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lstm_params.cell_to_forget_weights(), 1, + DataType::QSYMM16); ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.cell_to_forget_weights()->num_dimensions() != 1); ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.cell_to_forget_weights()->dimension(0) != num_units); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(lstm_params.cell_to_forget_weights(), lstm_params.cell_to_output_weights()); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(lstm_params.cell_to_forget_weights(), lstm_params.cell_to_output_weights()); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(lstm_params.cell_to_forget_weights(), + lstm_params.cell_to_output_weights()); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(lstm_params.cell_to_forget_weights(), + lstm_params.cell_to_output_weights()); - if(!lstm_params.has_cifg_opt()) + if (!lstm_params.has_cifg_opt()) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(lstm_params.cell_to_input_weights()); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(lstm_params.cell_to_forget_weights(), lstm_params.cell_to_input_weights()); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(lstm_params.cell_to_forget_weights(), lstm_params.cell_to_input_weights()); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(lstm_params.cell_to_forget_weights(), + lstm_params.cell_to_input_weights()); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(lstm_params.cell_to_forget_weights(), + lstm_params.cell_to_input_weights()); } } @@ -667,7 +803,7 @@ Status CLQLSTMLayer::validate(const ITensorInfo *input, // Calculate quantized parameters for clipping. int16_t quantized_cell_clip = 0; - if(lstm_params.cell_clip() > 0.0f) + if (lstm_params.cell_clip() > 0.0f) { quantized_cell_clip = quantize_qsymm16(lstm_params.cell_clip(), qcell_state_in); } @@ -675,33 +811,50 @@ Status CLQLSTMLayer::validate(const ITensorInfo *input, // Precompute effective bias for optimizing the matmul computations. const TensorInfo eff_bias_info(TensorShape(num_units), 1, DataType::S32); const TensorInfo projection_eff_bias_info(TensorShape(output_size), 1, DataType::S32); - if(!lstm_params.has_cifg_opt()) + if (!lstm_params.has_cifg_opt()) { - ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixAReductionKernel::validate(lstm_params.input_to_input_weights(), &eff_bias_info, GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true))); - ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixAReductionKernel::validate(lstm_params.recurrent_to_input_weights(), &eff_bias_info, GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, - true))); + ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpMatrixAReductionKernel::validate( + lstm_params.input_to_input_weights(), &eff_bias_info, + GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true))); + ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpMatrixAReductionKernel::validate( + lstm_params.recurrent_to_input_weights(), &eff_bias_info, + GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true))); } - ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixAReductionKernel::validate(input_to_forget_weights, &eff_bias_info, GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true))); - ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixAReductionKernel::validate(recurrent_to_forget_weights, &eff_bias_info, GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true))); - ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixAReductionKernel::validate(input_to_cell_weights, &eff_bias_info, GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true))); - ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixAReductionKernel::validate(recurrent_to_cell_weights, &eff_bias_info, GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true))); - ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixAReductionKernel::validate(input_to_output_weights, &eff_bias_info, GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true))); - ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixAReductionKernel::validate(recurrent_to_output_weights, &eff_bias_info, GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true))); - if(lstm_params.has_projection()) + ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpMatrixAReductionKernel::validate( + input_to_forget_weights, &eff_bias_info, GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true))); + ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpMatrixAReductionKernel::validate( + recurrent_to_forget_weights, &eff_bias_info, + GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true))); + ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpMatrixAReductionKernel::validate( + input_to_cell_weights, &eff_bias_info, GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true))); + ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpMatrixAReductionKernel::validate( + recurrent_to_cell_weights, &eff_bias_info, + GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true))); + ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpMatrixAReductionKernel::validate( + input_to_output_weights, &eff_bias_info, GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true))); + ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpMatrixAReductionKernel::validate( + recurrent_to_output_weights, &eff_bias_info, + GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true))); + if (lstm_params.has_projection()) { - ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixAReductionKernel::validate(lstm_params.projection_weights(), &projection_eff_bias_info, GEMMLowpReductionKernelInfo(output_size, false, - lstm_params.hidden_state_zero(), - true))); - if(lstm_params.projection_bias() != nullptr) + ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpMatrixAReductionKernel::validate( + lstm_params.projection_weights(), &projection_eff_bias_info, + GEMMLowpReductionKernelInfo(output_size, false, lstm_params.hidden_state_zero(), true))); + if (lstm_params.projection_bias() != nullptr) { ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lstm_params.projection_bias(), 1, DataType::S32); - ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAddition::validate(lstm_params.projection_bias(), &projection_eff_bias_info, - &projection_eff_bias_info, ConvertPolicy::SATURATE)); + ARM_COMPUTE_RETURN_ON_ERROR( + CLArithmeticAddition::validate(lstm_params.projection_bias(), &projection_eff_bias_info, + &projection_eff_bias_info, ConvertPolicy::SATURATE)); } } - const TensorInfo input_weights_transposed(TensorShape(num_units, input_size), 1, input_to_forget_weights->data_type(), input_to_forget_weights->quantization_info()); - const TensorInfo recurrent_weights_transposed(TensorShape(num_units, output_size), 1, recurrent_to_forget_weights->data_type(), recurrent_to_forget_weights->quantization_info()); + const TensorInfo input_weights_transposed(TensorShape(num_units, input_size), 1, + input_to_forget_weights->data_type(), + input_to_forget_weights->quantization_info()); + const TensorInfo recurrent_weights_transposed(TensorShape(num_units, output_size), 1, + recurrent_to_forget_weights->data_type(), + recurrent_to_forget_weights->quantization_info()); // Validate weights transpose ARM_COMPUTE_RETURN_ON_ERROR(CLTranspose::validate(input_to_forget_weights, &input_weights_transposed)); @@ -710,15 +863,20 @@ Status CLQLSTMLayer::validate(const ITensorInfo *input, ARM_COMPUTE_RETURN_ON_ERROR(CLTranspose::validate(recurrent_to_forget_weights, &recurrent_weights_transposed)); ARM_COMPUTE_RETURN_ON_ERROR(CLTranspose::validate(recurrent_to_cell_weights, &recurrent_weights_transposed)); ARM_COMPUTE_RETURN_ON_ERROR(CLTranspose::validate(recurrent_to_output_weights, &recurrent_weights_transposed)); - if(!lstm_params.has_cifg_opt()) + if (!lstm_params.has_cifg_opt()) { - ARM_COMPUTE_RETURN_ON_ERROR(CLTranspose::validate(lstm_params.input_to_input_weights(), &input_weights_transposed)); - ARM_COMPUTE_RETURN_ON_ERROR(CLTranspose::validate(lstm_params.recurrent_to_input_weights(), &recurrent_weights_transposed)); + ARM_COMPUTE_RETURN_ON_ERROR( + CLTranspose::validate(lstm_params.input_to_input_weights(), &input_weights_transposed)); + ARM_COMPUTE_RETURN_ON_ERROR( + CLTranspose::validate(lstm_params.recurrent_to_input_weights(), &recurrent_weights_transposed)); } - if(lstm_params.has_projection()) + if (lstm_params.has_projection()) { - const TensorInfo projection_weights_transposed(TensorShape(output_size, num_units), 1, lstm_params.projection_weights()->data_type(), lstm_params.projection_weights()->quantization_info()); - ARM_COMPUTE_RETURN_ON_ERROR(CLTranspose::validate(lstm_params.projection_weights(), &projection_weights_transposed)); + const TensorInfo projection_weights_transposed(TensorShape(output_size, num_units), 1, + lstm_params.projection_weights()->data_type(), + lstm_params.projection_weights()->quantization_info()); + ARM_COMPUTE_RETURN_ON_ERROR( + CLTranspose::validate(lstm_params.projection_weights(), &projection_weights_transposed)); } GEMMLowpOutputStageInfo gemmlowp_info; @@ -731,28 +889,42 @@ Status CLQLSTMLayer::validate(const ITensorInfo *input, // Forget gate. ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.forget_intermediate_scale() == 0); - const TensorInfo forget_outstage_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, QuantizationInfo(lstm_params.forget_intermediate_scale(), 0)); + const TensorInfo forget_outstage_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, + QuantizationInfo(lstm_params.forget_intermediate_scale(), 0)); const TensorInfo mm_out_info(TensorShape(num_units, batch_size), 1, DataType::S32); - const float input_to_forget_scale = input_to_forget_weights->quantization_info().uniform().scale * qinput.scale / lstm_params.forget_intermediate_scale(); - ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, input, &input_weights_transposed, &eff_bias_info, input_to_forget_scale, &mm_out_info, &forget_outstage_info)); + const float input_to_forget_scale = input_to_forget_weights->quantization_info().uniform().scale * qinput.scale / + lstm_params.forget_intermediate_scale(); + ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, input, &input_weights_transposed, &eff_bias_info, + input_to_forget_scale, &mm_out_info, &forget_outstage_info)); - const float recurrent_to_forget_scale = recurrent_to_forget_weights->quantization_info().uniform().scale * qoutput_state_in.scale / lstm_params.forget_intermediate_scale(); - ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, output_state_in, &recurrent_weights_transposed, &eff_bias_info, recurrent_to_forget_scale, &mm_out_info, &forget_outstage_info)); + const float recurrent_to_forget_scale = recurrent_to_forget_weights->quantization_info().uniform().scale * + qoutput_state_in.scale / lstm_params.forget_intermediate_scale(); + ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, output_state_in, &recurrent_weights_transposed, + &eff_bias_info, recurrent_to_forget_scale, &mm_out_info, + &forget_outstage_info)); - ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAddition::validate(&forget_outstage_info, &forget_outstage_info, &forget_outstage_info, ConvertPolicy::SATURATE)); + ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAddition::validate(&forget_outstage_info, &forget_outstage_info, + &forget_outstage_info, ConvertPolicy::SATURATE)); - if(lstm_params.has_peephole_opt()) + if (lstm_params.has_peephole_opt()) { - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lstm_params.cell_to_forget_weights(), 1, DataType::QSYMM16); - ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplication::validate(cell_state_in, lstm_params.cell_to_forget_weights(), &mm_out_info, 1.f, ConvertPolicy::SATURATE, - RoundingPolicy::TO_ZERO)); - const float cell_to_forget_scale = std::pow(2, cell_shift) * lstm_params.cell_to_forget_weights()->quantization_info().uniform().scale / lstm_params.forget_intermediate_scale(); - ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier(cell_to_forget_scale, &gemmlowp_info.gemmlowp_multiplier, &gemmlowp_info.gemmlowp_shift)); - ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpOutputStage::validate(&mm_out_info, nullptr, &forget_outstage_info, gemmlowp_info)); - ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAddition::validate(&forget_outstage_info, &forget_outstage_info, &forget_outstage_info, ConvertPolicy::SATURATE)); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lstm_params.cell_to_forget_weights(), 1, + DataType::QSYMM16); + ARM_COMPUTE_RETURN_ON_ERROR( + CLPixelWiseMultiplication::validate(cell_state_in, lstm_params.cell_to_forget_weights(), &mm_out_info, 1.f, + ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO)); + const float cell_to_forget_scale = std::pow(2, cell_shift) * + lstm_params.cell_to_forget_weights()->quantization_info().uniform().scale / + lstm_params.forget_intermediate_scale(); + ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier( + cell_to_forget_scale, &gemmlowp_info.gemmlowp_multiplier, &gemmlowp_info.gemmlowp_shift)); + ARM_COMPUTE_RETURN_ON_ERROR( + CLGEMMLowpOutputStage::validate(&mm_out_info, nullptr, &forget_outstage_info, gemmlowp_info)); + ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAddition::validate(&forget_outstage_info, &forget_outstage_info, + &forget_outstage_info, ConvertPolicy::SATURATE)); } - if(has_layer_norm) + if (has_layer_norm) { const ITensorInfo *w_info = lstm_params.forget_layer_norm_weights(); const ITensorInfo *b_info = forget_gate_bias; @@ -763,20 +935,29 @@ Status CLQLSTMLayer::validate(const ITensorInfo *input, const QuantizationInfo sigmoid_tanh_outqinfo(1.f / 32768.f, 0); const TensorInfo forget_gate_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, sigmoid_tanh_outqinfo); - ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayer::validate(&forget_outstage_info, &forget_gate_info, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC))); + ARM_COMPUTE_RETURN_ON_ERROR( + CLActivationLayer::validate(&forget_outstage_info, &forget_gate_info, + ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC))); // Modulation gate. ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.cell_intermediate_scale() == 0); - const TensorInfo cell_outstage_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, QuantizationInfo(lstm_params.cell_intermediate_scale(), 0)); - const float input_to_cell_scale = input_to_cell_weights->quantization_info().uniform().scale * qinput.scale / lstm_params.cell_intermediate_scale(); - ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, input, &input_weights_transposed, &eff_bias_info, input_to_cell_scale, &mm_out_info, &cell_outstage_info)); - - const float recurrent_to_cell_scale = recurrent_to_cell_weights->quantization_info().uniform().scale * qoutput_state_in.scale / lstm_params.cell_intermediate_scale(); - ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, output_state_in, &input_weights_transposed, &eff_bias_info, recurrent_to_cell_scale, &mm_out_info, &cell_outstage_info)); - - ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAddition::validate(&cell_outstage_info, &cell_outstage_info, &cell_outstage_info, ConvertPolicy::SATURATE)); - - if(has_layer_norm) + const TensorInfo cell_outstage_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, + QuantizationInfo(lstm_params.cell_intermediate_scale(), 0)); + const float input_to_cell_scale = input_to_cell_weights->quantization_info().uniform().scale * qinput.scale / + lstm_params.cell_intermediate_scale(); + ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, input, &input_weights_transposed, &eff_bias_info, + input_to_cell_scale, &mm_out_info, &cell_outstage_info)); + + const float recurrent_to_cell_scale = recurrent_to_cell_weights->quantization_info().uniform().scale * + qoutput_state_in.scale / lstm_params.cell_intermediate_scale(); + ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, output_state_in, &recurrent_weights_transposed, + &eff_bias_info, recurrent_to_cell_scale, &mm_out_info, + &cell_outstage_info)); + + ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAddition::validate(&cell_outstage_info, &cell_outstage_info, + &cell_outstage_info, ConvertPolicy::SATURATE)); + + if (has_layer_norm) { const ITensorInfo *w_info = lstm_params.cell_layer_norm_weights(); const ITensorInfo *b_info = cell_bias; @@ -784,85 +965,123 @@ Status CLQLSTMLayer::validate(const ITensorInfo *input, } const TensorInfo cell_gate_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, sigmoid_tanh_outqinfo); - ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayer::validate(&cell_outstage_info, &cell_gate_info, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.f, 1.f))); + ARM_COMPUTE_RETURN_ON_ERROR( + CLActivationLayer::validate(&cell_outstage_info, &cell_gate_info, + ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.f, 1.f))); // Input gate. const TensorInfo input_gate_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, sigmoid_tanh_outqinfo); - if(lstm_params.has_cifg_opt()) + if (lstm_params.has_cifg_opt()) { - ARM_COMPUTE_RETURN_ERROR_ON_MSG(lstm_params.input_gate_bias() != nullptr, "Input gate bias must not be present when CIFG is used"); - ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticSubtraction::validate(&input_gate_info, &forget_gate_info, &forget_gate_info, ConvertPolicy::SATURATE)); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(lstm_params.input_gate_bias() != nullptr, + "Input gate bias must not be present when CIFG is used"); + ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticSubtraction::validate(&input_gate_info, &forget_gate_info, + &forget_gate_info, ConvertPolicy::SATURATE)); } else { - ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(lstm_params.input_to_input_weights(), lstm_params.recurrent_to_input_weights(), lstm_params.input_gate_bias()); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input_to_forget_weights, lstm_params.input_to_input_weights(), lstm_params.recurrent_to_input_weights()); + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(lstm_params.input_to_input_weights(), + lstm_params.recurrent_to_input_weights(), lstm_params.input_gate_bias()); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES( + input_to_forget_weights, lstm_params.input_to_input_weights(), lstm_params.recurrent_to_input_weights()); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input_to_forget_weights, lstm_params.input_to_input_weights()); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(recurrent_to_forget_weights, lstm_params.recurrent_to_input_weights()); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(recurrent_to_forget_weights, + lstm_params.recurrent_to_input_weights()); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(forget_gate_bias, lstm_params.input_gate_bias()); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(forget_gate_bias, lstm_params.input_gate_bias()); ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.input_intermediate_scale() == 0); - const TensorInfo input_outstage_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, QuantizationInfo(lstm_params.input_intermediate_scale(), 0)); - const float input_to_input_scale = lstm_params.input_to_input_weights()->quantization_info().uniform().scale * qinput.scale / lstm_params.input_intermediate_scale(); - ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, input, &input_weights_transposed, &eff_bias_info, input_to_input_scale, &mm_out_info, &input_outstage_info)); - - const float recurrent_to_input_scale = lstm_params.recurrent_to_input_weights()->quantization_info().uniform().scale * qoutput_state_in.scale / lstm_params.input_intermediate_scale(); - ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, output_state_in, &recurrent_weights_transposed, &eff_bias_info, recurrent_to_input_scale, &mm_out_info, &input_outstage_info)); - - ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAddition::validate(&input_outstage_info, &input_outstage_info, &input_outstage_info, ConvertPolicy::SATURATE)); - - if(lstm_params.has_peephole_opt()) + const TensorInfo input_outstage_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, + QuantizationInfo(lstm_params.input_intermediate_scale(), 0)); + const float input_to_input_scale = lstm_params.input_to_input_weights()->quantization_info().uniform().scale * + qinput.scale / lstm_params.input_intermediate_scale(); + ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, input, &input_weights_transposed, &eff_bias_info, + input_to_input_scale, &mm_out_info, &input_outstage_info)); + + const float recurrent_to_input_scale = + lstm_params.recurrent_to_input_weights()->quantization_info().uniform().scale * qoutput_state_in.scale / + lstm_params.input_intermediate_scale(); + ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, output_state_in, &recurrent_weights_transposed, + &eff_bias_info, recurrent_to_input_scale, &mm_out_info, + &input_outstage_info)); + + ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAddition::validate(&input_outstage_info, &input_outstage_info, + &input_outstage_info, ConvertPolicy::SATURATE)); + + if (lstm_params.has_peephole_opt()) { - ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplication::validate(cell_state_in, lstm_params.cell_to_input_weights(), &mm_out_info, 1.f, ConvertPolicy::SATURATE, - RoundingPolicy::TO_ZERO)); - const float cell_to_input_scale = std::pow(2, cell_shift) * lstm_params.cell_to_input_weights()->quantization_info().uniform().scale / lstm_params.input_intermediate_scale(); - ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier(cell_to_input_scale, &gemmlowp_info.gemmlowp_multiplier, &gemmlowp_info.gemmlowp_shift)); - ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpOutputStage::validate(&mm_out_info, &eff_bias_info, &input_outstage_info, gemmlowp_info)); - ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAddition::validate(&input_outstage_info, &input_outstage_info, &input_outstage_info, ConvertPolicy::SATURATE)); + ARM_COMPUTE_RETURN_ON_ERROR( + CLPixelWiseMultiplication::validate(cell_state_in, lstm_params.cell_to_input_weights(), &mm_out_info, + 1.f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO)); + const float cell_to_input_scale = std::pow(2, cell_shift) * + lstm_params.cell_to_input_weights()->quantization_info().uniform().scale / + lstm_params.input_intermediate_scale(); + ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier( + cell_to_input_scale, &gemmlowp_info.gemmlowp_multiplier, &gemmlowp_info.gemmlowp_shift)); + ARM_COMPUTE_RETURN_ON_ERROR( + CLGEMMLowpOutputStage::validate(&mm_out_info, &eff_bias_info, &input_outstage_info, gemmlowp_info)); + ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAddition::validate(&input_outstage_info, &input_outstage_info, + &input_outstage_info, ConvertPolicy::SATURATE)); } - if(has_layer_norm) + if (has_layer_norm) { const ITensorInfo *w_info = lstm_params.input_layer_norm_weights(); const ITensorInfo *b_info = lstm_params.input_gate_bias(); ARM_COMPUTE_RETURN_ON_ERROR(validate_layer_norm(cell_outstage_info, *w_info, *b_info)); } - ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayer::validate(&input_outstage_info, &input_gate_info, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC, 1.f, 1.f))); + ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayer::validate( + &input_outstage_info, &input_gate_info, + ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC, 1.f, 1.f))); } // Cell. - ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplication::validate(&forget_gate_info, cell_state_in, &forget_gate_info, 1.f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO)); - ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplication::validate(&input_gate_info, cell_state_in, &cell_gate_info, 1.f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO)); - ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAddition::validate(&forget_gate_info, &cell_gate_info, cell_state_out, ConvertPolicy::SATURATE)); - if(quantized_cell_clip > 0) + ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplication::validate( + &forget_gate_info, cell_state_in, &forget_gate_info, 1.f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO)); + ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplication::validate( + &input_gate_info, cell_state_in, &cell_gate_info, 1.f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO)); + ARM_COMPUTE_RETURN_ON_ERROR( + CLArithmeticAddition::validate(&forget_gate_info, &cell_gate_info, cell_state_out, ConvertPolicy::SATURATE)); + if (quantized_cell_clip > 0) { - ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayer::validate(cell_state_out, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, -quantized_cell_clip, - quantized_cell_clip))); + ARM_COMPUTE_RETURN_ON_ERROR( + CLActivationLayer::validate(cell_state_out, nullptr, + ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, + -quantized_cell_clip, quantized_cell_clip))); } // Output gate. ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.output_intermediate_scale() == 0); - const TensorInfo output_outstage_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, QuantizationInfo(lstm_params.output_intermediate_scale(), 0)); - const float input_to_output_scale = input_to_output_weights->quantization_info().uniform().scale * qinput.scale / lstm_params.output_intermediate_scale(); - ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, input, &input_weights_transposed, &eff_bias_info, input_to_output_scale, &mm_out_info, &output_outstage_info)); - - const float recurrent_to_output_scale = recurrent_to_output_weights->quantization_info().uniform().scale * qoutput_state_in.scale / lstm_params.output_intermediate_scale(); - ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, output_state_in, &recurrent_weights_transposed, &eff_bias_info, recurrent_to_output_scale, &mm_out_info, &output_outstage_info)); - - ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAddition::validate(&output_outstage_info, &output_outstage_info, &output_outstage_info, ConvertPolicy::SATURATE)); - if(lstm_params.has_peephole_opt()) + const TensorInfo output_outstage_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, + QuantizationInfo(lstm_params.output_intermediate_scale(), 0)); + const float input_to_output_scale = input_to_output_weights->quantization_info().uniform().scale * qinput.scale / + lstm_params.output_intermediate_scale(); + ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, input, &input_weights_transposed, &eff_bias_info, + input_to_output_scale, &mm_out_info, &output_outstage_info)); + + const float recurrent_to_output_scale = recurrent_to_output_weights->quantization_info().uniform().scale * + qoutput_state_in.scale / lstm_params.output_intermediate_scale(); + ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, output_state_in, &recurrent_weights_transposed, + &eff_bias_info, recurrent_to_output_scale, &mm_out_info, + &output_outstage_info)); + + ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAddition::validate(&output_outstage_info, &output_outstage_info, + &output_outstage_info, ConvertPolicy::SATURATE)); + if (lstm_params.has_peephole_opt()) { - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lstm_params.cell_to_output_weights(), 1, DataType::QSYMM16); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lstm_params.cell_to_output_weights(), 1, + DataType::QSYMM16); // TODO(COMPMID-3395): Perform multiplication in the quantized domain in NEPixelWiseMultiplicationKernel // Here we are not using the output stage because all operations are done in float // const float cell_to_output_scale = std::pow(2, cell_shift) * lstm_params.cell_to_output_weights()->quantization_info().uniform().scale / lstm_params.output_intermediate_scale(); // ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier(cell_to_output_scale, &gemmlowp_info.gemmlowp_multiplier, &gemmlowp_info.gemmlowp_shift)); - ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplication::validate(cell_state_out, lstm_params.cell_to_output_weights(), &output_outstage_info, 1.f, ConvertPolicy::SATURATE, - RoundingPolicy::TO_ZERO)); - ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAddition::validate(&output_outstage_info, &output_outstage_info, &output_outstage_info, ConvertPolicy::SATURATE)); + ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplication::validate( + cell_state_out, lstm_params.cell_to_output_weights(), &output_outstage_info, 1.f, ConvertPolicy::SATURATE, + RoundingPolicy::TO_ZERO)); + ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAddition::validate(&output_outstage_info, &output_outstage_info, + &output_outstage_info, ConvertPolicy::SATURATE)); } - if(has_layer_norm) + if (has_layer_norm) { const ITensorInfo *w_info = lstm_params.output_layer_norm_weights(); const ITensorInfo *b_info = output_gate_bias; @@ -870,85 +1089,103 @@ Status CLQLSTMLayer::validate(const ITensorInfo *input, } const TensorInfo output_gate_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, sigmoid_tanh_outqinfo); - ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayer::validate(&output_outstage_info, &output_gate_info, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC))); + ARM_COMPUTE_RETURN_ON_ERROR( + CLActivationLayer::validate(&output_outstage_info, &output_gate_info, + ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC))); // Hidden. - ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayer::validate(cell_state_out, &input_gate_info, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.f, 1.f))); + ARM_COMPUTE_RETURN_ON_ERROR( + CLActivationLayer::validate(cell_state_out, &input_gate_info, + ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.f, 1.f))); const TensorInfo hidden_mul_res(TensorShape(num_units, batch_size), 1, DataType::S32); const TensorInfo hidden_out_info(TensorShape(num_units, batch_size), 1, DataType::QASYMM8_SIGNED); ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.hidden_state_scale() == 0); - ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplication::validate(&output_gate_info, &input_gate_info, &hidden_mul_res, 1.f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO)); + ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplication::validate( + &output_gate_info, &input_gate_info, &hidden_mul_res, 1.f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO)); const float hidden_state_scale = std::pow(2, -15) / lstm_params.hidden_state_scale() * std::pow(2, -15); - ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier(hidden_state_scale, &gemmlowp_info.gemmlowp_multiplier, &gemmlowp_info.gemmlowp_shift, /* ignore_epsilon */ true)); + ARM_COMPUTE_RETURN_ON_ERROR( + quantization::calculate_quantized_multiplier(hidden_state_scale, &gemmlowp_info.gemmlowp_multiplier, + &gemmlowp_info.gemmlowp_shift, /* ignore_epsilon */ true)); gemmlowp_info.gemmlowp_offset = lstm_params.hidden_state_zero(); gemmlowp_info.output_data_type = hidden_out_info.data_type(); - ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpOutputStage::validate(&hidden_mul_res, nullptr, &hidden_out_info, gemmlowp_info)); + ARM_COMPUTE_RETURN_ON_ERROR( + CLGEMMLowpOutputStage::validate(&hidden_mul_res, nullptr, &hidden_out_info, gemmlowp_info)); const bool projection_tensor_copy_required = num_units != output_size; // Projection. - if(lstm_params.has_projection()) + if (lstm_params.has_projection()) { - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(recurrent_to_forget_weights, lstm_params.projection_weights()); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(recurrent_to_forget_weights, + lstm_params.projection_weights()); ARM_COMPUTE_RETURN_ERROR_ON(qoutput_state_in.scale == 0); - const UniformQuantizationInfo qprojection = lstm_params.projection_weights()->quantization_info().uniform(); - const float projection_scale = qprojection.scale * lstm_params.hidden_state_scale() / qoutput_state_in.scale; - ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier(projection_scale, &gemmlowp_info.gemmlowp_multiplier, &gemmlowp_info.gemmlowp_shift)); + const UniformQuantizationInfo qprojection = lstm_params.projection_weights()->quantization_info().uniform(); + const float projection_scale = qprojection.scale * lstm_params.hidden_state_scale() / qoutput_state_in.scale; + ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier( + projection_scale, &gemmlowp_info.gemmlowp_multiplier, &gemmlowp_info.gemmlowp_shift)); gemmlowp_info.gemmlowp_offset = qoutput_state_in.offset; gemmlowp_info.gemmlowp_min_bound = std::numeric_limits<int8_t>::lowest(); gemmlowp_info.gemmlowp_max_bound = std::numeric_limits<int8_t>::max(); gemmlowp_info.output_data_type = DataType::QASYMM8_SIGNED; const TensorInfo projection_outstage_info(*output_state_out); - const TensorInfo projection_weights_transposed(TensorShape(output_size, num_units), 1, lstm_params.projection_weights()->data_type(), lstm_params.projection_weights()->quantization_info()); + const TensorInfo projection_weights_transposed(TensorShape(output_size, num_units), 1, + lstm_params.projection_weights()->data_type(), + lstm_params.projection_weights()->quantization_info()); - TensorInfo projection_mm_out_info{ mm_out_info }; + TensorInfo projection_mm_out_info{mm_out_info}; projection_mm_out_info.set_tensor_shape(TensorShape(output_size, batch_size)); - ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, &hidden_out_info, &projection_weights_transposed, &projection_eff_bias_info, projection_scale, &projection_mm_out_info, + ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, &hidden_out_info, &projection_weights_transposed, + &projection_eff_bias_info, projection_scale, &projection_mm_out_info, &projection_outstage_info)); - if(projection_tensor_copy_required) + if (projection_tensor_copy_required) { - ARM_COMPUTE_RETURN_ON_ERROR(CLQLSTMLayer::TensorCopyKernel::validate(*output_state_in, projection_outstage_info)); + ARM_COMPUTE_RETURN_ON_ERROR( + CLQLSTMLayer::TensorCopyKernel::validate(*output_state_in, projection_outstage_info)); } - ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAddition::validate(output_state_out, output_state_out, output_state_out, ConvertPolicy::SATURATE)); + ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAddition::validate(output_state_out, output_state_out, output_state_out, + ConvertPolicy::SATURATE)); - if(projection_tensor_copy_required) + if (projection_tensor_copy_required) { - ARM_COMPUTE_RETURN_ON_ERROR(CLQLSTMLayer::TensorCopyKernel::validate(projection_outstage_info, *output_state_out)); + ARM_COMPUTE_RETURN_ON_ERROR( + CLQLSTMLayer::TensorCopyKernel::validate(projection_outstage_info, *output_state_out)); } - int8_t quantized_projection_clip{ 0 }; - if(lstm_params.projection_clip() > 0.0f) + int8_t quantized_projection_clip{0}; + if (lstm_params.projection_clip() > 0.0f) { quantized_projection_clip = quantize_qasymm8_signed(lstm_params.projection_clip(), qprojection); } - if(quantized_projection_clip > 0) + if (quantized_projection_clip > 0) { - ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayer::validate(output_state_out, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, -quantized_projection_clip, - quantized_projection_clip))); + ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayer::validate( + output_state_out, nullptr, + ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, + -quantized_projection_clip, quantized_projection_clip))); } } else { - if(projection_tensor_copy_required) + if (projection_tensor_copy_required) { ARM_COMPUTE_RETURN_ON_ERROR(CLQLSTMLayer::TensorCopyKernel::validate(hidden_out_info, *output_state_out)); } } - if(cell_state_out->total_size() > 0) + if (cell_state_out->total_size() > 0) { ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(cell_state_in, cell_state_out); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(cell_state_in, cell_state_out); } - if(output_state_out->total_size() > 0) + if (output_state_out->total_size() > 0) { ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output_state_out); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output_state_in, output_state_out); @@ -973,14 +1210,14 @@ void CLQLSTMLayer::run() _recurrent_to_forget_outstage.run(); _accumulate_input_recurrent_forget.run(); - if(_has_peephole) + if (_has_peephole) { _pixelwise_mul_cell_to_forget.run(); _cell_to_forget_outstage.run(); _accumulate_cell_forget.run(); } - if(_has_layer_norm) + if (_has_layer_norm) { CLScheduler::get().enqueue(get_layer_norm(LayerNormGate::Forget)); } @@ -995,7 +1232,7 @@ void CLQLSTMLayer::run() _recurrent_to_cell_outstage.run(); _accumulate_input_recurrent_modulation.run(); - if(_has_layer_norm) + if (_has_layer_norm) { CLScheduler::get().enqueue(get_layer_norm(LayerNormGate::Cell)); } @@ -1003,7 +1240,7 @@ void CLQLSTMLayer::run() _cell_gate_tanh.run(); // Input gate - if(_has_cifg) + if (_has_cifg) { _input_gate_sub.run(); } @@ -1015,14 +1252,14 @@ void CLQLSTMLayer::run() _recurrent_to_input_outstage.run(); _accumulate_input_recurrent_input.run(); - if(_has_peephole) + if (_has_peephole) { _pixelwise_mul_cell_to_input.run(); _cell_to_input_outstage.run(); _accumulate_cell_input.run(); } - if(_has_layer_norm) + if (_has_layer_norm) { CLScheduler::get().enqueue(get_layer_norm(LayerNormGate::Input)); } @@ -1034,7 +1271,7 @@ void CLQLSTMLayer::run() _pixelwise_mul_forget_cell.run(); _pixelwise_mul_input_cell.run(); _add_forget_cell.run(); - if(_has_cell_clipping) + if (_has_cell_clipping) { _cell_clip.run(); } @@ -1045,14 +1282,14 @@ void CLQLSTMLayer::run() _mm_recurrent_to_output.run(); _recurrent_to_output_outstage.run(); _accumulate_input_recurrent_output.run(); - if(_has_peephole) + if (_has_peephole) { _pixelwise_mul_cell_to_output.run(); _cell_to_output_outstage.run(); _accumulate_cell_to_output.run(); } - if(_has_layer_norm) + if (_has_layer_norm) { CLScheduler::get().enqueue(get_layer_norm(LayerNormGate::Output)); } @@ -1065,31 +1302,31 @@ void CLQLSTMLayer::run() _hidden_outstage.run(); // Projection. - if(_has_projection) + if (_has_projection) { _mm_projection.run(); _projection_outstage.run(); - if(_projection_tensor_copy_required) + if (_projection_tensor_copy_required) { _projection_output_to_accumulate_copy.run(); } _accumulate_projection.run(); - if(_projection_tensor_copy_required) + if (_projection_tensor_copy_required) { _projection_accumulate_to_output_copy.run(); } - if(_has_projection_clipping) + if (_has_projection_clipping) { _projection_clip.run(); } } else { - if(_projection_tensor_copy_required) + if (_projection_tensor_copy_required) { _hidden_to_output_copy.run(); } @@ -1101,7 +1338,7 @@ void CLQLSTMLayer::run() void CLQLSTMLayer::prepare() { - if(!_is_prepared) + if (!_is_prepared) { // Pre-transpose weights to be used in GEMM. _input_to_forget_weights_transposed.allocator()->allocate(); @@ -1118,18 +1355,25 @@ void CLQLSTMLayer::prepare() _transpose_recurrent_to_output_weights.run(); // Precompute effective biases - if(_has_cifg) + if (_has_cifg) { _ones.map(true); - std::fill_n(reinterpret_cast<int16_t *>(_ones.buffer()), _ones.info()->total_size() / _ones.info()->element_size(), 32767); + std::fill_n(reinterpret_cast<int16_t *>(_ones.buffer()), + _ones.info()->total_size() / _ones.info()->element_size(), 32767); _ones.unmap(); } else { _input_to_input_eff_bias.allocator()->allocate(); _recurrent_to_input_eff_bias.allocator()->allocate(); - CLScheduler::get().enqueue(*_input_to_input_reduction); - CLScheduler::get().enqueue(*_recurrent_to_input_reduction); + + ITensorPack input_to_input_red_pack = {{ACL_SRC, _input_to_input_weights}, + {ACL_DST, &_input_to_input_eff_bias}}; + CLScheduler::get().enqueue_op(*_input_to_input_reduction, input_to_input_red_pack, false); + + ITensorPack rec_to_input_red_pack = {{ACL_SRC, _recurrent_to_input_weights}, + {ACL_DST, &_recurrent_to_input_eff_bias}}; + CLScheduler::get().enqueue_op(*_recurrent_to_input_reduction, rec_to_input_red_pack, false); _input_to_input_weights_transposed.allocator()->allocate(); _recurrent_to_input_weights_transposed.allocator()->allocate(); @@ -1144,18 +1388,36 @@ void CLQLSTMLayer::prepare() _recurrent_to_cell_eff_bias.allocator()->allocate(); _input_to_output_eff_bias.allocator()->allocate(); _recurrent_to_output_eff_bias.allocator()->allocate(); - CLScheduler::get().enqueue(*_input_to_forget_reduction); - CLScheduler::get().enqueue(*_recurrent_to_forget_reduction); - CLScheduler::get().enqueue(*_input_to_cell_reduction); - CLScheduler::get().enqueue(*_recurrent_to_cell_reduction); - CLScheduler::get().enqueue(*_input_to_output_reduction); - CLScheduler::get().enqueue(*_recurrent_to_output_reduction); - - if(_has_projection) + + ITensorPack input_to_forget_red_pack = {{ACL_SRC, _input_to_forget_weights}, + {ACL_DST, &_input_to_forget_eff_bias}}; + CLScheduler::get().enqueue_op(*_input_to_forget_reduction, input_to_forget_red_pack, false); + + ITensorPack rec_to_forget_red_pack = {{ACL_SRC, _recurrent_to_forget_weights}, + {ACL_DST, &_recurrent_to_forget_eff_bias}}; + CLScheduler::get().enqueue_op(*_recurrent_to_forget_reduction, rec_to_forget_red_pack, false); + + ITensorPack input_to_cell_red_pack = {{ACL_SRC, _input_to_cell_weights}, {ACL_DST, &_input_to_cell_eff_bias}}; + CLScheduler::get().enqueue_op(*_input_to_cell_reduction, input_to_cell_red_pack, false); + + ITensorPack rec_to_cell_red_pack = {{ACL_SRC, _recurrent_to_cell_weights}, + {ACL_DST, &_recurrent_to_cell_eff_bias}}; + CLScheduler::get().enqueue_op(*_recurrent_to_cell_reduction, rec_to_cell_red_pack, false); + + ITensorPack input_to_output_red_pack = {{ACL_SRC, _input_to_output_weights}, + {ACL_DST, &_input_to_output_eff_bias}}; + CLScheduler::get().enqueue_op(*_input_to_output_reduction, input_to_output_red_pack, false); + + ITensorPack rec_to_output_red_pack = {{ACL_SRC, _recurrent_to_output_weights}, + {ACL_DST, &_recurrent_to_output_eff_bias}}; + CLScheduler::get().enqueue_op(*_recurrent_to_output_reduction, rec_to_output_red_pack, false); + + if (_has_projection) { _projection_eff_bias.allocator()->allocate(); - CLScheduler::get().enqueue(*_projection_reduction); - if(_projection_bias != nullptr) + ITensorPack proj_red_pack{{ACL_SRC, _projection_weights}, {ACL_DST, &_projection_eff_bias}}; + CLScheduler::get().enqueue_op(*_projection_reduction, proj_red_pack, false); + if (_projection_bias != nullptr) { _projection_bias_add.run(); _projection_bias->mark_as_unused(); @@ -1165,7 +1427,7 @@ void CLQLSTMLayer::prepare() _transpose_projection_weights.run(); _projection_weights->mark_as_unused(); - if(!_projection_tensor_copy_required) + if (!_projection_tensor_copy_required) { _hidden_gate.mark_as_unused(); _projection_accumulate_res.mark_as_unused(); diff --git a/src/runtime/CL/functions/CLQuantizationLayer.cpp b/src/runtime/CL/functions/CLQuantizationLayer.cpp index e6451b2eb4..6edef29992 100644 --- a/src/runtime/CL/functions/CLQuantizationLayer.cpp +++ b/src/runtime/CL/functions/CLQuantizationLayer.cpp @@ -25,20 +25,20 @@ #include "arm_compute/core/CL/CLKernelLibrary.h" #include "arm_compute/core/CL/ICLTensor.h" + #include "src/core/CL/ICLKernel.h" -#include "src/runtime/gpu/cl/operators/ClQuantize.h" +#include "src/gpu/cl/operators/ClQuantize.h" namespace arm_compute { struct CLQuantizationLayer::Impl { - const ICLTensor *src{ nullptr }; - ICLTensor *dst{ nullptr }; - std::unique_ptr<opencl::ClQuantize> op{ nullptr }; + const ICLTensor *src{nullptr}; + ICLTensor *dst{nullptr}; + std::unique_ptr<opencl::ClQuantize> op{nullptr}; }; -CLQuantizationLayer::CLQuantizationLayer() - : _impl(std::make_unique<Impl>()) +CLQuantizationLayer::CLQuantizationLayer() : _impl(std::make_unique<Impl>()) { } CLQuantizationLayer::~CLQuantizationLayer() = default; diff --git a/src/runtime/CL/functions/CLRNNLayer.cpp b/src/runtime/CL/functions/CLRNNLayer.cpp index 755fa40121..34b78eefa7 100644 --- a/src/runtime/CL/functions/CLRNNLayer.cpp +++ b/src/runtime/CL/functions/CLRNNLayer.cpp @@ -28,27 +28,37 @@ #include "arm_compute/core/Utils.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" #include "arm_compute/runtime/CL/CLScheduler.h" + +#include "src/common/utils/Log.h" #include "src/core/CL/kernels/CLFillBorderKernel.h" -#include "src/core/CL/kernels/CLGEMMLowpMatrixMultiplyNativeKernel.h" -#include "src/core/CL/kernels/CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel.h" -#include "src/core/CL/kernels/CLGEMMLowpOffsetContributionKernel.h" -#include "src/core/CL/kernels/CLGEMMLowpOffsetContributionOutputStageKernel.h" -#include "src/core/CL/kernels/CLGEMMLowpReductionKernel.h" namespace arm_compute { using namespace arm_compute::misc::shape_calculator; CLRNNLayer::CLRNNLayer(std::shared_ptr<IMemoryManager> memory_manager) - : _memory_group(std::move(memory_manager)), _gemm_state_f(), _add_kernel(), _activation(), _fully_connected_kernel(), _copy(), _fully_connected_out(), _gemm_output(), _add_output(), + : _memory_group(std::move(memory_manager)), + _gemm_state_f(), + _add_kernel(), + _activation(), + _fully_connected_kernel(), + _copy(), + _fully_connected_out(), + _gemm_output(), + _add_output(), _is_prepared(false) { } CLRNNLayer::~CLRNNLayer() = default; -Status CLRNNLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *recurrent_weights, const ITensorInfo *bias, const ITensorInfo *hidden_state, - const ITensorInfo *output, const ActivationLayerInfo &info) +Status CLRNNLayer::validate(const ITensorInfo *input, + const ITensorInfo *weights, + const ITensorInfo *recurrent_weights, + const ITensorInfo *bias, + const ITensorInfo *hidden_state, + const ITensorInfo *output, + const ActivationLayerInfo &info) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, recurrent_weights, bias, hidden_state, output); ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_NOT_IN(input, DataType::F16, DataType::F32); @@ -66,28 +76,43 @@ Status CLRNNLayer::validate(const ITensorInfo *input, const ITensorInfo *weights ARM_COMPUTE_RETURN_ERROR_ON(hidden_state->dimension(idx_height) != input->dimension(idx_height)); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), hidden_state->tensor_shape()); - auto shape_info = TensorInfo(compute_rnn_shape(recurrent_weights, hidden_state->dimension(idx_height)), 1, input->data_type()); + auto shape_info = + TensorInfo(compute_rnn_shape(recurrent_weights, hidden_state->dimension(idx_height)), 1, input->data_type()); ARM_COMPUTE_RETURN_ON_ERROR(CLFullyConnectedLayer::validate(input, weights, bias, &shape_info)); ARM_COMPUTE_RETURN_ON_ERROR(CLGEMM::validate(hidden_state, recurrent_weights, nullptr, &shape_info, 1.f, 0.f)); - ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAddition::validate(&shape_info, &shape_info, &shape_info, ConvertPolicy::SATURATE)); + ARM_COMPUTE_RETURN_ON_ERROR( + CLArithmeticAddition::validate(&shape_info, &shape_info, &shape_info, ConvertPolicy::SATURATE)); ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayer::validate(&shape_info, &shape_info, info)); return Status{}; } -void CLRNNLayer::configure(const ICLTensor *input, const ICLTensor *weights, const ICLTensor *recurrent_weights, const ICLTensor *bias, ICLTensor *hidden_state, ICLTensor *output, +void CLRNNLayer::configure(const ICLTensor *input, + const ICLTensor *weights, + const ICLTensor *recurrent_weights, + const ICLTensor *bias, + ICLTensor *hidden_state, + ICLTensor *output, ActivationLayerInfo &info) { - configure(CLKernelLibrary::get().get_compile_context(), input, weights, recurrent_weights, bias, hidden_state, output, info); + configure(CLKernelLibrary::get().get_compile_context(), input, weights, recurrent_weights, bias, hidden_state, + output, info); } -void CLRNNLayer::configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *weights, const ICLTensor *recurrent_weights, const ICLTensor *bias, - ICLTensor *hidden_state, - ICLTensor *output, ActivationLayerInfo &info) +void CLRNNLayer::configure(const CLCompileContext &compile_context, + const ICLTensor *input, + const ICLTensor *weights, + const ICLTensor *recurrent_weights, + const ICLTensor *bias, + ICLTensor *hidden_state, + ICLTensor *output, + ActivationLayerInfo &info) { ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, recurrent_weights, bias, hidden_state, output); - ARM_COMPUTE_ERROR_THROW_ON(CLRNNLayer::validate(input->info(), weights->info(), recurrent_weights->info(), bias->info(), hidden_state->info(), output->info(), info)); + ARM_COMPUTE_ERROR_THROW_ON(CLRNNLayer::validate(input->info(), weights->info(), recurrent_weights->info(), + bias->info(), hidden_state->info(), output->info(), info)); + ARM_COMPUTE_LOG_PARAMS(input, weights, recurrent_weights, bias, hidden_state, output, info); const int idx_height = get_data_layout_dimension_index(input->info()->data_layout(), DataLayoutDimension::HEIGHT); TensorShape shape = compute_rnn_shape(recurrent_weights->info(), hidden_state->info()->dimension(idx_height)); @@ -135,7 +160,7 @@ void CLRNNLayer::run() void CLRNNLayer::prepare() { - if(!_is_prepared) + if (!_is_prepared) { _fully_connected_kernel.prepare(); _gemm_state_f.prepare(); diff --git a/src/runtime/CL/functions/CLROIAlignLayer.cpp b/src/runtime/CL/functions/CLROIAlignLayer.cpp index 291ccff958..1939d1d0ba 100644 --- a/src/runtime/CL/functions/CLROIAlignLayer.cpp +++ b/src/runtime/CL/functions/CLROIAlignLayer.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018-2020 Arm Limited. + * Copyright (c) 2018-2021 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -24,25 +24,39 @@ #include "arm_compute/runtime/CL/functions/CLROIAlignLayer.h" #include "arm_compute/core/CL/ICLArray.h" + +#include "src/common/utils/Log.h" #include "src/core/CL/kernels/CLROIAlignLayerKernel.h" #include "src/core/CL/kernels/CLROIPoolingLayerKernel.h" namespace arm_compute { -Status CLROIAlignLayer::validate(const ITensorInfo *input, const ITensorInfo *rois, ITensorInfo *output, const ROIPoolingLayerInfo &pool_info) +Status CLROIAlignLayer::validate(const ITensorInfo *input, + const ITensorInfo *rois, + ITensorInfo *output, + const ROIPoolingLayerInfo &pool_info) { ARM_COMPUTE_RETURN_ON_ERROR(CLROIAlignLayerKernel::validate(input, rois, output, pool_info)); return Status{}; } -void CLROIAlignLayer::configure(const ICLTensor *input, const ICLTensor *rois, ICLTensor *output, const ROIPoolingLayerInfo &pool_info) +void CLROIAlignLayer::configure(const ICLTensor *input, + const ICLTensor *rois, + ICLTensor *output, + const ROIPoolingLayerInfo &pool_info) { configure(CLKernelLibrary::get().get_compile_context(), input, rois, output, pool_info); } -void CLROIAlignLayer::configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *rois, ICLTensor *output, const ROIPoolingLayerInfo &pool_info) +void CLROIAlignLayer::configure(const CLCompileContext &compile_context, + const ICLTensor *input, + const ICLTensor *rois, + ICLTensor *output, + const ROIPoolingLayerInfo &pool_info) { + ARM_COMPUTE_LOG_PARAMS(input, rois, output, pool_info); + // Configure ROI pooling kernel auto k = std::make_unique<CLROIAlignLayerKernel>(); k->configure(compile_context, input, rois, output, pool_info); diff --git a/src/runtime/CL/functions/CLROIPoolingLayer.cpp b/src/runtime/CL/functions/CLROIPoolingLayer.cpp index cf7d4bcbc3..0d2eab0c76 100644 --- a/src/runtime/CL/functions/CLROIPoolingLayer.cpp +++ b/src/runtime/CL/functions/CLROIPoolingLayer.cpp @@ -22,23 +22,38 @@ * SOFTWARE. */ #include "arm_compute/runtime/CL/functions/CLROIPoolingLayer.h" + #include "arm_compute/core/CL/ICLArray.h" + +#include "src/common/utils/Log.h" #include "src/core/CL/kernels/CLROIPoolingLayerKernel.h" using namespace arm_compute; -Status CLROIPoolingLayer::validate(const ITensorInfo *input, const ITensorInfo *rois, ITensorInfo *output, const ROIPoolingLayerInfo &pool_info) +Status CLROIPoolingLayer::validate(const ITensorInfo *input, + const ITensorInfo *rois, + ITensorInfo *output, + const ROIPoolingLayerInfo &pool_info) { return CLROIPoolingLayerKernel::validate(input, rois, output, pool_info); } -void CLROIPoolingLayer::configure(const ICLTensor *input, const ICLTensor *rois, ICLTensor *output, const ROIPoolingLayerInfo &pool_info) +void CLROIPoolingLayer::configure(const ICLTensor *input, + const ICLTensor *rois, + ICLTensor *output, + const ROIPoolingLayerInfo &pool_info) { configure(CLKernelLibrary::get().get_compile_context(), input, rois, output, pool_info); } -void CLROIPoolingLayer::configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *rois, const ICLTensor *output, const ROIPoolingLayerInfo &pool_info) +void CLROIPoolingLayer::configure(const CLCompileContext &compile_context, + const ICLTensor *input, + const ICLTensor *rois, + const ICLTensor *output, + const ROIPoolingLayerInfo &pool_info) { + ARM_COMPUTE_LOG_PARAMS(input, rois, output, pool_info); + // Configure ROI pooling kernel auto k = std::make_unique<CLROIPoolingLayerKernel>(); k->configure(compile_context, input, rois, output, pool_info); diff --git a/src/runtime/CL/functions/CLRange.cpp b/src/runtime/CL/functions/CLRange.cpp index d4735c875d..5c3f7f9c8c 100644 --- a/src/runtime/CL/functions/CLRange.cpp +++ b/src/runtime/CL/functions/CLRange.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018-2020 Arm Limited. + * Copyright (c) 2018-2021 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -27,6 +27,8 @@ #include "arm_compute/core/Error.h" #include "arm_compute/core/Validate.h" #include "arm_compute/runtime/CL/CLScheduler.h" + +#include "src/common/utils/Log.h" #include "src/core/CL/kernels/CLRangeKernel.h" using namespace arm_compute; @@ -36,8 +38,10 @@ void CLRange::configure(ICLTensor *output, const float start, const float end, c configure(CLKernelLibrary::get().get_compile_context(), output, start, end, step); } -void CLRange::configure(const CLCompileContext &compile_context, ICLTensor *output, const float start, const float end, const float step) +void CLRange::configure( + const CLCompileContext &compile_context, ICLTensor *output, const float start, const float end, const float step) { + ARM_COMPUTE_LOG_PARAMS(output, start, end, step); auto k = std::make_unique<CLRangeKernel>(); k->set_target(CLScheduler::get().target()); k->configure(compile_context, output, start, end, step); diff --git a/src/runtime/CL/functions/CLReduceMean.cpp b/src/runtime/CL/functions/CLReduceMean.cpp index b761dc2f99..bef8d887fd 100644 --- a/src/runtime/CL/functions/CLReduceMean.cpp +++ b/src/runtime/CL/functions/CLReduceMean.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018-2020 Arm Limited. + * Copyright (c) 2018-2021, 2023-2024 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -27,6 +27,8 @@ #include "arm_compute/core/Error.h" #include "arm_compute/core/Types.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" + +#include "src/common/utils/Log.h" #include "src/core/CL/CLValidate.h" #include "src/core/CL/kernels/CLFillBorderKernel.h" #include "src/core/CL/kernels/CLReductionOperationKernel.h" @@ -36,12 +38,14 @@ namespace arm_compute { namespace { -Status validate_config(const ITensorInfo *input, const Coordinates &reduction_axis, bool keep_dims, const ITensorInfo *output) +Status +validate_config(const ITensorInfo *input, const Coordinates &reduction_axis, bool keep_dims, const ITensorInfo *output) { ARM_COMPUTE_UNUSED(keep_dims); ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output); ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F16, DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, + DataType::F16, DataType::F32); ARM_COMPUTE_RETURN_ERROR_ON(reduction_axis.num_dimensions() < 1); ARM_COMPUTE_RETURN_ERROR_ON(reduction_axis.num_dimensions() > input->num_dimensions()); @@ -49,29 +53,36 @@ Status validate_config(const ITensorInfo *input, const Coordinates &reduction_ax const int input_dims = input->num_dimensions(); Coordinates axis_local = reduction_axis; - for(unsigned int i = 0; i < axis_local.num_dimensions(); ++i) + for (unsigned int i = 0; i < axis_local.num_dimensions(); ++i) { //axis: The dimensions to reduce. Must be in the range [-rank(input_tensor), rank(input_tensor)). ARM_COMPUTE_RETURN_ERROR_ON(axis_local[i] < (-static_cast<int>(input->num_dimensions()))); ARM_COMPUTE_RETURN_ERROR_ON(axis_local[i] >= static_cast<int>(input->num_dimensions())); } - if(output->tensor_shape().total_size() != 0) + if (output->tensor_shape().total_size() != 0) { // Only validate if not using auto_init for the output tensor TensorShape out_shape = input->tensor_shape(); // Validate output_shape only if not using auto_init convert_negative_axis(axis_local, input_dims); + +// Suppress warning produced by a compiler bug in GCC +// https://gcc.gnu.org/bugzilla/show_bug.cgi?id=104165 +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Warray-bounds" std::sort(axis_local.begin(), axis_local.begin() + reduction_ops); - for(unsigned int i = 0; i < reduction_ops; ++i) +#pragma GCC diagnostic pop + + for (unsigned int i = 0; i < reduction_ops; ++i) { ARM_COMPUTE_RETURN_ERROR_ON(axis_local[i] > 3); ARM_COMPUTE_RETURN_ERROR_ON(static_cast<unsigned int>(axis_local[i]) > input->num_dimensions() - 1); - if(output->total_size() > 0 && keep_dims) + if (output->total_size() > 0 && keep_dims) { ARM_COMPUTE_RETURN_ERROR_ON(output->dimension(axis_local[i]) != 1); } - if(keep_dims) + if (keep_dims) { out_shape.set(axis_local[i], 1); } @@ -80,13 +91,14 @@ Status validate_config(const ITensorInfo *input, const Coordinates &reduction_ax ARM_COMPUTE_RETURN_ERROR_ON(i > static_cast<unsigned int>(axis_local[i])); const unsigned int remove_index = axis_local[i] - i; ARM_COMPUTE_RETURN_ERROR_ON(remove_index >= out_shape.num_dimensions()); - out_shape.remove_dimension(remove_index); + out_shape.remove_dimension(remove_index, false); } } const TensorInfo out_info = input->clone()->set_tensor_shape(out_shape); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output, &out_info); - const bool requant = is_data_type_quantized(input->data_type()) && input->quantization_info() != output->quantization_info(); - if(requant) + const bool requant = + is_data_type_quantized(input->data_type()) && input->quantization_info() != output->quantization_info(); + if (requant) { TensorInfo input_no_quant(input->clone()->set_data_type(DataType::F32)); CLDequantizationLayer::validate(input, &input_no_quant); @@ -96,10 +108,19 @@ Status validate_config(const ITensorInfo *input, const Coordinates &reduction_ax } return Status{}; } -} +} // namespace CLReduceMean::CLReduceMean(std::shared_ptr<IMemoryManager> memory_manager) - : _memory_group(std::move(memory_manager)), _reduction_kernels(), _reduced_outs(), _reshape(), _dequant(), _requant(), _reduction_ops(), _keep_dims(), _do_requant(), _input_no_quant(), + : _memory_group(std::move(memory_manager)), + _reduction_kernels(), + _reduced_outs(), + _reshape(), + _dequant(), + _requant(), + _reduction_ops(), + _keep_dims(), + _do_requant(), + _input_no_quant(), _output_no_quant() { } @@ -109,15 +130,23 @@ void CLReduceMean::configure(ICLTensor *input, const Coordinates &reduction_axis configure(CLKernelLibrary::get().get_compile_context(), input, reduction_axis, keep_dims, output); } -void CLReduceMean::configure(const CLCompileContext &compile_context, ICLTensor *input, const Coordinates &reduction_axis, bool keep_dims, ICLTensor *output) +void CLReduceMean::configure(const CLCompileContext &compile_context, + ICLTensor *input, + const Coordinates &reduction_axis, + bool keep_dims, + ICLTensor *output) { // Perform validate step ARM_COMPUTE_ERROR_THROW_ON(CLReduceMean::validate(input->info(), reduction_axis, keep_dims, output->info())); + ARM_COMPUTE_LOG_PARAMS(input, reduction_axis, keep_dims, output); + // Output auto inizialitation if not yet initialized - const TensorShape output_shape = arm_compute::misc::shape_calculator::calculate_reduce_mean_shape(input->info(), reduction_axis, keep_dims); + const TensorShape output_shape = + arm_compute::misc::shape_calculator::calculate_reduce_mean_shape(input->info(), reduction_axis, keep_dims); auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(output_shape)); - _do_requant = is_data_type_quantized(input->info()->data_type()) && input->info()->quantization_info() != output->info()->quantization_info(); + _do_requant = is_data_type_quantized(input->info()->data_type()) && + input->info()->quantization_info() != output->info()->quantization_info(); _reduction_ops = reduction_axis.num_dimensions(); _reduction_kernels.resize(_reduction_ops); _reduced_outs.resize(_reduction_ops - (keep_dims ? 1 : 0)); @@ -125,7 +154,7 @@ void CLReduceMean::configure(const CLCompileContext &compile_context, ICLTensor ICLTensor *tmp_input = input; ICLTensor *tmp_output = output; - if(_do_requant) + if (_do_requant) { _memory_group.manage(&_input_no_quant); _memory_group.manage(&_output_no_quant); @@ -144,46 +173,57 @@ void CLReduceMean::configure(const CLCompileContext &compile_context, ICLTensor convert_negative_axis(axis_local, input_dims); // Perform reduction for every axis - for(int i = 0; i < _reduction_ops; ++i) + for (int i = 0; i < _reduction_ops; ++i) { - TensorShape out_shape = i == 0 ? tmp_input->info()->tensor_shape() : (&_reduced_outs[i - 1])->info()->tensor_shape(); + TensorShape out_shape = + i == 0 ? tmp_input->info()->tensor_shape() : (&_reduced_outs[i - 1])->info()->tensor_shape(); out_shape.set(axis_local[i], 1); auto in = (i == 0) ? tmp_input : (&_reduced_outs[i - 1]); - if(i == _reduction_ops - 1 && keep_dims) + if (i == _reduction_ops - 1 && keep_dims) { - _reduction_kernels[i].configure(compile_context, in, tmp_output, axis_local[i], ReductionOperation::MEAN_SUM); + _reduction_kernels[i].configure(compile_context, in, tmp_output, axis_local[i], + ReductionOperation::MEAN_SUM); } else { - _reduced_outs[i].allocator()->init(TensorInfo(out_shape, tmp_input->info()->num_channels(), tmp_input->info()->data_type(), tmp_input->info()->quantization_info())); + _reduced_outs[i].allocator()->init(TensorInfo(out_shape, tmp_input->info()->num_channels(), + tmp_input->info()->data_type(), + tmp_input->info()->quantization_info())); _memory_group.manage(&_reduced_outs[i]); - _reduction_kernels[i].configure(compile_context, in, &_reduced_outs[i], axis_local[i], ReductionOperation::MEAN_SUM); + _reduction_kernels[i].configure(compile_context, in, &_reduced_outs[i], axis_local[i], + ReductionOperation::MEAN_SUM); } } // Allocate intermediate tensors - for(int i = 0; i < _reduction_ops - (keep_dims ? 1 : 0); ++i) + for (int i = 0; i < _reduction_ops - (keep_dims ? 1 : 0); ++i) { _reduced_outs[i].allocator()->allocate(); } // Configure reshape layer if we want to drop the dimensions - if(!_keep_dims) + if (!_keep_dims) { TensorShape out_shape = tmp_input->info()->tensor_shape(); // We have to sort the reduction axis vectors in order for remove_dimension // to work properly + +// Suppress warning produced by a compiler bug in GCC +// https://gcc.gnu.org/bugzilla/show_bug.cgi?id=104165 +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Warray-bounds" std::sort(axis_local.begin(), axis_local.begin() + _reduction_ops); - for(int i = 0; i < _reduction_ops; ++i) +#pragma GCC diagnostic pop + for (int i = 0; i < _reduction_ops; ++i) { - out_shape.remove_dimension(axis_local[i] - i); + out_shape.remove_dimension(axis_local[i] - i, false); } auto_init_if_empty(*tmp_output->info(), tmp_input->info()->clone()->set_tensor_shape(out_shape)); _reshape.configure(compile_context, &_reduced_outs[_reduction_ops - 1], tmp_output); } - if(_do_requant) + if (_do_requant) { _requant.configure(compile_context, &_output_no_quant, output); _input_no_quant.allocator()->allocate(); @@ -191,7 +231,10 @@ void CLReduceMean::configure(const CLCompileContext &compile_context, ICLTensor } } -Status CLReduceMean::validate(const ITensorInfo *input, const Coordinates &reduction_axis, bool keep_dims, const ITensorInfo *output) +Status CLReduceMean::validate(const ITensorInfo *input, + const Coordinates &reduction_axis, + bool keep_dims, + const ITensorInfo *output) { return validate_config(input, reduction_axis, keep_dims, output); } @@ -200,19 +243,19 @@ void CLReduceMean::run() { MemoryGroupResourceScope scope_mg(_memory_group); - if(_do_requant) + if (_do_requant) { _dequant.run(); } - for(auto &kernel : _reduction_kernels) + for (auto &kernel : _reduction_kernels) { kernel.run(); } - if(!_keep_dims) + if (!_keep_dims) { _reshape.run(); } - if(_do_requant) + if (_do_requant) { _requant.run(); } diff --git a/src/runtime/CL/functions/CLReductionOperation.cpp b/src/runtime/CL/functions/CLReductionOperation.cpp index 61859f8de8..ba5489018e 100644 --- a/src/runtime/CL/functions/CLReductionOperation.cpp +++ b/src/runtime/CL/functions/CLReductionOperation.cpp @@ -27,9 +27,11 @@ #include "arm_compute/core/Helpers.h" #include "arm_compute/core/PixelValue.h" #include "arm_compute/core/TensorInfo.h" -#include "arm_compute/core/Validate.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" +#include "arm_compute/core/Validate.h" #include "arm_compute/runtime/CL/CLScheduler.h" + +#include "src/common/utils/Log.h" #include "src/core/CL/kernels/CLReductionOperationKernel.h" #include "src/core/helpers/AutoConfiguration.h" #include "src/runtime/Utils.h" @@ -37,23 +39,31 @@ namespace arm_compute { CLReductionOperation::CLReductionOperation(std::shared_ptr<IMemoryManager> memory_manager) - : _memory_group(std::move(memory_manager)), _unreshaped_output(), _reduction_kernel(), _reshape(), _reduction_axis(), _is_reshape_required(false) + : _memory_group(std::move(memory_manager)), + _unreshaped_output(), + _reduction_kernel(), + _reshape(), + _reduction_axis(), + _is_reshape_required(false) { } CLReductionOperation::~CLReductionOperation() = default; -Status CLReductionOperation::validate(const ITensorInfo *input, const ITensorInfo *output, unsigned int axis, ReductionOperation op, bool keep_dims) +Status CLReductionOperation::validate( + const ITensorInfo *input, const ITensorInfo *output, unsigned int axis, ReductionOperation op, bool keep_dims) { ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis >= TensorShape::num_max_dimensions, "Reduction axis greater than max number of dimensions"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis >= TensorShape::num_max_dimensions, + "Reduction axis greater than max number of dimensions"); ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis > 3, "Unsupported reduction axis"); const bool is_reshape_required = !keep_dims; - if(is_reshape_required && output->total_size() != 0) + if (is_reshape_required && output->total_size() != 0) { - const TensorInfo expected_output_shape = output->clone()->set_tensor_shape(arm_compute::misc::shape_calculator::compute_reduced_shape(input->tensor_shape(), axis, keep_dims)); + const TensorInfo expected_output_shape = output->clone()->set_tensor_shape( + arm_compute::misc::shape_calculator::compute_reduced_shape(input->tensor_shape(), axis, keep_dims)); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&expected_output_shape, output); } @@ -65,22 +75,23 @@ Status CLReductionOperation::validate(const ITensorInfo *input, const ITensorInf const auto input_qinfo = input->quantization_info(); const auto output_data_type = output->data_type(); - auto initialize_tensorinfo = [](TensorInfo & ti, TensorShape shape, DataType data_type, int num_channels, QuantizationInfo qinfo) - { + auto initialize_tensorinfo = [](TensorInfo &ti, TensorShape shape, DataType data_type, int num_channels, + QuantizationInfo qinfo) { ti.set_data_type(data_type).set_tensor_shape(shape).set_num_channels(num_channels).set_quantization_info(qinfo); }; - if(is_reshape_required) + if (is_reshape_required) { auto shape_before_reshape = input_shape; shape_before_reshape.set(axis, 1); - initialize_tensorinfo(output_before_reshape, shape_before_reshape, output_data_type, input_num_channles, input_qinfo); + initialize_tensorinfo(output_before_reshape, shape_before_reshape, output_data_type, input_num_channles, + input_qinfo); output_internal = &output_before_reshape; } ARM_COMPUTE_RETURN_ON_ERROR(CLReductionOperationKernel::validate(input, output_internal, axis, op)); - if(is_reshape_required) + if (is_reshape_required) { ARM_COMPUTE_RETURN_ON_ERROR(CLReshapeLayer::validate(output_internal, output)); } @@ -90,7 +101,7 @@ Status CLReductionOperation::validate(const ITensorInfo *input, const ITensorInf ICLTensor *CLReductionOperation::configure_intermediate_result_vector(ICLTensor *input, ICLTensor *output) { - if(!_is_reshape_required) + if (!_is_reshape_required) { return output; } @@ -101,24 +112,37 @@ ICLTensor *CLReductionOperation::configure_intermediate_result_vector(ICLTensor return &_unreshaped_output; } -void CLReductionOperation::configure(ICLTensor *input, ICLTensor *output, unsigned int axis, ReductionOperation op, bool keep_dims) +void CLReductionOperation::configure( + ICLTensor *input, ICLTensor *output, unsigned int axis, ReductionOperation op, bool keep_dims) { configure(CLKernelLibrary::get().get_compile_context(), input, output, axis, op, keep_dims); } -void CLReductionOperation::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, unsigned int axis, ReductionOperation op, bool keep_dims) +void CLReductionOperation::configure(const CLCompileContext &compile_context, + ICLTensor *input, + ICLTensor *output, + unsigned int axis, + ReductionOperation op, + bool keep_dims) { ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); + ARM_COMPUTE_LOG_PARAMS(input, output, axis, op, keep_dims); _reduction_axis = axis; _is_reshape_required = !keep_dims; auto *output_internal = configure_intermediate_result_vector(input, output); - if(_is_reshape_required) + if (_is_reshape_required) { - const TensorShape output_shape = arm_compute::misc::shape_calculator::compute_reduced_shape(input->info()->tensor_shape(), axis, false); - const auto output_data_type = input->info()->data_type(); - auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(output_shape).set_data_type(output_data_type).reset_padding().set_is_resizable(true)); + const TensorShape output_shape = + arm_compute::misc::shape_calculator::compute_reduced_shape(input->info()->tensor_shape(), axis, false); + const auto output_data_type = input->info()->data_type(); + auto_init_if_empty(*output->info(), input->info() + ->clone() + ->set_tensor_shape(output_shape) + .set_data_type(output_data_type) + .reset_padding() + .set_is_resizable(true)); _memory_group.manage(&_unreshaped_output); } @@ -126,7 +150,7 @@ void CLReductionOperation::configure(const CLCompileContext &compile_context, IC _reduction_kernel = std::make_unique<CLReductionOperationKernel>(); _reduction_kernel->configure(compile_context, input, output_internal, axis, op); - if(_is_reshape_required) + if (_is_reshape_required) { _reshape.configure(compile_context, &_unreshaped_output, output); _unreshaped_output.allocator()->allocate(); @@ -139,7 +163,7 @@ void CLReductionOperation::run() CLScheduler::get().enqueue(*_reduction_kernel, false); - if(_is_reshape_required) + if (_is_reshape_required) { _reshape.run(); } diff --git a/src/runtime/CL/functions/CLRemap.cpp b/src/runtime/CL/functions/CLRemap.cpp deleted file mode 100644 index 0a1f864543..0000000000 --- a/src/runtime/CL/functions/CLRemap.cpp +++ /dev/null @@ -1,51 +0,0 @@ -/* - * Copyright (c) 2017-2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "arm_compute/runtime/CL/functions/CLRemap.h" - -#include "arm_compute/core/CL/ICLTensor.h" -#include "arm_compute/core/Error.h" -#include "arm_compute/core/PixelValue.h" -#include "arm_compute/core/TensorInfo.h" -#include "arm_compute/core/Validate.h" -#include "src/core/CL/kernels/CLFillBorderKernel.h" -#include "src/core/CL/kernels/CLRemapKernel.h" - -#include <utility> - -using namespace arm_compute; - -void CLRemap::configure(ICLTensor *input, const ICLTensor *map_x, const ICLTensor *map_y, ICLTensor *output, InterpolationPolicy policy, BorderMode border_mode, uint8_t constant_border_value) -{ - configure(CLKernelLibrary::get().get_compile_context(), input, map_x, map_y, output, policy, border_mode, constant_border_value); -} - -void CLRemap::configure(const CLCompileContext &compile_context, ICLTensor *input, const ICLTensor *map_x, const ICLTensor *map_y, ICLTensor *output, InterpolationPolicy policy, - BorderMode border_mode, - uint8_t constant_border_value) -{ - auto k = std::make_unique<CLRemapKernel>(); - k->configure(compile_context, input, map_x, map_y, output, RemapInfo{ policy, border_mode, PixelValue(constant_border_value) }); - _kernel = std::move(k); - _border_handler->configure(compile_context, input, _kernel->border_size(), border_mode, PixelValue(constant_border_value)); -} diff --git a/src/runtime/CL/functions/CLReorgLayer.cpp b/src/runtime/CL/functions/CLReorgLayer.cpp index 69b28abab3..156e9b90c1 100644 --- a/src/runtime/CL/functions/CLReorgLayer.cpp +++ b/src/runtime/CL/functions/CLReorgLayer.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018-2020 Arm Limited. + * Copyright (c) 2018-2021 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -27,6 +27,8 @@ #include "arm_compute/core/Error.h" #include "arm_compute/core/TensorInfo.h" #include "arm_compute/core/Validate.h" + +#include "src/common/utils/Log.h" #include "src/core/CL/kernels/CLReorgLayerKernel.h" #include <utility> @@ -38,8 +40,12 @@ void CLReorgLayer::configure(ICLTensor *input, ICLTensor *output, int32_t stride configure(CLKernelLibrary::get().get_compile_context(), input, output, stride); } -void CLReorgLayer::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, int32_t stride) +void CLReorgLayer::configure(const CLCompileContext &compile_context, + ICLTensor *input, + ICLTensor *output, + int32_t stride) { + ARM_COMPUTE_LOG_PARAMS(input, output, stride); auto k = std::make_unique<CLReorgLayerKernel>(); k->configure(compile_context, input, output, stride); _kernel = std::move(k); diff --git a/src/runtime/CL/functions/CLReshapeLayer.cpp b/src/runtime/CL/functions/CLReshapeLayer.cpp index 060eddb96c..3d6349fb25 100644 --- a/src/runtime/CL/functions/CLReshapeLayer.cpp +++ b/src/runtime/CL/functions/CLReshapeLayer.cpp @@ -27,25 +27,25 @@ #include "arm_compute/core/CL/ICLTensor.h" #include "arm_compute/core/Types.h" #include "arm_compute/core/Validate.h" + #include "src/core/CL/ICLKernel.h" -#include "src/runtime/gpu/cl/operators/ClReshape.h" +#include "src/gpu/cl/operators/ClReshape.h" /** [CLReshapeLayer snippet] **/ namespace arm_compute { struct CLReshapeLayer::Impl { - const ICLTensor *src{ nullptr }; - ICLTensor *dst{ nullptr }; - std::unique_ptr<opencl::ClReshape> op{ nullptr }; + const ICLTensor *src{nullptr}; + ICLTensor *dst{nullptr}; + std::unique_ptr<opencl::ClReshape> op{nullptr}; }; -CLReshapeLayer::CLReshapeLayer() - : _impl(std::make_unique<Impl>()) +CLReshapeLayer::CLReshapeLayer() : _impl(std::make_unique<Impl>()) { } -CLReshapeLayer::CLReshapeLayer(CLReshapeLayer &&) = default; +CLReshapeLayer::CLReshapeLayer(CLReshapeLayer &&) = default; CLReshapeLayer &CLReshapeLayer::operator=(CLReshapeLayer &&) = default; CLReshapeLayer::~CLReshapeLayer() = default; @@ -78,4 +78,4 @@ void CLReshapeLayer::run() _impl->op->run(pack); } } // namespace arm_compute -/** [CLReshapeLayer snippet] **/
\ No newline at end of file + /** [CLReshapeLayer snippet] **/ diff --git a/src/runtime/CL/functions/CLReverse.cpp b/src/runtime/CL/functions/CLReverse.cpp index 2a845bae13..a20be2335a 100644 --- a/src/runtime/CL/functions/CLReverse.cpp +++ b/src/runtime/CL/functions/CLReverse.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018-2020 Arm Limited. + * Copyright (c) 2018-2021, 2023 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -24,24 +24,34 @@ #include "arm_compute/runtime/CL/functions/CLReverse.h" #include "arm_compute/core/Types.h" + +#include "src/common/utils/Log.h" #include "src/core/CL/kernels/CLReverseKernel.h" namespace arm_compute { -void CLReverse::configure(const ICLTensor *input, ICLTensor *output, const ICLTensor *axis) +void CLReverse::configure(const ICLTensor *input, ICLTensor *output, const ICLTensor *axis, bool use_inverted_axis) { - configure(CLKernelLibrary::get().get_compile_context(), input, output, axis); + configure(CLKernelLibrary::get().get_compile_context(), input, output, axis, use_inverted_axis); } -void CLReverse::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const ICLTensor *axis) +void CLReverse::configure(const CLCompileContext &compile_context, + const ICLTensor *input, + ICLTensor *output, + const ICLTensor *axis, + bool use_inverted_axis) { + ARM_COMPUTE_LOG_PARAMS(input, output, axis); auto k = std::make_unique<CLReverseKernel>(); - k->configure(compile_context, input, output, axis); + k->configure(compile_context, input, output, axis, use_inverted_axis); _kernel = std::move(k); } -Status CLReverse::validate(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *axis) +Status CLReverse::validate(const ITensorInfo *input, + const ITensorInfo *output, + const ITensorInfo *axis, + bool use_inverted_axis) { - return CLReverseKernel::validate(input, output, axis); + return CLReverseKernel::validate(input, output, axis, use_inverted_axis); } } // namespace arm_compute diff --git a/src/runtime/CL/functions/CLScale.cpp b/src/runtime/CL/functions/CLScale.cpp index cbd93c1086..abff0724e4 100644 --- a/src/runtime/CL/functions/CLScale.cpp +++ b/src/runtime/CL/functions/CLScale.cpp @@ -26,20 +26,20 @@ #include "arm_compute/core/CL/CLKernelLibrary.h" #include "arm_compute/core/CL/ICLTensor.h" #include "arm_compute/core/KernelDescriptors.h" + #include "src/core/CL/ICLKernel.h" -#include "src/runtime/gpu/cl/operators/ClScale.h" +#include "src/gpu/cl/operators/ClScale.h" namespace arm_compute { struct CLScale::Impl { - const ICLTensor *src{ nullptr }; - ICLTensor *dst{ nullptr }; - std::unique_ptr<opencl::ClScale> op{ nullptr }; + const ICLTensor *src{nullptr}; + ICLTensor *dst{nullptr}; + std::unique_ptr<opencl::ClScale> op{nullptr}; }; -CLScale::CLScale() - : _impl(std::make_unique<Impl>()) +CLScale::CLScale() : _impl(std::make_unique<Impl>()) { } CLScale::~CLScale() = default; @@ -49,7 +49,10 @@ void CLScale::configure(ICLTensor *input, ICLTensor *output, const ScaleKernelIn configure(CLKernelLibrary::get().get_compile_context(), input, output, info); } -void CLScale::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, const ScaleKernelInfo &info) +void CLScale::configure(const CLCompileContext &compile_context, + ICLTensor *input, + ICLTensor *output, + const ScaleKernelInfo &info) { _impl->src = input; _impl->dst = output; diff --git a/src/runtime/CL/functions/CLScatter.cpp b/src/runtime/CL/functions/CLScatter.cpp new file mode 100644 index 0000000000..e16fcc4ccc --- /dev/null +++ b/src/runtime/CL/functions/CLScatter.cpp @@ -0,0 +1,93 @@ +/* + * Copyright (c) 2024 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "arm_compute/runtime/CL/functions/CLScatter.h" + +#include "arm_compute/function_info/ScatterInfo.h" +#include "arm_compute/runtime/CL/CLTensor.h" + +#include "src/gpu/cl/operators/ClScatter.h" + +namespace arm_compute +{ +using OperatorType = opencl::ClScatter; + +struct CLScatter::Impl +{ + std::unique_ptr<OperatorType> op{nullptr}; + ITensorPack run_pack{}; +}; + +CLScatter::CLScatter() : _impl(std::make_unique<Impl>()) +{ +} + +CLScatter::~CLScatter() = default; + +void CLScatter::configure(const ICLTensor *src, + const ICLTensor *updates, + const ICLTensor *indices, + ICLTensor *output, + const ScatterInfo &info) +{ + ARM_COMPUTE_UNUSED(info); + configure(CLKernelLibrary::get().get_compile_context(), src, updates, indices, output, info); +} + +void CLScatter::configure(const CLCompileContext &compile_context, + const ICLTensor *src, + const ICLTensor *updates, + const ICLTensor *indices, + ICLTensor *output, + const ScatterInfo &info) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(updates, indices, output); + + _impl->op = std::make_unique<OperatorType>(); + if (src) + { // Src not nullptr. + _impl->op->configure(compile_context, src->info(), updates->info(), indices->info(), output->info(), info); + } + else + { + _impl->op->configure(compile_context, nullptr, updates->info(), indices->info(), output->info(), info); + } + _impl->run_pack = {{ACL_SRC_0, src}, {ACL_SRC_1, updates}, {ACL_SRC_2, indices}, {ACL_DST, output}}; +} + +Status CLScatter::validate(const ITensorInfo *src, + const ITensorInfo *updates, + const ITensorInfo *indices, + const ITensorInfo *output, + const ScatterInfo &info) +{ + return OperatorType::validate(src, updates, indices, output, info); +} + +void CLScatter::run() +{ + _impl->op->run(_impl->run_pack); +} + +} // namespace arm_compute diff --git a/src/runtime/CL/functions/CLSelect.cpp b/src/runtime/CL/functions/CLSelect.cpp index 5ec18a032f..b4897d9e62 100644 --- a/src/runtime/CL/functions/CLSelect.cpp +++ b/src/runtime/CL/functions/CLSelect.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018-2020 Arm Limited. + * Copyright (c) 2018-2021 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -25,6 +25,8 @@ #include "arm_compute/core/Types.h" #include "arm_compute/runtime/CL/CLScheduler.h" + +#include "src/common/utils/Log.h" #include "src/core/CL/kernels/CLSelectKernel.h" using namespace arm_compute; @@ -36,8 +38,13 @@ void CLSelect::configure(const ICLTensor *c, const ICLTensor *x, const ICLTensor configure(CLKernelLibrary::get().get_compile_context(), c, x, y, output); } -void CLSelect::configure(const CLCompileContext &compile_context, const ICLTensor *c, const ICLTensor *x, const ICLTensor *y, ICLTensor *output) +void CLSelect::configure(const CLCompileContext &compile_context, + const ICLTensor *c, + const ICLTensor *x, + const ICLTensor *y, + ICLTensor *output) { + ARM_COMPUTE_LOG_PARAMS(c, x, y, output); auto k = std::make_unique<CLSelectKernel>(); k->configure(compile_context, c, x, y, output); _kernel = std::move(k); diff --git a/src/runtime/CL/functions/CLSlice.cpp b/src/runtime/CL/functions/CLSlice.cpp index 7f39143dc7..f79c6a1235 100644 --- a/src/runtime/CL/functions/CLSlice.cpp +++ b/src/runtime/CL/functions/CLSlice.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018-2020 Arm Limited. + * Copyright (c) 2018-2021 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -26,15 +26,22 @@ #include "arm_compute/core/CL/ICLTensor.h" #include "arm_compute/core/Types.h" #include "arm_compute/core/utils/helpers/tensor_transform.h" + +#include "src/common/utils/Log.h" #include "src/core/CL/kernels/CLStridedSliceKernel.h" namespace arm_compute { namespace experimental { -void CLSlice::configure(const CLCompileContext &compile_context, const ITensorInfo *input, ITensorInfo *output, const Coordinates &starts, const Coordinates &ends) +void CLSlice::configure(const CLCompileContext &compile_context, + const ITensorInfo *input, + ITensorInfo *output, + const Coordinates &starts, + const Coordinates &ends) { ARM_COMPUTE_ERROR_ON_NULLPTR(input); + ARM_COMPUTE_LOG_PARAMS(input, output, starts, ends); // Get absolute end coordinates const int32_t slice_end_mask = arm_compute::helpers::tensor_transform::construct_slice_end_mask(ends); @@ -44,15 +51,16 @@ void CLSlice::configure(const CLCompileContext &compile_context, const ITensorIn _kernel = std::move(k); } -Status CLSlice::validate(const ITensorInfo *input, const ITensorInfo *output, const Coordinates &starts, const Coordinates &ends) +Status CLSlice::validate(const ITensorInfo *input, + const ITensorInfo *output, + const Coordinates &starts, + const Coordinates &ends) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input); // Check start dimensions for being non-negative - ARM_COMPUTE_RETURN_ERROR_ON(std::any_of(starts.cbegin(), starts.cbegin() + starts.num_dimensions(), [](int i) - { - return i < 0; - })); + ARM_COMPUTE_RETURN_ERROR_ON( + std::any_of(starts.cbegin(), starts.cbegin() + starts.num_dimensions(), [](int i) { return i < 0; })); // Get absolute end coordinates const int32_t slice_end_mask = arm_compute::helpers::tensor_transform::construct_slice_end_mask(ends); @@ -63,20 +71,22 @@ Status CLSlice::validate(const ITensorInfo *input, const ITensorInfo *output, co struct CLSlice::Impl { - const ICLTensor *src{ nullptr }; - ICLTensor *dst{ nullptr }; - std::unique_ptr<experimental::CLSlice> op{ nullptr }; + const ICLTensor *src{nullptr}; + ICLTensor *dst{nullptr}; + std::unique_ptr<experimental::CLSlice> op{nullptr}; }; -CLSlice::CLSlice() - : _impl(std::make_unique<Impl>()) +CLSlice::CLSlice() : _impl(std::make_unique<Impl>()) { } -CLSlice::CLSlice(CLSlice &&) = default; +CLSlice::CLSlice(CLSlice &&) = default; CLSlice &CLSlice::operator=(CLSlice &&) = default; CLSlice::~CLSlice() = default; -Status CLSlice::validate(const ITensorInfo *input, const ITensorInfo *output, const Coordinates &starts, const Coordinates &ends) +Status CLSlice::validate(const ITensorInfo *input, + const ITensorInfo *output, + const Coordinates &starts, + const Coordinates &ends) { return experimental::CLSlice::validate(input, output, starts, ends); } @@ -86,7 +96,11 @@ void CLSlice::configure(const ICLTensor *input, ICLTensor *output, const Coordin configure(CLKernelLibrary::get().get_compile_context(), input, output, starts, ends); } -void CLSlice::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const Coordinates &starts, const Coordinates &ends) +void CLSlice::configure(const CLCompileContext &compile_context, + const ICLTensor *input, + ICLTensor *output, + const Coordinates &starts, + const Coordinates &ends) { _impl->src = input; _impl->dst = output; diff --git a/src/runtime/CL/functions/CLSoftmaxLayer.cpp b/src/runtime/CL/functions/CLSoftmaxLayer.cpp index de58bf1b02..2e70e2aa08 100644 --- a/src/runtime/CL/functions/CLSoftmaxLayer.cpp +++ b/src/runtime/CL/functions/CLSoftmaxLayer.cpp @@ -22,16 +22,18 @@ * SOFTWARE. */ #include "arm_compute/runtime/CL/functions/CLSoftmaxLayer.h" + #include "arm_compute/core/CL/CLHelpers.h" #include "arm_compute/core/CL/CLKernelLibrary.h" #include "arm_compute/core/Helpers.h" #include "arm_compute/core/KernelDescriptors.h" #include "arm_compute/core/Types.h" #include "arm_compute/core/Utils.h" -#include "src/core/gpu/cl/kernels/ClSoftmaxKernel.h" + #include "src/core/helpers/MemoryHelpers.h" -#include "src/runtime/gpu/cl/operators/ClPermute.h" -#include "src/runtime/gpu/cl/operators/ClSoftmax.h" +#include "src/gpu/cl/kernels/ClSoftmaxKernel.h" +#include "src/gpu/cl/operators/ClPermute.h" +#include "src/gpu/cl/operators/ClSoftmax.h" namespace arm_compute { @@ -40,9 +42,9 @@ using OperatorType = opencl::ClSoftmax; template <bool IS_LOG> struct CLSoftmaxLayerGeneric<IS_LOG>::Impl { - const ICLTensor *src{ nullptr }; - ICLTensor *dst{ nullptr }; - std::unique_ptr<OperatorType> op{ nullptr }; + const ICLTensor *src{nullptr}; + ICLTensor *dst{nullptr}; + std::unique_ptr<OperatorType> op{nullptr}; MemoryGroup memory_group{}; ITensorPack run_pack{}; WorkspaceData<CLTensor> workspace_tensors{}; @@ -65,28 +67,30 @@ void CLSoftmaxLayerGeneric<IS_LOG>::configure(const ICLTensor *input, ICLTensor } template <bool IS_LOG> -void CLSoftmaxLayerGeneric<IS_LOG>::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, float beta, int32_t axis) +void CLSoftmaxLayerGeneric<IS_LOG>::configure( + const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, float beta, int32_t axis) { _impl->src = input; _impl->dst = output; _impl->op = std::make_unique<OperatorType>(); - SoftmaxKernelInfo softmax_info{ beta, IS_LOG, input->info()->data_type(), axis }; + SoftmaxKernelInfo softmax_info{beta, IS_LOG, input->info()->data_type(), axis}; _impl->op->configure(compile_context, *input->info(), *output->info(), softmax_info); - _impl->run_pack = { { TensorType::ACL_SRC, _impl->src }, { TensorType::ACL_DST, _impl->dst } }; + _impl->run_pack = {{TensorType::ACL_SRC, _impl->src}, {TensorType::ACL_DST, _impl->dst}}; _impl->workspace_tensors = manage_workspace<CLTensor>(_impl->op->workspace(), _impl->memory_group, _impl->run_pack); } template <bool IS_LOG> -Status CLSoftmaxLayerGeneric<IS_LOG>::validate(const ITensorInfo *input, const ITensorInfo *output, float beta, int32_t axis) +Status +CLSoftmaxLayerGeneric<IS_LOG>::validate(const ITensorInfo *input, const ITensorInfo *output, float beta, int32_t axis) { - SoftmaxKernelInfo softmax_info{ beta, IS_LOG, input->data_type(), axis }; + SoftmaxKernelInfo softmax_info{beta, IS_LOG, input->data_type(), axis}; return OperatorType::validate(*input, *output, softmax_info); } template <bool IS_LOG> -void CLSoftmaxLayerGeneric<IS_LOG>::run() +void CLSoftmaxLayerGeneric<IS_LOG>::run() { // Acquire all the temporaries MemoryGroupResourceScope scope_mg(_impl->memory_group); diff --git a/src/runtime/CL/functions/CLSpaceToBatchLayer.cpp b/src/runtime/CL/functions/CLSpaceToBatchLayer.cpp index 6180f4de07..37f728895f 100644 --- a/src/runtime/CL/functions/CLSpaceToBatchLayer.cpp +++ b/src/runtime/CL/functions/CLSpaceToBatchLayer.cpp @@ -29,67 +29,100 @@ #include "arm_compute/core/Types.h" #include "arm_compute/core/Validate.h" #include "arm_compute/runtime/CL/CLScheduler.h" + +#include "src/common/utils/Log.h" #include "src/core/CL/kernels/CLSpaceToBatchLayerKernel.h" namespace arm_compute { CLSpaceToBatchLayer::CLSpaceToBatchLayer() - : _space_to_batch_kernel(std::make_unique<CLSpaceToBatchLayerKernel>()), - _fill(), - _has_padding(false) + : _space_to_batch_kernel(std::make_unique<CLSpaceToBatchLayerKernel>()), _fill(), _has_padding(false) { } CLSpaceToBatchLayer::~CLSpaceToBatchLayer() = default; -void CLSpaceToBatchLayer::configure(const ICLTensor *input, const ICLTensor *block_shape, const ICLTensor *paddings, ICLTensor *output) +void CLSpaceToBatchLayer::configure(const ICLTensor *input, + const ICLTensor *block_shape, + const ICLTensor *paddings, + ICLTensor *output) { configure(CLKernelLibrary::get().get_compile_context(), input, block_shape, paddings, output); } -void CLSpaceToBatchLayer::configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *block_shape, const ICLTensor *paddings, ICLTensor *output) +void CLSpaceToBatchLayer::configure(const CLCompileContext &compile_context, + const ICLTensor *input, + const ICLTensor *block_shape, + const ICLTensor *paddings, + ICLTensor *output) { ARM_COMPUTE_ERROR_ON_NULLPTR(input, block_shape, paddings, output); + ARM_COMPUTE_LOG_PARAMS(input, block_shape, paddings, output); - if(input->info()->tensor_shape().total_size() != output->info()->tensor_shape().total_size()) + if (input->info()->tensor_shape().total_size() != output->info()->tensor_shape().total_size()) { _has_padding = true; - _fill.configure(compile_context, output, PixelValue(0, input->info()->data_type(), input->info()->quantization_info())); + _fill.configure(compile_context, output, + PixelValue(0, input->info()->data_type(), input->info()->quantization_info())); } _space_to_batch_kernel->configure(compile_context, input, block_shape, paddings, output); } -void CLSpaceToBatchLayer::configure(const ICLTensor *input, const int block_shape_x, const int block_shape_y, const Size2D &padding_left, const Size2D &padding_right, ICLTensor *output) +void CLSpaceToBatchLayer::configure(const ICLTensor *input, + const int block_shape_x, + const int block_shape_y, + const Size2D &padding_left, + const Size2D &padding_right, + ICLTensor *output) { - configure(CLKernelLibrary::get().get_compile_context(), input, block_shape_x, block_shape_y, padding_left, padding_right, output); + configure(CLKernelLibrary::get().get_compile_context(), input, block_shape_x, block_shape_y, padding_left, + padding_right, output); } -void CLSpaceToBatchLayer::configure(const CLCompileContext &compile_context, const ICLTensor *input, const int block_shape_x, const int block_shape_y, const Size2D &padding_left, - const Size2D &padding_right, ICLTensor *output) +void CLSpaceToBatchLayer::configure(const CLCompileContext &compile_context, + const ICLTensor *input, + const int block_shape_x, + const int block_shape_y, + const Size2D &padding_left, + const Size2D &padding_right, + ICLTensor *output) { ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); + ARM_COMPUTE_LOG_PARAMS(input, block_shape_x, block_shape_y, padding_left, padding_right, output); - if(input->info()->tensor_shape().total_size() != output->info()->tensor_shape().total_size()) + if (input->info()->tensor_shape().total_size() != output->info()->tensor_shape().total_size()) { _has_padding = true; - _fill.configure(compile_context, output, PixelValue(0, input->info()->data_type(), input->info()->quantization_info())); + _fill.configure(compile_context, output, + PixelValue(0, input->info()->data_type(), input->info()->quantization_info())); } - _space_to_batch_kernel->configure(compile_context, input, block_shape_x, block_shape_y, padding_left, padding_right, output); + _space_to_batch_kernel->configure(compile_context, input, block_shape_x, block_shape_y, padding_left, padding_right, + output); } -Status CLSpaceToBatchLayer::validate(const ITensorInfo *input, const ITensorInfo *block_shape, const ITensorInfo *paddings, const ITensorInfo *output) +Status CLSpaceToBatchLayer::validate(const ITensorInfo *input, + const ITensorInfo *block_shape, + const ITensorInfo *paddings, + const ITensorInfo *output) { - ARM_COMPUTE_RETURN_ON_ERROR(CLFill::validate(output, PixelValue(0, input->data_type(), input->quantization_info()))); + ARM_COMPUTE_RETURN_ON_ERROR( + CLFill::validate(output, PixelValue(0, input->data_type(), input->quantization_info()))); ARM_COMPUTE_RETURN_ON_ERROR(CLSpaceToBatchLayerKernel::validate(input, block_shape, paddings, output)); return Status{}; } -Status CLSpaceToBatchLayer::validate(const ITensorInfo *input, const int block_shape_x, const int block_shape_y, const Size2D &padding_left, const Size2D &padding_right, +Status CLSpaceToBatchLayer::validate(const ITensorInfo *input, + const int block_shape_x, + const int block_shape_y, + const Size2D &padding_left, + const Size2D &padding_right, const ITensorInfo *output) { - ARM_COMPUTE_RETURN_ON_ERROR(CLFill::validate(output, PixelValue(0, input->data_type(), input->quantization_info()))); - ARM_COMPUTE_RETURN_ON_ERROR(CLSpaceToBatchLayerKernel::validate(input, block_shape_x, block_shape_y, padding_left, padding_right, output)); + ARM_COMPUTE_RETURN_ON_ERROR( + CLFill::validate(output, PixelValue(0, input->data_type(), input->quantization_info()))); + ARM_COMPUTE_RETURN_ON_ERROR( + CLSpaceToBatchLayerKernel::validate(input, block_shape_x, block_shape_y, padding_left, padding_right, output)); return Status{}; } @@ -97,7 +130,7 @@ Status CLSpaceToBatchLayer::validate(const ITensorInfo *input, const int block_s void CLSpaceToBatchLayer::run() { // Zero out output only if we have paddings - if(_has_padding) + if (_has_padding) { //CLScheduler::get().enqueue(*_fill, true); _fill.run(); diff --git a/src/runtime/CL/functions/CLSpaceToDepthLayer.cpp b/src/runtime/CL/functions/CLSpaceToDepthLayer.cpp index 842d5bc5cc..22695c9ef3 100644 --- a/src/runtime/CL/functions/CLSpaceToDepthLayer.cpp +++ b/src/runtime/CL/functions/CLSpaceToDepthLayer.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2020 Arm Limited. + * Copyright (c) 2019-2021 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -29,12 +29,13 @@ #include "arm_compute/core/Types.h" #include "arm_compute/core/Validate.h" #include "arm_compute/runtime/CL/CLScheduler.h" + +#include "src/common/utils/Log.h" #include "src/core/CL/kernels/CLSpaceToDepthLayerKernel.h" namespace arm_compute { -CLSpaceToDepthLayer::CLSpaceToDepthLayer() - : _space_to_depth_kernel(std::make_unique<CLSpaceToDepthLayerKernel>()) +CLSpaceToDepthLayer::CLSpaceToDepthLayer() : _space_to_depth_kernel(std::make_unique<CLSpaceToDepthLayerKernel>()) { } @@ -45,8 +46,12 @@ void CLSpaceToDepthLayer::configure(const ICLTensor *input, ICLTensor *output, i configure(CLKernelLibrary::get().get_compile_context(), input, output, block_shape); } -void CLSpaceToDepthLayer::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, int32_t block_shape) +void CLSpaceToDepthLayer::configure(const CLCompileContext &compile_context, + const ICLTensor *input, + ICLTensor *output, + int32_t block_shape) { + ARM_COMPUTE_LOG_PARAMS(input, output, block_shape); _space_to_depth_kernel->configure(compile_context, input, output, block_shape); } diff --git a/src/runtime/CL/functions/CLSplit.cpp b/src/runtime/CL/functions/CLSplit.cpp index 0b27371e3f..6be43cc5cd 100644 --- a/src/runtime/CL/functions/CLSplit.cpp +++ b/src/runtime/CL/functions/CLSplit.cpp @@ -30,6 +30,7 @@ #include "arm_compute/core/Types.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" #include "arm_compute/runtime/CL/CLScheduler.h" + #include "src/core/helpers/AutoConfiguration.h" namespace arm_compute @@ -38,7 +39,7 @@ void CLSplit::run() { cl::CommandQueue q = CLScheduler::get().queue(); - for(unsigned i = 0; i < _num_outputs; ++i) + for (unsigned i = 0; i < _num_outputs; ++i) { _slice_functions[i].run(); } diff --git a/src/runtime/CL/functions/CLStackLayer.cpp b/src/runtime/CL/functions/CLStackLayer.cpp index 3ef6a27675..c15496fc31 100644 --- a/src/runtime/CL/functions/CLStackLayer.cpp +++ b/src/runtime/CL/functions/CLStackLayer.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018-2020 Arm Limited. + * Copyright (c) 2018-2021 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -21,8 +21,6 @@ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ -#include <complex> - #include "arm_compute/runtime/CL/functions/CLStackLayer.h" #include "arm_compute/core/CL/ICLTensor.h" @@ -32,14 +30,16 @@ #include "arm_compute/core/Types.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" #include "arm_compute/runtime/CL/CLScheduler.h" + +#include "src/common/utils/Log.h" #include "src/core/CL/kernels/CLStackLayerKernel.h" +#include <complex> + namespace arm_compute { CLStackLayer::CLStackLayer() // NOLINT - : _input(), - _stack_kernels(), - _num_inputs(0) + : _input(), _stack_kernels(), _num_inputs(0) { } @@ -50,15 +50,19 @@ void CLStackLayer::configure(const std::vector<ICLTensor *> &input, int axis, IC configure(CLKernelLibrary::get().get_compile_context(), input, axis, output); } -void CLStackLayer::configure(const CLCompileContext &compile_context, const std::vector<ICLTensor *> &input, int axis, ICLTensor *output) +void CLStackLayer::configure(const CLCompileContext &compile_context, + const std::vector<ICLTensor *> &input, + int axis, + ICLTensor *output) { + ARM_COMPUTE_LOG_PARAMS(input, axis, output); _num_inputs = input.size(); _stack_kernels.reserve(_num_inputs); // Wrap around negative values const unsigned int axis_u = wrap_around(axis, static_cast<int>(input[0]->info()->num_dimensions() + 1)); - for(unsigned int i = 0; i < _num_inputs; i++) + for (unsigned int i = 0; i < _num_inputs; i++) { _stack_kernels.emplace_back(std::make_unique<CLStackLayerKernel>()); _stack_kernels.back()->configure(compile_context, input[i], axis_u, i, _num_inputs, output); @@ -76,7 +80,7 @@ Status CLStackLayer::validate(const std::vector<ITensorInfo *> &input, int axis, const unsigned int num_inputs = input.size(); - for(unsigned int i = 0; i < num_inputs; i++) + for (unsigned int i = 0; i < num_inputs; i++) { // All the tensors must have the same rank ARM_COMPUTE_RETURN_ERROR_ON(input[i]->num_dimensions() != rank); @@ -89,7 +93,7 @@ Status CLStackLayer::validate(const std::vector<ITensorInfo *> &input, int axis, void CLStackLayer::run() { - for(unsigned i = 0; i < _num_inputs; i++) + for (unsigned i = 0; i < _num_inputs; i++) { CLScheduler::get().enqueue(*_stack_kernels[i], false); } diff --git a/src/runtime/CL/functions/CLStridedSlice.cpp b/src/runtime/CL/functions/CLStridedSlice.cpp index fd3db9341a..c1953cc415 100644 --- a/src/runtime/CL/functions/CLStridedSlice.cpp +++ b/src/runtime/CL/functions/CLStridedSlice.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018-2020 Arm Limited. + * Copyright (c) 2018-2021 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -25,24 +25,38 @@ #include "arm_compute/core/CL/ICLTensor.h" #include "arm_compute/core/Types.h" + +#include "src/common/utils/Log.h" #include "src/core/CL/kernels/CLStridedSliceKernel.h" namespace arm_compute { namespace experimental { -void CLStridedSlice::configure(const CLCompileContext &compile_context, const ITensorInfo *input, ITensorInfo *output, - const Coordinates &starts, const Coordinates &ends, const BiStrides &strides, - int32_t begin_mask, int32_t end_mask, int32_t shrink_axis_mask) +void CLStridedSlice::configure(const CLCompileContext &compile_context, + const ITensorInfo *input, + ITensorInfo *output, + const Coordinates &starts, + const Coordinates &ends, + const BiStrides &strides, + int32_t begin_mask, + int32_t end_mask, + int32_t shrink_axis_mask) { + ARM_COMPUTE_LOG_PARAMS(input, output, starts, ends, strides, begin_mask, end_mask, shrink_axis_mask); auto k = std::make_unique<CLStridedSliceKernel>(); k->configure(compile_context, input, output, starts, ends, strides, begin_mask, end_mask, shrink_axis_mask); _kernel = std::move(k); } -Status CLStridedSlice::validate(const ITensorInfo *input, const ITensorInfo *output, - const Coordinates &starts, const Coordinates &ends, const BiStrides &strides, - int32_t begin_mask, int32_t end_mask, int32_t shrink_axis_mask) +Status CLStridedSlice::validate(const ITensorInfo *input, + const ITensorInfo *output, + const Coordinates &starts, + const Coordinates &ends, + const BiStrides &strides, + int32_t begin_mask, + int32_t end_mask, + int32_t shrink_axis_mask) { return CLStridedSliceKernel::validate(input, output, starts, ends, strides, begin_mask, end_mask, shrink_axis_mask); } @@ -50,32 +64,43 @@ Status CLStridedSlice::validate(const ITensorInfo *input, const ITensorInfo *out struct CLStridedSlice::Impl { - const ICLTensor *src{ nullptr }; - ICLTensor *dst{ nullptr }; - CLRuntimeContext *ctx{ nullptr }; - std::unique_ptr<experimental::CLStridedSlice> op{ nullptr }; + const ICLTensor *src{nullptr}; + ICLTensor *dst{nullptr}; + CLRuntimeContext *ctx{nullptr}; + std::unique_ptr<experimental::CLStridedSlice> op{nullptr}; }; -CLStridedSlice::CLStridedSlice(CLRuntimeContext *ctx) - : _impl(std::make_unique<Impl>()) +CLStridedSlice::CLStridedSlice(CLRuntimeContext *ctx) : _impl(std::make_unique<Impl>()) { _impl->ctx = ctx; } -CLStridedSlice::CLStridedSlice(CLStridedSlice &&) = default; +CLStridedSlice::CLStridedSlice(CLStridedSlice &&) = default; CLStridedSlice &CLStridedSlice::operator=(CLStridedSlice &&) = default; CLStridedSlice::~CLStridedSlice() = default; -void CLStridedSlice::configure(const ICLTensor *input, ICLTensor *output, - const Coordinates &starts, const Coordinates &ends, const BiStrides &strides, - int32_t begin_mask, int32_t end_mask, int32_t shrink_axis_mask) +void CLStridedSlice::configure(const ICLTensor *input, + ICLTensor *output, + const Coordinates &starts, + const Coordinates &ends, + const BiStrides &strides, + int32_t begin_mask, + int32_t end_mask, + int32_t shrink_axis_mask) { - configure(CLKernelLibrary::get().get_compile_context(), input, output, starts, ends, strides, begin_mask, end_mask, shrink_axis_mask); + configure(CLKernelLibrary::get().get_compile_context(), input, output, starts, ends, strides, begin_mask, end_mask, + shrink_axis_mask); } -void CLStridedSlice::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, - const Coordinates &starts, const Coordinates &ends, const BiStrides &strides, - int32_t begin_mask, int32_t end_mask, int32_t shrink_axis_mask) +void CLStridedSlice::configure(const CLCompileContext &compile_context, + const ICLTensor *input, + ICLTensor *output, + const Coordinates &starts, + const Coordinates &ends, + const BiStrides &strides, + int32_t begin_mask, + int32_t end_mask, + int32_t shrink_axis_mask) { ARM_COMPUTE_ERROR_ON_NULLPTR(input); @@ -83,14 +108,21 @@ void CLStridedSlice::configure(const CLCompileContext &compile_context, const IC _impl->dst = output; _impl->op = std::make_unique<experimental::CLStridedSlice>(); - _impl->op->configure(compile_context, _impl->src->info(), _impl->dst->info(), starts, ends, strides, begin_mask, end_mask, shrink_axis_mask); + _impl->op->configure(compile_context, _impl->src->info(), _impl->dst->info(), starts, ends, strides, begin_mask, + end_mask, shrink_axis_mask); } -Status CLStridedSlice::validate(const ITensorInfo *input, const ITensorInfo *output, - const Coordinates &starts, const Coordinates &ends, const BiStrides &strides, - int32_t begin_mask, int32_t end_mask, int32_t shrink_axis_mask) +Status CLStridedSlice::validate(const ITensorInfo *input, + const ITensorInfo *output, + const Coordinates &starts, + const Coordinates &ends, + const BiStrides &strides, + int32_t begin_mask, + int32_t end_mask, + int32_t shrink_axis_mask) { - return experimental::CLStridedSlice::validate(input, output, starts, ends, strides, begin_mask, end_mask, shrink_axis_mask); + return experimental::CLStridedSlice::validate(input, output, starts, ends, strides, begin_mask, end_mask, + shrink_axis_mask); } void CLStridedSlice::run() diff --git a/src/runtime/CL/functions/CLTile.cpp b/src/runtime/CL/functions/CLTile.cpp index 818f10f1ac..4f86c4adfa 100644 --- a/src/runtime/CL/functions/CLTile.cpp +++ b/src/runtime/CL/functions/CLTile.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018-2020 Arm Limited. + * Copyright (c) 2018-2021 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -23,6 +23,7 @@ */ #include "arm_compute/runtime/CL/functions/CLTile.h" +#include "src/common/utils/Log.h" #include "src/core/CL/kernels/CLTileKernel.h" namespace arm_compute @@ -32,8 +33,12 @@ void CLTile::configure(const ICLTensor *input, ICLTensor *output, const Multiple configure(CLKernelLibrary::get().get_compile_context(), input, output, multiples); } -void CLTile::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const Multiples &multiples) +void CLTile::configure(const CLCompileContext &compile_context, + const ICLTensor *input, + ICLTensor *output, + const Multiples &multiples) { + ARM_COMPUTE_LOG_PARAMS(input, output, multiples); auto k = std::make_unique<CLTileKernel>(); k->configure(compile_context, input, output, multiples); _kernel = std::move(k); diff --git a/src/runtime/CL/functions/CLTranspose.cpp b/src/runtime/CL/functions/CLTranspose.cpp index 142cf73259..5a738f47ce 100644 --- a/src/runtime/CL/functions/CLTranspose.cpp +++ b/src/runtime/CL/functions/CLTranspose.cpp @@ -27,19 +27,19 @@ #include "arm_compute/core/CL/ICLTensor.h" #include "arm_compute/core/Types.h" #include "arm_compute/core/Validate.h" + #include "src/core/CL/ICLKernel.h" -#include "src/runtime/gpu/cl/operators/ClTranspose.h" +#include "src/gpu/cl/operators/ClTranspose.h" namespace arm_compute { struct CLTranspose::Impl { - const ICLTensor *src{ nullptr }; - ICLTensor *dst{ nullptr }; - std::unique_ptr<opencl::ClTranspose> op{ nullptr }; + const ICLTensor *src{nullptr}; + ICLTensor *dst{nullptr}; + std::unique_ptr<opencl::ClTranspose> op{nullptr}; }; -CLTranspose::CLTranspose() - : _impl(std::make_unique<Impl>()) +CLTranspose::CLTranspose() : _impl(std::make_unique<Impl>()) { } CLTranspose::~CLTranspose() = default; @@ -70,4 +70,4 @@ void CLTranspose::run() pack.add_tensor(TensorType::ACL_DST, _impl->dst); _impl->op->run(pack); } -} // namespace arm_compute
\ No newline at end of file +} // namespace arm_compute diff --git a/src/runtime/CL/functions/CLUnstack.cpp b/src/runtime/CL/functions/CLUnstack.cpp index 28d122b3cf..ddd83e7824 100644 --- a/src/runtime/CL/functions/CLUnstack.cpp +++ b/src/runtime/CL/functions/CLUnstack.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018-2020 Arm Limited. + * Copyright (c) 2018-2021 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -29,6 +29,8 @@ #include "arm_compute/core/Types.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" +#include "src/common/utils/Log.h" + namespace arm_compute { namespace @@ -38,13 +40,15 @@ inline unsigned int wrap_axis(int axis, const ITensorInfo *const tensor) return wrap_around(axis, static_cast<int>(tensor->num_dimensions())); } -inline void setup_slice_coordinates_and_mask(Coordinates &slice_start, int32_t &slice_end_mask, const unsigned int input_num_dimensions) +inline void setup_slice_coordinates_and_mask(Coordinates &slice_start, + int32_t &slice_end_mask, + const unsigned int input_num_dimensions) { // Setups up coordinates to slice the input tensor: start coordinates to all 0s and the unstacking axis of both Start/End to slice just one 2d tensor at a time. Coordinates slice_end; slice_start.set_num_dimensions(input_num_dimensions); slice_end.set_num_dimensions(input_num_dimensions); - for(size_t k = 0; k < input_num_dimensions; ++k) + for (size_t k = 0; k < input_num_dimensions; ++k) { slice_start.set(k, 0); slice_end.set(k, -1); @@ -54,8 +58,7 @@ inline void setup_slice_coordinates_and_mask(Coordinates &slice_start, int32_t & } // namespace CLUnstack::CLUnstack() // NOLINT - : _num_slices(0), - _strided_slice_vector() + : _num_slices(0), _strided_slice_vector() { } @@ -64,14 +67,19 @@ void CLUnstack::configure(const ICLTensor *input, const std::vector<ICLTensor *> configure(CLKernelLibrary::get().get_compile_context(), input, output_vector, axis); } -void CLUnstack::configure(const CLCompileContext &compile_context, const ICLTensor *input, const std::vector<ICLTensor *> &output_vector, int axis) +void CLUnstack::configure(const CLCompileContext &compile_context, + const ICLTensor *input, + const std::vector<ICLTensor *> &output_vector, + int axis) { + ARM_COMPUTE_LOG_PARAMS(input, output_vector, axis); std::vector<ITensorInfo *> outputs_vector_info(output_vector.size()); - std::transform(output_vector.begin(), output_vector.end(), outputs_vector_info.begin(), [](ICLTensor * t) - { - ARM_COMPUTE_ERROR_ON_NULLPTR(t); - return t->info(); - }); + std::transform(output_vector.begin(), output_vector.end(), outputs_vector_info.begin(), + [](ICLTensor *t) + { + ARM_COMPUTE_ERROR_ON_NULLPTR(t); + return t->info(); + }); ARM_COMPUTE_ERROR_ON_NULLPTR(input); ARM_COMPUTE_ERROR_THROW_ON(CLUnstack::validate(input->info(), outputs_vector_info, axis)); @@ -84,11 +92,12 @@ void CLUnstack::configure(const CLCompileContext &compile_context, const ICLTens Coordinates slice_start; int32_t slice_end_mask; setup_slice_coordinates_and_mask(slice_start, slice_end_mask, input->info()->tensor_shape().num_dimensions()); - for(unsigned int slice = 0; slice < _num_slices; ++slice) + for (unsigned int slice = 0; slice < _num_slices; ++slice) { // Adjusts start and end coordinates to take a 2D slice at a time slice_start.set(axis_u, slice); - _strided_slice_vector[slice].configure(compile_context, input, output_vector[slice], slice_start, Coordinates(), BiStrides(), 0, slice_end_mask, (1 << axis_u)); + _strided_slice_vector[slice].configure(compile_context, input, output_vector[slice], slice_start, Coordinates(), + BiStrides(), 0, slice_end_mask, (1 << axis_u)); } } @@ -103,18 +112,20 @@ Status CLUnstack::validate(const ITensorInfo *input, const std::vector<ITensorIn ARM_COMPUTE_RETURN_ERROR_ON(num_slices > output_vector.size()); Coordinates slice_start; int32_t slice_end_mask; - for(size_t k = 0; k < num_slices; ++k) + for (size_t k = 0; k < num_slices; ++k) { slice_start.set(wrap_axis(axis, input), k); setup_slice_coordinates_and_mask(slice_start, slice_end_mask, input->tensor_shape().num_dimensions()); - ARM_COMPUTE_RETURN_ON_ERROR(CLStridedSlice::validate(input, output_vector[k], slice_start, Coordinates(), BiStrides(), 0, slice_end_mask, (1 << wrap_axis(axis, input)))); + ARM_COMPUTE_RETURN_ON_ERROR(CLStridedSlice::validate(input, output_vector[k], slice_start, Coordinates(), + BiStrides(), 0, slice_end_mask, + (1 << wrap_axis(axis, input)))); } return Status{}; } void CLUnstack::run() { - for(unsigned i = 0; i < _num_slices; ++i) + for (unsigned i = 0; i < _num_slices; ++i) { _strided_slice_vector[i].run(); } diff --git a/src/runtime/CL/functions/CLWinogradConvolutionLayer.cpp b/src/runtime/CL/functions/CLWinogradConvolutionLayer.cpp index f758c3d0b3..645f817030 100644 --- a/src/runtime/CL/functions/CLWinogradConvolutionLayer.cpp +++ b/src/runtime/CL/functions/CLWinogradConvolutionLayer.cpp @@ -26,25 +26,25 @@ #include "arm_compute/core/CL/CLKernelLibrary.h" #include "arm_compute/core/CL/ICLTensor.h" #include "arm_compute/core/KernelDescriptors.h" + #include "src/core/CL/ICLKernel.h" #include "src/core/helpers/MemoryHelpers.h" -#include "src/runtime/gpu/cl/operators/ClWinogradConv2d.h" +#include "src/gpu/cl/operators/ClWinogradConv2d.h" #include "support/Cast.h" namespace arm_compute { struct CLWinogradConvolutionLayer::Impl { - const ICLTensor *src{ nullptr }; - const ICLTensor *weights{ nullptr }; - const ICLTensor *biases{ nullptr }; - ICLTensor *dst{ nullptr }; - std::unique_ptr<opencl::ClWinogradConv2d> op{ nullptr }; + const ICLTensor *src{nullptr}; + const ICLTensor *weights{nullptr}; + const ICLTensor *biases{nullptr}; + ICLTensor *dst{nullptr}; + std::unique_ptr<opencl::ClWinogradConv2d> op{nullptr}; ITensorPack run_pack{}; - ITensorPack prep_pack{}; MemoryGroup memory_group{}; WorkspaceData<CLTensor> workspace_tensors{}; - bool is_prepared{ false }; + bool is_prepared{false}; }; CLWinogradConvolutionLayer::CLWinogradConvolutionLayer(std::shared_ptr<IMemoryManager> memory_manager) @@ -55,15 +55,26 @@ CLWinogradConvolutionLayer::CLWinogradConvolutionLayer(std::shared_ptr<IMemoryMa CLWinogradConvolutionLayer::~CLWinogradConvolutionLayer() = default; -void CLWinogradConvolutionLayer::configure(ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info, - bool enable_fast_math) +void CLWinogradConvolutionLayer::configure(ICLTensor *input, + const ICLTensor *weights, + const ICLTensor *biases, + ICLTensor *output, + const PadStrideInfo &conv_info, + const ActivationLayerInfo &act_info, + bool enable_fast_math) { - configure(CLKernelLibrary::get().get_compile_context(), input, weights, biases, output, conv_info, act_info, enable_fast_math); + configure(CLKernelLibrary::get().get_compile_context(), input, weights, biases, output, conv_info, act_info, + enable_fast_math); } -void CLWinogradConvolutionLayer::configure(const CLCompileContext &compile_context, ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, +void CLWinogradConvolutionLayer::configure(const CLCompileContext &compile_context, + ICLTensor *input, + const ICLTensor *weights, + const ICLTensor *biases, + ICLTensor *output, const PadStrideInfo &conv_info, - const ActivationLayerInfo &act_info, bool enable_fast_math) + const ActivationLayerInfo &act_info, + bool enable_fast_math) { _impl->src = input; _impl->weights = weights; @@ -71,22 +82,25 @@ void CLWinogradConvolutionLayer::configure(const CLCompileContext &compile_conte _impl->dst = output; _impl->op = std::make_unique<opencl::ClWinogradConv2d>(); - _impl->op->configure(compile_context, input->info(), weights->info(), (biases != nullptr ? biases->info() : nullptr), output->info(), conv_info, act_info, enable_fast_math); + _impl->op->configure(compile_context, input->info(), weights->info(), + (biases != nullptr ? biases->info() : nullptr), output->info(), conv_info, act_info, + enable_fast_math); - _impl->run_pack = - { - { TensorType::ACL_SRC_0, _impl->src }, - { TensorType::ACL_SRC_1, _impl->weights }, - { TensorType::ACL_SRC_2, _impl->biases }, - { TensorType::ACL_DST, _impl->dst } - }; - - _impl->prep_pack = { { TensorType::ACL_SRC_1, _impl->weights } }; - _impl->workspace_tensors = manage_workspace<CLTensor>(_impl->op->workspace(), _impl->memory_group, _impl->run_pack, _impl->prep_pack); + _impl->run_pack = {{TensorType::ACL_SRC_0, _impl->src}, + {TensorType::ACL_SRC_1, _impl->weights}, + {TensorType::ACL_SRC_2, _impl->biases}, + {TensorType::ACL_DST, _impl->dst}}; + _impl->workspace_tensors = + manage_workspace<CLTensor>(_impl->op->workspace(), _impl->memory_group, _impl->run_pack, _impl->run_pack); } -Status CLWinogradConvolutionLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info, - const ActivationLayerInfo &act_info, bool enable_fast_math) +Status CLWinogradConvolutionLayer::validate(const ITensorInfo *input, + const ITensorInfo *weights, + const ITensorInfo *biases, + const ITensorInfo *output, + const PadStrideInfo &conv_info, + const ActivationLayerInfo &act_info, + bool enable_fast_math) { return opencl::ClWinogradConv2d::validate(input, weights, biases, output, conv_info, act_info, enable_fast_math); } @@ -100,10 +114,14 @@ void CLWinogradConvolutionLayer::run() void CLWinogradConvolutionLayer::prepare() { - if(!_impl->is_prepared) + if (!_impl->is_prepared) { - _impl->op->prepare(_impl->prep_pack); + _impl->op->prepare(_impl->run_pack); + + // Release Preparation tensors + release_prepare_tensors(_impl->workspace_tensors, _impl->run_pack); + _impl->run_pack.remove_tensor(TensorType::ACL_SRC_1); _impl->is_prepared = true; } } -} // namespace arm_compute
\ No newline at end of file +} // namespace arm_compute diff --git a/src/runtime/CL/gemm/CLGEMMDefaultTypeBifrost.cpp b/src/runtime/CL/gemm/CLGEMMDefaultTypeBifrost.cpp index 390bb97665..4270165ab4 100644 --- a/src/runtime/CL/gemm/CLGEMMDefaultTypeBifrost.cpp +++ b/src/runtime/CL/gemm/CLGEMMDefaultTypeBifrost.cpp @@ -25,7 +25,8 @@ #include "arm_compute/core/CL/CLHelpers.h" #include "arm_compute/core/CL/CLKernelLibrary.h" -#include "src/core/gpu/cl/kernels/gemm/ClGemmHelpers.h" + +#include "src/gpu/cl/kernels/gemm/ClGemmHelpers.h" #include <map> #include <utility> @@ -34,8 +35,7 @@ namespace arm_compute { namespace cl_gemm { -CLGEMMDefaultTypeBifrost::CLGEMMDefaultTypeBifrost(GPUTarget gpu) - : ICLGEMMKernelSelection(gpu) +CLGEMMDefaultTypeBifrost::CLGEMMDefaultTypeBifrost(GPUTarget gpu) : ICLGEMMKernelSelection(gpu) { } @@ -44,131 +44,133 @@ CLGEMMKernelType CLGEMMDefaultTypeBifrost::select_kernel(const CLGEMMKernelSelec // _target could be used in the future to have a dedicated heuristic for each GPU IP ARM_COMPUTE_UNUSED(_target); - using FunctionExecutorPtr = CLGEMMKernelType (CLGEMMDefaultTypeBifrost::*)(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant); + using FunctionExecutorPtr = CLGEMMKernelType (CLGEMMDefaultTypeBifrost::*)( + unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant); // Default configurations for Bifrost architectures - static std::map<DataType, FunctionExecutorPtr> gemm_default_configs = - { - { DataType::F32, &CLGEMMDefaultTypeBifrost::default_f32 }, - { DataType::F16, &CLGEMMDefaultTypeBifrost::default_f16 }, - { DataType::QASYMM8, &CLGEMMDefaultTypeBifrost::default_q8 }, - { DataType::QASYMM8_SIGNED, &CLGEMMDefaultTypeBifrost::default_q8 }, - { DataType::QSYMM8, &CLGEMMDefaultTypeBifrost::default_q8 }, - { DataType::QSYMM8_PER_CHANNEL, &CLGEMMDefaultTypeBifrost::default_q8 } - }; + static std::map<DataType, FunctionExecutorPtr> gemm_default_configs = { + {DataType::F32, &CLGEMMDefaultTypeBifrost::default_f32}, + {DataType::F16, &CLGEMMDefaultTypeBifrost::default_f16}, + {DataType::QASYMM8, &CLGEMMDefaultTypeBifrost::default_q8}, + {DataType::QASYMM8_SIGNED, &CLGEMMDefaultTypeBifrost::default_q8}, + {DataType::QSYMM8, &CLGEMMDefaultTypeBifrost::default_q8}, + {DataType::QSYMM8_PER_CHANNEL, &CLGEMMDefaultTypeBifrost::default_q8}}; // Mali-G71 configurations - static std::map<DataType, FunctionExecutorPtr> gemm_g71_configs = - { - { DataType::F32, &CLGEMMDefaultTypeBifrost::default_f32 }, - { DataType::F16, &CLGEMMDefaultTypeBifrost::g71_f16 }, - { DataType::QASYMM8, &CLGEMMDefaultTypeBifrost::default_q8 }, - { DataType::QASYMM8_SIGNED, &CLGEMMDefaultTypeBifrost::default_q8 }, - { DataType::QSYMM8, &CLGEMMDefaultTypeBifrost::default_q8 }, - { DataType::QSYMM8_PER_CHANNEL, &CLGEMMDefaultTypeBifrost::default_q8 } - }; + static std::map<DataType, FunctionExecutorPtr> gemm_g71_configs = { + {DataType::F32, &CLGEMMDefaultTypeBifrost::default_f32}, + {DataType::F16, &CLGEMMDefaultTypeBifrost::g71_f16}, + {DataType::QASYMM8, &CLGEMMDefaultTypeBifrost::default_q8}, + {DataType::QASYMM8_SIGNED, &CLGEMMDefaultTypeBifrost::default_q8}, + {DataType::QSYMM8, &CLGEMMDefaultTypeBifrost::default_q8}, + {DataType::QSYMM8_PER_CHANNEL, &CLGEMMDefaultTypeBifrost::default_q8}}; // Mali-G52 configurations - static std::map<DataType, FunctionExecutorPtr> gemm_g52_configs = - { - { DataType::F32, &CLGEMMDefaultTypeBifrost::g52_f32 }, - { DataType::F16, &CLGEMMDefaultTypeBifrost::g52_f16 }, - { DataType::QASYMM8, &CLGEMMDefaultTypeBifrost::default_q8 }, - { DataType::QASYMM8_SIGNED, &CLGEMMDefaultTypeBifrost::default_q8 }, - { DataType::QSYMM8, &CLGEMMDefaultTypeBifrost::default_q8 }, - { DataType::QSYMM8_PER_CHANNEL, &CLGEMMDefaultTypeBifrost::default_q8 } - }; + static std::map<DataType, FunctionExecutorPtr> gemm_g52_configs = { + {DataType::F32, &CLGEMMDefaultTypeBifrost::g52_f32}, + {DataType::F16, &CLGEMMDefaultTypeBifrost::g52_f16}, + {DataType::QASYMM8, &CLGEMMDefaultTypeBifrost::default_q8}, + {DataType::QASYMM8_SIGNED, &CLGEMMDefaultTypeBifrost::default_q8}, + {DataType::QSYMM8, &CLGEMMDefaultTypeBifrost::default_q8}, + {DataType::QSYMM8_PER_CHANNEL, &CLGEMMDefaultTypeBifrost::default_q8}}; // Mali-G76 configurations - static std::map<DataType, FunctionExecutorPtr> gemm_g76_configs = - { - { DataType::F32, &CLGEMMDefaultTypeBifrost::g76_f32 }, - { DataType::F16, &CLGEMMDefaultTypeBifrost::g76_f16 }, - { DataType::QASYMM8, &CLGEMMDefaultTypeBifrost::default_q8 }, - { DataType::QASYMM8_SIGNED, &CLGEMMDefaultTypeBifrost::default_q8 }, - { DataType::QSYMM8, &CLGEMMDefaultTypeBifrost::default_q8 }, - { DataType::QSYMM8_PER_CHANNEL, &CLGEMMDefaultTypeBifrost::default_q8 } - }; + static std::map<DataType, FunctionExecutorPtr> gemm_g76_configs = { + {DataType::F32, &CLGEMMDefaultTypeBifrost::g76_f32}, + {DataType::F16, &CLGEMMDefaultTypeBifrost::g76_f16}, + {DataType::QASYMM8, &CLGEMMDefaultTypeBifrost::default_q8}, + {DataType::QASYMM8_SIGNED, &CLGEMMDefaultTypeBifrost::default_q8}, + {DataType::QSYMM8, &CLGEMMDefaultTypeBifrost::default_q8}, + {DataType::QSYMM8_PER_CHANNEL, &CLGEMMDefaultTypeBifrost::default_q8}}; const DataType data_type = params.data_type; - switch(_target) + switch (_target) { case GPUTarget::G71: - if(gemm_g71_configs.find(data_type) != gemm_g71_configs.end()) + if (gemm_g71_configs.find(data_type) != gemm_g71_configs.end()) { - return (this->*gemm_g71_configs[data_type])(params.m, params.n, params.k, params.b, params.is_rhs_constant); + return (this->*gemm_g71_configs[data_type])(params.m, params.n, params.k, params.b, + params.is_rhs_constant); } ARM_COMPUTE_ERROR("Not supported data type"); case GPUTarget::G76: - if(gemm_g76_configs.find(data_type) != gemm_g76_configs.end()) + if (gemm_g76_configs.find(data_type) != gemm_g76_configs.end()) { - return (this->*gemm_g76_configs[data_type])(params.m, params.n, params.k, params.b, params.is_rhs_constant); + return (this->*gemm_g76_configs[data_type])(params.m, params.n, params.k, params.b, + params.is_rhs_constant); } ARM_COMPUTE_ERROR("Not supported data type"); case GPUTarget::G52: - if(gemm_g52_configs.find(data_type) != gemm_g52_configs.end()) + if (gemm_g52_configs.find(data_type) != gemm_g52_configs.end()) { - return (this->*gemm_g52_configs[data_type])(params.m, params.n, params.k, params.b, params.is_rhs_constant); + return (this->*gemm_g52_configs[data_type])(params.m, params.n, params.k, params.b, + params.is_rhs_constant); } ARM_COMPUTE_ERROR("Not supported data type"); default: - if(gemm_default_configs.find(data_type) != gemm_default_configs.end()) + if (gemm_default_configs.find(data_type) != gemm_default_configs.end()) { - return (this->*gemm_default_configs[data_type])(params.m, params.n, params.k, params.b, params.is_rhs_constant); + return (this->*gemm_default_configs[data_type])(params.m, params.n, params.k, params.b, + params.is_rhs_constant); } ARM_COMPUTE_ERROR("Not supported data type"); } } -CLGEMMKernelType CLGEMMDefaultTypeBifrost::default_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant) +CLGEMMKernelType CLGEMMDefaultTypeBifrost::default_f32( + unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant) { ARM_COMPUTE_UNUSED(b); - CLGEMMKernelType gemm_type = CLGEMMKernelType::NATIVE_V1; + CLGEMMKernelType gemm_type = CLGEMMKernelType::NATIVE; - if(is_rhs_constant) + if (is_rhs_constant) { - if((m > 1) && (n < 16)) + if ((m > 1) && (n < 16)) { - gemm_type = CLGEMMKernelType::RESHAPED_V1; + gemm_type = CLGEMMKernelType::RESHAPED; } - else if(m == 1) + else if (m == 1) { gemm_type = CLGEMMKernelType::RESHAPED_ONLY_RHS; } else { - if((k > 256) && (m > 4)) + if ((k > 256) && (m > 4)) { constexpr float alpha = 3.2f; constexpr float fact0 = 1.51f; constexpr float fact1 = 1.66f; constexpr float ops = 12.0f; const float scale = k > 1024 ? 1.07f : 1.0f; - gemm_type = (alpha + ((n * fact0) / ops) < ((fact1 * n * scale) / ops)) ? CLGEMMKernelType::RESHAPED_V1 : CLGEMMKernelType::NATIVE_V1; + gemm_type = (alpha + ((n * fact0) / ops) < ((fact1 * n * scale) / ops)) + ? CLGEMMKernelType::RESHAPED + : CLGEMMKernelType::RESHAPED_ONLY_RHS; } else { - gemm_type = CLGEMMKernelType::NATIVE_V1; + gemm_type = CLGEMMKernelType::RESHAPED_ONLY_RHS; } } const auto workload = static_cast<float>((m * n) / 20.0f); - gemm_type = ((workload > 1600.0f) && (gemm_type == CLGEMMKernelType::RESHAPED_V1)) ? CLGEMMKernelType::RESHAPED : gemm_type; + gemm_type = ((workload > 1600.0f) && (gemm_type == CLGEMMKernelType::RESHAPED)) ? CLGEMMKernelType::RESHAPED + : gemm_type; } return gemm_type; } -CLGEMMKernelType CLGEMMDefaultTypeBifrost::default_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant) +CLGEMMKernelType CLGEMMDefaultTypeBifrost::default_f16( + unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant) { ARM_COMPUTE_UNUSED(n, k, b); - if(is_rhs_constant) + if (is_rhs_constant) { - if(m == 1) + if (m == 1) { return CLGEMMKernelType::RESHAPED_ONLY_RHS; } @@ -179,15 +181,16 @@ CLGEMMKernelType CLGEMMDefaultTypeBifrost::default_f16(unsigned int m, unsigned } else { - return CLGEMMKernelType::NATIVE_V1; + return CLGEMMKernelType::NATIVE; } } -CLGEMMKernelType CLGEMMDefaultTypeBifrost::default_q8(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant) +CLGEMMKernelType CLGEMMDefaultTypeBifrost::default_q8( + unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant) { ARM_COMPUTE_UNUSED(m, n, k, b); - if(is_rhs_constant) + if (is_rhs_constant) { return CLGEMMKernelType::RESHAPED_ONLY_RHS; } @@ -197,21 +200,22 @@ CLGEMMKernelType CLGEMMDefaultTypeBifrost::default_q8(unsigned int m, unsigned i } } -CLGEMMKernelType CLGEMMDefaultTypeBifrost::g76_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant) +CLGEMMKernelType +CLGEMMDefaultTypeBifrost::g76_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant) { ARM_COMPUTE_UNUSED(b); - if(!is_rhs_constant) + if (!is_rhs_constant) { - return CLGEMMKernelType::NATIVE_V1; + return CLGEMMKernelType::NATIVE; } - if(m == 1) + if (m == 1) { return CLGEMMKernelType::RESHAPED_ONLY_RHS; } - if(k <= 496) + if (k <= 496) { - if(n <= 544) + if (n <= 544) { return CLGEMMKernelType::RESHAPED_ONLY_RHS; } @@ -222,17 +226,17 @@ CLGEMMKernelType CLGEMMDefaultTypeBifrost::g76_f32(unsigned int m, unsigned int } else { - if(k <= 588) + if (k <= 588) { - if(k <= 552) + if (k <= 552) { - if(m <= 148) + if (m <= 148) { return CLGEMMKernelType::RESHAPED_ONLY_RHS; } else { - if(m <= 278) + if (m <= 278) { return CLGEMMKernelType::RESHAPED; } @@ -254,16 +258,17 @@ CLGEMMKernelType CLGEMMDefaultTypeBifrost::g76_f32(unsigned int m, unsigned int } } -CLGEMMKernelType CLGEMMDefaultTypeBifrost::g52_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant) +CLGEMMKernelType +CLGEMMDefaultTypeBifrost::g52_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant) { ARM_COMPUTE_UNUSED(b); - if(!is_rhs_constant) + if (!is_rhs_constant) { - return CLGEMMKernelType::NATIVE_V1; + return CLGEMMKernelType::NATIVE; } - if(m == 1) + if (m == 1) { return CLGEMMKernelType::RESHAPED_ONLY_RHS; } @@ -273,13 +278,13 @@ CLGEMMKernelType CLGEMMDefaultTypeBifrost::g52_f32(unsigned int m, unsigned int const float r_nk = static_cast<float>(n) / static_cast<float>(k); const float r_mnk = static_cast<float>(m) / (static_cast<float>(n) * static_cast<float>(k)); - if(r_mn <= 1.5469f) + if (r_mn <= 1.5469f) { - if(r_mk <= 0.8766f) + if (r_mk <= 0.8766f) { - if(r_mk <= 0.0211f) + if (r_mk <= 0.0211f) { - if(r_mnk <= 77.5833f) + if (r_mnk <= 77.5833f) { return CLGEMMKernelType::RESHAPED_ONLY_RHS; } @@ -290,7 +295,7 @@ CLGEMMKernelType CLGEMMDefaultTypeBifrost::g52_f32(unsigned int m, unsigned int } else { - if(r_nk <= 0.0832f) + if (r_nk <= 0.0832f) { return CLGEMMKernelType::RESHAPED_ONLY_RHS; } @@ -302,11 +307,11 @@ CLGEMMKernelType CLGEMMDefaultTypeBifrost::g52_f32(unsigned int m, unsigned int } else { - if(r_mnk <= 193.0000f) + if (r_mnk <= 193.0000f) { - if(r_mn <= 0.9948f) + if (r_mn <= 0.9948f) { - if(r_mk <= 2.5453f) + if (r_mk <= 2.5453f) { return CLGEMMKernelType::RESHAPED; } @@ -328,17 +333,17 @@ CLGEMMKernelType CLGEMMDefaultTypeBifrost::g52_f32(unsigned int m, unsigned int } else { - if(r_mn <= 17.7370f) + if (r_mn <= 17.7370f) { - if(r_mnk <= 1391.2875f) + if (r_mnk <= 1391.2875f) { - if(r_mk <= 2.9724f) + if (r_mk <= 2.9724f) { return CLGEMMKernelType::RESHAPED_ONLY_RHS; } else { - if(r_mnk <= 470.0000f) + if (r_mnk <= 470.0000f) { return CLGEMMKernelType::RESHAPED_ONLY_RHS; } @@ -350,9 +355,9 @@ CLGEMMKernelType CLGEMMDefaultTypeBifrost::g52_f32(unsigned int m, unsigned int } else { - if(r_nk <= 0.1381f) + if (r_nk <= 0.1381f) { - if(r_mnk <= 9040.5000f) + if (r_mnk <= 9040.5000f) { return CLGEMMKernelType::RESHAPED_ONLY_RHS; } @@ -363,7 +368,7 @@ CLGEMMKernelType CLGEMMDefaultTypeBifrost::g52_f32(unsigned int m, unsigned int } else { - if(r_mn <= 5.6790f) + if (r_mn <= 5.6790f) { return CLGEMMKernelType::RESHAPED; } @@ -381,16 +386,17 @@ CLGEMMKernelType CLGEMMDefaultTypeBifrost::g52_f32(unsigned int m, unsigned int } } -CLGEMMKernelType CLGEMMDefaultTypeBifrost::g76_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant) +CLGEMMKernelType +CLGEMMDefaultTypeBifrost::g76_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant) { ARM_COMPUTE_UNUSED(b); - if(!is_rhs_constant) + if (!is_rhs_constant) { - return CLGEMMKernelType::NATIVE_V1; + return CLGEMMKernelType::NATIVE; } - if(m == 1) + if (m == 1) { return CLGEMMKernelType::RESHAPED_ONLY_RHS; } @@ -398,21 +404,21 @@ CLGEMMKernelType CLGEMMDefaultTypeBifrost::g76_f16(unsigned int m, unsigned int const float r_mn = static_cast<float>(m) / static_cast<float>(n); const float r_nk = static_cast<float>(n) / static_cast<float>(k); - if(k <= 212) + if (k <= 212) { return CLGEMMKernelType::RESHAPED_ONLY_RHS; } else { - if(r_nk <= 0.4990234375f) + if (r_nk <= 0.4990234375f) { - if(k <= 1392) + if (k <= 1392) { return CLGEMMKernelType::RESHAPED_ONLY_RHS; } else { - if(m <= 325) + if (m <= 325) { return CLGEMMKernelType::RESHAPED_ONLY_RHS; } @@ -424,13 +430,13 @@ CLGEMMKernelType CLGEMMDefaultTypeBifrost::g76_f16(unsigned int m, unsigned int } else { - if(k <= 471) + if (k <= 471) { return CLGEMMKernelType::RESHAPED_ONLY_RHS; } else { - if(r_mn <= 0.04475911520421505f) + if (r_mn <= 0.04475911520421505f) { return CLGEMMKernelType::RESHAPED; } @@ -443,37 +449,38 @@ CLGEMMKernelType CLGEMMDefaultTypeBifrost::g76_f16(unsigned int m, unsigned int } } -CLGEMMKernelType CLGEMMDefaultTypeBifrost::g52_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant) +CLGEMMKernelType +CLGEMMDefaultTypeBifrost::g52_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant) { - if(!is_rhs_constant) + if (!is_rhs_constant) { - return CLGEMMKernelType::NATIVE_V1; + return CLGEMMKernelType::NATIVE; } - if(m == 1) + if (m == 1) { return CLGEMMKernelType::RESHAPED_ONLY_RHS; } - if(n <= 127.0000f) + if (n <= 127.0000f) { - if(n <= 63.5000f) + if (n <= 63.5000f) { return CLGEMMKernelType::RESHAPED_ONLY_RHS; } else { - if(m <= 3616.0000f) + if (m <= 3616.0000f) { - if(b <= 18.5000f) + if (b <= 18.5000f) { - if(m <= 2970.5000f) + if (m <= 2970.5000f) { return CLGEMMKernelType::RESHAPED_ONLY_RHS; } else { - if(k <= 104.0000f) + if (k <= 104.0000f) { return CLGEMMKernelType::RESHAPED_ONLY_RHS; } @@ -496,19 +503,19 @@ CLGEMMKernelType CLGEMMDefaultTypeBifrost::g52_f16(unsigned int m, unsigned int } else { - if(m <= 12.5000f) + if (m <= 12.5000f) { return CLGEMMKernelType::RESHAPED_ONLY_RHS; } else { - if(k <= 104.0000f) + if (k <= 104.0000f) { - if(b <= 18.5000f) + if (b <= 18.5000f) { - if(m <= 490.0000f) + if (m <= 490.0000f) { - if(n <= 272.0000f) + if (n <= 272.0000f) { return CLGEMMKernelType::RESHAPED_ONLY_RHS; } @@ -529,11 +536,11 @@ CLGEMMKernelType CLGEMMDefaultTypeBifrost::g52_f16(unsigned int m, unsigned int } else { - if(m <= 226.0000f) + if (m <= 226.0000f) { - if(n <= 140.0000f) + if (n <= 140.0000f) { - if(m <= 179.5000f) + if (m <= 179.5000f) { return CLGEMMKernelType::RESHAPED; } @@ -556,22 +563,18 @@ CLGEMMKernelType CLGEMMDefaultTypeBifrost::g52_f16(unsigned int m, unsigned int } } -CLGEMMKernelType CLGEMMDefaultTypeBifrost::g71_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant) +CLGEMMKernelType +CLGEMMDefaultTypeBifrost::g71_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant) { ARM_COMPUTE_UNUSED(b); + ARM_COMPUTE_UNUSED(n); + ARM_COMPUTE_UNUSED(k); - if(is_rhs_constant) + if (is_rhs_constant) { - if(m == 1) + if (m == 1) { - if(n > k) - { - return CLGEMMKernelType::NATIVE_V1; - } - else - { - return CLGEMMKernelType::RESHAPED_ONLY_RHS; - } + return CLGEMMKernelType::RESHAPED_ONLY_RHS; } else { @@ -580,7 +583,7 @@ CLGEMMKernelType CLGEMMDefaultTypeBifrost::g71_f16(unsigned int m, unsigned int } else { - return CLGEMMKernelType::NATIVE_V1; + return CLGEMMKernelType::NATIVE; } } } // namespace cl_gemm diff --git a/src/runtime/CL/gemm/CLGEMMDefaultTypeMidgard.cpp b/src/runtime/CL/gemm/CLGEMMDefaultTypeMidgard.cpp index b799de6967..673038a8db 100644 --- a/src/runtime/CL/gemm/CLGEMMDefaultTypeMidgard.cpp +++ b/src/runtime/CL/gemm/CLGEMMDefaultTypeMidgard.cpp @@ -26,7 +26,8 @@ #include "arm_compute/core/CL/CLHelpers.h" #include "arm_compute/core/CL/CLKernelLibrary.h" #include "arm_compute/core/GPUTarget.h" -#include "src/core/gpu/cl/kernels/gemm/ClGemmHelpers.h" + +#include "src/gpu/cl/kernels/gemm/ClGemmHelpers.h" #include <map> #include <utility> @@ -35,8 +36,7 @@ namespace arm_compute { namespace cl_gemm { -CLGEMMDefaultTypeMidgard::CLGEMMDefaultTypeMidgard(GPUTarget gpu) - : ICLGEMMKernelSelection(gpu) +CLGEMMDefaultTypeMidgard::CLGEMMDefaultTypeMidgard(GPUTarget gpu) : ICLGEMMKernelSelection(gpu) { } @@ -45,22 +45,21 @@ CLGEMMKernelType CLGEMMDefaultTypeMidgard::select_kernel(const CLGEMMKernelSelec // _target could be used in the future to have a dedicated heuristic for each GPU IP ARM_COMPUTE_UNUSED(_target); - using FunctionExecutorPtr = CLGEMMKernelType (CLGEMMDefaultTypeMidgard::*)(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant); + using FunctionExecutorPtr = CLGEMMKernelType (CLGEMMDefaultTypeMidgard::*)( + unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant); // Configurations for Midgard architectures - static std::map<DataType, FunctionExecutorPtr> gemm_configs = - { - { DataType::F32, &CLGEMMDefaultTypeMidgard::default_f32 }, - { DataType::F16, &CLGEMMDefaultTypeMidgard::default_f16 }, - { DataType::QASYMM8, &CLGEMMDefaultTypeMidgard::default_q8 }, - { DataType::QASYMM8_SIGNED, &CLGEMMDefaultTypeMidgard::default_q8 }, - { DataType::QSYMM8, &CLGEMMDefaultTypeMidgard::default_q8 }, - { DataType::QSYMM8_PER_CHANNEL, &CLGEMMDefaultTypeMidgard::default_q8 } - }; + static std::map<DataType, FunctionExecutorPtr> gemm_configs = { + {DataType::F32, &CLGEMMDefaultTypeMidgard::default_f32}, + {DataType::F16, &CLGEMMDefaultTypeMidgard::default_f16}, + {DataType::QASYMM8, &CLGEMMDefaultTypeMidgard::default_q8}, + {DataType::QASYMM8_SIGNED, &CLGEMMDefaultTypeMidgard::default_q8}, + {DataType::QSYMM8, &CLGEMMDefaultTypeMidgard::default_q8}, + {DataType::QSYMM8_PER_CHANNEL, &CLGEMMDefaultTypeMidgard::default_q8}}; const DataType data_type = params.data_type; - if(gemm_configs.find(data_type) != gemm_configs.end()) + if (gemm_configs.find(data_type) != gemm_configs.end()) { return (this->*gemm_configs[data_type])(params.m, params.n, params.k, params.b, params.is_rhs_constant); } @@ -68,23 +67,26 @@ CLGEMMKernelType CLGEMMDefaultTypeMidgard::select_kernel(const CLGEMMKernelSelec ARM_COMPUTE_ERROR("Not supported data type"); } -CLGEMMKernelType CLGEMMDefaultTypeMidgard::default_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant) +CLGEMMKernelType CLGEMMDefaultTypeMidgard::default_f32( + unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant) { ARM_COMPUTE_UNUSED(n, k, b); // We reshape the matrices only if we do not have the vector-by-matrix case and we reshape the matrix B only once - return ((m != 1) && is_rhs_constant) ? CLGEMMKernelType::RESHAPED_V1 : CLGEMMKernelType::NATIVE_V1; + return ((m != 1) && is_rhs_constant) ? CLGEMMKernelType::RESHAPED : CLGEMMKernelType::NATIVE; } -CLGEMMKernelType CLGEMMDefaultTypeMidgard::default_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant) +CLGEMMKernelType CLGEMMDefaultTypeMidgard::default_f16( + unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant) { ARM_COMPUTE_UNUSED(n, k, b); // We reshape the matrices only if we do not have the vector-by-matrix case and we reshape the matrix B only once - return ((m != 1) && is_rhs_constant) ? CLGEMMKernelType::RESHAPED_V1 : CLGEMMKernelType::NATIVE_V1; + return ((m != 1) && is_rhs_constant) ? CLGEMMKernelType::RESHAPED : CLGEMMKernelType::NATIVE; } -CLGEMMKernelType CLGEMMDefaultTypeMidgard::default_q8(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant) +CLGEMMKernelType CLGEMMDefaultTypeMidgard::default_q8( + unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant) { ARM_COMPUTE_UNUSED(m, n, k, b, is_rhs_constant); diff --git a/src/runtime/CL/gemm/CLGEMMDefaultTypeValhall.cpp b/src/runtime/CL/gemm/CLGEMMDefaultTypeValhall.cpp index 982748810d..851e23bc84 100644 --- a/src/runtime/CL/gemm/CLGEMMDefaultTypeValhall.cpp +++ b/src/runtime/CL/gemm/CLGEMMDefaultTypeValhall.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2021 Arm Limited. + * Copyright (c) 2020-2023 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -25,7 +25,8 @@ #include "arm_compute/core/CL/CLHelpers.h" #include "arm_compute/core/CL/CLKernelLibrary.h" -#include "src/core/gpu/cl/kernels/gemm/ClGemmHelpers.h" + +#include "src/gpu/cl/kernels/gemm/ClGemmHelpers.h" #include <map> #include <utility> @@ -34,8 +35,7 @@ namespace arm_compute { namespace cl_gemm { -CLGEMMDefaultTypeValhall::CLGEMMDefaultTypeValhall(GPUTarget gpu) - : ICLGEMMKernelSelection(gpu) +CLGEMMDefaultTypeValhall::CLGEMMDefaultTypeValhall(GPUTarget gpu) : ICLGEMMKernelSelection(gpu) { } @@ -44,189 +44,136 @@ CLGEMMKernelType CLGEMMDefaultTypeValhall::select_kernel(const CLGEMMKernelSelec // _target could be used in the future to have a dedicated heuristic for each GPU IP ARM_COMPUTE_UNUSED(_target); - using FunctionExecutorPtr = CLGEMMKernelType (CLGEMMDefaultTypeValhall::*)(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant); + using FunctionExecutorPtr = CLGEMMKernelType (CLGEMMDefaultTypeValhall::*)( + unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant); // Default configurations for Valhall architectures - static std::map<DataType, FunctionExecutorPtr> gemm_default_configs = - { - { DataType::F32, &CLGEMMDefaultTypeValhall::default_f32 }, - { DataType::F16, &CLGEMMDefaultTypeValhall::default_f16 }, - { DataType::QASYMM8, &CLGEMMDefaultTypeValhall::default_q8 }, - { DataType::QASYMM8_SIGNED, &CLGEMMDefaultTypeValhall::default_q8 }, - { DataType::QSYMM8, &CLGEMMDefaultTypeValhall::default_q8 }, - { DataType::QSYMM8_PER_CHANNEL, &CLGEMMDefaultTypeValhall::default_q8 } - }; + static std::map<DataType, FunctionExecutorPtr> gemm_default_configs = { + {DataType::F32, &CLGEMMDefaultTypeValhall::default_f32}, + {DataType::F16, &CLGEMMDefaultTypeValhall::default_f16}, + {DataType::QASYMM8, &CLGEMMDefaultTypeValhall::default_q8}, + {DataType::QASYMM8_SIGNED, &CLGEMMDefaultTypeValhall::default_q8}, + {DataType::QSYMM8, &CLGEMMDefaultTypeValhall::default_q8}, + {DataType::QSYMM8_PER_CHANNEL, &CLGEMMDefaultTypeValhall::default_q8}}; // Mali-G77 configurations - static std::map<DataType, FunctionExecutorPtr> gemm_g77_configs = - { - { DataType::F32, &CLGEMMDefaultTypeValhall::default_f32 }, - { DataType::F16, &CLGEMMDefaultTypeValhall::g77_f16 }, - { DataType::QASYMM8, &CLGEMMDefaultTypeValhall::default_q8 }, - { DataType::QASYMM8_SIGNED, &CLGEMMDefaultTypeValhall::default_q8 }, - { DataType::QSYMM8, &CLGEMMDefaultTypeValhall::default_q8 }, - { DataType::QSYMM8_PER_CHANNEL, &CLGEMMDefaultTypeValhall::default_q8 } - }; + static std::map<DataType, FunctionExecutorPtr> gemm_g77_configs = { + {DataType::F32, &CLGEMMDefaultTypeValhall::default_f32}, + {DataType::F16, &CLGEMMDefaultTypeValhall::g77_f16}, + {DataType::QASYMM8, &CLGEMMDefaultTypeValhall::default_q8}, + {DataType::QASYMM8_SIGNED, &CLGEMMDefaultTypeValhall::default_q8}, + {DataType::QSYMM8, &CLGEMMDefaultTypeValhall::default_q8}, + {DataType::QSYMM8_PER_CHANNEL, &CLGEMMDefaultTypeValhall::default_q8}}; // Mali-G78 configurations - static std::map<DataType, FunctionExecutorPtr> gemm_g78_configs = - { - { DataType::F32, &CLGEMMDefaultTypeValhall::g78_f32 }, - { DataType::F16, &CLGEMMDefaultTypeValhall::g78_f16 }, - { DataType::QASYMM8, &CLGEMMDefaultTypeValhall::default_q8 }, - { DataType::QASYMM8_SIGNED, &CLGEMMDefaultTypeValhall::default_q8 }, - { DataType::QSYMM8, &CLGEMMDefaultTypeValhall::default_q8 }, - { DataType::QSYMM8_PER_CHANNEL, &CLGEMMDefaultTypeValhall::default_q8 } - }; + static std::map<DataType, FunctionExecutorPtr> gemm_g78_configs = { + {DataType::F32, &CLGEMMDefaultTypeValhall::g78_f32}, + {DataType::F16, &CLGEMMDefaultTypeValhall::g78_f16}, + {DataType::QASYMM8, &CLGEMMDefaultTypeValhall::default_q8}, + {DataType::QASYMM8_SIGNED, &CLGEMMDefaultTypeValhall::default_q8}, + {DataType::QSYMM8, &CLGEMMDefaultTypeValhall::default_q8}, + {DataType::QSYMM8_PER_CHANNEL, &CLGEMMDefaultTypeValhall::default_q8}}; + + // Mali-G710 and Mali-G610 configurations + static std::map<DataType, FunctionExecutorPtr> gemm_g710_configs = { + {DataType::F32, &CLGEMMDefaultTypeValhall::default_f32}, + {DataType::F16, &CLGEMMDefaultTypeValhall::g710_f16}, + {DataType::QASYMM8, &CLGEMMDefaultTypeValhall::default_q8}, + {DataType::QASYMM8_SIGNED, &CLGEMMDefaultTypeValhall::default_q8}, + {DataType::QSYMM8, &CLGEMMDefaultTypeValhall::default_q8}, + {DataType::QSYMM8_PER_CHANNEL, &CLGEMMDefaultTypeValhall::default_q8}}; + + // Mali-G715 and Mali-G615 configurations + static std::map<DataType, FunctionExecutorPtr> gemm_g715_configs = { + {DataType::F32, &CLGEMMDefaultTypeValhall::g715_f32}, + {DataType::F16, &CLGEMMDefaultTypeValhall::g715_f16}, + {DataType::QASYMM8, &CLGEMMDefaultTypeValhall::default_q8}, + {DataType::QASYMM8_SIGNED, &CLGEMMDefaultTypeValhall::default_q8}, + {DataType::QSYMM8, &CLGEMMDefaultTypeValhall::default_q8}, + {DataType::QSYMM8_PER_CHANNEL, &CLGEMMDefaultTypeValhall::default_q8}}; const DataType data_type = params.data_type; - switch(_target) + switch (_target) { + case GPUTarget::G710: + case GPUTarget::G610: + if (gemm_g710_configs.find(data_type) != gemm_g710_configs.end()) + { + return (this->*gemm_g710_configs[data_type])(params.m, params.n, params.k, params.b, + params.is_rhs_constant); + } + ARM_COMPUTE_ERROR("Not supported data type"); + case GPUTarget::G715: + case GPUTarget::G615: + if (gemm_g715_configs.find(data_type) != gemm_g715_configs.end()) + { + return (this->*gemm_g715_configs[data_type])(params.m, params.n, params.k, params.b, + params.is_rhs_constant); + } + ARM_COMPUTE_ERROR("Not supported data type"); case GPUTarget::G78: - if(gemm_g78_configs.find(data_type) != gemm_g78_configs.end()) + if (gemm_g78_configs.find(data_type) != gemm_g78_configs.end()) { - return (this->*gemm_g78_configs[data_type])(params.m, params.n, params.k, params.b, params.is_rhs_constant); + return (this->*gemm_g78_configs[data_type])(params.m, params.n, params.k, params.b, + params.is_rhs_constant); } ARM_COMPUTE_ERROR("Not supported data type"); case GPUTarget::G77: - if(gemm_g77_configs.find(data_type) != gemm_g77_configs.end()) + if (gemm_g77_configs.find(data_type) != gemm_g77_configs.end()) { - return (this->*gemm_g77_configs[data_type])(params.m, params.n, params.k, params.b, params.is_rhs_constant); + return (this->*gemm_g77_configs[data_type])(params.m, params.n, params.k, params.b, + params.is_rhs_constant); } ARM_COMPUTE_ERROR("Not supported data type"); default: - if(gemm_default_configs.find(data_type) != gemm_default_configs.end()) + if (gemm_default_configs.find(data_type) != gemm_default_configs.end()) { - return (this->*gemm_default_configs[data_type])(params.m, params.n, params.k, params.b, params.is_rhs_constant); + return (this->*gemm_default_configs[data_type])(params.m, params.n, params.k, params.b, + params.is_rhs_constant); } ARM_COMPUTE_ERROR("Not supported data type"); } } -CLGEMMKernelType CLGEMMDefaultTypeValhall::default_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant) +CLGEMMKernelType CLGEMMDefaultTypeValhall::default_f32( + unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant) { ARM_COMPUTE_UNUSED(m, n, k, b); - return is_rhs_constant ? CLGEMMKernelType::RESHAPED_ONLY_RHS : CLGEMMKernelType::NATIVE_V1; + return is_rhs_constant ? CLGEMMKernelType::RESHAPED_ONLY_RHS : CLGEMMKernelType::NATIVE; } -CLGEMMKernelType CLGEMMDefaultTypeValhall::default_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant) +CLGEMMKernelType CLGEMMDefaultTypeValhall::default_f16( + unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant) { ARM_COMPUTE_UNUSED(m, n, k, b); - return is_rhs_constant ? CLGEMMKernelType::RESHAPED_ONLY_RHS : CLGEMMKernelType::NATIVE_V1; + return is_rhs_constant ? CLGEMMKernelType::RESHAPED_ONLY_RHS : CLGEMMKernelType::NATIVE; } -CLGEMMKernelType CLGEMMDefaultTypeValhall::g77_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant) +CLGEMMKernelType +CLGEMMDefaultTypeValhall::g77_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant) { - if(!is_rhs_constant) - { - return CLGEMMKernelType::NATIVE_V1; - } + ARM_COMPUTE_UNUSED(m, n, k, b); - if(m == 1) - { - return CLGEMMKernelType::RESHAPED_ONLY_RHS; - } + return is_rhs_constant ? CLGEMMKernelType::RESHAPED_ONLY_RHS : CLGEMMKernelType::NATIVE; +} - const float r_mn = static_cast<float>(m) / static_cast<float>(n); - const float r_mk = static_cast<float>(m) / static_cast<float>(k); - const float r_nk = static_cast<float>(n) / static_cast<float>(k); - const float workload = (static_cast<float>(m) * static_cast<float>(n) * static_cast<float>(b)) / 20.0f; +CLGEMMKernelType +CLGEMMDefaultTypeValhall::g710_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant) +{ + ARM_COMPUTE_UNUSED(m, n, k, b); - if(r_mk <= 0.6817956566810608) - { - if(workload <= 801.6000061035156) - { - return CLGEMMKernelType::RESHAPED_ONLY_RHS; - } - else - { - if(r_mn <= 0.0839829258620739) - { - return CLGEMMKernelType::RESHAPED_ONLY_RHS; - } - else - { - if(r_mk <= 0.24917218834161758) - { - return CLGEMMKernelType::RESHAPED; - } - else - { - if(workload <= 2551.75) - { - return CLGEMMKernelType::RESHAPED_ONLY_RHS; - } - else - { - if(workload <= 5061.574951171875) - { - return CLGEMMKernelType::RESHAPED_ONLY_RHS; - } - else - { - return CLGEMMKernelType::RESHAPED; - } - } - } - } - } - } - else - { - if(r_mk <= 4.849947690963745) - { - if(workload <= 17618.4501953125) - { - if(workload <= 5224.699951171875) - { - return CLGEMMKernelType::RESHAPED_ONLY_RHS; - } - else - { - if(r_nk <= 0.7933054566383362) - { - return CLGEMMKernelType::RESHAPED; - } - else - { - return CLGEMMKernelType::RESHAPED_ONLY_RHS; - } - } - } - else - { - if(workload <= 20275.2001953125) - { - return CLGEMMKernelType::RESHAPED; - } - else - { - if(r_mk <= 3.07421875) - { - return CLGEMMKernelType::RESHAPED_ONLY_RHS; - } - else - { - return CLGEMMKernelType::RESHAPED; - } - } - } - } - else - { - return CLGEMMKernelType::RESHAPED_ONLY_RHS; - } - } + return is_rhs_constant ? CLGEMMKernelType::RESHAPED_ONLY_RHS : CLGEMMKernelType::NATIVE; } -CLGEMMKernelType CLGEMMDefaultTypeValhall::default_q8(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant) +CLGEMMKernelType CLGEMMDefaultTypeValhall::default_q8( + unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant) { ARM_COMPUTE_UNUSED(m, n, k, b); - if(is_rhs_constant) + if (is_rhs_constant) { return CLGEMMKernelType::RESHAPED_ONLY_RHS; } @@ -236,47 +183,48 @@ CLGEMMKernelType CLGEMMDefaultTypeValhall::default_q8(unsigned int m, unsigned i } } -CLGEMMKernelType CLGEMMDefaultTypeValhall::g78_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant) +CLGEMMKernelType +CLGEMMDefaultTypeValhall::g78_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant) { ARM_COMPUTE_UNUSED(b); - if(!is_rhs_constant) + if (!is_rhs_constant) { - return CLGEMMKernelType::NATIVE_V1; + return CLGEMMKernelType::NATIVE; } - if(m == 1) + if (m == 1) { return CLGEMMKernelType::RESHAPED_ONLY_RHS; } - if(n <= 272.0000f) + if (n <= 272.0000f) { return CLGEMMKernelType::RESHAPED_ONLY_RHS; } else { - if(k <= 471.0000f) + if (k <= 471.0000f) { return CLGEMMKernelType::RESHAPED_ONLY_RHS; } else { - if(m <= 72.5000f) + if (m <= 72.5000f) { return CLGEMMKernelType::RESHAPED_ONLY_RHS; } else { - if(m <= 90.5000f) + if (m <= 90.5000f) { return CLGEMMKernelType::RESHAPED; } else { - if(k <= 2448.0000f) + if (k <= 2448.0000f) { - if(n <= 756.0000f) + if (n <= 756.0000f) { return CLGEMMKernelType::RESHAPED_ONLY_RHS; } @@ -295,16 +243,60 @@ CLGEMMKernelType CLGEMMDefaultTypeValhall::g78_f32(unsigned int m, unsigned int } } -CLGEMMKernelType CLGEMMDefaultTypeValhall::g78_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant) +CLGEMMKernelType +CLGEMMDefaultTypeValhall::g78_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant) { ARM_COMPUTE_UNUSED(m, n, k, b); - if(!is_rhs_constant) + if (!is_rhs_constant) { - return CLGEMMKernelType::NATIVE_V1; + return CLGEMMKernelType::NATIVE; } return CLGEMMKernelType::RESHAPED_ONLY_RHS; } + +CLGEMMKernelType +CLGEMMDefaultTypeValhall::g715_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant) +{ + if (!is_rhs_constant) + { + return default_f32(m, n, k, b, is_rhs_constant); + } + + unsigned int best_m0; + unsigned int best_n0; + + if (opencl::kernels::gemm::is_mmul_kernel_preferred(m, n, k, b, DataType::F32, best_m0, best_n0)) + { + return CLGEMMKernelType::RESHAPED_ONLY_RHS_MMUL; + } + else + { + return default_f32(m, n, k, b, is_rhs_constant); + } +} + +CLGEMMKernelType +CLGEMMDefaultTypeValhall::g715_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant) +{ + if (!is_rhs_constant) + { + return g78_f16(m, n, k, b, is_rhs_constant); + } + + unsigned int best_m0; + unsigned int best_n0; + + if (opencl::kernels::gemm::is_mmul_kernel_preferred(m, n, k, b, DataType::F16, best_m0, best_n0)) + { + return CLGEMMKernelType::RESHAPED_ONLY_RHS_MMUL; + } + else + { + return g78_f16(m, n, k, b, is_rhs_constant); + } +} + } // namespace cl_gemm } // namespace arm_compute diff --git a/src/runtime/CL/gemm/CLGEMMDefaultTypeValhall.h b/src/runtime/CL/gemm/CLGEMMDefaultTypeValhall.h index c88fbcf557..e190295ee4 100644 --- a/src/runtime/CL/gemm/CLGEMMDefaultTypeValhall.h +++ b/src/runtime/CL/gemm/CLGEMMDefaultTypeValhall.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2021 Arm Limited. + * Copyright (c) 2020-2023 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -50,6 +50,9 @@ private: CLGEMMKernelType g77_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant); CLGEMMKernelType g78_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant); CLGEMMKernelType g78_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant); + CLGEMMKernelType g710_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant); + CLGEMMKernelType g715_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant); + CLGEMMKernelType g715_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant); }; } // namespace cl_gemm } // namespace arm_compute diff --git a/src/runtime/CL/gemm/CLGEMMKernelSelection.h b/src/runtime/CL/gemm/CLGEMMKernelSelection.h index 6189a324cf..98dd44b1bf 100644 --- a/src/runtime/CL/gemm/CLGEMMKernelSelection.h +++ b/src/runtime/CL/gemm/CLGEMMKernelSelection.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020 Arm Limited. + * Copyright (c) 2020, 2023 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -21,10 +21,11 @@ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ -#ifndef SRC_CLGEMMKERNELSELECTION_H -#define SRC_CLGEMMKERNELSELECTION_H +#ifndef ACL_SRC_RUNTIME_CL_GEMM_CLGEMMKERNELSELECTION_H +#define ACL_SRC_RUNTIME_CL_GEMM_CLGEMMKERNELSELECTION_H #include "arm_compute/runtime/CL/ICLGEMMKernelSelection.h" + #include "src/runtime/CL/gemm/CLGEMMDefaultTypeBifrost.h" #include "src/runtime/CL/gemm/CLGEMMDefaultTypeMidgard.h" #include "src/runtime/CL/gemm/CLGEMMDefaultTypeValhall.h" @@ -45,13 +46,14 @@ public: */ static std::unique_ptr<ICLGEMMKernelSelection> create(GPUTarget gpu) { - switch(get_arch_from_target(gpu)) + switch (get_arch_from_target(gpu)) { case GPUTarget::MIDGARD: return std::make_unique<CLGEMMDefaultTypeMidgard>(gpu); case GPUTarget::BIFROST: return std::make_unique<CLGEMMDefaultTypeBifrost>(gpu); case GPUTarget::VALHALL: + case GPUTarget::FIFTHGEN: return std::make_unique<CLGEMMDefaultTypeValhall>(gpu); default: ARM_COMPUTE_ERROR("Not supported GPU target"); @@ -60,4 +62,4 @@ public: }; } // namespace cl_gemm } // namespace arm_compute -#endif /* SRC_CLGEMMKERNELSELECTION_H */ +#endif // ACL_SRC_RUNTIME_CL_GEMM_CLGEMMKERNELSELECTION_H diff --git a/src/runtime/CL/gemm_auto_heuristics/CLGEMMAutoHeuristics.cpp b/src/runtime/CL/gemm_auto_heuristics/CLGEMMAutoHeuristics.cpp index b8437487f8..8df57197e2 100644 --- a/src/runtime/CL/gemm_auto_heuristics/CLGEMMAutoHeuristics.cpp +++ b/src/runtime/CL/gemm_auto_heuristics/CLGEMMAutoHeuristics.cpp @@ -27,11 +27,12 @@ #include "arm_compute/core/Validate.h" #include "arm_compute/runtime/CL/CLScheduler.h" #include "arm_compute/runtime/CL/ICLGEMMKernelSelection.h" -#include "src/core/gpu/cl/kernels/gemm/ClGemmHelpers.h" -#include "src/core/gpu/cl/kernels/gemm/IClGemmKernelConfig.h" -#include "src/core/gpu/cl/kernels/gemm/native/ClGemmNativeKernelConfig.h" -#include "src/core/gpu/cl/kernels/gemm/reshaped/ClGemmReshapedKernelConfig.h" -#include "src/core/gpu/cl/kernels/gemm/reshaped_only_rhs/ClGemmReshapedOnlyRhsKernelConfig.h" + +#include "src/gpu/cl/kernels/gemm/ClGemmHelpers.h" +#include "src/gpu/cl/kernels/gemm/IClGemmKernelConfig.h" +#include "src/gpu/cl/kernels/gemm/native/ClGemmNativeKernelConfig.h" +#include "src/gpu/cl/kernels/gemm/reshaped/ClGemmReshapedKernelConfig.h" +#include "src/gpu/cl/kernels/gemm/reshaped_only_rhs/ClGemmReshapedOnlyRhsKernelConfig.h" #include "src/runtime/CL/gemm/CLGEMMKernelSelection.h" #include "src/runtime/CL/mlgo/MLGOHeuristics.h" #include "src/runtime/CL/mlgo/Utils.h" @@ -51,13 +52,15 @@ GEMMTypeResult select_mlgo_gemm_kernel(const CommonQuery &query, bool reshape_b_ bool valid = false; CLGEMMKernelType gemm_type{}; const auto mlgo_heuristics = CLScheduler::get().gemm_heuristics(); - if(mlgo_heuristics != nullptr) + if (mlgo_heuristics != nullptr) { - std::tie(valid, gemm_type) = mlgo_heuristics->get()->query_gemm_type(mlgo::Query{ string_from_target(query.gpu_target), query.data_type, query.m, query.n, query.k, query.b }); + std::tie(valid, gemm_type) = mlgo_heuristics->get()->query_gemm_type( + mlgo::Query{string_from_target(query.gpu_target), query.data_type, query.m, query.n, query.k, query.b}); } - if(valid) + if (valid) { - ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("MLGOHeuristics query returns gemm type: %s.", to_string(gemm_type).c_str()); + ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("MLGOHeuristics query returns gemm type: %s.", + to_string(gemm_type).c_str()); } else { @@ -87,10 +90,11 @@ GEMMConfigResult select_default_gemm_config_reshaped_only_rhs(const CommonQuery { GEMMLHSMatrixInfo lhs_info; GEMMRHSMatrixInfo rhs_info; - std::unique_ptr<IClGemmKernelConfig> gemm_config = ClGemmReshapedOnlyRhsKernelConfigurationFactory::create(query.gpu_target); + std::unique_ptr<IClGemmKernelConfig> gemm_config = + ClGemmReshapedOnlyRhsKernelConfigurationFactory::create(query.gpu_target); ARM_COMPUTE_ERROR_ON_NULLPTR(gemm_config.get()); std::tie(lhs_info, rhs_info) = gemm_config->configure(query.m, query.n, query.k, query.b, query.data_type); - return GEMMConfigResult{ true, lhs_info, rhs_info }; + return GEMMConfigResult{true, lhs_info, rhs_info}; } GEMMConfigResult select_mlgo_gemm_config_reshaped_only_rhs(const CommonQuery &query) @@ -100,32 +104,36 @@ GEMMConfigResult select_mlgo_gemm_config_reshaped_only_rhs(const CommonQuery &qu GEMMRHSMatrixInfo rhs_info; mlgo::GEMMConfigReshapedOnlyRHS config{}; const auto mlgo_heuristics = CLScheduler::get().gemm_heuristics(); - if(mlgo_heuristics != nullptr) + if (mlgo_heuristics != nullptr) { - std::tie(valid, config) = mlgo_heuristics->get()->query_gemm_config_reshaped_only_rhs(mlgo::Query{ string_from_target(query.gpu_target), query.data_type, query.m, query.n, query.k, query.b }); + std::tie(valid, config) = mlgo_heuristics->get()->query_gemm_config_reshaped_only_rhs( + mlgo::Query{string_from_target(query.gpu_target), query.data_type, query.m, query.n, query.k, query.b}); } - if(valid) + if (valid) { - ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("MLGOHeuristics query returns gemm config: %s.", to_string(config).c_str()); + ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("MLGOHeuristics query returns gemm config: %s.", + to_string(config).c_str()); // Setting irrelevant unsigned int parameters to 1 and bool parameters to false as they do no matter - std::tie(lhs_info, rhs_info) = configure_lhs_rhs_info(query.m, query.n, config.m0, config.n0, config.k0, 1, config.h0, false, config.interleave_rhs, !config.transpose_rhs, config.transpose_rhs, - config.export_cl_image); + std::tie(lhs_info, rhs_info) = configure_lhs_rhs_info( + query.m, query.n, config.m0, config.n0, config.k0, 1, config.h0, false, config.interleave_rhs, + !config.transpose_rhs, config.transpose_rhs, config.export_cl_image); } else { ARM_COMPUTE_LOG_INFO_MSG_CORE("MLGOHeuristics query failed"); } - return GEMMConfigResult{ valid, lhs_info, rhs_info }; + return GEMMConfigResult{valid, lhs_info, rhs_info}; } GEMMConfigResult select_default_gemm_config_reshaped(const CommonQuery &query) { GEMMLHSMatrixInfo lhs_info; GEMMRHSMatrixInfo rhs_info; - std::unique_ptr<IClGemmKernelConfig> gemm_config = ClGemmReshapedKernelConfigurationFactory::create(query.gpu_target); + std::unique_ptr<IClGemmKernelConfig> gemm_config = + ClGemmReshapedKernelConfigurationFactory::create(query.gpu_target); ARM_COMPUTE_ERROR_ON_NULLPTR(gemm_config.get()); std::tie(lhs_info, rhs_info) = gemm_config->configure(query.m, query.n, query.k, query.b, query.data_type); - return GEMMConfigResult{ true, lhs_info, rhs_info }; + return GEMMConfigResult{true, lhs_info, rhs_info}; } GEMMConfigResult select_mlgo_gemm_config_reshaped(const CommonQuery &query) @@ -135,21 +143,24 @@ GEMMConfigResult select_mlgo_gemm_config_reshaped(const CommonQuery &query) GEMMRHSMatrixInfo rhs_info; mlgo::GEMMConfigReshaped config{}; const auto mlgo_heuristics = CLScheduler::get().gemm_heuristics(); - if(mlgo_heuristics != nullptr) + if (mlgo_heuristics != nullptr) { - std::tie(valid, config) = mlgo_heuristics->get()->query_gemm_config_reshaped(mlgo::Query{ string_from_target(query.gpu_target), query.data_type, query.m, query.n, query.k, query.b }); + std::tie(valid, config) = mlgo_heuristics->get()->query_gemm_config_reshaped( + mlgo::Query{string_from_target(query.gpu_target), query.data_type, query.m, query.n, query.k, query.b}); } - if(valid) + if (valid) { - ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("MLGOHeuristics query returns gemm config: %s.", to_string(config).c_str()); - std::tie(lhs_info, rhs_info) = configure_lhs_rhs_info(query.m, query.n, config.m0, config.n0, config.k0, config.v0, config.h0, config.interleave_lhs, config.interleave_rhs, !config.transpose_rhs, - config.transpose_rhs, config.export_cl_image); + ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("MLGOHeuristics query returns gemm config: %s.", + to_string(config).c_str()); + std::tie(lhs_info, rhs_info) = configure_lhs_rhs_info( + query.m, query.n, config.m0, config.n0, config.k0, config.v0, config.h0, config.interleave_lhs, + config.interleave_rhs, !config.transpose_rhs, config.transpose_rhs, config.export_cl_image); } else { ARM_COMPUTE_LOG_INFO_MSG_CORE("MLGOHeuristics query failed"); } - return GEMMConfigResult{ valid, lhs_info, rhs_info }; + return GEMMConfigResult{valid, lhs_info, rhs_info}; } GEMMConfigResult select_default_gemm_config_native(const CommonQuery &query) @@ -159,7 +170,7 @@ GEMMConfigResult select_default_gemm_config_native(const CommonQuery &query) std::unique_ptr<IClGemmKernelConfig> gemm_config = ClGemmNativeKernelConfigurationFactory::create(query.gpu_target); ARM_COMPUTE_ERROR_ON_NULLPTR(gemm_config.get()); std::tie(lhs_info, rhs_info) = gemm_config->configure(query.m, query.n, query.k, query.b, query.data_type); - return GEMMConfigResult{ true, lhs_info, rhs_info }; + return GEMMConfigResult{true, lhs_info, rhs_info}; } GEMMConfigResult select_mlgo_gemm_config_native(const CommonQuery &query) @@ -169,23 +180,26 @@ GEMMConfigResult select_mlgo_gemm_config_native(const CommonQuery &query) GEMMRHSMatrixInfo rhs_info; mlgo::GEMMConfigNative config{}; const auto mlgo_heuristics = CLScheduler::get().gemm_heuristics(); - if(mlgo_heuristics != nullptr) + if (mlgo_heuristics != nullptr) { - std::tie(valid, config) = mlgo_heuristics->get()->query_gemm_config_native(mlgo::Query{ string_from_target(query.gpu_target), query.data_type, query.m, query.n, query.k, query.b }); + std::tie(valid, config) = mlgo_heuristics->get()->query_gemm_config_native( + mlgo::Query{string_from_target(query.gpu_target), query.data_type, query.m, query.n, query.k, query.b}); } - if(valid) + if (valid) { - ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("MLGOHeuristics query returns gemm config: %s.", to_string(config).c_str()); + ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("MLGOHeuristics query returns gemm config: %s.", + to_string(config).c_str()); // Setting irrelevant unsigned int parameters to 1 and bool parameters to false as they do no matter - std::tie(lhs_info, rhs_info) = opencl::kernels::gemm::configure_lhs_rhs_info(query.m, query.n, config.m0, config.n0, config.k0, 1, 1, false, false, false, false, false); + std::tie(lhs_info, rhs_info) = opencl::kernels::gemm::configure_lhs_rhs_info( + query.m, query.n, config.m0, config.n0, config.k0, 1, 1, false, false, false, false, false); } else { ARM_COMPUTE_LOG_INFO_MSG_CORE("MLGOHeuristics query failed"); } - return GEMMConfigResult{ valid, lhs_info, rhs_info }; + return GEMMConfigResult{valid, lhs_info, rhs_info}; } } // namespace auto_heuristics } // namespace cl_gemm -} // namespace arm_compute
\ No newline at end of file +} // namespace arm_compute diff --git a/src/runtime/CL/gemm_auto_heuristics/CLGEMMAutoHeuristics.h b/src/runtime/CL/gemm_auto_heuristics/CLGEMMAutoHeuristics.h index 020237b7f4..f544715e03 100644 --- a/src/runtime/CL/gemm_auto_heuristics/CLGEMMAutoHeuristics.h +++ b/src/runtime/CL/gemm_auto_heuristics/CLGEMMAutoHeuristics.h @@ -50,8 +50,7 @@ struct CommonQuery /** Result of querying about GEMM type ( @ref CLGEMMKernelType) */ struct GEMMTypeResult { - GEMMTypeResult(bool valid, CLGEMMKernelType gemm_type) - : valid{ valid }, gemm_type{ gemm_type } + GEMMTypeResult(bool valid, CLGEMMKernelType gemm_type) : valid{valid}, gemm_type{gemm_type} { } /** Test if the result is valid */ @@ -67,7 +66,7 @@ struct GEMMTypeResult struct GEMMConfigResult { GEMMConfigResult(bool valid, const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info) - : valid{ valid }, lhs_info{ lhs_info }, rhs_info{ rhs_info } + : valid{valid}, lhs_info{lhs_info}, rhs_info{rhs_info} { } /** Test if the result is valid */ @@ -134,4 +133,4 @@ GEMMConfigResult select_default_gemm_config_native(const CommonQuery &query); } // namespace cl_gemm } // namespace arm_compute -#endif // SRC_RUNTIME_CL_GEMM_AUTO_HEURISTICS_CL_GEMM_AUTO_HEURISTICS_H
\ No newline at end of file +#endif // SRC_RUNTIME_CL_GEMM_AUTO_HEURISTICS_CL_GEMM_AUTO_HEURISTICS_H diff --git a/src/runtime/CL/mlgo/Common.h b/src/runtime/CL/mlgo/Common.h index c451bd9062..08a7ee8c18 100644 --- a/src/runtime/CL/mlgo/Common.h +++ b/src/runtime/CL/mlgo/Common.h @@ -45,37 +45,37 @@ using GEMMType = CLGEMMKernelType; /** GEMM Configuration for Native kernel */ struct GEMMConfigNative { - unsigned int m0{ 1 }; /**< Number of rows processed by the matrix multiplication */ - unsigned int n0{ 1 }; /**< Number of columns processed by the matrix multiplication */ - unsigned int k0{ 1 }; /**< Number of partial accumulations performed by the matrix multiplication */ + unsigned int m0{1}; /**< Number of rows processed by the matrix multiplication */ + unsigned int n0{1}; /**< Number of columns processed by the matrix multiplication */ + unsigned int k0{1}; /**< Number of partial accumulations performed by the matrix multiplication */ }; /** GEMM Configuration for Reshaped Only RHS kernel */ struct GEMMConfigReshapedOnlyRHS { - unsigned int m0{ 1 }; /**< Number of rows processed by the matrix multiplication */ - unsigned int n0{ 1 }; /**< Number of columns processed by the matrix multiplication */ - unsigned int k0{ 1 }; /**< Number of partial accumulations performed by the matrix multiplication */ - unsigned int h0{ 1 }; /**< Number of horizontal blocks of size (k0xn0) stored on the same output row */ - bool interleave_rhs{ false }; /**< True if the h0 (k0xn0) blocks have to be interleaved in the output row */ - bool transpose_rhs{ false }; /**< True if the (k0xn0) block has to be transposed before been stored */ - bool export_cl_image{ false }; /**< True if the reshaped rhs has to be exported to cl_image. n0 must be equal to 4 */ + unsigned int m0{1}; /**< Number of rows processed by the matrix multiplication */ + unsigned int n0{1}; /**< Number of columns processed by the matrix multiplication */ + unsigned int k0{1}; /**< Number of partial accumulations performed by the matrix multiplication */ + unsigned int h0{1}; /**< Number of horizontal blocks of size (k0xn0) stored on the same output row */ + bool interleave_rhs{false}; /**< True if the h0 (k0xn0) blocks have to be interleaved in the output row */ + bool transpose_rhs{false}; /**< True if the (k0xn0) block has to be transposed before been stored */ + bool export_cl_image{false}; /**< True if the reshaped rhs has to be exported to cl_image. n0 must be equal to 4 */ }; /** GEMM Configuration for Reshaped kernel */ struct GEMMConfigReshaped { - unsigned int m0{ 1 }; /**< Number of rows processed by the matrix multiplication */ - unsigned int n0{ 1 }; /**< Number of columns processed by the matrix multiplication */ - unsigned int k0{ 1 }; /**< Number of partial accumulations performed by the matrix multiplication */ - unsigned int v0{ 1 }; /**< Number of vertical blocks of size (m0xk0) stored on the same output row */ - unsigned int h0{ 1 }; /**< Number of horizontal blocks of size (k0xn0) stored on the same output row */ - bool interleave_lhs{ false }; /**< True if the v0 (m0xk0) blocks have to be interleaved in the output row */ - bool interleave_rhs{ false }; /**< True if the h0 (k0xn0) blocks have to be interleaved in the output row */ - bool transpose_rhs{ false }; /**< True if the (k0xn0) block has to be transposed before been stored */ - bool export_cl_image{ false }; /**< True if the reshaped rhs has to be exported to cl_image. n0 must be equal to 4 */ + unsigned int m0{1}; /**< Number of rows processed by the matrix multiplication */ + unsigned int n0{1}; /**< Number of columns processed by the matrix multiplication */ + unsigned int k0{1}; /**< Number of partial accumulations performed by the matrix multiplication */ + unsigned int v0{1}; /**< Number of vertical blocks of size (m0xk0) stored on the same output row */ + unsigned int h0{1}; /**< Number of horizontal blocks of size (k0xn0) stored on the same output row */ + bool interleave_lhs{false}; /**< True if the v0 (m0xk0) blocks have to be interleaved in the output row */ + bool interleave_rhs{false}; /**< True if the h0 (k0xn0) blocks have to be interleaved in the output row */ + bool transpose_rhs{false}; /**< True if the (k0xn0) block has to be transposed before been stored */ + bool export_cl_image{false}; /**< True if the reshaped rhs has to be exported to cl_image. n0 must be equal to 4 */ }; } // namespace mlgo } // namespace arm_compute -#endif // SRC_RUNTIME_CL_MLGO_COMMON_H
\ No newline at end of file +#endif // SRC_RUNTIME_CL_MLGO_COMMON_H diff --git a/src/runtime/CL/mlgo/HeuristicTree.cpp b/src/runtime/CL/mlgo/HeuristicTree.cpp index 1c75cdc427..f7b706902b 100644 --- a/src/runtime/CL/mlgo/HeuristicTree.cpp +++ b/src/runtime/CL/mlgo/HeuristicTree.cpp @@ -22,6 +22,7 @@ * SOFTWARE. */ #include "src/runtime/CL/mlgo/HeuristicTree.h" + #include "arm_compute/core/Log.h" #include "support/Cast.h" @@ -40,27 +41,23 @@ bool evaluate(GEMMShape shape, Condition cond) // PRE: all features and ConditionalOps are valid constexpr float eps = 0.0001f; // Calculate all secondary features - std::vector<std::pair<std::string, float>> cond_values - { - { "m", static_cast<float>(shape.m) }, - { "n", static_cast<float>(shape.n) }, - { "k", static_cast<float>(shape.k) }, - { "b", static_cast<float>(shape.b) }, - { "r_mn", static_cast<float>(shape.m) / shape.n }, - { "r_mk", static_cast<float>(shape.m) / shape.k }, - { "r_nk", static_cast<float>(shape.n) / shape.k }, - { "r_mnk", static_cast<float>(shape.m) / (static_cast<float>(shape.n) / shape.k) }, - { "workload", (static_cast<float>(shape.m) * shape.n * shape.b) / 20.0 } - }; - auto cond_value_pair_it = std::find_if(cond_values.begin(), cond_values.end(), - [&cond](decltype(*cond_values.begin()) it) - { - return it.first == cond.feature; - }); + std::vector<std::pair<std::string, float>> cond_values{ + {"m", static_cast<float>(shape.m)}, + {"n", static_cast<float>(shape.n)}, + {"k", static_cast<float>(shape.k)}, + {"b", static_cast<float>(shape.b)}, + {"r_mn", static_cast<float>(shape.m) / shape.n}, + {"r_mk", static_cast<float>(shape.m) / shape.k}, + {"r_nk", static_cast<float>(shape.n) / shape.k}, + {"r_mnk", static_cast<float>(shape.m) / (static_cast<float>(shape.n) / shape.k)}, + {"workload", (static_cast<float>(shape.m) * shape.n * shape.b) / 20.0}}; + auto cond_value_pair_it = + std::find_if(cond_values.begin(), cond_values.end(), + [&cond](decltype(*cond_values.begin()) it) { return it.first == cond.feature; }); ARM_COMPUTE_ERROR_ON(cond_value_pair_it == cond_values.end()); const float cond_value = cond_value_pair_it->second; - switch(cond.op) + switch (cond.op) { case ConditionalOp::LT: { @@ -92,13 +89,12 @@ constexpr size_t HeuristicTree::_max_num_nodes; constexpr size_t HeuristicTree::_max_query_depth; constexpr HeuristicTree::NodeID HeuristicTree::_root; -HeuristicTree::HeuristicTree() - : HeuristicTree(0, HeuristicType::GEMM_Type, "", DataType::F32) +HeuristicTree::HeuristicTree() : HeuristicTree(0, HeuristicType::GEMM_Type, "", DataType::F32) { } HeuristicTree::HeuristicTree(TreeID id, HeuristicType h_type, const std::string &ip_target, DataType data_type) - : _id{ id }, _heuristic_type{ h_type }, _ip_target{ ip_target }, _data_type{ data_type }, _tree{} + : _id{id}, _heuristic_type{h_type}, _ip_target{ip_target}, _data_type{data_type}, _tree{} { } @@ -108,16 +104,17 @@ std::pair<bool, T> HeuristicTree::query(GEMMShape shape) const // Root ID = 0; auto cur_node = _tree.at(_root).get(); size_t depth = 0; - while(cur_node->type() != NodeType::Leaf) + while (cur_node->type() != NodeType::Leaf) { - if(depth > _max_query_depth) + if (depth > _max_query_depth) { - ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("Exceeding max query depth: %zu. Is the tree too deep?", _max_query_depth); + ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("Exceeding max query depth: %zu. Is the tree too deep?", + _max_query_depth); return std::make_pair(false, T{}); } ARM_COMPUTE_ERROR_ON_MSG(cur_node->type() != NodeType::Branch, "Unexpected NodeType"); auto br_node = utils::cast::polymorphic_downcast<BranchNode *>(cur_node); - if(evaluate(shape, br_node->condition)) + if (evaluate(shape, br_node->condition)) { cur_node = _tree.at(br_node->true_node).get(); } @@ -135,12 +132,12 @@ std::pair<bool, T> HeuristicTree::query(GEMMShape shape) const template <typename T> bool HeuristicTree::add_leaf(NodeID id, T val) { - if(_tree.size() >= _max_num_nodes) + if (_tree.size() >= _max_num_nodes) { ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("Exceeding the maximum number of nodes allowed %zu", _max_num_nodes); return false; } - if(_tree.find(id) != _tree.end()) + if (_tree.find(id) != _tree.end()) { ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("Cannot add node; node id %zu already exists", id); return false; @@ -151,28 +148,23 @@ bool HeuristicTree::add_leaf(NodeID id, T val) bool HeuristicTree::add_branch(NodeID id, Condition cond, NodeID t_node, NodeID f_node) { - if(_tree.size() >= _max_num_nodes) + if (_tree.size() >= _max_num_nodes) { ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("Exceeding the maximum number of nodes allowed %zu", _max_num_nodes); return false; } - const std::set<std::string> supported_features = - { - "m", "n", "k", "b", "r_mn", "r_mk", "r_nk", "r_mnk", "workload" - }; - const auto orig_feature = cond.feature; - std::transform(cond.feature.begin(), cond.feature.end(), cond.feature.begin(), [](char c) - { - return std::tolower(c); - }); - if(supported_features.find(cond.feature) == supported_features.end()) + const std::set<std::string> supported_features = {"m", "n", "k", "b", "r_mn", "r_mk", "r_nk", "r_mnk", "workload"}; + const auto orig_feature = cond.feature; + std::transform(cond.feature.begin(), cond.feature.end(), cond.feature.begin(), + [](char c) { return std::tolower(c); }); + if (supported_features.find(cond.feature) == supported_features.end()) { ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("Unsupported feature %s", orig_feature.c_str()); return false; } - if(_tree.find(id) != _tree.end()) + if (_tree.find(id) != _tree.end()) { ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("Cannot add node; node id %zu already exists", id); return false; @@ -184,32 +176,32 @@ bool HeuristicTree::add_branch(NodeID id, Condition cond, NodeID t_node, NodeID bool HeuristicTree::check_if_structurally_correct() const { std::set<NodeID> visited; - std::deque<NodeID> to_visit{ _root }; + std::deque<NodeID> to_visit{_root}; - while(!to_visit.empty()) + while (!to_visit.empty()) { auto id = to_visit.front(); to_visit.pop_front(); - if(_tree.find(id) == _tree.end()) + if (_tree.find(id) == _tree.end()) { ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("Missing node %zu", id); return false; } auto not_seen_before = visited.insert(id); - if(!not_seen_before.second) + if (!not_seen_before.second) { ARM_COMPUTE_LOG_INFO_MSG_CORE("Not a tree; contains cycles or loops"); return false; } auto cur_node = _tree.at(id).get(); - if(cur_node->type() == NodeType::Branch) + if (cur_node->type() == NodeType::Branch) { auto br_node = utils::cast::polymorphic_downcast<BranchNode *>(cur_node); to_visit.push_back(br_node->true_node); to_visit.push_back(br_node->false_node); } } - if(visited.size() != _tree.size()) + if (visited.size() != _tree.size()) { ARM_COMPUTE_LOG_INFO_MSG_CORE("Contains disjoint nodes"); return false; @@ -219,12 +211,12 @@ bool HeuristicTree::check_if_structurally_correct() const bool HeuristicTree::check() { - if(_tree.empty()) + if (_tree.empty()) { ARM_COMPUTE_LOG_INFO_MSG_CORE("Empty tree encountered"); return false; } - if(_tree.find(_root) == _tree.end()) + if (_tree.find(_root) == _tree.end()) { ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("Missing root. Root must have a Node ID of %zu", _root); return false; @@ -237,7 +229,8 @@ template std::pair<bool, GEMMType> HeuristicTree::query<GEMMType>(GEMMShape shap /** Explicit template instantiation @relates HeuristicTree */ template std::pair<bool, GEMMConfigNative> HeuristicTree::query<GEMMConfigNative>(GEMMShape shape) const; /** Explicit template instantiation @relates HeuristicTree */ -template std::pair<bool, GEMMConfigReshapedOnlyRHS> HeuristicTree::query<GEMMConfigReshapedOnlyRHS>(GEMMShape shape) const; +template std::pair<bool, GEMMConfigReshapedOnlyRHS> +HeuristicTree::query<GEMMConfigReshapedOnlyRHS>(GEMMShape shape) const; /** Explicit template instantiation @relates HeuristicTree */ template std::pair<bool, GEMMConfigReshaped> HeuristicTree::query<GEMMConfigReshaped>(GEMMShape shape) const; diff --git a/src/runtime/CL/mlgo/HeuristicTree.h b/src/runtime/CL/mlgo/HeuristicTree.h index d5c7de2215..a4f8c116b9 100644 --- a/src/runtime/CL/mlgo/HeuristicTree.h +++ b/src/runtime/CL/mlgo/HeuristicTree.h @@ -25,6 +25,7 @@ #define SRC_RUNTIME_CL_MLGO_HEURISTIC_TREE_H #include "arm_compute/core/Types.h" + #include "src/runtime/CL/mlgo/Common.h" #include <map> @@ -84,7 +85,7 @@ public: struct BranchNode : public Node { BranchNode(NodeID id, Condition cond, NodeID t_node, NodeID f_node) - : id{ id }, condition{ cond }, true_node{ t_node }, false_node{ f_node } + : id{id}, condition{cond}, true_node{t_node}, false_node{f_node} { } NodeType type() const override @@ -100,8 +101,7 @@ public: template <typename T> struct LeafNode : public Node { - LeafNode(NodeID id, T val) - : id{ id }, value{ val } + LeafNode(NodeID id, T val) : id{id}, value{val} { } NodeType type() const override @@ -177,22 +177,22 @@ public: bool check(); private: - static constexpr size_t _max_query_depth{ 1000 }; // Maximum depth of query - static constexpr size_t _max_num_nodes{ 100000 }; // Maximum number of nodes contained by the tree - static constexpr NodeID _root{ 0 }; // Root tree ID + static constexpr size_t _max_query_depth{1000}; // Maximum depth of query + static constexpr size_t _max_num_nodes{100000}; // Maximum number of nodes contained by the tree + static constexpr NodeID _root{0}; // Root tree ID private: bool check_if_structurally_correct() const; private: - TreeID _id; /**< Heuristic tree ID */ - HeuristicType _heuristic_type; /**< Heuristic type */ - std::string _ip_target; /**< IP target associated with the tree */ - DataType _data_type; /**< Data type associated with the tree */ - std::map<NodeID, std::unique_ptr<Node>> _tree; /**< Tree representation */ + TreeID _id; /**< Heuristic tree ID */ + HeuristicType _heuristic_type; /**< Heuristic type */ + std::string _ip_target; /**< IP target associated with the tree */ + DataType _data_type; /**< Data type associated with the tree */ + std::map<NodeID, std::unique_ptr<Node>> _tree; /**< Tree representation */ }; } // namespace mlgo } // namespace arm_compute -#endif //SRC_RUNTIME_CL_MLGO_HEURISTIC_TREE_H
\ No newline at end of file +#endif //SRC_RUNTIME_CL_MLGO_HEURISTIC_TREE_H diff --git a/src/runtime/CL/mlgo/MLGOHeuristics.cpp b/src/runtime/CL/mlgo/MLGOHeuristics.cpp index 80f3bb85e9..aed46cd80f 100644 --- a/src/runtime/CL/mlgo/MLGOHeuristics.cpp +++ b/src/runtime/CL/mlgo/MLGOHeuristics.cpp @@ -24,6 +24,7 @@ #include "src/runtime/CL/mlgo/MLGOHeuristics.h" #include "arm_compute/core/Log.h" + #include "src/runtime/CL/mlgo/MLGOParser.h" #include "src/runtime/CL/mlgo/Utils.h" @@ -39,19 +40,19 @@ bool operator==(const GEMMConfigNative &lhs, const GEMMConfigNative &rhs) } bool operator==(const GEMMConfigReshapedOnlyRHS &lhs, const GEMMConfigReshapedOnlyRHS &rhs) { - return std::tie(lhs.m0, lhs.n0, lhs.k0, lhs.h0, lhs.interleave_rhs, lhs.transpose_rhs, lhs.export_cl_image) == std::tie(rhs.m0, rhs.n0, rhs.k0, rhs.h0, rhs.interleave_rhs, rhs.transpose_rhs, - rhs.export_cl_image); + return std::tie(lhs.m0, lhs.n0, lhs.k0, lhs.h0, lhs.interleave_rhs, lhs.transpose_rhs, lhs.export_cl_image) == + std::tie(rhs.m0, rhs.n0, rhs.k0, rhs.h0, rhs.interleave_rhs, rhs.transpose_rhs, rhs.export_cl_image); } bool operator==(const GEMMConfigReshaped &lhs, const GEMMConfigReshaped &rhs) { - return std::tie(lhs.m0, lhs.n0, lhs.k0, lhs.v0, lhs.h0, lhs.interleave_lhs, lhs.interleave_rhs, lhs.transpose_rhs, lhs.export_cl_image) == std::tie(rhs.m0, rhs.n0, rhs.k0, rhs.v0, rhs.h0, - rhs.interleave_lhs, rhs.interleave_rhs, rhs.transpose_rhs, rhs.export_cl_image); + return std::tie(lhs.m0, lhs.n0, lhs.k0, lhs.v0, lhs.h0, lhs.interleave_lhs, lhs.interleave_rhs, lhs.transpose_rhs, + lhs.export_cl_image) == std::tie(rhs.m0, rhs.n0, rhs.k0, rhs.v0, rhs.h0, rhs.interleave_lhs, + rhs.interleave_rhs, rhs.transpose_rhs, rhs.export_cl_image); } constexpr size_t MLGOHeuristics::_max_num_trees; -MLGOHeuristics::MLGOHeuristics() - : _indices{}, _trees{}, _tree_valid{}, _valid{ false } +MLGOHeuristics::MLGOHeuristics() : _indices{}, _trees{}, _tree_valid{}, _valid{false} { } @@ -59,71 +60,74 @@ std::pair<bool, GEMMType> MLGOHeuristics::query_gemm_type(const Query &query) co { ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("MLGOHeuristics querying gemm type. %s.", to_string(query).c_str()); const auto invalid = GEMMType::RESHAPED; - if(!_valid) + if (!_valid) { ARM_COMPUTE_LOG_INFO_MSG_CORE("Invalid DotMLGO. Use default heuristics instead"); - return { false, invalid }; + return {false, invalid}; } auto index = std::make_tuple(HeuristicType::GEMM_Type, query.ip_target, query.data_type); - GEMMShape shape_query{ query.m, query.n, query.k, query.b }; - if(_trees.find(index) == _trees.end()) + GEMMShape shape_query{query.m, query.n, query.k, query.b}; + if (_trees.find(index) == _trees.end()) { ARM_COMPUTE_LOG_INFO_MSG_CORE("Cannot find tree index"); - return { false, invalid }; + return {false, invalid}; } return _trees.at(index).query<GEMMType>(shape_query); } std::pair<bool, GEMMConfigNative> MLGOHeuristics::query_gemm_config_native(const Query &query) const { - ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("MLGOHeuristics querying gemm config native. %s.", to_string(query).c_str()); + ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("MLGOHeuristics querying gemm config native. %s.", + to_string(query).c_str()); const auto invalid = GEMMConfigNative{}; - if(!_valid) + if (!_valid) { ARM_COMPUTE_LOG_INFO_MSG_CORE("Invalid DotMLGO. Use default heuristics instead"); - return { false, invalid }; + return {false, invalid}; } auto index = std::make_tuple(HeuristicType::GEMM_Config_Native, query.ip_target, query.data_type); - GEMMShape shape_query{ query.m, query.n, query.k, query.b }; - if(_trees.find(index) == _trees.end()) + GEMMShape shape_query{query.m, query.n, query.k, query.b}; + if (_trees.find(index) == _trees.end()) { ARM_COMPUTE_LOG_INFO_MSG_CORE("Cannot find tree index"); - return { false, invalid }; + return {false, invalid}; } return _trees.at(index).query<GEMMConfigNative>(shape_query); } std::pair<bool, GEMMConfigReshapedOnlyRHS> MLGOHeuristics::query_gemm_config_reshaped_only_rhs(const Query &query) const { - ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("MLGOHeuristics querying gemm config reshaped only rhs. %s.", to_string(query).c_str()); + ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("MLGOHeuristics querying gemm config reshaped only rhs. %s.", + to_string(query).c_str()); const auto invalid = GEMMConfigReshapedOnlyRHS{}; - if(!_valid) + if (!_valid) { ARM_COMPUTE_LOG_INFO_MSG_CORE("Invalid DotMLGO. Use default heuristics instead"); - return { false, invalid }; + return {false, invalid}; } auto index = std::make_tuple(HeuristicType::GEMM_Config_Reshaped_Only_RHS, query.ip_target, query.data_type); - GEMMShape shape_query{ query.m, query.n, query.k, query.b }; - if(_trees.find(index) == _trees.end()) + GEMMShape shape_query{query.m, query.n, query.k, query.b}; + if (_trees.find(index) == _trees.end()) { ARM_COMPUTE_LOG_INFO_MSG_CORE("Cannot find tree index"); - return { false, invalid }; + return {false, invalid}; } return _trees.at(index).query<GEMMConfigReshapedOnlyRHS>(shape_query); } std::pair<bool, GEMMConfigReshaped> MLGOHeuristics::query_gemm_config_reshaped(const Query &query) const { - ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("MLGOHeuristics querying gemm config reshaped. %s.", to_string(query).c_str()); + ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("MLGOHeuristics querying gemm config reshaped. %s.", + to_string(query).c_str()); const auto invalid = GEMMConfigReshaped{}; - if(!_valid) + if (!_valid) { ARM_COMPUTE_LOG_INFO_MSG_CORE("Invalid DotMLGO. Use default heuristics instead"); - return { false, invalid }; + return {false, invalid}; } auto index = std::make_tuple(HeuristicType::GEMM_Config_Reshaped, query.ip_target, query.data_type); - GEMMShape shape_query{ query.m, query.n, query.k, query.b }; - if(_trees.find(index) == _trees.end()) + GEMMShape shape_query{query.m, query.n, query.k, query.b}; + if (_trees.find(index) == _trees.end()) { ARM_COMPUTE_LOG_INFO_MSG_CORE("Cannot find tree index"); - return { false, invalid }; + return {false, invalid}; } return _trees.at(index).query<GEMMConfigReshaped>(shape_query); } @@ -131,14 +135,14 @@ std::pair<bool, GEMMConfigReshaped> MLGOHeuristics::query_gemm_config_reshaped(c bool MLGOHeuristics::check_heuristic_tree(HeuristicTree::TreeID id) { bool status; - HeuristicTree *tree{ nullptr }; + HeuristicTree *tree{nullptr}; std::tie(status, tree) = get_heuristic_tree(id); - if(!status) + if (!status) { return status; } status = tree->check(); - if(!status) + if (!status) { return status; } @@ -149,14 +153,12 @@ bool MLGOHeuristics::check_heuristic_tree(HeuristicTree::TreeID id) bool MLGOHeuristics::check_all() const { // Tree validities are already checked and cached. - bool all_trees_are_checked = std::find_if(_tree_valid.begin(), _tree_valid.end(), [](auto v) - { - return !v.second; - }) - == _tree_valid.end(); - if(!all_trees_are_checked) + bool all_trees_are_checked = + std::find_if(_tree_valid.begin(), _tree_valid.end(), [](auto v) { return !v.second; }) == _tree_valid.end(); + if (!all_trees_are_checked) { - ARM_COMPUTE_LOG_INFO_MSG_CORE("Missing checks on some trees. Make sure to call check_heuristic_tree after each tree is completed. This could also indicate there are no trees in the dotmlgo"); + ARM_COMPUTE_LOG_INFO_MSG_CORE("Missing checks on some trees. Make sure to call check_heuristic_tree after each " + "tree is completed. This could also indicate there are no trees in the dotmlgo"); return false; } @@ -167,14 +169,14 @@ bool MLGOHeuristics::check_all() const std::pair<bool, HeuristicTree *> MLGOHeuristics::get_heuristic_tree(HeuristicTree::TreeID id) { - if(_indices.find(id) == _indices.end()) + if (_indices.find(id) == _indices.end()) { ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("Cannot find tree with id %zu", id); return std::make_pair(false, nullptr); } const auto index = _indices[id]; - if(_trees.find(index) == _trees.end()) + if (_trees.find(index) == _trees.end()) { ARM_COMPUTE_LOG_INFO_MSG_CORE("Cannot find tree index"); return std::make_pair(false, nullptr); @@ -186,7 +188,7 @@ std::pair<bool, HeuristicTree *> MLGOHeuristics::get_heuristic_tree(HeuristicTre bool MLGOHeuristics::add_heuristic_tree(HeuristicTree &&t) { - if(_indices.size() >= _max_num_trees) + if (_indices.size() >= _max_num_trees) { ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("Exceeding the max number of trees allowed: %zu", _max_num_trees); return false; @@ -194,7 +196,7 @@ bool MLGOHeuristics::add_heuristic_tree(HeuristicTree &&t) // PRE: correctness of t is guaranteed by the tree construction process // Ensure unique id const auto id = t.id(); - if(_indices.find(id) != _indices.end()) + if (_indices.find(id) != _indices.end()) { ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("Cannot add redundant trees; tree id %zu already exists", id); return false; @@ -202,7 +204,7 @@ bool MLGOHeuristics::add_heuristic_tree(HeuristicTree &&t) // Ensure unique index const auto index = t.index(); - if(_trees.find(index) != _trees.end()) + if (_trees.find(index) != _trees.end()) { ARM_COMPUTE_LOG_INFO_MSG_CORE("Cannot add redundant trees; tree index already exists"); return false; @@ -219,9 +221,10 @@ bool MLGOHeuristics::reload_from_file(const std::string &filename) std::ifstream fs; fs.exceptions(std::ifstream::badbit); fs.open(filename, std::ios::in); - if(!fs.is_open()) + if (!fs.is_open()) { - ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("Cannot open DotMLGO file %s. Use default heuristics instead", filename.c_str()); + ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("Cannot open DotMLGO file %s. Use default heuristics instead", + filename.c_str()); return _valid = false; } return reload_from_stream(fs); @@ -230,7 +233,7 @@ bool MLGOHeuristics::reload_from_file(const std::string &filename) bool MLGOHeuristics::reload_from_stream(std::istream &in) { auto parsed = parser::parse_mlgo(in); - if(!parsed.first) + if (!parsed.first) { ARM_COMPUTE_LOG_INFO_MSG_CORE("DotMLGO parsing failed. Use default heuristics instead"); return _valid = false; @@ -241,4 +244,4 @@ bool MLGOHeuristics::reload_from_stream(std::istream &in) } } // namespace mlgo -} // namespace arm_compute
\ No newline at end of file +} // namespace arm_compute diff --git a/src/runtime/CL/mlgo/MLGOHeuristics.h b/src/runtime/CL/mlgo/MLGOHeuristics.h index aa21225959..6a491c5503 100644 --- a/src/runtime/CL/mlgo/MLGOHeuristics.h +++ b/src/runtime/CL/mlgo/MLGOHeuristics.h @@ -135,16 +135,16 @@ public: bool check_all() const; private: - static constexpr size_t _max_num_trees{ 100 }; /**< Max number of trees that can be added*/ + static constexpr size_t _max_num_trees{100}; /**< Max number of trees that can be added*/ private: // There exists a one-to-one mappipng between TreeID and Index, either can be used to identify a @ref HeuristicTree std::map<HeuristicTree::TreeID, HeuristicTree::Index> _indices; /**< A mapping from TreeID to Index */ std::map<HeuristicTree::Index, HeuristicTree> _trees; /**< A mapping from Index to HeuristicTree */ std::map<HeuristicTree::TreeID, bool> _tree_valid; /**< Result cache of the tree validity checks */ - bool _valid; /**< Overall validity */ + bool _valid; /**< Overall validity */ }; } // namespace mlgo } // namespace arm_compute -#endif //SRC_RUNTIME_CL_MLGO_MLGO_HEURISTICS_H
\ No newline at end of file +#endif //SRC_RUNTIME_CL_MLGO_MLGO_HEURISTICS_H diff --git a/src/runtime/CL/mlgo/MLGOParser.cpp b/src/runtime/CL/mlgo/MLGOParser.cpp index 625739e450..893daf2ed9 100644 --- a/src/runtime/CL/mlgo/MLGOParser.cpp +++ b/src/runtime/CL/mlgo/MLGOParser.cpp @@ -22,19 +22,21 @@ * SOFTWARE. */ #include "src/runtime/CL/mlgo/MLGOParser.h" + #include "arm_compute/core/Log.h" + #include "src/runtime/CL/mlgo/Utils.h" #include <sstream> #define CHECK(parser_expr, valid_var) \ (parser_expr); \ - if(!valid_var) \ + if (!valid_var) \ return; #define CHECK_DEFAULT(parser_expr, valid_var, default_val) \ (parser_expr); \ - if(!valid_var) \ + if (!valid_var) \ return default_val; #ifdef ARM_COMPUTE_LOGGING_ENABLED @@ -53,8 +55,7 @@ valid_var = false; \ return default_val; -#define LOG_TOKEN_POS(tokens, pos_var) \ - const auto pos_var = tokens.current_pos(); +#define LOG_TOKEN_POS(tokens, pos_var) const auto pos_var = tokens.current_pos(); #else // ARM_COMPUTE_LOGGING_ENABLED @@ -73,19 +74,12 @@ namespace { void ltrim(std::string &str) { - str.erase(str.begin(), std::find_if(str.begin(), str.end(), [](char ch) - { - return !std::isspace(ch); - })); + str.erase(str.begin(), std::find_if(str.begin(), str.end(), [](char ch) { return !std::isspace(ch); })); } void rtrim(std::string &str) { - str.erase(std::find_if(str.rbegin(), str.rend(), [](char ch) - { - return !std::isspace(ch); - }).base(), - str.end()); + str.erase(std::find_if(str.rbegin(), str.rend(), [](char ch) { return !std::isspace(ch); }).base(), str.end()); } void trim(std::string &str) @@ -109,7 +103,7 @@ enum class ComparatorType }; TokenStream::TokenStream(std::istream &s, const std::string &delims) - : _delims{ delims }, _istream{ s }, _tokens{}, _lookahead_pos{} + : _delims{delims}, _istream{s}, _tokens{}, _lookahead_pos{} { read(); } @@ -125,7 +119,7 @@ Token TokenStream::take() ARM_COMPUTE_ERROR_ON_MSG(_tokens.empty(), "TokenStream can never be empty"); Token t = _tokens.front(); _tokens.pop_front(); - if(_tokens.empty()) + if (_tokens.empty()) { read(); } @@ -136,7 +130,7 @@ Token TokenStream::peek(size_t i) ARM_COMPUTE_ERROR_ON_MSG(_tokens.empty(), "TokenStream can never be empty"); ARM_COMPUTE_ERROR_ON_MSG(i >= max_look_ahead, "TokenStream: Exceeding max look ahead"); // NOTE: If i exceeds the stream (_istream.eof()), read() automatically appends a End token at the end - while(_istream && _tokens.size() <= i) + while (_istream && _tokens.size() <= i) { read(); } @@ -146,7 +140,7 @@ Token TokenStream::peek(size_t i) void advance(CharPosition &pos, char ch) { - if(ch == '\n') + if (ch == '\n') { pos.ln += 1; pos.col = 0; @@ -167,17 +161,16 @@ void TokenStream::read() do { // Reached eof - if(!_istream.get(ch)) + if (!_istream.get(ch)) { - if(!reached_end()) + if (!reached_end()) { _tokens.emplace_back(TokenType::End, "", _lookahead_pos); } return; } advance(_lookahead_pos, ch); - } - while(std::isspace(ch) || is_delim(ch)); + } while (std::isspace(ch) || is_delim(ch)); // Read chars until we hit a delim or eof auto orig_pos = _lookahead_pos; auto tok = recognize_tok(ch); @@ -190,41 +183,41 @@ void TokenStream::read() Token TokenStream::recognize_tok(char ch) { - if(ch == '[') + if (ch == '[') { - return Token{ TokenType::L_List, "", _lookahead_pos }; + return Token{TokenType::L_List, "", _lookahead_pos}; } - else if(ch == ']') + else if (ch == ']') { - return Token{ TokenType::R_List, "", _lookahead_pos }; + return Token{TokenType::R_List, "", _lookahead_pos}; } - else if(ch == '.') + else if (ch == '.') { - return float_after_dp_st(std::string{ ch }); + return float_after_dp_st(std::string{ch}); } - else if(std::isdigit(ch)) + else if (std::isdigit(ch)) { - return num_st(std::string{ ch }); + return num_st(std::string{ch}); } else { - return text_st(std::string{ ch }); + return text_st(std::string{ch}); } } Token TokenStream::num_st(std::string value) { char ch{}; - while(_istream.get(ch)) + while (_istream.get(ch)) { advance(_lookahead_pos, ch); - if(ch == '.') + if (ch == '.') { return float_after_dp_st(value + ch); } - else if(!std::isdigit(ch)) + else if (!std::isdigit(ch)) { - if(!is_delim(ch) && !std::isspace(ch)) + if (!is_delim(ch) && !std::isspace(ch)) { rewind(_lookahead_pos); _istream.unget(); @@ -233,18 +226,18 @@ Token TokenStream::num_st(std::string value) } value += ch; } - return Token{ TokenType::Int, value, _lookahead_pos }; + return Token{TokenType::Int, value, _lookahead_pos}; } Token TokenStream::float_after_dp_st(std::string value) { char ch{}; - while(_istream.get(ch)) + while (_istream.get(ch)) { advance(_lookahead_pos, ch); - if(!std::isdigit(ch)) + if (!std::isdigit(ch)) { - if(!is_delim(ch) && !std::isspace(ch)) + if (!is_delim(ch) && !std::isspace(ch)) { rewind(_lookahead_pos); _istream.unget(); @@ -253,20 +246,20 @@ Token TokenStream::float_after_dp_st(std::string value) } value += ch; } - return Token{ TokenType::Float, value, _lookahead_pos }; + return Token{TokenType::Float, value, _lookahead_pos}; } Token TokenStream::text_st(std::string value) { char ch{}; - while(_istream.get(ch)) + while (_istream.get(ch)) { advance(_lookahead_pos, ch); - if(is_delim(ch)) + if (is_delim(ch)) { break; } - if(ch == '[' || ch == ']') + if (ch == '[' || ch == ']') { rewind(_lookahead_pos); _istream.unget(); @@ -274,7 +267,7 @@ Token TokenStream::text_st(std::string value) } value += ch; } - return Token{ TokenType::Text, value, _lookahead_pos }; + return Token{TokenType::Text, value, _lookahead_pos}; } bool TokenStream::reached_end() const @@ -291,7 +284,7 @@ void end(TokenStream &in, bool &valid) { LOG_TOKEN_POS(in, pos); auto tok = in.take(); - if(tok.type != TokenType::End) + if (tok.type != TokenType::End) { FAIL_WITH_MSG(valid, pos, "Unexpected token at the end of stream"); } @@ -301,7 +294,7 @@ bool bool_val(TokenStream &in, bool &valid) { LOG_TOKEN_POS(in, pos); auto tok = in.take(); - if(tok.type != TokenType::Int) + if (tok.type != TokenType::Int) { FAIL_WITH_MSG_DEFAULT(valid, false, pos, "Expect bool or int token"); } @@ -314,7 +307,7 @@ int int_val(TokenStream &in, bool &valid) { LOG_TOKEN_POS(in, pos); auto tok = in.take(); - if(tok.type != TokenType::Int) + if (tok.type != TokenType::Int) { FAIL_WITH_MSG_DEFAULT(valid, -1, pos, "Expect int token"); } @@ -327,7 +320,7 @@ unsigned int uint_val(TokenStream &in, bool &valid) { LOG_TOKEN_POS(in, pos); int val = CHECK_DEFAULT(int_val(in, valid), valid, 0); - if(val < 0) + if (val < 0) { FAIL_WITH_MSG_DEFAULT(valid, 0, pos, "Expect unsigned int token"); } @@ -338,7 +331,7 @@ float float_val(TokenStream &in, bool &valid) { LOG_TOKEN_POS(in, pos); auto tok = in.take(); - if(tok.type != TokenType::Float) + if (tok.type != TokenType::Float) { FAIL_WITH_MSG_DEFAULT(valid, 0.f, pos, "Expect float token"); } @@ -351,7 +344,7 @@ std::string text_val(TokenStream &in, bool &valid) { LOG_TOKEN_POS(in, pos); auto tok = in.take(); - if(tok.type != TokenType::Text || tok.value.empty()) + if (tok.type != TokenType::Text || tok.value.empty()) { FAIL_WITH_MSG_DEFAULT(valid, "", pos, "Expect a non-empty text token"); } @@ -361,9 +354,9 @@ std::string text_val(TokenStream &in, bool &valid) bool accept_text(TokenStream &in, const std::string &c_str, bool take = true) { auto tok = in.peek(); - if(tok.type == TokenType::Text && tok.value == c_str) + if (tok.type == TokenType::Text && tok.value == c_str) { - if(take) + if (take) { in.take(); } @@ -375,7 +368,7 @@ bool accept_text(TokenStream &in, const std::string &c_str, bool take = true) void expect_text(TokenStream &in, const std::string &str, bool &valid) { LOG_TOKEN_POS(in, pos); - if(!accept_text(in, str)) + if (!accept_text(in, str)) { FAIL_WITH_MSG(valid, pos, std::string("Expect text token: ") + str); } @@ -384,7 +377,7 @@ void expect_text(TokenStream &in, const std::string &str, bool &valid) bool accept_l_list(TokenStream &in) { auto tok = in.peek(); - if(tok.type == TokenType::L_List) + if (tok.type == TokenType::L_List) { in.take(); return true; @@ -395,7 +388,7 @@ bool accept_l_list(TokenStream &in) void expect_l_list(TokenStream &in, bool &valid) { LOG_TOKEN_POS(in, pos); - if(!accept_l_list(in)) + if (!accept_l_list(in)) { FAIL_WITH_MSG(valid, pos, "Expect '['"); } @@ -404,7 +397,7 @@ void expect_l_list(TokenStream &in, bool &valid) bool accept_r_list(TokenStream &in) { auto tok = in.peek(); - if(tok.type == TokenType::R_List) + if (tok.type == TokenType::R_List) { in.take(); return true; @@ -415,7 +408,7 @@ bool accept_r_list(TokenStream &in) void expect_r_list(TokenStream &in, bool &valid) { LOG_TOKEN_POS(in, pos); - if(!accept_r_list(in)) + if (!accept_r_list(in)) { FAIL_WITH_MSG(valid, pos, "Expect ']'"); } @@ -424,23 +417,23 @@ void expect_r_list(TokenStream &in, bool &valid) ConditionalOp conditional_op(TokenStream &in, bool &valid) { LOG_TOKEN_POS(in, pos); - if(accept_text(in, "<=")) + if (accept_text(in, "<=")) { return ConditionalOp::LE; } - else if(accept_text(in, ">=")) + else if (accept_text(in, ">=")) { return ConditionalOp::GE; } - else if(accept_text(in, "==")) + else if (accept_text(in, "==")) { return ConditionalOp::EQ; } - else if(accept_text(in, "<")) + else if (accept_text(in, "<")) { return ConditionalOp::LT; } - else if(accept_text(in, ">")) + else if (accept_text(in, ">")) { return ConditionalOp::GT; } @@ -464,11 +457,11 @@ void ip_type(TokenStream &in, bool &valid) { CHECK(expect_text(in, "ip-type", valid), valid); LOG_TOKEN_POS(in, pos); - if(accept_text(in, "gpu")) + if (accept_text(in, "gpu")) { ; } - else if(accept_text(in, "cpu")) + else if (accept_text(in, "cpu")) { ; } @@ -489,15 +482,15 @@ void header(TokenStream &in, bool &valid) DataType data_type(TokenStream &in, bool &valid) { LOG_TOKEN_POS(in, pos); - if(accept_text(in, "f16")) + if (accept_text(in, "f16")) { return DataType::F16; } - else if(accept_text(in, "f32")) + else if (accept_text(in, "f32")) { return DataType::F32; } - else if(accept_text(in, "qasymm8")) + else if (accept_text(in, "qasymm8")) { return DataType::QASYMM8; } @@ -510,15 +503,15 @@ DataType data_type(TokenStream &in, bool &valid) ComparatorType comparator_type(TokenStream &in, bool &valid) { LOG_TOKEN_POS(in, pos); - if(accept_text(in, "var")) + if (accept_text(in, "var")) { return ComparatorType::Var; } - else if(accept_text(in, "num")) + else if (accept_text(in, "num")) { return ComparatorType::Num; } - else if(accept_text(in, "enum")) + else if (accept_text(in, "enum")) { return ComparatorType::Enum; } @@ -531,19 +524,19 @@ ComparatorType comparator_type(TokenStream &in, bool &valid) HeuristicType heuristic_type(TokenStream &in, bool &valid, bool take = true) { LOG_TOKEN_POS(in, pos); - if(accept_text(in, "gemm-type", take)) + if (accept_text(in, "gemm-type", take)) { return HeuristicType::GEMM_Type; } - else if(accept_text(in, "gemm-config-native", take)) + else if (accept_text(in, "gemm-config-native", take)) { return HeuristicType::GEMM_Config_Native; } - else if(accept_text(in, "gemm-config-reshaped-only-rhs", take)) + else if (accept_text(in, "gemm-config-reshaped-only-rhs", take)) { return HeuristicType::GEMM_Config_Reshaped_Only_RHS; } - else if(accept_text(in, "gemm-config-reshaped", take)) + else if (accept_text(in, "gemm-config-reshaped", take)) { return HeuristicType::GEMM_Config_Reshaped; } @@ -557,7 +550,7 @@ void expect_heuristic_type(TokenStream &in, HeuristicType expected_ht, bool &val { LOG_TOKEN_POS(in, pos); auto ht = CHECK(heuristic_type(in, valid, false), valid); - if(ht != expected_ht) + if (ht != expected_ht) { FAIL_WITH_MSG(valid, pos, "Unexpected heuristic type"); } @@ -567,15 +560,15 @@ void expect_heuristic_type(TokenStream &in, HeuristicType expected_ht, bool &val GEMMType gemm_type(TokenStream &in, bool &valid) { LOG_TOKEN_POS(in, pos); - if(accept_text(in, "native")) + if (accept_text(in, "native")) { return GEMMType::NATIVE; } - else if(accept_text(in, "reshaped-only-rhs")) + else if (accept_text(in, "reshaped-only-rhs")) { return GEMMType::RESHAPED_ONLY_RHS; } - else if(accept_text(in, "reshaped")) + else if (accept_text(in, "reshaped")) { return GEMMType::RESHAPED; } @@ -593,7 +586,7 @@ GEMMConfigNative gemm_config_native(TokenStream &in, bool &valid) const auto n0 = CHECK_DEFAULT(uint_val(in, valid), valid, invalid_val); const auto k0 = CHECK_DEFAULT(uint_val(in, valid), valid, invalid_val); CHECK_DEFAULT(expect_r_list(in, valid), valid, invalid_val); - return GEMMConfigNative{ m0, n0, k0 }; + return GEMMConfigNative{m0, n0, k0}; } GEMMConfigReshapedOnlyRHS gemm_config_reshaped_only_rhs(TokenStream &in, bool &valid) @@ -608,7 +601,7 @@ GEMMConfigReshapedOnlyRHS gemm_config_reshaped_only_rhs(TokenStream &in, bool &v const auto tr = CHECK_DEFAULT(bool_val(in, valid), valid, invalid_val); const auto ex = CHECK_DEFAULT(bool_val(in, valid), valid, invalid_val); CHECK_DEFAULT(expect_r_list(in, valid), valid, invalid_val); - return GEMMConfigReshapedOnlyRHS{ m0, n0, k0, h0, ir, tr, ex }; + return GEMMConfigReshapedOnlyRHS{m0, n0, k0, h0, ir, tr, ex}; } GEMMConfigReshaped gemm_config_reshaped(TokenStream &in, bool &valid) @@ -625,17 +618,17 @@ GEMMConfigReshaped gemm_config_reshaped(TokenStream &in, bool &valid) const auto tr = CHECK_DEFAULT(bool_val(in, valid), valid, invalid_val); const auto ex = CHECK_DEFAULT(bool_val(in, valid), valid, invalid_val); CHECK_DEFAULT(expect_r_list(in, valid), valid, invalid_val); - return GEMMConfigReshaped{ m0, n0, k0, v0, h0, il, ir, tr, ex }; + return GEMMConfigReshaped{m0, n0, k0, v0, h0, il, ir, tr, ex}; } void gpu_priority(TokenStream &in, bool &valid) { LOG_TOKEN_POS(in, pos); - if(accept_text(in, "best-performance")) + if (accept_text(in, "best-performance")) { ; } - else if(accept_text(in, "best-memory-usage")) + else if (accept_text(in, "best-memory-usage")) { ; } @@ -648,11 +641,11 @@ void gpu_priority(TokenStream &in, bool &valid) void gpu_behavior(TokenStream &in, bool &valid) { LOG_TOKEN_POS(in, pos); - if(accept_text(in, "static")) + if (accept_text(in, "static")) { ; } - else if(accept_text(in, "dynamic")) + else if (accept_text(in, "dynamic")) { ; } @@ -665,7 +658,7 @@ void gpu_behavior(TokenStream &in, bool &valid) void free_vars(TokenStream &in, bool &valid) { CHECK(expect_l_list(in, valid), valid); - while(!accept_r_list(in)) + while (!accept_r_list(in)) { CHECK(text_val(in, valid), valid); } @@ -688,7 +681,7 @@ void heuristics_table_entry(TokenStream &in, MLGOHeuristics &h, bool &valid) void heuristics_table(TokenStream &in, MLGOHeuristics &h, bool &valid) { CHECK(expect_text(in, "<heuristics-table>", valid), valid); - while(!accept_text(in, "</heuristics-table>")) + while (!accept_text(in, "</heuristics-table>")) { CHECK(heuristics_table_entry(in, h, valid), valid); } @@ -705,11 +698,12 @@ Condition condition(TokenStream &in, bool &valid) const auto c_o = CHECK_DEFAULT(conditional_op(in, valid), valid, invalid_val); const auto r_t = CHECK_DEFAULT(comparator_type(in, valid), valid, invalid_val); const auto r_v = CHECK_DEFAULT(float_val(in, valid), valid, invalid_val); - if(l_t != ComparatorType::Var || r_t != ComparatorType::Num) + if (l_t != ComparatorType::Var || r_t != ComparatorType::Num) { - FAIL_WITH_MSG_DEFAULT(valid, invalid_val, pos, "Only accept LHS type to be Var (string) and RHS type to be Num (float)"); + FAIL_WITH_MSG_DEFAULT(valid, invalid_val, pos, + "Only accept LHS type to be Var (string) and RHS type to be Num (float)"); } - return Condition{ l_v, c_o, r_v }; + return Condition{l_v, c_o, r_v}; } void heuristic_tree(TokenStream &in, MLGOHeuristics &h, bool &valid) @@ -717,13 +711,13 @@ void heuristic_tree(TokenStream &in, MLGOHeuristics &h, bool &valid) CHECK(expect_text(in, "<heuristic", valid), valid); const auto tree_id = CHECK(uint_val(in, valid), valid); CHECK(expect_text(in, ">", valid), valid); - HeuristicTree *t = nullptr; - std::tie(valid, t) = CHECK(h.get_heuristic_tree(tree_id), valid); + HeuristicTree *t = nullptr; + std::tie(valid, t) = CHECK(h.get_heuristic_tree(tree_id), valid); const HeuristicType t_heuristic_type = std::get<0>(t->index()); - while(!accept_text(in, "</heuristic>")) + while (!accept_text(in, "</heuristic>")) { LOG_TOKEN_POS(in, pos); - if(accept_text(in, "b")) + if (accept_text(in, "b")) { // Branch node const auto id = CHECK(uint_val(in, valid), valid); @@ -732,7 +726,7 @@ void heuristic_tree(TokenStream &in, MLGOHeuristics &h, bool &valid) const auto f_id = CHECK(uint_val(in, valid), valid); valid = CHECK(t->add_branch(id, cond, t_id, f_id), valid); } - else if(accept_text(in, "l")) + else if (accept_text(in, "l")) { // Leaf node const auto id = CHECK(uint_val(in, valid), valid); @@ -740,7 +734,7 @@ void heuristic_tree(TokenStream &in, MLGOHeuristics &h, bool &valid) // heuristic table). For now it remains as a step for validation. LOG_TOKEN_POS(in, pos); CHECK(expect_heuristic_type(in, t_heuristic_type, valid), valid); - switch(t_heuristic_type) + switch (t_heuristic_type) { case HeuristicType::GEMM_Type: { @@ -786,7 +780,7 @@ MLGOHeuristics mlgo(TokenStream &in, bool &valid) MLGOHeuristics h; CHECK_DEFAULT(header(in, valid), valid, h); CHECK_DEFAULT(heuristics_table(in, h, valid), valid, h); - while(accept_text(in, "<heuristic", false)) + while (accept_text(in, "<heuristic", false)) { CHECK_DEFAULT(heuristic_tree(in, h, valid), valid, h); } @@ -809,4 +803,4 @@ std::pair<bool, MLGOHeuristics> parse_mlgo(std::istream &in) #undef CHECK #undef CHECK_DEFAULT #undef FAIL_WITH_MSG -#undef FAIL_WITH_MSG_DEFAULT
\ No newline at end of file +#undef FAIL_WITH_MSG_DEFAULT diff --git a/src/runtime/CL/mlgo/MLGOParser.h b/src/runtime/CL/mlgo/MLGOParser.h index 49d8b9c644..cffce8d6a1 100644 --- a/src/runtime/CL/mlgo/MLGOParser.h +++ b/src/runtime/CL/mlgo/MLGOParser.h @@ -98,15 +98,14 @@ struct CharPosition return ln == other.ln && col == other.col; } - size_t ln{ 0 }; - size_t col{ 0 }; + size_t ln{0}; + size_t col{0}; }; /** Token */ struct Token { - Token(TokenType t, std::string v, CharPosition pos) - : type{ t }, value{ v }, pos{ pos } + Token(TokenType t, std::string v, CharPosition pos) : type{t}, value{v}, pos{pos} { } @@ -196,4 +195,4 @@ std::pair<bool, MLGOHeuristics> parse_mlgo(std::istream &in); } // namespace parser } // namespace mlgo } // namespace arm_compute -#endif //SRC_RUNTIME_CL_MLGO_MLGO_PARSER_H
\ No newline at end of file +#endif //SRC_RUNTIME_CL_MLGO_MLGO_PARSER_H diff --git a/src/runtime/CL/mlgo/Utils.cpp b/src/runtime/CL/mlgo/Utils.cpp index 81d418c28e..c7e0100b3c 100644 --- a/src/runtime/CL/mlgo/Utils.cpp +++ b/src/runtime/CL/mlgo/Utils.cpp @@ -43,40 +43,38 @@ inline std::string to_str(const T &val) std::ostream &operator<<(std::ostream &os, const GEMMConfigNative &config) { return os << "Native:{" - << "m0: " << config.m0 << ", " - << "n0: " << config.n0 << ", " - << "k0: " << config.k0 << ", " - << "}"; + << "m0: " << config.m0 << ", " + << "n0: " << config.n0 << ", " + << "k0: " << config.k0 << ", " + << "}"; } std::ostream &operator<<(std::ostream &os, const GEMMConfigReshapedOnlyRHS &config) { return os << "ReshapedOnlyRHS:{" - << "m0: " << config.m0 << ", " - << "n0: " << config.n0 << ", " - << "k0: " << config.k0 << ", " - << "h0: " << config.h0 << ", " - << "interleave_rhs: " << config.interleave_rhs << ", " - << "transpose_rhs: " << config.transpose_rhs << ", " - << "export_cl_image: " << config.export_cl_image - << "}"; + << "m0: " << config.m0 << ", " + << "n0: " << config.n0 << ", " + << "k0: " << config.k0 << ", " + << "h0: " << config.h0 << ", " + << "interleave_rhs: " << config.interleave_rhs << ", " + << "transpose_rhs: " << config.transpose_rhs << ", " + << "export_cl_image: " << config.export_cl_image << "}"; } std::ostream &operator<<(std::ostream &os, const GEMMConfigReshaped &config) { return os << "Reshaped:{" - << "m0: " << config.m0 << ", " - << "n0: " << config.n0 << ", " - << "k0: " << config.k0 << ", " - << "v0: " << config.v0 << ", " - << "h0: " << config.h0 << ", " - << "interleave_lhs: " << config.interleave_lhs << ", " - << "interleave_rhs: " << config.interleave_rhs << ", " - << "transpose_rhs: " << config.transpose_rhs << ", " - << "export_cl_image: " << config.export_cl_image - << "}"; + << "m0: " << config.m0 << ", " + << "n0: " << config.n0 << ", " + << "k0: " << config.k0 << ", " + << "v0: " << config.v0 << ", " + << "h0: " << config.h0 << ", " + << "interleave_lhs: " << config.interleave_lhs << ", " + << "interleave_rhs: " << config.interleave_rhs << ", " + << "transpose_rhs: " << config.transpose_rhs << ", " + << "export_cl_image: " << config.export_cl_image << "}"; } std::ostream &operator<<(std::ostream &os, HeuristicType ht) { - switch(ht) + switch (ht) { case HeuristicType::GEMM_Type: { @@ -103,7 +101,7 @@ std::ostream &operator<<(std::ostream &os, HeuristicType ht) } std::ostream &operator<<(std::ostream &os, DataType dt) { - switch(dt) + switch (dt) { case DataType::F32: { @@ -184,4 +182,4 @@ std::ostream &operator<<(std::ostream &os, const CharPosition &pos) } // namespace mlgo -} // namespace arm_compute
\ No newline at end of file +} // namespace arm_compute diff --git a/src/runtime/CL/mlgo/Utils.h b/src/runtime/CL/mlgo/Utils.h index c634a887e9..73b537f476 100644 --- a/src/runtime/CL/mlgo/Utils.h +++ b/src/runtime/CL/mlgo/Utils.h @@ -43,10 +43,10 @@ std::ostream &operator<<(std::ostream &os, HeuristicType ht); std::ostream &operator<<(std::ostream &os, DataType dt); std::ostream &operator<<(std::ostream &os, const HeuristicTree::Index &index); std::ostream &operator<<(std::ostream &os, const Query &query); -std::string to_string(const GEMMConfigNative &config); -std::string to_string(const GEMMConfigReshapedOnlyRHS &config); -std::string to_string(const GEMMConfigReshaped &config); -std::string to_string(const Query &query); +std::string to_string(const GEMMConfigNative &config); +std::string to_string(const GEMMConfigReshapedOnlyRHS &config); +std::string to_string(const GEMMConfigReshaped &config); +std::string to_string(const Query &query); namespace parser { std::ostream &operator<<(std::ostream &os, const CharPosition &pos); @@ -54,4 +54,4 @@ std::ostream &operator<<(std::ostream &os, const CharPosition &pos); } // namespace mlgo } // namespace arm_compute -#endif //SRC_RUNTIME_CL_MLGO_UTILS_H
\ No newline at end of file +#endif //SRC_RUNTIME_CL_MLGO_UTILS_H diff --git a/src/runtime/CL/tuners/CLTuningParametersList.cpp b/src/runtime/CL/tuners/CLTuningParametersList.cpp index 6cb2212794..5e3907f1ea 100644 --- a/src/runtime/CL/tuners/CLTuningParametersList.cpp +++ b/src/runtime/CL/tuners/CLTuningParametersList.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2021 Arm Limited. + * Copyright (c) 2019-2021, 2023 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -27,20 +27,20 @@ namespace arm_compute { namespace cl_tuner { -constexpr unsigned int max_lws_supported_x{ 64u }; -constexpr unsigned int max_lws_supported_y{ 32u }; -constexpr unsigned int max_lws_supported_z{ 32u }; +constexpr unsigned int max_lws_supported_x{64u}; +constexpr unsigned int max_lws_supported_y{32u}; +constexpr unsigned int max_lws_supported_z{32u}; -/** Non instantiable base class for Tuning parameters combinations that use Index2Cooard mapping */ +/** Non instantiable base class for Tuning parameters combinations that use Index2Coord mapping */ class CLTuningParametersList : public ICLTuningParametersList { protected: /* Shape of 4-D search space */ - TensorShape search_space_shape{ 0, 0, 0, 0 }; - std::vector<unsigned int> _lws_x{ 0 }; - std::vector<unsigned int> _lws_y{ 0 }; - std::vector<unsigned int> _lws_z{ 0 }; - std::vector<int> _wbsm{ 0 }; /* Modify the batches size of workgroups distributed to compute units. + TensorShape search_space_shape{0, 0, 0, 0}; + std::vector<unsigned int> _lws_x{0}; + std::vector<unsigned int> _lws_y{0}; + std::vector<unsigned int> _lws_z{0}; + std::vector<int> _wbsm{0}; /* Modify the batches size of workgroups distributed to compute units. The value is in the range [-31,+31]. When 0, the runtime-selected wbs used is unmodified. */ @@ -116,7 +116,8 @@ private: * @param[in] lws_max Max LWS value allowed to be tested * @param[in] mod_let_one True if the results of the modulo operation between gws and the lws can be less than one. */ - void initialize_lws_values(std::vector<unsigned int> &lws, unsigned int gws, unsigned int lws_max, bool mod_let_one); + void + initialize_lws_values(std::vector<unsigned int> &lws, unsigned int gws, unsigned int lws_max, bool mod_let_one); }; /** A minimal subset of LWS values that only have 1,2 and 4/8 */ @@ -162,14 +163,17 @@ CLTuningParams CLTuningParametersListExhaustive::operator[](size_t index) CLTuningParametersListExhaustive::CLTuningParametersListExhaustive(const cl::NDRange &gws, CLTuningInfo tuning_info) { - ARM_COMPUTE_UNUSED(gws); - search_space_shape[0] = max_lws_supported_x; - search_space_shape[1] = max_lws_supported_y; - search_space_shape[2] = max_lws_supported_z; + const auto lws_x_max = std::min(static_cast<unsigned int>(gws[0]), max_lws_supported_x); + const auto lws_y_max = std::min(static_cast<unsigned int>(gws[1]), max_lws_supported_y); + const auto lws_z_max = std::min(static_cast<unsigned int>(gws[2]), max_lws_supported_z); + + search_space_shape[0] = lws_x_max; + search_space_shape[1] = lws_y_max; + search_space_shape[2] = lws_z_max; search_space_shape[3] = 1; - if(tuning_info.tune_wbsm) + if (tuning_info.tune_wbsm) { - _wbsm = { -3, -2, -1, 0, 1, 2, 3 }; + _wbsm = {-3, -2, -1, 0, 1, 2, 3}; search_space_shape[3] = _wbsm.size(); } } @@ -183,34 +187,39 @@ CLTuningParams CLTuningParametersListNormal::operator[](size_t index) CLTuningParametersListNormal::CLTuningParametersListNormal(const cl::NDRange &gws, CLTuningInfo tuning_info) { - auto lws_x_max = std::min(static_cast<unsigned int>(gws[0]), max_lws_supported_x); - auto lws_y_max = std::min(static_cast<unsigned int>(gws[1]), max_lws_supported_y); - auto lws_z_max = std::min(static_cast<unsigned int>(gws[2]), max_lws_supported_z); + const auto lws_x_max = std::min(static_cast<unsigned int>(gws[0]), max_lws_supported_x); + const auto lws_y_max = std::min(static_cast<unsigned int>(gws[1]), max_lws_supported_y); + const auto lws_z_max = std::min(static_cast<unsigned int>(gws[2]), max_lws_supported_z); // Initialize the tuning parameters values to test _lws_x = {}; _lws_y = {}; _lws_z = {}; - initialize_lws_values(_lws_x, gws[0], lws_x_max, gws[2] > 16); // Explore lws that are not factors of gws only when gws[2] > 16 - initialize_lws_values(_lws_y, gws[1], lws_y_max, gws[2] > 16); // Explore lws that are not factors of gws only when gws[2] > 16 + initialize_lws_values(_lws_x, gws[0], lws_x_max, + gws[2] > 16); // Explore lws that are not factors of gws only when gws[2] > 16 + initialize_lws_values(_lws_y, gws[1], lws_y_max, + gws[2] > 16); // Explore lws that are not factors of gws only when gws[2] > 16 initialize_lws_values(_lws_z, gws[2], lws_z_max, false); search_space_shape[0] = _lws_x.size(); search_space_shape[1] = _lws_y.size(); search_space_shape[2] = _lws_z.size(); search_space_shape[3] = 1; - if(tuning_info.tune_wbsm) + if (tuning_info.tune_wbsm) { - _wbsm = { -2, -1, 0, 1, 2 }; + _wbsm = {-2, -1, 0, 1, 2}; search_space_shape[3] = _wbsm.size(); } } -void CLTuningParametersListNormal::initialize_lws_values(std::vector<unsigned int> &lws, unsigned int gws, unsigned int lws_max, bool mod_let_one) +void CLTuningParametersListNormal::initialize_lws_values(std::vector<unsigned int> &lws, + unsigned int gws, + unsigned int lws_max, + bool mod_let_one) { lws.push_back(1); - for(unsigned int i = 2; i <= lws_max; ++i) + for (unsigned int i = 2; i <= lws_max; ++i) { // Power of two condition const bool is_power_of_two = (i & (i - 1)) == 0; @@ -218,7 +227,7 @@ void CLTuningParametersListNormal::initialize_lws_values(std::vector<unsigned in // Condition for the module accordingly with the mod_let_one flag const bool mod_cond = mod_let_one ? (gws % i) <= 1 : (gws % i) == 0; - if(mod_cond || is_power_of_two) + if (mod_cond || is_power_of_two) { lws.push_back(i); } @@ -227,9 +236,9 @@ void CLTuningParametersListNormal::initialize_lws_values(std::vector<unsigned in CLTuningParametersListRapid::CLTuningParametersListRapid(const cl::NDRange &gws, CLTuningInfo tuning_info) { - auto lws_x_max = std::min(static_cast<unsigned int>(gws[0]), 8u); // Limit exploration to 1 - 8 - auto lws_y_max = std::min(static_cast<unsigned int>(gws[1]), 4u); // Limit exploration to 1 - 4 - auto lws_z_max = std::min(static_cast<unsigned int>(gws[2]), 4u); // Limit exploration to 1 - 4 + const auto lws_x_max = std::min(static_cast<unsigned int>(gws[0]), 8u); // Limit exploration to 1 - 8 + const auto lws_y_max = std::min(static_cast<unsigned int>(gws[1]), 4u); // Limit exploration to 1 - 4 + const auto lws_z_max = std::min(static_cast<unsigned int>(gws[2]), 4u); // Limit exploration to 1 - 4 // Initialize the LWS values to test _lws_x = {}; @@ -243,9 +252,9 @@ CLTuningParametersListRapid::CLTuningParametersListRapid(const cl::NDRange &gws, search_space_shape[1] = _lws_y.size(); search_space_shape[2] = _lws_z.size(); search_space_shape[3] = 1; - if(tuning_info.tune_wbsm) + if (tuning_info.tune_wbsm) { - _wbsm = { -1, 0, 1 }; + _wbsm = {-1, 0, 1}; search_space_shape[3] = _wbsm.size(); } } @@ -254,7 +263,7 @@ void CLTuningParametersListRapid::initialize_lws_values(std::vector<unsigned int { lws.push_back(1); - for(unsigned int i = 2; i <= lws_max; i *= 4) + for (unsigned int i = 2; i <= lws_max; i *= 4) { lws.push_back(i); } @@ -262,7 +271,7 @@ void CLTuningParametersListRapid::initialize_lws_values(std::vector<unsigned int std::unique_ptr<ICLTuningParametersList> get_tuning_parameters_list(CLTuningInfo tuning_info, const cl::NDRange &gws) { - switch(tuning_info.tuner_mode) + switch (tuning_info.tuner_mode) { case CLTunerMode::EXHAUSTIVE: return std::make_unique<CLTuningParametersListExhaustive>(gws, tuning_info); diff --git a/src/runtime/CPP/CPPScheduler.cpp b/src/runtime/CPP/CPPScheduler.cpp index f112d456c7..9fbdc3a4dd 100644 --- a/src/runtime/CPP/CPPScheduler.cpp +++ b/src/runtime/CPP/CPPScheduler.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2016-2021 Arm Limited. + * Copyright (c) 2016-2023 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -29,6 +29,7 @@ #include "arm_compute/core/Log.h" #include "arm_compute/core/Utils.h" #include "arm_compute/core/utils/misc/Utility.h" + #include "support/Mutex.h" #include <atomic> @@ -53,8 +54,7 @@ public: * @param[in] start First value that will be returned by the feeder * @param[in] end End condition (The last value returned by get_next() will be end - 1) */ - explicit ThreadFeeder(unsigned int start = 0, unsigned int end = 0) - : _atomic_counter(start), _end(end) + explicit ThreadFeeder(unsigned int start = 0, unsigned int end = 0) : _atomic_counter(start), _end(end) { } /** Return the next element in the range if there is one. @@ -89,8 +89,7 @@ void process_workloads(std::vector<IScheduler::Workload> &workloads, ThreadFeede { ARM_COMPUTE_ERROR_ON(workload_index >= workloads.size()); workloads[workload_index](info); - } - while(feeder.get_next(workload_index)); + } while (feeder.get_next(workload_index)); } /** Set thread affinity. Pin current thread to a particular core @@ -99,17 +98,17 @@ void process_workloads(std::vector<IScheduler::Workload> &workloads, ThreadFeede */ void set_thread_affinity(int core_id) { - if(core_id < 0) + if (core_id < 0) { return; } -#if !defined(__APPLE__) +#if !defined(_WIN64) && !defined(__APPLE__) && !defined(__OpenBSD__) cpu_set_t set; CPU_ZERO(&set); CPU_SET(core_id, &set); ARM_COMPUTE_EXIT_ON_MSG(sched_setaffinity(0, sizeof(set), &set), "Error setting thread affinity"); -#endif /* !defined(__APPLE__) */ +#endif /* !defined(__APPLE__) && !defined(__OpenBSD__) */ } /** There are currently 2 scheduling modes supported by CPPScheduler @@ -150,10 +149,10 @@ public: */ explicit Thread(int core_pin = -1); - Thread(const Thread &) = delete; + Thread(const Thread &) = delete; Thread &operator=(const Thread &) = delete; Thread(Thread &&) = delete; - Thread &operator=(Thread &&) = delete; + Thread &operator=(Thread &&) = delete; /** Destructor. Make the thread join. */ ~Thread(); @@ -172,7 +171,7 @@ public: void start(); /** Wait for the current kernel execution to complete. */ - void wait(); + std::exception_ptr wait(); /** Function ran by the worker thread. */ void worker_thread(); @@ -196,21 +195,20 @@ public: private: std::thread _thread{}; ThreadInfo _info{}; - std::vector<IScheduler::Workload> *_workloads{ nullptr }; - ThreadFeeder *_feeder{ nullptr }; + std::vector<IScheduler::Workload> *_workloads{nullptr}; + ThreadFeeder *_feeder{nullptr}; std::mutex _m{}; std::condition_variable _cv{}; - bool _wait_for_work{ false }; - bool _job_complete{ true }; - std::exception_ptr _current_exception{ nullptr }; - int _core_pin{ -1 }; - std::list<Thread> *_thread_pool{ nullptr }; - unsigned int _wake_beg{ 0 }; - unsigned int _wake_end{ 0 }; + bool _wait_for_work{false}; + bool _job_complete{true}; + std::exception_ptr _current_exception{nullptr}; + int _core_pin{-1}; + std::list<Thread> *_thread_pool{nullptr}; + unsigned int _wake_beg{0}; + unsigned int _wake_end{0}; }; -Thread::Thread(int core_pin) - : _core_pin(core_pin) +Thread::Thread(int core_pin) : _core_pin(core_pin) { _thread = std::thread(&Thread::worker_thread, this); } @@ -218,7 +216,7 @@ Thread::Thread(int core_pin) Thread::~Thread() { // Make sure worker thread has ended - if(_thread.joinable()) + if (_thread.joinable()) { ThreadFeeder feeder; set_workload(nullptr, feeder, ThreadInfo()); @@ -244,24 +242,20 @@ void Thread::start() _cv.notify_one(); } -void Thread::wait() +std::exception_ptr Thread::wait() { { std::unique_lock<std::mutex> lock(_m); _cv.wait(lock, [&] { return _job_complete; }); } - - if(_current_exception) - { - std::rethrow_exception(_current_exception); - } + return _current_exception; } void Thread::worker_thread() { set_thread_affinity(_core_pin); - while(true) + while (true) { std::unique_lock<std::mutex> lock(_m); _cv.wait(lock, [&] { return _wait_for_work; }); @@ -270,18 +264,18 @@ void Thread::worker_thread() _current_exception = nullptr; // Exit if the worker thread has not been fed with workloads - if(_workloads == nullptr || _feeder == nullptr) + if (_workloads == nullptr || _feeder == nullptr) { return; } // Wake up more peer threads from thread pool if this job has been delegated to the current thread - if(_thread_pool != nullptr) + if (_thread_pool != nullptr) { auto thread_it = _thread_pool->begin(); std::advance(thread_it, std::min(static_cast<unsigned int>(_thread_pool->size()), _wake_beg)); auto wake_end = std::min(_wake_end, static_cast<unsigned int>(_info.num_threads - 1)); - for(unsigned int t = _wake_beg; t < wake_end; ++t, ++thread_it) + for (unsigned int t = _wake_beg; t < wake_end; ++t, ++thread_it) { thread_it->start(); } @@ -295,7 +289,7 @@ void Thread::worker_thread() #ifndef ARM_COMPUTE_EXCEPTIONS_DISABLED } - catch(...) + catch (...) { _current_exception = std::current_exception(); } @@ -326,11 +320,11 @@ struct CPPScheduler::Impl final : _num_threads(thread_hint), _threads(_num_threads - 1), _mode(Mode::Linear), _wake_fanout(0U) { const auto mode_env_v = utility::tolower(utility::getenv("ARM_COMPUTE_CPP_SCHEDULER_MODE")); - if(mode_env_v == "linear") + if (mode_env_v == "linear") { _forced_mode = ModeToggle::Linear; } - else if(mode_env_v == "fanout") + else if (mode_env_v == "fanout") { _forced_mode = ModeToggle::Fanout; } @@ -354,7 +348,7 @@ struct CPPScheduler::Impl final // Set affinity on worked threads _threads.clear(); - for(auto i = 1U; i < _num_threads; ++i) + for (auto i = 1U; i < _num_threads; ++i) { _threads.emplace_back(func(i, thread_hint)); } @@ -363,20 +357,23 @@ struct CPPScheduler::Impl final void auto_switch_mode(unsigned int num_threads_to_use) { // If the environment variable is set to any of the modes, it overwrites the mode selected over num_threads_to_use - if(_forced_mode == ModeToggle::Fanout || (_forced_mode == ModeToggle::None && num_threads_to_use > 8)) + if (_forced_mode == ModeToggle::Fanout || (_forced_mode == ModeToggle::None && num_threads_to_use > 8)) { set_fanout_mode(m_default_wake_fanout, num_threads_to_use); - ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("Set CPPScheduler to Fanout mode, with wake up fanout : %d and %d threads to use\n", this->wake_fanout(), num_threads_to_use); + ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE( + "Set CPPScheduler to Fanout mode, with wake up fanout : %d and %d threads to use\n", + this->wake_fanout(), num_threads_to_use); } else // Equivalent to (_forced_mode == ModeToggle::Linear || (_forced_mode == ModeToggle::None && num_threads_to_use <= 8)) { set_linear_mode(); - ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("Set CPPScheduler to Linear mode, with %d threads to use\n", num_threads_to_use); + ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("Set CPPScheduler to Linear mode, with %d threads to use\n", + num_threads_to_use); } } void set_linear_mode() { - for(auto &thread : _threads) + for (auto &thread : _threads) { thread.set_linear_mode(); } @@ -388,14 +385,14 @@ struct CPPScheduler::Impl final ARM_COMPUTE_ERROR_ON(num_threads_to_use > _threads.size() + 1); const auto actual_wake_fanout = std::max(2U, std::min(wake_fanout, num_threads_to_use - 1)); auto thread_it = _threads.begin(); - for(auto i = 1U; i < num_threads_to_use; ++i, ++thread_it) + for (auto i = 1U; i < num_threads_to_use; ++i, ++thread_it) { const auto wake_begin = i * actual_wake_fanout - 1; const auto wake_end = std::min((i + 1) * actual_wake_fanout - 1, num_threads_to_use - 1); thread_it->set_fanout_mode(&_threads, wake_begin, wake_end); } // Reset the remaining threads's wake up schedule - while(thread_it != _threads.end()) + while (thread_it != _threads.end()) { thread_it->set_fanout_mode(&_threads, 0U, 0U); ++thread_it; @@ -421,9 +418,9 @@ struct CPPScheduler::Impl final unsigned int _num_threads; std::list<Thread> _threads; arm_compute::Mutex _run_workloads_mutex{}; - Mode _mode{ Mode::Linear }; - ModeToggle _forced_mode{ ModeToggle::None }; - unsigned int _wake_fanout{ 0 }; + Mode _mode{Mode::Linear}; + ModeToggle _forced_mode{ModeToggle::None}; + unsigned int _wake_fanout{0}; }; /* @@ -435,8 +432,7 @@ CPPScheduler &CPPScheduler::get() return scheduler; } -CPPScheduler::CPPScheduler() - : _impl(std::make_unique<Impl>(num_threads_hint())) +CPPScheduler::CPPScheduler() : _impl(std::make_unique<Impl>(num_threads_hint())) { } @@ -469,15 +465,15 @@ void CPPScheduler::run_workloads(std::vector<IScheduler::Workload> &workloads) // This is not great because different threads workloads won't run in parallel but at least they // won't interfere each other and deadlock. arm_compute::lock_guard<std::mutex> lock(_impl->_run_workloads_mutex); - const unsigned int num_threads_to_use = std::min(_impl->num_threads(), static_cast<unsigned int>(workloads.size())); - if(num_threads_to_use < 1) + const unsigned int num_threads_to_use = std::min(_impl->num_threads(), static_cast<unsigned int>(workloads.size())); + if (num_threads_to_use < 1) { return; } // Re-adjust the mode if the actual number of threads to use is different from the number of threads created _impl->auto_switch_mode(num_threads_to_use); int num_threads_to_start = 0; - switch(_impl->mode()) + switch (_impl->mode()) { case CPPScheduler::Impl::Mode::Fanout: { @@ -493,35 +489,54 @@ void CPPScheduler::run_workloads(std::vector<IScheduler::Workload> &workloads) } ThreadFeeder feeder(num_threads_to_use, workloads.size()); ThreadInfo info; - info.cpu_info = &_cpu_info; + info.cpu_info = &cpu_info(); info.num_threads = num_threads_to_use; unsigned int t = 0; auto thread_it = _impl->_threads.begin(); // Set num_threads_to_use - 1 workloads to the threads as the remaining 1 is left to the main thread - for(; t < num_threads_to_use - 1; ++t, ++thread_it) + for (; t < num_threads_to_use - 1; ++t, ++thread_it) { info.thread_id = t; thread_it->set_workload(&workloads, feeder, info); } thread_it = _impl->_threads.begin(); - for(int i = 0; i < num_threads_to_start; ++i, ++thread_it) + for (int i = 0; i < num_threads_to_start; ++i, ++thread_it) { thread_it->start(); } - info.thread_id = t; // Set main thread's thread_id - process_workloads(workloads, feeder, info); // Main thread processes workloads + info.thread_id = t; // Set main thread's thread_id + std::exception_ptr last_exception = nullptr; +#ifndef ARM_COMPUTE_EXCEPTIONS_DISABLED + try + { +#endif /* ARM_COMPUTE_EXCEPTIONS_DISABLED */ + process_workloads(workloads, feeder, info); // Main thread processes workloads #ifndef ARM_COMPUTE_EXCEPTIONS_DISABLED + } + catch (...) + { + last_exception = std::current_exception(); + } + try { #endif /* ARM_COMPUTE_EXCEPTIONS_DISABLED */ thread_it = _impl->_threads.begin(); - for(unsigned int i = 0; i < num_threads_to_use - 1; ++i, ++thread_it) + for (unsigned int i = 0; i < num_threads_to_use - 1; ++i, ++thread_it) + { + std::exception_ptr current_exception = thread_it->wait(); + if (current_exception) + { + last_exception = current_exception; + } + } + if (last_exception) { - thread_it->wait(); + std::rethrow_exception(last_exception); } #ifndef ARM_COMPUTE_EXCEPTIONS_DISABLED } - catch(const std::system_error &e) + catch (const std::system_error &e) { std::cerr << "Caught system_error with code " << e.code() << " meaning " << e.what() << '\n'; } diff --git a/src/runtime/CPP/SingleThreadScheduler.cpp b/src/runtime/CPP/SingleThreadScheduler.cpp index 70536b7ccc..c46a2731d8 100644 --- a/src/runtime/CPP/SingleThreadScheduler.cpp +++ b/src/runtime/CPP/SingleThreadScheduler.cpp @@ -39,33 +39,36 @@ void SingleThreadScheduler::schedule(ICPPKernel *kernel, const Hints &hints) { const Window &max_window = kernel->window(); - if(hints.split_dimension() != IScheduler::split_dimensions_all) + if (hints.split_dimension() != IScheduler::split_dimensions_all) { const unsigned int num_iterations = max_window.num_iterations(hints.split_dimension()); - if(num_iterations < 1) + if (num_iterations < 1) { return; } } ThreadInfo info; - info.cpu_info = &_cpu_info; + info.cpu_info = &cpu_info(); kernel->run(kernel->window(), info); } -void SingleThreadScheduler::schedule_op(ICPPKernel *kernel, const Hints &hints, const Window &window, ITensorPack &tensors) +void SingleThreadScheduler::schedule_op(ICPPKernel *kernel, + const Hints &hints, + const Window &window, + ITensorPack &tensors) { ARM_COMPUTE_UNUSED(hints); ThreadInfo info; - info.cpu_info = &_cpu_info; + info.cpu_info = &cpu_info(); kernel->run_op(tensors, window, info); } void SingleThreadScheduler::run_workloads(std::vector<Workload> &workloads) { ThreadInfo info; - info.cpu_info = &_cpu_info; - for(auto &wl : workloads) + info.cpu_info = &cpu_info(); + for (auto &wl : workloads) { wl(info); } diff --git a/src/runtime/CPP/functions/CPPBoxWithNonMaximaSuppressionLimit.cpp b/src/runtime/CPP/functions/CPPBoxWithNonMaximaSuppressionLimit.cpp index b6803d0d37..94a1673d59 100644 --- a/src/runtime/CPP/functions/CPPBoxWithNonMaximaSuppressionLimit.cpp +++ b/src/runtime/CPP/functions/CPPBoxWithNonMaximaSuppressionLimit.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018-2020 Arm Limited. + * Copyright (c) 2018-2021 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -26,6 +26,8 @@ #include "arm_compute/core/CPP/kernels/CPPBoxWithNonMaximaSuppressionLimitKernel.h" #include "arm_compute/runtime/Scheduler.h" +#include "src/common/utils/Log.h" + namespace arm_compute { namespace @@ -40,28 +42,37 @@ void dequantize_tensor(const ITensor *input, ITensor *output) Iterator input_it(input, window); Iterator output_it(output, window); - switch(data_type) + switch (data_type) { case DataType::QASYMM8: - execute_window_loop(window, [&](const Coordinates &) - { - *reinterpret_cast<float *>(output_it.ptr()) = dequantize(*reinterpret_cast<const uint8_t *>(input_it.ptr()), qinfo.scale, qinfo.offset); - }, - input_it, output_it); + execute_window_loop( + window, + [&](const Coordinates &) + { + *reinterpret_cast<float *>(output_it.ptr()) = + dequantize(*reinterpret_cast<const uint8_t *>(input_it.ptr()), qinfo.scale, qinfo.offset); + }, + input_it, output_it); break; case DataType::QASYMM8_SIGNED: - execute_window_loop(window, [&](const Coordinates &) - { - *reinterpret_cast<float *>(output_it.ptr()) = dequantize_qasymm8_signed(*reinterpret_cast<const int8_t *>(input_it.ptr()), qinfo); - }, - input_it, output_it); + execute_window_loop( + window, + [&](const Coordinates &) + { + *reinterpret_cast<float *>(output_it.ptr()) = + dequantize_qasymm8_signed(*reinterpret_cast<const int8_t *>(input_it.ptr()), qinfo); + }, + input_it, output_it); break; case DataType::QASYMM16: - execute_window_loop(window, [&](const Coordinates &) - { - *reinterpret_cast<float *>(output_it.ptr()) = dequantize(*reinterpret_cast<const uint16_t *>(input_it.ptr()), qinfo.scale, qinfo.offset); - }, - input_it, output_it); + execute_window_loop( + window, + [&](const Coordinates &) + { + *reinterpret_cast<float *>(output_it.ptr()) = + dequantize(*reinterpret_cast<const uint16_t *>(input_it.ptr()), qinfo.scale, qinfo.offset); + }, + input_it, output_it); break; default: ARM_COMPUTE_ERROR("Unsupported data type"); @@ -78,28 +89,37 @@ void quantize_tensor(const ITensor *input, ITensor *output) Iterator input_it(input, window); Iterator output_it(output, window); - switch(data_type) + switch (data_type) { case DataType::QASYMM8: - execute_window_loop(window, [&](const Coordinates &) - { - *reinterpret_cast<uint8_t *>(output_it.ptr()) = quantize_qasymm8(*reinterpret_cast<const float *>(input_it.ptr()), qinfo); - }, - input_it, output_it); + execute_window_loop( + window, + [&](const Coordinates &) + { + *reinterpret_cast<uint8_t *>(output_it.ptr()) = + quantize_qasymm8(*reinterpret_cast<const float *>(input_it.ptr()), qinfo); + }, + input_it, output_it); break; case DataType::QASYMM8_SIGNED: - execute_window_loop(window, [&](const Coordinates &) - { - *reinterpret_cast<int8_t *>(output_it.ptr()) = quantize_qasymm8_signed(*reinterpret_cast<const float *>(input_it.ptr()), qinfo); - }, - input_it, output_it); + execute_window_loop( + window, + [&](const Coordinates &) + { + *reinterpret_cast<int8_t *>(output_it.ptr()) = + quantize_qasymm8_signed(*reinterpret_cast<const float *>(input_it.ptr()), qinfo); + }, + input_it, output_it); break; case DataType::QASYMM16: - execute_window_loop(window, [&](const Coordinates &) - { - *reinterpret_cast<uint16_t *>(output_it.ptr()) = quantize_qasymm16(*reinterpret_cast<const float *>(input_it.ptr()), qinfo); - }, - input_it, output_it); + execute_window_loop( + window, + [&](const Coordinates &) + { + *reinterpret_cast<uint16_t *>(output_it.ptr()) = + quantize_qasymm16(*reinterpret_cast<const float *>(input_it.ptr()), qinfo); + }, + input_it, output_it); break; default: ARM_COMPUTE_ERROR("Unsupported data type"); @@ -130,12 +150,23 @@ CPPBoxWithNonMaximaSuppressionLimit::CPPBoxWithNonMaximaSuppressionLimit(std::sh { } -void CPPBoxWithNonMaximaSuppressionLimit::configure(const ITensor *scores_in, const ITensor *boxes_in, const ITensor *batch_splits_in, ITensor *scores_out, ITensor *boxes_out, ITensor *classes, - ITensor *batch_splits_out, ITensor *keeps, ITensor *keeps_size, const BoxNMSLimitInfo info) +void CPPBoxWithNonMaximaSuppressionLimit::configure(const ITensor *scores_in, + const ITensor *boxes_in, + const ITensor *batch_splits_in, + ITensor *scores_out, + ITensor *boxes_out, + ITensor *classes, + ITensor *batch_splits_out, + ITensor *keeps, + ITensor *keeps_size, + const BoxNMSLimitInfo info) { ARM_COMPUTE_ERROR_ON_NULLPTR(scores_in, boxes_in, scores_out, boxes_out, classes); + ARM_COMPUTE_LOG_PARAMS(scores_in, boxes_in, batch_splits_in, scores_out, boxes_out, classes, batch_splits_out, + keeps, keeps_size, info); - _is_qasymm8 = scores_in->info()->data_type() == DataType::QASYMM8 || scores_in->info()->data_type() == DataType::QASYMM8_SIGNED; + _is_qasymm8 = scores_in->info()->data_type() == DataType::QASYMM8 || + scores_in->info()->data_type() == DataType::QASYMM8_SIGNED; _scores_in = scores_in; _boxes_in = boxes_in; @@ -146,7 +177,7 @@ void CPPBoxWithNonMaximaSuppressionLimit::configure(const ITensor *scores_in, co _batch_splits_out = batch_splits_out; _keeps = keeps; - if(_is_qasymm8) + if (_is_qasymm8) { // Manage intermediate buffers _memory_group.manage(&_scores_in_f32); @@ -156,7 +187,7 @@ void CPPBoxWithNonMaximaSuppressionLimit::configure(const ITensor *scores_in, co _memory_group.manage(&_classes_f32); _scores_in_f32.allocator()->init(scores_in->info()->clone()->set_data_type(DataType::F32)); _boxes_in_f32.allocator()->init(boxes_in->info()->clone()->set_data_type(DataType::F32)); - if(batch_splits_in != nullptr) + if (batch_splits_in != nullptr) { _memory_group.manage(&_batch_splits_in_f32); _batch_splits_in_f32.allocator()->init(batch_splits_in->info()->clone()->set_data_type(DataType::F32)); @@ -164,58 +195,70 @@ void CPPBoxWithNonMaximaSuppressionLimit::configure(const ITensor *scores_in, co _scores_out_f32.allocator()->init(scores_out->info()->clone()->set_data_type(DataType::F32)); _boxes_out_f32.allocator()->init(boxes_out->info()->clone()->set_data_type(DataType::F32)); _classes_f32.allocator()->init(classes->info()->clone()->set_data_type(DataType::F32)); - if(batch_splits_out != nullptr) + if (batch_splits_out != nullptr) { _memory_group.manage(&_batch_splits_out_f32); _batch_splits_out_f32.allocator()->init(batch_splits_out->info()->clone()->set_data_type(DataType::F32)); } - if(keeps != nullptr) + if (keeps != nullptr) { _memory_group.manage(&_keeps_f32); _keeps_f32.allocator()->init(keeps->info()->clone()->set_data_type(DataType::F32)); } - _box_with_nms_limit_kernel.configure(&_scores_in_f32, &_boxes_in_f32, (batch_splits_in != nullptr) ? &_batch_splits_in_f32 : nullptr, + _box_with_nms_limit_kernel.configure(&_scores_in_f32, &_boxes_in_f32, + (batch_splits_in != nullptr) ? &_batch_splits_in_f32 : nullptr, &_scores_out_f32, &_boxes_out_f32, &_classes_f32, - (batch_splits_out != nullptr) ? &_batch_splits_out_f32 : nullptr, (keeps != nullptr) ? &_keeps_f32 : nullptr, - keeps_size, info); + (batch_splits_out != nullptr) ? &_batch_splits_out_f32 : nullptr, + (keeps != nullptr) ? &_keeps_f32 : nullptr, keeps_size, info); } else { - _box_with_nms_limit_kernel.configure(scores_in, boxes_in, batch_splits_in, scores_out, boxes_out, classes, batch_splits_out, keeps, keeps_size, info); + _box_with_nms_limit_kernel.configure(scores_in, boxes_in, batch_splits_in, scores_out, boxes_out, classes, + batch_splits_out, keeps, keeps_size, info); } - if(_is_qasymm8) + if (_is_qasymm8) { _scores_in_f32.allocator()->allocate(); _boxes_in_f32.allocator()->allocate(); - if(_batch_splits_in != nullptr) + if (_batch_splits_in != nullptr) { _batch_splits_in_f32.allocator()->allocate(); } _scores_out_f32.allocator()->allocate(); _boxes_out_f32.allocator()->allocate(); _classes_f32.allocator()->allocate(); - if(batch_splits_out != nullptr) + if (batch_splits_out != nullptr) { _batch_splits_out_f32.allocator()->allocate(); } - if(keeps != nullptr) + if (keeps != nullptr) { _keeps_f32.allocator()->allocate(); } } } -Status validate(const ITensorInfo *scores_in, const ITensorInfo *boxes_in, const ITensorInfo *batch_splits_in, const ITensorInfo *scores_out, const ITensorInfo *boxes_out, const ITensorInfo *classes, - const ITensorInfo *batch_splits_out, const ITensorInfo *keeps, const ITensorInfo *keeps_size, const BoxNMSLimitInfo info) +Status validate(const ITensorInfo *scores_in, + const ITensorInfo *boxes_in, + const ITensorInfo *batch_splits_in, + const ITensorInfo *scores_out, + const ITensorInfo *boxes_out, + const ITensorInfo *classes, + const ITensorInfo *batch_splits_out, + const ITensorInfo *keeps, + const ITensorInfo *keeps_size, + const BoxNMSLimitInfo info) { ARM_COMPUTE_UNUSED(batch_splits_in, batch_splits_out, keeps, keeps_size, info); ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(scores_in, boxes_in, scores_out, boxes_out, classes); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(scores_in, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F16, DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(scores_in, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, + DataType::F16, DataType::F32); - const bool is_qasymm8 = scores_in->data_type() == DataType::QASYMM8 || scores_in->data_type() == DataType::QASYMM8_SIGNED; - if(is_qasymm8) + const bool is_qasymm8 = + scores_in->data_type() == DataType::QASYMM8 || scores_in->data_type() == DataType::QASYMM8_SIGNED; + if (is_qasymm8) { ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(boxes_in, 1, DataType::QASYMM16); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(boxes_in, boxes_out); @@ -233,11 +276,11 @@ void CPPBoxWithNonMaximaSuppressionLimit::run() // Acquire all the temporaries MemoryGroupResourceScope scope_mg(_memory_group); - if(_is_qasymm8) + if (_is_qasymm8) { dequantize_tensor(_scores_in, &_scores_in_f32); dequantize_tensor(_boxes_in, &_boxes_in_f32); - if(_batch_splits_in != nullptr) + if (_batch_splits_in != nullptr) { dequantize_tensor(_batch_splits_in, &_batch_splits_in_f32); } @@ -245,16 +288,16 @@ void CPPBoxWithNonMaximaSuppressionLimit::run() Scheduler::get().schedule(&_box_with_nms_limit_kernel, Window::DimY); - if(_is_qasymm8) + if (_is_qasymm8) { quantize_tensor(&_scores_out_f32, _scores_out); quantize_tensor(&_boxes_out_f32, _boxes_out); quantize_tensor(&_classes_f32, _classes); - if(_batch_splits_out != nullptr) + if (_batch_splits_out != nullptr) { quantize_tensor(&_batch_splits_out_f32, _batch_splits_out); } - if(_keeps != nullptr) + if (_keeps != nullptr) { quantize_tensor(&_keeps_f32, _keeps); } diff --git a/src/runtime/CPP/functions/CPPDetectionOutputLayer.cpp b/src/runtime/CPP/functions/CPPDetectionOutputLayer.cpp index fdb4c9f0f6..e6291f973e 100644 --- a/src/runtime/CPP/functions/CPPDetectionOutputLayer.cpp +++ b/src/runtime/CPP/functions/CPPDetectionOutputLayer.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018-2020 Arm Limited. + * Copyright (c) 2018-2021 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -26,6 +26,8 @@ #include "arm_compute/core/Error.h" #include "arm_compute/core/Helpers.h" #include "arm_compute/core/Validate.h" + +#include "src/common/utils/Log.h" #include "src/core/helpers/AutoConfiguration.h" #include <list> @@ -34,25 +36,35 @@ namespace arm_compute { namespace { -Status validate_arguments(const ITensorInfo *input_loc, const ITensorInfo *input_conf, const ITensorInfo *input_priorbox, const ITensorInfo *output, DetectionOutputLayerInfo info) +Status validate_arguments(const ITensorInfo *input_loc, + const ITensorInfo *input_conf, + const ITensorInfo *input_priorbox, + const ITensorInfo *output, + DetectionOutputLayerInfo info) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input_loc, input_conf, input_priorbox, output); ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input_loc, 1, DataType::F32); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input_loc, input_conf, input_priorbox); ARM_COMPUTE_RETURN_ERROR_ON_MSG(input_loc->num_dimensions() > 2, "The location input tensor should be [C1, N]."); ARM_COMPUTE_RETURN_ERROR_ON_MSG(input_conf->num_dimensions() > 2, "The location input tensor should be [C2, N]."); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(input_priorbox->num_dimensions() > 3, "The priorbox input tensor should be [C3, 2, N]."); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(input_priorbox->num_dimensions() > 3, + "The priorbox input tensor should be [C3, 2, N]."); ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.eta() <= 0.f && info.eta() > 1.f, "Eta should be between 0 and 1"); const int num_priors = input_priorbox->tensor_shape()[0] / 4; - ARM_COMPUTE_RETURN_ERROR_ON_MSG(static_cast<size_t>((num_priors * info.num_loc_classes() * 4)) != input_loc->tensor_shape()[0], "Number of priors must match number of location predictions."); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(static_cast<size_t>((num_priors * info.num_classes())) != input_conf->tensor_shape()[0], "Number of priors must match number of confidence predictions."); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(static_cast<size_t>((num_priors * info.num_loc_classes() * 4)) != + input_loc->tensor_shape()[0], + "Number of priors must match number of location predictions."); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(static_cast<size_t>((num_priors * info.num_classes())) != + input_conf->tensor_shape()[0], + "Number of priors must match number of confidence predictions."); // Validate configured output - if(output->total_size() != 0) + if (output->total_size() != 0) { - const unsigned int max_size = info.keep_top_k() * (input_loc->num_dimensions() > 1 ? input_loc->dimension(1) : 1); + const unsigned int max_size = + info.keep_top_k() * (input_loc->num_dimensions() > 1 ? input_loc->dimension(1) : 1); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), TensorShape(7U, max_size)); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input_loc, output); } @@ -63,8 +75,7 @@ Status validate_arguments(const ITensorInfo *input_loc, const ITensorInfo *input /** Function used to sort pair<float, T> in descend order based on the score (first) value. */ template <typename T> -bool SortScorePairDescend(const std::pair<float, T> &pair1, - const std::pair<float, T> &pair2) +bool SortScorePairDescend(const std::pair<float, T> &pair1, const std::pair<float, T> &pair2) { return pair1.first > pair2.first; } @@ -80,16 +91,19 @@ bool SortScorePairDescend(const std::pair<float, T> &pair1, * @param[out] all_location_predictions All the location predictions. * */ -void retrieve_all_loc_predictions(const ITensor *input_loc, const int num, - const int num_priors, const int num_loc_classes, - const bool share_location, std::vector<LabelBBox> &all_location_predictions) +void retrieve_all_loc_predictions(const ITensor *input_loc, + const int num, + const int num_priors, + const int num_loc_classes, + const bool share_location, + std::vector<LabelBBox> &all_location_predictions) { - for(int i = 0; i < num; ++i) + for (int i = 0; i < num; ++i) { - for(int c = 0; c < num_loc_classes; ++c) + for (int c = 0; c < num_loc_classes; ++c) { int label = share_location ? -1 : c; - if(all_location_predictions[i].find(label) == all_location_predictions[i].end()) + if (all_location_predictions[i].find(label) == all_location_predictions[i].end()) { all_location_predictions[i][label].resize(num_priors); } @@ -100,19 +114,23 @@ void retrieve_all_loc_predictions(const ITensor *input_loc, const int num, } } } - for(int i = 0; i < num; ++i) + for (int i = 0; i < num; ++i) { - for(int p = 0; p < num_priors; ++p) + for (int p = 0; p < num_priors; ++p) { - for(int c = 0; c < num_loc_classes; ++c) + for (int c = 0; c < num_loc_classes; ++c) { const int label = share_location ? -1 : c; const int base_ptr = i * num_priors * num_loc_classes * 4 + p * num_loc_classes * 4 + c * 4; //xmin, ymin, xmax, ymax - all_location_predictions[i][label][p][0] = *reinterpret_cast<float *>(input_loc->ptr_to_element(Coordinates(base_ptr))); - all_location_predictions[i][label][p][1] = *reinterpret_cast<float *>(input_loc->ptr_to_element(Coordinates(base_ptr + 1))); - all_location_predictions[i][label][p][2] = *reinterpret_cast<float *>(input_loc->ptr_to_element(Coordinates(base_ptr + 2))); - all_location_predictions[i][label][p][3] = *reinterpret_cast<float *>(input_loc->ptr_to_element(Coordinates(base_ptr + 3))); + all_location_predictions[i][label][p][0] = + *reinterpret_cast<float *>(input_loc->ptr_to_element(Coordinates(base_ptr))); + all_location_predictions[i][label][p][1] = + *reinterpret_cast<float *>(input_loc->ptr_to_element(Coordinates(base_ptr + 1))); + all_location_predictions[i][label][p][2] = + *reinterpret_cast<float *>(input_loc->ptr_to_element(Coordinates(base_ptr + 2))); + all_location_predictions[i][label][p][3] = + *reinterpret_cast<float *>(input_loc->ptr_to_element(Coordinates(base_ptr + 3))); } } } @@ -128,26 +146,28 @@ void retrieve_all_loc_predictions(const ITensor *input_loc, const int num, * @param[out] all_location_predictions All the location predictions. * */ -void retrieve_all_conf_scores(const ITensor *input_conf, const int num, - const int num_priors, const int num_classes, +void retrieve_all_conf_scores(const ITensor *input_conf, + const int num, + const int num_priors, + const int num_classes, std::vector<std::map<int, std::vector<float>>> &all_confidence_scores) { std::vector<float> tmp_buffer; tmp_buffer.resize(num * num_priors * num_classes); - for(int i = 0; i < num; ++i) + for (int i = 0; i < num; ++i) { - for(int c = 0; c < num_classes; ++c) + for (int c = 0; c < num_classes; ++c) { - for(int p = 0; p < num_priors; ++p) + for (int p = 0; p < num_priors; ++p) { - tmp_buffer[i * num_classes * num_priors + c * num_priors + p] = - *reinterpret_cast<float *>(input_conf->ptr_to_element(Coordinates(i * num_classes * num_priors + p * num_classes + c))); + tmp_buffer[i * num_classes * num_priors + c * num_priors + p] = *reinterpret_cast<float *>( + input_conf->ptr_to_element(Coordinates(i * num_classes * num_priors + p * num_classes + c))); } } } - for(int i = 0; i < num; ++i) + for (int i = 0; i < num; ++i) { - for(int c = 0; c < num_classes; ++c) + for (int c = 0; c < num_classes; ++c) { all_confidence_scores[i][c].resize(num_priors); all_confidence_scores[i][c].assign(&tmp_buffer[i * num_classes * num_priors + c * num_priors], @@ -166,28 +186,23 @@ void retrieve_all_conf_scores(const ITensor *input_conf, const int num, * @param[out] all_location_predictions All the location predictions. * */ -void retrieve_all_priorbox(const ITensor *input_priorbox, - const int num_priors, - std::vector<BBox> &all_prior_bboxes, +void retrieve_all_priorbox(const ITensor *input_priorbox, + const int num_priors, + std::vector<BBox> &all_prior_bboxes, std::vector<std::array<float, 4>> &all_prior_variances) { - for(int i = 0; i < num_priors; ++i) + for (int i = 0; i < num_priors; ++i) { - all_prior_bboxes[i] = - { - { - *reinterpret_cast<float *>(input_priorbox->ptr_to_element(Coordinates(i * 4))), - *reinterpret_cast<float *>(input_priorbox->ptr_to_element(Coordinates(i * 4 + 1))), - *reinterpret_cast<float *>(input_priorbox->ptr_to_element(Coordinates(i * 4 + 2))), - *reinterpret_cast<float *>(input_priorbox->ptr_to_element(Coordinates(i * 4 + 3))) - } - }; + all_prior_bboxes[i] = {{*reinterpret_cast<float *>(input_priorbox->ptr_to_element(Coordinates(i * 4))), + *reinterpret_cast<float *>(input_priorbox->ptr_to_element(Coordinates(i * 4 + 1))), + *reinterpret_cast<float *>(input_priorbox->ptr_to_element(Coordinates(i * 4 + 2))), + *reinterpret_cast<float *>(input_priorbox->ptr_to_element(Coordinates(i * 4 + 3)))}}; } - std::array<float, 4> var({ { 0, 0, 0, 0 } }); - for(int i = 0; i < num_priors; ++i) + std::array<float, 4> var({{0, 0, 0, 0}}); + for (int i = 0; i < num_priors; ++i) { - for(int j = 0; j < 4; ++j) + for (int j = 0; j < 4; ++j) { var[j] = *reinterpret_cast<float *>(input_priorbox->ptr_to_element(Coordinates((num_priors + i) * 4 + j))); } @@ -206,13 +221,17 @@ void retrieve_all_priorbox(const ITensor *input_priorbox, * @param[out] decode_bbox The decoded bboxes. * */ -void DecodeBBox(const BBox &prior_bbox, const std::array<float, 4> &prior_variance, - const DetectionOutputLayerCodeType code_type, const bool variance_encoded_in_target, - const bool clip_bbox, const BBox &bbox, BBox &decode_bbox) +void DecodeBBox(const BBox &prior_bbox, + const std::array<float, 4> &prior_variance, + const DetectionOutputLayerCodeType code_type, + const bool variance_encoded_in_target, + const bool clip_bbox, + const BBox &bbox, + BBox &decode_bbox) { // if the variance is encoded in target, we simply need to add the offset predictions // otherwise we need to scale the offset accordingly. - switch(code_type) + switch (code_type) { case DetectionOutputLayerCodeType::CORNER: { @@ -235,10 +254,14 @@ void DecodeBBox(const BBox &prior_bbox, const std::array<float, 4> &prior_varian const float prior_center_x = (prior_bbox[0] + prior_bbox[2]) / 2.; const float prior_center_y = (prior_bbox[1] + prior_bbox[3]) / 2.; - const float decode_bbox_center_x = (variance_encoded_in_target ? bbox[0] : prior_variance[0] * bbox[0]) * prior_width + prior_center_x; - const float decode_bbox_center_y = (variance_encoded_in_target ? bbox[1] : prior_variance[1] * bbox[1]) * prior_height + prior_center_y; - const float decode_bbox_width = (variance_encoded_in_target ? std::exp(bbox[2]) : std::exp(prior_variance[2] * bbox[2])) * prior_width; - const float decode_bbox_height = (variance_encoded_in_target ? std::exp(bbox[3]) : std::exp(prior_variance[3] * bbox[3])) * prior_height; + const float decode_bbox_center_x = + (variance_encoded_in_target ? bbox[0] : prior_variance[0] * bbox[0]) * prior_width + prior_center_x; + const float decode_bbox_center_y = + (variance_encoded_in_target ? bbox[1] : prior_variance[1] * bbox[1]) * prior_height + prior_center_y; + const float decode_bbox_width = + (variance_encoded_in_target ? std::exp(bbox[2]) : std::exp(prior_variance[2] * bbox[2])) * prior_width; + const float decode_bbox_height = + (variance_encoded_in_target ? std::exp(bbox[3]) : std::exp(prior_variance[3] * bbox[3])) * prior_height; decode_bbox[0] = (decode_bbox_center_x - decode_bbox_width / 2.f); decode_bbox[1] = (decode_bbox_center_y - decode_bbox_height / 2.f); @@ -256,10 +279,14 @@ void DecodeBBox(const BBox &prior_bbox, const std::array<float, 4> &prior_varian ARM_COMPUTE_ERROR_ON(prior_width <= 0.f); ARM_COMPUTE_ERROR_ON(prior_height <= 0.f); - decode_bbox[0] = prior_bbox[0] + (variance_encoded_in_target ? bbox[0] : prior_variance[0] * bbox[0]) * prior_width; - decode_bbox[1] = prior_bbox[1] + (variance_encoded_in_target ? bbox[1] : prior_variance[1] * bbox[1]) * prior_height; - decode_bbox[2] = prior_bbox[2] + (variance_encoded_in_target ? bbox[2] : prior_variance[2] * bbox[2]) * prior_width; - decode_bbox[3] = prior_bbox[3] + (variance_encoded_in_target ? bbox[3] : prior_variance[3] * bbox[3]) * prior_height; + decode_bbox[0] = + prior_bbox[0] + (variance_encoded_in_target ? bbox[0] : prior_variance[0] * bbox[0]) * prior_width; + decode_bbox[1] = + prior_bbox[1] + (variance_encoded_in_target ? bbox[1] : prior_variance[1] * bbox[1]) * prior_height; + decode_bbox[2] = + prior_bbox[2] + (variance_encoded_in_target ? bbox[2] : prior_variance[2] * bbox[2]) * prior_width; + decode_bbox[3] = + prior_bbox[3] + (variance_encoded_in_target ? bbox[3] : prior_variance[3] * bbox[3]) * prior_height; break; } @@ -267,9 +294,9 @@ void DecodeBBox(const BBox &prior_bbox, const std::array<float, 4> &prior_varian ARM_COMPUTE_ERROR("Unsupported Detection Output Code Type."); } - if(clip_bbox) + if (clip_bbox) { - for(auto &d_bbox : decode_bbox) + for (auto &d_bbox : decode_bbox) { d_bbox = utility::clamp(d_bbox, 0.f, 1.f); } @@ -287,10 +314,13 @@ void DecodeBBox(const BBox &prior_bbox, const std::array<float, 4> &prior_varian * @param[out] indices The kept indices of bboxes after nms. * */ -void ApplyNMSFast(const std::vector<BBox> &bboxes, - const std::vector<float> &scores, const float score_threshold, - const float nms_threshold, const float eta, const int top_k, - std::vector<int> &indices) +void ApplyNMSFast(const std::vector<BBox> &bboxes, + const std::vector<float> &scores, + const float score_threshold, + const float nms_threshold, + const float eta, + const int top_k, + std::vector<int> &indices) { ARM_COMPUTE_ERROR_ON_MSG(bboxes.size() != scores.size(), "bboxes and scores have different size."); @@ -298,9 +328,9 @@ void ApplyNMSFast(const std::vector<BBox> &bboxes, std::list<std::pair<float, int>> score_index_vec; // Generate index score pairs. - for(size_t i = 0; i < scores.size(); ++i) + for (size_t i = 0; i < scores.size(); ++i) { - if(scores[i] > score_threshold) + if (scores[i] > score_threshold) { score_index_vec.emplace_back(std::make_pair(scores[i], i)); } @@ -311,7 +341,7 @@ void ApplyNMSFast(const std::vector<BBox> &bboxes, // Keep top_k scores if needed. const int score_index_vec_size = score_index_vec.size(); - if(top_k > -1 && top_k < score_index_vec_size) + if (top_k > -1 && top_k < score_index_vec_size) { score_index_vec.resize(top_k); } @@ -320,46 +350,45 @@ void ApplyNMSFast(const std::vector<BBox> &bboxes, float adaptive_threshold = nms_threshold; indices.clear(); - while(!score_index_vec.empty()) + while (!score_index_vec.empty()) { const int idx = score_index_vec.front().second; bool keep = true; - for(int kept_idx : indices) + for (int kept_idx : indices) { - if(keep) + if (keep) { // Compute the jaccard (intersection over union IoU) overlap between two bboxes. - BBox intersect_bbox = std::array<float, 4>({ 0, 0, 0, 0 }); - if(bboxes[kept_idx][0] > bboxes[idx][2] || bboxes[kept_idx][2] < bboxes[idx][0] || bboxes[kept_idx][1] > bboxes[idx][3] || bboxes[kept_idx][3] < bboxes[idx][1]) + BBox intersect_bbox = std::array<float, 4>({0, 0, 0, 0}); + if (bboxes[kept_idx][0] > bboxes[idx][2] || bboxes[kept_idx][2] < bboxes[idx][0] || + bboxes[kept_idx][1] > bboxes[idx][3] || bboxes[kept_idx][3] < bboxes[idx][1]) { - intersect_bbox = std::array<float, 4>({ { 0, 0, 0, 0 } }); + intersect_bbox = std::array<float, 4>({{0, 0, 0, 0}}); } else { - intersect_bbox = std::array<float, 4>({ { - std::max(bboxes[idx][0], bboxes[kept_idx][0]), - std::max(bboxes[idx][1], bboxes[kept_idx][1]), - std::min(bboxes[idx][2], bboxes[kept_idx][2]), - std::min(bboxes[idx][3], bboxes[kept_idx][3]) - } - }); + intersect_bbox = std::array<float, 4>( + {{std::max(bboxes[idx][0], bboxes[kept_idx][0]), std::max(bboxes[idx][1], bboxes[kept_idx][1]), + std::min(bboxes[idx][2], bboxes[kept_idx][2]), + std::min(bboxes[idx][3], bboxes[kept_idx][3])}}); } float intersect_width = intersect_bbox[2] - intersect_bbox[0]; float intersect_height = intersect_bbox[3] - intersect_bbox[1]; float overlap = 0.f; - if(intersect_width > 0 && intersect_height > 0) + if (intersect_width > 0 && intersect_height > 0) { float intersect_size = intersect_width * intersect_height; - float bbox1_size = (bboxes[idx][2] < bboxes[idx][0] - || bboxes[idx][3] < bboxes[idx][1]) ? - 0.f : - (bboxes[idx][2] - bboxes[idx][0]) * (bboxes[idx][3] - bboxes[idx][1]); //BBoxSize(bboxes[idx]); - float bbox2_size = (bboxes[kept_idx][2] < bboxes[kept_idx][0] - || bboxes[kept_idx][3] < bboxes[kept_idx][1]) ? - 0.f : - (bboxes[kept_idx][2] - bboxes[kept_idx][0]) * (bboxes[kept_idx][3] - bboxes[kept_idx][1]); // BBoxSize(bboxes[kept_idx]); + float bbox1_size = (bboxes[idx][2] < bboxes[idx][0] || bboxes[idx][3] < bboxes[idx][1]) + ? 0.f + : (bboxes[idx][2] - bboxes[idx][0]) * + (bboxes[idx][3] - bboxes[idx][1]); //BBoxSize(bboxes[idx]); + float bbox2_size = + (bboxes[kept_idx][2] < bboxes[kept_idx][0] || bboxes[kept_idx][3] < bboxes[kept_idx][1]) + ? 0.f + : (bboxes[kept_idx][2] - bboxes[kept_idx][0]) * + (bboxes[kept_idx][3] - bboxes[kept_idx][1]); // BBoxSize(bboxes[kept_idx]); overlap = intersect_size / (bbox1_size + bbox2_size - intersect_size); } keep = (overlap <= adaptive_threshold); @@ -369,12 +398,12 @@ void ApplyNMSFast(const std::vector<BBox> &bboxes, break; } } - if(keep) + if (keep) { indices.push_back(idx); } score_index_vec.erase(score_index_vec.begin()); - if(keep && eta < 1.f && adaptive_threshold > 0.5f) + if (keep && eta < 1.f && adaptive_threshold > 0.5f) { adaptive_threshold *= eta; } @@ -383,23 +412,42 @@ void ApplyNMSFast(const std::vector<BBox> &bboxes, } // namespace CPPDetectionOutputLayer::CPPDetectionOutputLayer() - : _input_loc(nullptr), _input_conf(nullptr), _input_priorbox(nullptr), _output(nullptr), _info(), _num_priors(), _num(), _all_location_predictions(), _all_confidence_scores(), _all_prior_bboxes(), - _all_prior_variances(), _all_decode_bboxes(), _all_indices() + : _input_loc(nullptr), + _input_conf(nullptr), + _input_priorbox(nullptr), + _output(nullptr), + _info(), + _num_priors(), + _num(), + _all_location_predictions(), + _all_confidence_scores(), + _all_prior_bboxes(), + _all_prior_variances(), + _all_decode_bboxes(), + _all_indices() { } -void CPPDetectionOutputLayer::configure(const ITensor *input_loc, const ITensor *input_conf, const ITensor *input_priorbox, ITensor *output, DetectionOutputLayerInfo info) +void CPPDetectionOutputLayer::configure(const ITensor *input_loc, + const ITensor *input_conf, + const ITensor *input_priorbox, + ITensor *output, + DetectionOutputLayerInfo info) { ARM_COMPUTE_ERROR_ON_NULLPTR(input_loc, input_conf, input_priorbox, output); + ARM_COMPUTE_LOG_PARAMS(input_loc, input_conf, input_priorbox, output, info); + // Output auto initialization if not yet initialized // Since the number of bboxes to kept is unknown before nms, the shape is set to the maximum // The maximum is keep_top_k * input_loc_size[1] // Each row is a 7 dimension std::vector, which stores [image_id, label, confidence, xmin, ymin, xmax, ymax] - const unsigned int max_size = info.keep_top_k() * (input_loc->info()->num_dimensions() > 1 ? input_loc->info()->dimension(1) : 1); + const unsigned int max_size = + info.keep_top_k() * (input_loc->info()->num_dimensions() > 1 ? input_loc->info()->dimension(1) : 1); auto_init_if_empty(*output->info(), input_loc->info()->clone()->set_tensor_shape(TensorShape(7U, max_size))); // Perform validation step - ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input_loc->info(), input_conf->info(), input_priorbox->info(), output->info(), info)); + ARM_COMPUTE_ERROR_THROW_ON( + validate_arguments(input_loc->info(), input_conf->info(), input_priorbox->info(), output->info(), info)); _input_loc = input_loc; _input_conf = input_conf; @@ -415,12 +463,12 @@ void CPPDetectionOutputLayer::configure(const ITensor *input_loc, const ITensor _all_prior_variances.resize(_num_priors); _all_decode_bboxes.resize(_num); - for(int i = 0; i < _num; ++i) + for (int i = 0; i < _num; ++i) { - for(int c = 0; c < _info.num_loc_classes(); ++c) + for (int c = 0; c < _info.num_loc_classes(); ++c) { const int label = _info.share_location() ? -1 : c; - if(label == _info.background_label_id()) + if (label == _info.background_label_id()) { // Ignore background class. continue; @@ -435,7 +483,11 @@ void CPPDetectionOutputLayer::configure(const ITensor *input_loc, const ITensor output->info()->set_valid_region(ValidRegion(coord, output->info()->tensor_shape())); } -Status CPPDetectionOutputLayer::validate(const ITensorInfo *input_loc, const ITensorInfo *input_conf, const ITensorInfo *input_priorbox, const ITensorInfo *output, DetectionOutputLayerInfo info) +Status CPPDetectionOutputLayer::validate(const ITensorInfo *input_loc, + const ITensorInfo *input_conf, + const ITensorInfo *input_priorbox, + const ITensorInfo *output, + DetectionOutputLayerInfo info) { ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input_loc, input_conf, input_priorbox, output, info)); return Status{}; @@ -444,7 +496,8 @@ Status CPPDetectionOutputLayer::validate(const ITensorInfo *input_loc, const ITe void CPPDetectionOutputLayer::run() { // Retrieve all location predictions. - retrieve_all_loc_predictions(_input_loc, _num, _num_priors, _info.num_loc_classes(), _info.share_location(), _all_location_predictions); + retrieve_all_loc_predictions(_input_loc, _num, _num_priors, _info.num_loc_classes(), _info.share_location(), + _all_location_predictions); // Retrieve all confidences. retrieve_all_conf_scores(_input_conf, _num, _num_priors, _info.num_classes(), _all_confidence_scores); @@ -454,75 +507,79 @@ void CPPDetectionOutputLayer::run() // Decode all loc predictions to bboxes const bool clip_bbox = false; - for(int i = 0; i < _num; ++i) + for (int i = 0; i < _num; ++i) { - for(int c = 0; c < _info.num_loc_classes(); ++c) + for (int c = 0; c < _info.num_loc_classes(); ++c) { const int label = _info.share_location() ? -1 : c; - if(label == _info.background_label_id()) + if (label == _info.background_label_id()) { // Ignore background class. continue; } - ARM_COMPUTE_ERROR_ON_MSG_VAR(_all_location_predictions[i].find(label) == _all_location_predictions[i].end(), "Could not find location predictions for label %d.", label); + ARM_COMPUTE_ERROR_ON_MSG_VAR(_all_location_predictions[i].find(label) == _all_location_predictions[i].end(), + "Could not find location predictions for label %d.", label); const std::vector<BBox> &label_loc_preds = _all_location_predictions[i].find(label)->second; const int num_bboxes = _all_prior_bboxes.size(); ARM_COMPUTE_ERROR_ON(_all_prior_variances[i].size() != 4); - for(int j = 0; j < num_bboxes; ++j) + for (int j = 0; j < num_bboxes; ++j) { - DecodeBBox(_all_prior_bboxes[j], _all_prior_variances[j], _info.code_type(), _info.variance_encoded_in_target(), clip_bbox, label_loc_preds[j], _all_decode_bboxes[i][label][j]); + DecodeBBox(_all_prior_bboxes[j], _all_prior_variances[j], _info.code_type(), + _info.variance_encoded_in_target(), clip_bbox, label_loc_preds[j], + _all_decode_bboxes[i][label][j]); } } } int num_kept = 0; - for(int i = 0; i < _num; ++i) + for (int i = 0; i < _num; ++i) { - const LabelBBox &decode_bboxes = _all_decode_bboxes[i]; - const std::map<int, std::vector<float>> &conf_scores = _all_confidence_scores[i]; + const LabelBBox &decode_bboxes = _all_decode_bboxes[i]; + const std::map<int, std::vector<float>> &conf_scores = _all_confidence_scores[i]; std::map<int, std::vector<int>> indices; - int num_det = 0; - for(int c = 0; c < _info.num_classes(); ++c) + int num_det = 0; + for (int c = 0; c < _info.num_classes(); ++c) { - if(c == _info.background_label_id()) + if (c == _info.background_label_id()) { // Ignore background class continue; } const int label = _info.share_location() ? -1 : c; - if(conf_scores.find(c) == conf_scores.end() || decode_bboxes.find(label) == decode_bboxes.end()) + if (conf_scores.find(c) == conf_scores.end() || decode_bboxes.find(label) == decode_bboxes.end()) { ARM_COMPUTE_ERROR_VAR("Could not find predictions for label %d.", label); } const std::vector<float> &scores = conf_scores.find(c)->second; - const std::vector<BBox> &bboxes = decode_bboxes.find(label)->second; + const std::vector<BBox> &bboxes = decode_bboxes.find(label)->second; - ApplyNMSFast(bboxes, scores, _info.confidence_threshold(), _info.nms_threshold(), _info.eta(), _info.top_k(), indices[c]); + ApplyNMSFast(bboxes, scores, _info.confidence_threshold(), _info.nms_threshold(), _info.eta(), + _info.top_k(), indices[c]); num_det += indices[c].size(); } int num_to_add = 0; - if(_info.keep_top_k() > -1 && num_det > _info.keep_top_k()) + if (_info.keep_top_k() > -1 && num_det > _info.keep_top_k()) { std::vector<std::pair<float, std::pair<int, int>>> score_index_pairs; - for(auto const &it : indices) + for (auto const &it : indices) { const int label = it.first; const std::vector<int> &label_indices = it.second; - if(conf_scores.find(label) == conf_scores.end()) + if (conf_scores.find(label) == conf_scores.end()) { ARM_COMPUTE_ERROR_VAR("Could not find predictions for label %d.", label); } const std::vector<float> &scores = conf_scores.find(label)->second; - for(auto idx : label_indices) + for (auto idx : label_indices) { ARM_COMPUTE_ERROR_ON(idx > static_cast<int>(scores.size())); score_index_pairs.emplace_back(std::make_pair(scores[idx], std::make_pair(label, idx))); @@ -536,7 +593,7 @@ void CPPDetectionOutputLayer::run() // Store the new indices. std::map<int, std::vector<int>> new_indices; - for(auto score_index_pair : score_index_pairs) + for (auto score_index_pair : score_index_pairs) { int label = score_index_pair.second.first; int idx = score_index_pair.second.second; @@ -557,25 +614,25 @@ void CPPDetectionOutputLayer::run() _output->info()->set_valid_region(ValidRegion(Coordinates(0, 0), TensorShape(7, num_kept))); int count = 0; - for(int i = 0; i < _num; ++i) + for (int i = 0; i < _num; ++i) { - const std::map<int, std::vector<float>> &conf_scores = _all_confidence_scores[i]; - const LabelBBox &decode_bboxes = _all_decode_bboxes[i]; - for(auto &it : _all_indices[i]) + const std::map<int, std::vector<float>> &conf_scores = _all_confidence_scores[i]; + const LabelBBox &decode_bboxes = _all_decode_bboxes[i]; + for (auto &it : _all_indices[i]) { const int label = it.first; const std::vector<float> &scores = conf_scores.find(label)->second; const int loc_label = _info.share_location() ? -1 : label; - if(conf_scores.find(label) == conf_scores.end() || decode_bboxes.find(loc_label) == decode_bboxes.end()) + if (conf_scores.find(label) == conf_scores.end() || decode_bboxes.find(loc_label) == decode_bboxes.end()) { // Either if there are no confidence predictions // or there are no location predictions for current label. ARM_COMPUTE_ERROR_VAR("Could not find predictions for the label %d.", label); } const std::vector<BBox> &bboxes = decode_bboxes.find(loc_label)->second; - const std::vector<int> &indices = it.second; + const std::vector<int> &indices = it.second; - for(auto idx : indices) + for (auto idx : indices) { *(reinterpret_cast<float *>(_output->ptr_to_element(Coordinates(count * 7)))) = i; *(reinterpret_cast<float *>(_output->ptr_to_element(Coordinates(count * 7 + 1)))) = label; diff --git a/src/runtime/CPP/functions/CPPDetectionPostProcessLayer.cpp b/src/runtime/CPP/functions/CPPDetectionPostProcessLayer.cpp index 31f1fafd69..2861d6cacb 100644 --- a/src/runtime/CPP/functions/CPPDetectionPostProcessLayer.cpp +++ b/src/runtime/CPP/functions/CPPDetectionPostProcessLayer.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2020 Arm Limited. + * Copyright (c) 2019-2021 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -26,6 +26,8 @@ #include "arm_compute/core/Error.h" #include "arm_compute/core/Helpers.h" #include "arm_compute/core/Validate.h" + +#include "src/common/utils/Log.h" #include "src/core/helpers/AutoConfiguration.h" #include <cstddef> @@ -36,53 +38,76 @@ namespace arm_compute { namespace { -Status validate_arguments(const ITensorInfo *input_box_encoding, const ITensorInfo *input_class_score, const ITensorInfo *input_anchors, - ITensorInfo *output_boxes, ITensorInfo *output_classes, ITensorInfo *output_scores, ITensorInfo *num_detection, - DetectionPostProcessLayerInfo info, const unsigned int kBatchSize, const unsigned int kNumCoordBox) +Status validate_arguments(const ITensorInfo *input_box_encoding, + const ITensorInfo *input_class_score, + const ITensorInfo *input_anchors, + ITensorInfo *output_boxes, + ITensorInfo *output_classes, + ITensorInfo *output_scores, + ITensorInfo *num_detection, + DetectionPostProcessLayerInfo info, + const unsigned int kBatchSize, + const unsigned int kNumCoordBox) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input_box_encoding, input_class_score, input_anchors); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input_box_encoding, 1, DataType::F32, DataType::QASYMM8, DataType::QASYMM8_SIGNED); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input_box_encoding, 1, DataType::F32, DataType::QASYMM8, + DataType::QASYMM8_SIGNED); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input_box_encoding, input_anchors); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(input_box_encoding->num_dimensions() > 3, "The location input tensor shape should be [4, N, kBatchSize]."); - if(input_box_encoding->num_dimensions() > 2) + ARM_COMPUTE_RETURN_ERROR_ON_MSG(input_box_encoding->num_dimensions() > 3, + "The location input tensor shape should be [4, N, kBatchSize]."); + if (input_box_encoding->num_dimensions() > 2) { - ARM_COMPUTE_RETURN_ERROR_ON_MSG_VAR(input_box_encoding->dimension(2) != kBatchSize, "The third dimension of the input box_encoding tensor should be equal to %d.", kBatchSize); + ARM_COMPUTE_RETURN_ERROR_ON_MSG_VAR( + input_box_encoding->dimension(2) != kBatchSize, + "The third dimension of the input box_encoding tensor should be equal to %d.", kBatchSize); } - ARM_COMPUTE_RETURN_ERROR_ON_MSG_VAR(input_box_encoding->dimension(0) != kNumCoordBox, "The first dimension of the input box_encoding tensor should be equal to %d.", kNumCoordBox); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(input_class_score->dimension(0) != (info.num_classes() + 1), - "The first dimension of the input class_prediction should be equal to the number of classes plus one."); - - ARM_COMPUTE_RETURN_ERROR_ON_MSG(input_anchors->num_dimensions() > 3, "The anchors input tensor shape should be [4, N, kBatchSize]."); - if(input_anchors->num_dimensions() > 2) + ARM_COMPUTE_RETURN_ERROR_ON_MSG_VAR(input_box_encoding->dimension(0) != kNumCoordBox, + "The first dimension of the input box_encoding tensor should be equal to %d.", + kNumCoordBox); + ARM_COMPUTE_RETURN_ERROR_ON_MSG( + input_class_score->dimension(0) != (info.num_classes() + 1), + "The first dimension of the input class_prediction should be equal to the number of classes plus one."); + + ARM_COMPUTE_RETURN_ERROR_ON_MSG(input_anchors->num_dimensions() > 3, + "The anchors input tensor shape should be [4, N, kBatchSize]."); + if (input_anchors->num_dimensions() > 2) { - ARM_COMPUTE_RETURN_ERROR_ON_MSG_VAR(input_anchors->dimension(0) != kNumCoordBox, "The first dimension of the input anchors tensor should be equal to %d.", kNumCoordBox); + ARM_COMPUTE_RETURN_ERROR_ON_MSG_VAR(input_anchors->dimension(0) != kNumCoordBox, + "The first dimension of the input anchors tensor should be equal to %d.", + kNumCoordBox); } - ARM_COMPUTE_RETURN_ERROR_ON_MSG((input_box_encoding->dimension(1) != input_class_score->dimension(1)) - || (input_box_encoding->dimension(1) != input_anchors->dimension(1)), + ARM_COMPUTE_RETURN_ERROR_ON_MSG((input_box_encoding->dimension(1) != input_class_score->dimension(1)) || + (input_box_encoding->dimension(1) != input_anchors->dimension(1)), "The second dimension of the inputs should be the same."); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(num_detection->num_dimensions() > 1, "The num_detection output tensor shape should be [M]."); - ARM_COMPUTE_RETURN_ERROR_ON_MSG((info.iou_threshold() <= 0.0f) || (info.iou_threshold() > 1.0f), "The intersection over union should be positive and less than 1."); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.max_classes_per_detection() <= 0, "The number of max classes per detection should be positive."); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(num_detection->num_dimensions() > 1, + "The num_detection output tensor shape should be [M]."); + ARM_COMPUTE_RETURN_ERROR_ON_MSG((info.iou_threshold() <= 0.0f) || (info.iou_threshold() > 1.0f), + "The intersection over union should be positive and less than 1."); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.max_classes_per_detection() <= 0, + "The number of max classes per detection should be positive."); const unsigned int num_detected_boxes = info.max_detections() * info.max_classes_per_detection(); // Validate configured outputs - if(output_boxes->total_size() != 0) + if (output_boxes->total_size() != 0) { - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output_boxes->tensor_shape(), TensorShape(4U, num_detected_boxes, 1U)); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output_boxes->tensor_shape(), + TensorShape(4U, num_detected_boxes, 1U)); ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output_boxes, 1, DataType::F32); } - if(output_classes->total_size() != 0) + if (output_classes->total_size() != 0) { - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output_classes->tensor_shape(), TensorShape(num_detected_boxes, 1U)); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output_classes->tensor_shape(), + TensorShape(num_detected_boxes, 1U)); ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output_classes, 1, DataType::F32); } - if(output_scores->total_size() != 0) + if (output_scores->total_size() != 0) { - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output_scores->tensor_shape(), TensorShape(num_detected_boxes, 1U)); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output_scores->tensor_shape(), + TensorShape(num_detected_boxes, 1U)); ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output_scores, 1, DataType::F32); } - if(num_detection->total_size() != 0) + if (num_detection->total_size() != 0) { ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(num_detection->tensor_shape(), TensorShape(1U)); ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(num_detection, 1, DataType::F32); @@ -91,15 +116,18 @@ Status validate_arguments(const ITensorInfo *input_box_encoding, const ITensorIn return Status{}; } -inline void DecodeBoxCorner(BBox &box_centersize, BBox &anchor, Iterator &decoded_it, DetectionPostProcessLayerInfo info) +inline void +DecodeBoxCorner(BBox &box_centersize, BBox &anchor, Iterator &decoded_it, DetectionPostProcessLayerInfo info) { const float half_factor = 0.5f; // BBox is equavalent to CenterSizeEncoding [y,x,h,w] const float y_center = box_centersize[0] / info.scale_value_y() * anchor[2] + anchor[0]; const float x_center = box_centersize[1] / info.scale_value_x() * anchor[3] + anchor[1]; - const float half_h = half_factor * static_cast<float>(std::exp(box_centersize[2] / info.scale_value_h())) * anchor[2]; - const float half_w = half_factor * static_cast<float>(std::exp(box_centersize[3] / info.scale_value_w())) * anchor[3]; + const float half_h = + half_factor * static_cast<float>(std::exp(box_centersize[2] / info.scale_value_h())) * anchor[2]; + const float half_w = + half_factor * static_cast<float>(std::exp(box_centersize[3] / info.scale_value_w())) * anchor[3]; // Box Corner encoding boxes are saved as [xmin, ymin, xmax, ymax] auto decoded_ptr = reinterpret_cast<float *>(decoded_it.ptr()); @@ -116,12 +144,15 @@ inline void DecodeBoxCorner(BBox &box_centersize, BBox &anchor, Iterator &decode * @param[in] info The detection informations * @param[out] decoded_boxes The decoded bboxes. */ -void DecodeCenterSizeBoxes(const ITensor *input_box_encoding, const ITensor *input_anchors, DetectionPostProcessLayerInfo info, Tensor *decoded_boxes) +void DecodeCenterSizeBoxes(const ITensor *input_box_encoding, + const ITensor *input_anchors, + DetectionPostProcessLayerInfo info, + Tensor *decoded_boxes) { const QuantizationInfo &qi_box = input_box_encoding->info()->quantization_info(); const QuantizationInfo &qi_anchors = input_anchors->info()->quantization_info(); - BBox box_centersize{ {} }; - BBox anchor{ {} }; + BBox box_centersize{{}}; + BBox anchor{{}}; Window win; win.use_tensor_dimensions(input_box_encoding->info()->tensor_shape()); @@ -131,103 +162,155 @@ void DecodeCenterSizeBoxes(const ITensor *input_box_encoding, const ITensor *inp Iterator anchor_it(input_anchors, win); Iterator decoded_it(decoded_boxes, win); - if(input_box_encoding->info()->data_type() == DataType::QASYMM8) + if (input_box_encoding->info()->data_type() == DataType::QASYMM8) { - execute_window_loop(win, [&](const Coordinates &) - { - const auto box_ptr = reinterpret_cast<const qasymm8_t *>(box_it.ptr()); - const auto anchor_ptr = reinterpret_cast<const qasymm8_t *>(anchor_it.ptr()); - box_centersize = BBox({ dequantize_qasymm8(*box_ptr, qi_box), dequantize_qasymm8(*(box_ptr + 1), qi_box), - dequantize_qasymm8(*(2 + box_ptr), qi_box), dequantize_qasymm8(*(3 + box_ptr), qi_box) - }); - anchor = BBox({ dequantize_qasymm8(*anchor_ptr, qi_anchors), dequantize_qasymm8(*(anchor_ptr + 1), qi_anchors), - dequantize_qasymm8(*(2 + anchor_ptr), qi_anchors), dequantize_qasymm8(*(3 + anchor_ptr), qi_anchors) - }); - DecodeBoxCorner(box_centersize, anchor, decoded_it, info); - }, - box_it, anchor_it, decoded_it); + execute_window_loop( + win, + [&](const Coordinates &) + { + const auto box_ptr = reinterpret_cast<const qasymm8_t *>(box_it.ptr()); + const auto anchor_ptr = reinterpret_cast<const qasymm8_t *>(anchor_it.ptr()); + box_centersize = + BBox({dequantize_qasymm8(*box_ptr, qi_box), dequantize_qasymm8(*(box_ptr + 1), qi_box), + dequantize_qasymm8(*(2 + box_ptr), qi_box), dequantize_qasymm8(*(3 + box_ptr), qi_box)}); + anchor = BBox({dequantize_qasymm8(*anchor_ptr, qi_anchors), + dequantize_qasymm8(*(anchor_ptr + 1), qi_anchors), + dequantize_qasymm8(*(2 + anchor_ptr), qi_anchors), + dequantize_qasymm8(*(3 + anchor_ptr), qi_anchors)}); + DecodeBoxCorner(box_centersize, anchor, decoded_it, info); + }, + box_it, anchor_it, decoded_it); } - else if(input_box_encoding->info()->data_type() == DataType::QASYMM8_SIGNED) + else if (input_box_encoding->info()->data_type() == DataType::QASYMM8_SIGNED) { - execute_window_loop(win, [&](const Coordinates &) - { - const auto box_ptr = reinterpret_cast<const qasymm8_signed_t *>(box_it.ptr()); - const auto anchor_ptr = reinterpret_cast<const qasymm8_signed_t *>(anchor_it.ptr()); - box_centersize = BBox({ dequantize_qasymm8_signed(*box_ptr, qi_box), dequantize_qasymm8_signed(*(box_ptr + 1), qi_box), - dequantize_qasymm8_signed(*(2 + box_ptr), qi_box), dequantize_qasymm8_signed(*(3 + box_ptr), qi_box) - }); - anchor = BBox({ dequantize_qasymm8_signed(*anchor_ptr, qi_anchors), dequantize_qasymm8_signed(*(anchor_ptr + 1), qi_anchors), - dequantize_qasymm8_signed(*(2 + anchor_ptr), qi_anchors), dequantize_qasymm8_signed(*(3 + anchor_ptr), qi_anchors) - }); - DecodeBoxCorner(box_centersize, anchor, decoded_it, info); - }, - box_it, anchor_it, decoded_it); + execute_window_loop( + win, + [&](const Coordinates &) + { + const auto box_ptr = reinterpret_cast<const qasymm8_signed_t *>(box_it.ptr()); + const auto anchor_ptr = reinterpret_cast<const qasymm8_signed_t *>(anchor_it.ptr()); + box_centersize = BBox({dequantize_qasymm8_signed(*box_ptr, qi_box), + dequantize_qasymm8_signed(*(box_ptr + 1), qi_box), + dequantize_qasymm8_signed(*(2 + box_ptr), qi_box), + dequantize_qasymm8_signed(*(3 + box_ptr), qi_box)}); + anchor = BBox({dequantize_qasymm8_signed(*anchor_ptr, qi_anchors), + dequantize_qasymm8_signed(*(anchor_ptr + 1), qi_anchors), + dequantize_qasymm8_signed(*(2 + anchor_ptr), qi_anchors), + dequantize_qasymm8_signed(*(3 + anchor_ptr), qi_anchors)}); + DecodeBoxCorner(box_centersize, anchor, decoded_it, info); + }, + box_it, anchor_it, decoded_it); } else { - execute_window_loop(win, [&](const Coordinates &) - { - const auto box_ptr = reinterpret_cast<const float *>(box_it.ptr()); - const auto anchor_ptr = reinterpret_cast<const float *>(anchor_it.ptr()); - box_centersize = BBox({ *box_ptr, *(box_ptr + 1), *(2 + box_ptr), *(3 + box_ptr) }); - anchor = BBox({ *anchor_ptr, *(anchor_ptr + 1), *(2 + anchor_ptr), *(3 + anchor_ptr) }); - DecodeBoxCorner(box_centersize, anchor, decoded_it, info); - }, - box_it, anchor_it, decoded_it); + execute_window_loop( + win, + [&](const Coordinates &) + { + const auto box_ptr = reinterpret_cast<const float *>(box_it.ptr()); + const auto anchor_ptr = reinterpret_cast<const float *>(anchor_it.ptr()); + box_centersize = BBox({*box_ptr, *(box_ptr + 1), *(2 + box_ptr), *(3 + box_ptr)}); + anchor = BBox({*anchor_ptr, *(anchor_ptr + 1), *(2 + anchor_ptr), *(3 + anchor_ptr)}); + DecodeBoxCorner(box_centersize, anchor, decoded_it, info); + }, + box_it, anchor_it, decoded_it); } } -void SaveOutputs(const Tensor *decoded_boxes, const std::vector<int> &result_idx_boxes_after_nms, const std::vector<float> &result_scores_after_nms, const std::vector<int> &result_classes_after_nms, - std::vector<unsigned int> &sorted_indices, const unsigned int num_output, const unsigned int max_detections, ITensor *output_boxes, ITensor *output_classes, ITensor *output_scores, - ITensor *num_detection) +void SaveOutputs(const Tensor *decoded_boxes, + const std::vector<int> &result_idx_boxes_after_nms, + const std::vector<float> &result_scores_after_nms, + const std::vector<int> &result_classes_after_nms, + std::vector<unsigned int> &sorted_indices, + const unsigned int num_output, + const unsigned int max_detections, + ITensor *output_boxes, + ITensor *output_classes, + ITensor *output_scores, + ITensor *num_detection) { // xmin,ymin,xmax,ymax -> ymin,xmin,ymax,xmax unsigned int i = 0; - for(; i < num_output; ++i) + for (; i < num_output; ++i) { const unsigned int box_in_idx = result_idx_boxes_after_nms[sorted_indices[i]]; - *(reinterpret_cast<float *>(output_boxes->ptr_to_element(Coordinates(0, i)))) = *(reinterpret_cast<float *>(decoded_boxes->ptr_to_element(Coordinates(1, box_in_idx)))); - *(reinterpret_cast<float *>(output_boxes->ptr_to_element(Coordinates(1, i)))) = *(reinterpret_cast<float *>(decoded_boxes->ptr_to_element(Coordinates(0, box_in_idx)))); - *(reinterpret_cast<float *>(output_boxes->ptr_to_element(Coordinates(2, i)))) = *(reinterpret_cast<float *>(decoded_boxes->ptr_to_element(Coordinates(3, box_in_idx)))); - *(reinterpret_cast<float *>(output_boxes->ptr_to_element(Coordinates(3, i)))) = *(reinterpret_cast<float *>(decoded_boxes->ptr_to_element(Coordinates(2, box_in_idx)))); - *(reinterpret_cast<float *>(output_classes->ptr_to_element(Coordinates(i)))) = static_cast<float>(result_classes_after_nms[sorted_indices[i]]); - *(reinterpret_cast<float *>(output_scores->ptr_to_element(Coordinates(i)))) = result_scores_after_nms[sorted_indices[i]]; + *(reinterpret_cast<float *>(output_boxes->ptr_to_element(Coordinates(0, i)))) = + *(reinterpret_cast<float *>(decoded_boxes->ptr_to_element(Coordinates(1, box_in_idx)))); + *(reinterpret_cast<float *>(output_boxes->ptr_to_element(Coordinates(1, i)))) = + *(reinterpret_cast<float *>(decoded_boxes->ptr_to_element(Coordinates(0, box_in_idx)))); + *(reinterpret_cast<float *>(output_boxes->ptr_to_element(Coordinates(2, i)))) = + *(reinterpret_cast<float *>(decoded_boxes->ptr_to_element(Coordinates(3, box_in_idx)))); + *(reinterpret_cast<float *>(output_boxes->ptr_to_element(Coordinates(3, i)))) = + *(reinterpret_cast<float *>(decoded_boxes->ptr_to_element(Coordinates(2, box_in_idx)))); + *(reinterpret_cast<float *>(output_classes->ptr_to_element(Coordinates(i)))) = + static_cast<float>(result_classes_after_nms[sorted_indices[i]]); + *(reinterpret_cast<float *>(output_scores->ptr_to_element(Coordinates(i)))) = + result_scores_after_nms[sorted_indices[i]]; } - for(; i < max_detections; ++i) + for (; i < max_detections; ++i) { *(reinterpret_cast<float *>(output_boxes->ptr_to_element(Coordinates(1, i)))) = 0.0f; *(reinterpret_cast<float *>(output_boxes->ptr_to_element(Coordinates(0, i)))) = 0.0f; *(reinterpret_cast<float *>(output_boxes->ptr_to_element(Coordinates(3, i)))) = 0.0f; *(reinterpret_cast<float *>(output_boxes->ptr_to_element(Coordinates(2, i)))) = 0.0f; - *(reinterpret_cast<float *>(output_classes->ptr_to_element(Coordinates(i)))) = 0.0f; - *(reinterpret_cast<float *>(output_scores->ptr_to_element(Coordinates(i)))) = 0.0f; + *(reinterpret_cast<float *>(output_classes->ptr_to_element(Coordinates(i)))) = 0.0f; + *(reinterpret_cast<float *>(output_scores->ptr_to_element(Coordinates(i)))) = 0.0f; } *(reinterpret_cast<float *>(num_detection->ptr_to_element(Coordinates(0)))) = num_output; } } // namespace CPPDetectionPostProcessLayer::CPPDetectionPostProcessLayer(std::shared_ptr<IMemoryManager> memory_manager) - : _memory_group(std::move(memory_manager)), _nms(), _input_box_encoding(nullptr), _input_scores(nullptr), _input_anchors(nullptr), _output_boxes(nullptr), _output_classes(nullptr), - _output_scores(nullptr), _num_detection(nullptr), _info(), _num_boxes(), _num_classes_with_background(), _num_max_detected_boxes(), _dequantize_scores(false), _decoded_boxes(), _decoded_scores(), - _selected_indices(), _class_scores(), _input_scores_to_use(nullptr) + : _memory_group(std::move(memory_manager)), + _nms(), + _input_box_encoding(nullptr), + _input_scores(nullptr), + _input_anchors(nullptr), + _output_boxes(nullptr), + _output_classes(nullptr), + _output_scores(nullptr), + _num_detection(nullptr), + _info(), + _num_boxes(), + _num_classes_with_background(), + _num_max_detected_boxes(), + _dequantize_scores(false), + _decoded_boxes(), + _decoded_scores(), + _selected_indices(), + _class_scores(), + _input_scores_to_use(nullptr) { } -void CPPDetectionPostProcessLayer::configure(const ITensor *input_box_encoding, const ITensor *input_scores, const ITensor *input_anchors, - ITensor *output_boxes, ITensor *output_classes, ITensor *output_scores, ITensor *num_detection, DetectionPostProcessLayerInfo info) +void CPPDetectionPostProcessLayer::configure(const ITensor *input_box_encoding, + const ITensor *input_scores, + const ITensor *input_anchors, + ITensor *output_boxes, + ITensor *output_classes, + ITensor *output_scores, + ITensor *num_detection, + DetectionPostProcessLayerInfo info) { - ARM_COMPUTE_ERROR_ON_NULLPTR(input_box_encoding, input_scores, input_anchors, output_boxes, output_classes, output_scores); + ARM_COMPUTE_ERROR_ON_NULLPTR(input_box_encoding, input_scores, input_anchors, output_boxes, output_classes, + output_scores); + ARM_COMPUTE_LOG_PARAMS(input_box_encoding, input_scores, input_anchors, output_boxes, output_classes, output_scores, + num_detection, info); + _num_max_detected_boxes = info.max_detections() * info.max_classes_per_detection(); - auto_init_if_empty(*output_boxes->info(), TensorInfo(TensorShape(_kNumCoordBox, _num_max_detected_boxes, _kBatchSize), 1, DataType::F32)); - auto_init_if_empty(*output_classes->info(), TensorInfo(TensorShape(_num_max_detected_boxes, _kBatchSize), 1, DataType::F32)); - auto_init_if_empty(*output_scores->info(), TensorInfo(TensorShape(_num_max_detected_boxes, _kBatchSize), 1, DataType::F32)); + auto_init_if_empty(*output_boxes->info(), + TensorInfo(TensorShape(_kNumCoordBox, _num_max_detected_boxes, _kBatchSize), 1, DataType::F32)); + auto_init_if_empty(*output_classes->info(), + TensorInfo(TensorShape(_num_max_detected_boxes, _kBatchSize), 1, DataType::F32)); + auto_init_if_empty(*output_scores->info(), + TensorInfo(TensorShape(_num_max_detected_boxes, _kBatchSize), 1, DataType::F32)); auto_init_if_empty(*num_detection->info(), TensorInfo(TensorShape(1U), 1, DataType::F32)); // Perform validation step - ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input_box_encoding->info(), input_scores->info(), input_anchors->info(), output_boxes->info(), output_classes->info(), output_scores->info(), - num_detection->info(), - info, _kBatchSize, _kNumCoordBox)); + ARM_COMPUTE_ERROR_THROW_ON(validate_arguments( + input_box_encoding->info(), input_scores->info(), input_anchors->info(), output_boxes->info(), + output_classes->info(), output_scores->info(), num_detection->info(), info, _kBatchSize, _kNumCoordBox)); _input_box_encoding = input_box_encoding; _input_scores = input_scores; @@ -239,13 +322,24 @@ void CPPDetectionPostProcessLayer::configure(const ITensor *input_box_encoding, _info = info; _num_boxes = input_box_encoding->info()->dimension(1); _num_classes_with_background = _input_scores->info()->dimension(0); - _dequantize_scores = (info.dequantize_scores() && is_data_type_quantized(input_box_encoding->info()->data_type())); - - auto_init_if_empty(*_decoded_boxes.info(), TensorInfo(TensorShape(_kNumCoordBox, _input_box_encoding->info()->dimension(1), _kBatchSize), 1, DataType::F32)); - auto_init_if_empty(*_decoded_scores.info(), TensorInfo(TensorShape(_input_scores->info()->dimension(0), _input_scores->info()->dimension(1), _kBatchSize), 1, DataType::F32)); - auto_init_if_empty(*_selected_indices.info(), TensorInfo(TensorShape(info.use_regular_nms() ? info.detection_per_class() : info.max_detections()), 1, DataType::S32)); + _dequantize_scores = (info.dequantize_scores() && is_data_type_quantized(input_box_encoding->info()->data_type())); + + auto_init_if_empty(*_decoded_boxes.info(), + TensorInfo(TensorShape(_kNumCoordBox, _input_box_encoding->info()->dimension(1), _kBatchSize), 1, + DataType::F32)); + auto_init_if_empty( + *_decoded_scores.info(), + TensorInfo(TensorShape(_input_scores->info()->dimension(0), _input_scores->info()->dimension(1), _kBatchSize), + 1, DataType::F32)); + auto_init_if_empty( + *_selected_indices.info(), + TensorInfo(TensorShape(info.use_regular_nms() ? info.detection_per_class() : info.max_detections()), 1, + DataType::S32)); const unsigned int num_classes_per_box = std::min(info.max_classes_per_detection(), info.num_classes()); - auto_init_if_empty(*_class_scores.info(), TensorInfo(info.use_regular_nms() ? TensorShape(_num_boxes) : TensorShape(_num_boxes * num_classes_per_box), 1, DataType::F32)); + auto_init_if_empty( + *_class_scores.info(), + TensorInfo(info.use_regular_nms() ? TensorShape(_num_boxes) : TensorShape(_num_boxes * num_classes_per_box), 1, + DataType::F32)); _input_scores_to_use = _dequantize_scores ? &_decoded_scores : _input_scores; @@ -254,7 +348,9 @@ void CPPDetectionPostProcessLayer::configure(const ITensor *input_box_encoding, _memory_group.manage(&_decoded_scores); _memory_group.manage(&_selected_indices); _memory_group.manage(&_class_scores); - _nms.configure(&_decoded_boxes, &_class_scores, &_selected_indices, info.use_regular_nms() ? info.detection_per_class() : info.max_detections(), info.nms_score_threshold(), info.iou_threshold()); + _nms.configure(&_decoded_boxes, &_class_scores, &_selected_indices, + info.use_regular_nms() ? info.detection_per_class() : info.max_detections(), + info.nms_score_threshold(), info.iou_threshold()); // Allocate and reserve intermediate tensors and vectors _decoded_boxes.allocator()->allocate(); @@ -263,18 +359,28 @@ void CPPDetectionPostProcessLayer::configure(const ITensor *input_box_encoding, _class_scores.allocator()->allocate(); } -Status CPPDetectionPostProcessLayer::validate(const ITensorInfo *input_box_encoding, const ITensorInfo *input_class_score, const ITensorInfo *input_anchors, - ITensorInfo *output_boxes, ITensorInfo *output_classes, ITensorInfo *output_scores, ITensorInfo *num_detection, DetectionPostProcessLayerInfo info) +Status CPPDetectionPostProcessLayer::validate(const ITensorInfo *input_box_encoding, + const ITensorInfo *input_class_score, + const ITensorInfo *input_anchors, + ITensorInfo *output_boxes, + ITensorInfo *output_classes, + ITensorInfo *output_scores, + ITensorInfo *num_detection, + DetectionPostProcessLayerInfo info) { - constexpr unsigned int kBatchSize = 1; - constexpr unsigned int kNumCoordBox = 4; - const TensorInfo _decoded_boxes_info = TensorInfo(TensorShape(kNumCoordBox, input_box_encoding->dimension(1)), 1, DataType::F32); - const TensorInfo _decoded_scores_info = TensorInfo(TensorShape(input_box_encoding->dimension(1)), 1, DataType::F32); - const TensorInfo _selected_indices_info = TensorInfo(TensorShape(info.max_detections()), 1, DataType::S32); - - ARM_COMPUTE_RETURN_ON_ERROR(CPPNonMaximumSuppression::validate(&_decoded_boxes_info, &_decoded_scores_info, &_selected_indices_info, info.max_detections(), info.nms_score_threshold(), - info.iou_threshold())); - ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input_box_encoding, input_class_score, input_anchors, output_boxes, output_classes, output_scores, num_detection, info, kBatchSize, kNumCoordBox)); + constexpr unsigned int kBatchSize = 1; + constexpr unsigned int kNumCoordBox = 4; + const TensorInfo _decoded_boxes_info = + TensorInfo(TensorShape(kNumCoordBox, input_box_encoding->dimension(1)), 1, DataType::F32); + const TensorInfo _decoded_scores_info = TensorInfo(TensorShape(input_box_encoding->dimension(1)), 1, DataType::F32); + const TensorInfo _selected_indices_info = TensorInfo(TensorShape(info.max_detections()), 1, DataType::S32); + + ARM_COMPUTE_RETURN_ON_ERROR(CPPNonMaximumSuppression::validate(&_decoded_boxes_info, &_decoded_scores_info, + &_selected_indices_info, info.max_detections(), + info.nms_score_threshold(), info.iou_threshold())); + ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input_box_encoding, input_class_score, input_anchors, output_boxes, + output_classes, output_scores, num_detection, info, kBatchSize, + kNumCoordBox)); return Status{}; } @@ -287,62 +393,69 @@ void CPPDetectionPostProcessLayer::run() DecodeCenterSizeBoxes(_input_box_encoding, _input_anchors, _info, &_decoded_boxes); // Decode scores if necessary - if(_dequantize_scores) + if (_dequantize_scores) { - if(_input_box_encoding->info()->data_type() == DataType::QASYMM8) + if (_input_box_encoding->info()->data_type() == DataType::QASYMM8) { - for(unsigned int idx_c = 0; idx_c < _num_classes_with_background; ++idx_c) + for (unsigned int idx_c = 0; idx_c < _num_classes_with_background; ++idx_c) { - for(unsigned int idx_b = 0; idx_b < _num_boxes; ++idx_b) + for (unsigned int idx_b = 0; idx_b < _num_boxes; ++idx_b) { *(reinterpret_cast<float *>(_decoded_scores.ptr_to_element(Coordinates(idx_c, idx_b)))) = - dequantize_qasymm8(*(reinterpret_cast<qasymm8_t *>(_input_scores->ptr_to_element(Coordinates(idx_c, idx_b)))), _input_scores->info()->quantization_info()); + dequantize_qasymm8( + *(reinterpret_cast<qasymm8_t *>(_input_scores->ptr_to_element(Coordinates(idx_c, idx_b)))), + _input_scores->info()->quantization_info()); } } } - else if(_input_box_encoding->info()->data_type() == DataType::QASYMM8_SIGNED) + else if (_input_box_encoding->info()->data_type() == DataType::QASYMM8_SIGNED) { - for(unsigned int idx_c = 0; idx_c < _num_classes_with_background; ++idx_c) + for (unsigned int idx_c = 0; idx_c < _num_classes_with_background; ++idx_c) { - for(unsigned int idx_b = 0; idx_b < _num_boxes; ++idx_b) + for (unsigned int idx_b = 0; idx_b < _num_boxes; ++idx_b) { *(reinterpret_cast<float *>(_decoded_scores.ptr_to_element(Coordinates(idx_c, idx_b)))) = - dequantize_qasymm8_signed(*(reinterpret_cast<qasymm8_signed_t *>(_input_scores->ptr_to_element(Coordinates(idx_c, idx_b)))), _input_scores->info()->quantization_info()); + dequantize_qasymm8_signed(*(reinterpret_cast<qasymm8_signed_t *>( + _input_scores->ptr_to_element(Coordinates(idx_c, idx_b)))), + _input_scores->info()->quantization_info()); } } } } // Regular NMS - if(_info.use_regular_nms()) + if (_info.use_regular_nms()) { std::vector<int> result_idx_boxes_after_nms; std::vector<int> result_classes_after_nms; std::vector<float> result_scores_after_nms; std::vector<unsigned int> sorted_indices; - for(unsigned int c = 0; c < num_classes; ++c) + for (unsigned int c = 0; c < num_classes; ++c) { // For each boxes get scores of the boxes for the class c - for(unsigned int i = 0; i < _num_boxes; ++i) + for (unsigned int i = 0; i < _num_boxes; ++i) { *(reinterpret_cast<float *>(_class_scores.ptr_to_element(Coordinates(i)))) = - *(reinterpret_cast<float *>(_input_scores_to_use->ptr_to_element(Coordinates(c + 1, i)))); // i * _num_classes_with_background + c + 1 + *(reinterpret_cast<float *>(_input_scores_to_use->ptr_to_element( + Coordinates(c + 1, i)))); // i * _num_classes_with_background + c + 1 } // Run Non-maxima Suppression _nms.run(); - for(unsigned int i = 0; i < _info.detection_per_class(); ++i) + for (unsigned int i = 0; i < _info.detection_per_class(); ++i) { - const auto selected_index = *(reinterpret_cast<int *>(_selected_indices.ptr_to_element(Coordinates(i)))); - if(selected_index == -1) + const auto selected_index = + *(reinterpret_cast<int *>(_selected_indices.ptr_to_element(Coordinates(i)))); + if (selected_index == -1) { // Nms will return -1 for all the last M-elements not valid break; } result_idx_boxes_after_nms.emplace_back(selected_index); - result_scores_after_nms.emplace_back((reinterpret_cast<float *>(_class_scores.buffer()))[selected_index]); + result_scores_after_nms.emplace_back( + (reinterpret_cast<float *>(_class_scores.buffer()))[selected_index]); result_classes_after_nms.emplace_back(c); } } @@ -354,49 +467,46 @@ void CPPDetectionPostProcessLayer::run() // Sort selected indices based on result scores sorted_indices.resize(num_selected); std::iota(sorted_indices.begin(), sorted_indices.end(), 0); - std::partial_sort(sorted_indices.data(), - sorted_indices.data() + num_output, + std::partial_sort(sorted_indices.data(), sorted_indices.data() + num_output, sorted_indices.data() + num_selected, [&](unsigned int first, unsigned int second) - { - - return result_scores_after_nms[first] > result_scores_after_nms[second]; - }); + { return result_scores_after_nms[first] > result_scores_after_nms[second]; }); - SaveOutputs(&_decoded_boxes, result_idx_boxes_after_nms, result_scores_after_nms, result_classes_after_nms, sorted_indices, - num_output, max_detections, _output_boxes, _output_classes, _output_scores, _num_detection); + SaveOutputs(&_decoded_boxes, result_idx_boxes_after_nms, result_scores_after_nms, result_classes_after_nms, + sorted_indices, num_output, max_detections, _output_boxes, _output_classes, _output_scores, + _num_detection); } // Fast NMS else { - const unsigned int num_classes_per_box = std::min<unsigned int>(_info.max_classes_per_detection(), _info.num_classes()); + const unsigned int num_classes_per_box = + std::min<unsigned int>(_info.max_classes_per_detection(), _info.num_classes()); std::vector<float> max_scores; std::vector<int> box_indices; std::vector<int> max_score_classes; - for(unsigned int b = 0; b < _num_boxes; ++b) + for (unsigned int b = 0; b < _num_boxes; ++b) { std::vector<float> box_scores; - for(unsigned int c = 0; c < num_classes; ++c) + for (unsigned int c = 0; c < num_classes; ++c) { - box_scores.emplace_back(*(reinterpret_cast<float *>(_input_scores_to_use->ptr_to_element(Coordinates(c + 1, b))))); + box_scores.emplace_back( + *(reinterpret_cast<float *>(_input_scores_to_use->ptr_to_element(Coordinates(c + 1, b))))); } std::vector<unsigned int> max_score_indices; max_score_indices.resize(_info.num_classes()); std::iota(max_score_indices.data(), max_score_indices.data() + _info.num_classes(), 0); - std::partial_sort(max_score_indices.data(), - max_score_indices.data() + num_classes_per_box, + std::partial_sort(max_score_indices.data(), max_score_indices.data() + num_classes_per_box, max_score_indices.data() + num_classes, [&](unsigned int first, unsigned int second) - { - return box_scores[first] > box_scores[second]; - }); + { return box_scores[first] > box_scores[second]; }); - for(unsigned int i = 0; i < num_classes_per_box; ++i) + for (unsigned int i = 0; i < num_classes_per_box; ++i) { - const float score_to_add = box_scores[max_score_indices[i]]; - *(reinterpret_cast<float *>(_class_scores.ptr_to_element(Coordinates(b * num_classes_per_box + i)))) = score_to_add; + const float score_to_add = box_scores[max_score_indices[i]]; + *(reinterpret_cast<float *>(_class_scores.ptr_to_element(Coordinates(b * num_classes_per_box + i)))) = + score_to_add; max_scores.emplace_back(score_to_add); box_indices.emplace_back(b); max_score_classes.emplace_back(max_score_indices[i]); @@ -406,10 +516,10 @@ void CPPDetectionPostProcessLayer::run() // Run Non-maxima Suppression _nms.run(); std::vector<unsigned int> selected_indices; - for(unsigned int i = 0; i < max_detections; ++i) + for (unsigned int i = 0; i < max_detections; ++i) { // NMS returns M valid indices, the not valid tail is filled with -1 - if(*(reinterpret_cast<int *>(_selected_indices.ptr_to_element(Coordinates(i)))) == -1) + if (*(reinterpret_cast<int *>(_selected_indices.ptr_to_element(Coordinates(i)))) == -1) { // Nms will return -1 for all the last M-elements not valid break; @@ -419,8 +529,8 @@ void CPPDetectionPostProcessLayer::run() // We select the max detection numbers of the highest score of all classes const auto num_output = std::min<unsigned int>(_info.max_detections(), selected_indices.size()); - SaveOutputs(&_decoded_boxes, box_indices, max_scores, max_score_classes, selected_indices, - num_output, max_detections, _output_boxes, _output_classes, _output_scores, _num_detection); + SaveOutputs(&_decoded_boxes, box_indices, max_scores, max_score_classes, selected_indices, num_output, + max_detections, _output_boxes, _output_classes, _output_scores, _num_detection); } } } // namespace arm_compute diff --git a/src/runtime/CPP/functions/CPPNonMaximumSuppression.cpp b/src/runtime/CPP/functions/CPPNonMaximumSuppression.cpp index d0d0b1e98b..3217742c6b 100644 --- a/src/runtime/CPP/functions/CPPNonMaximumSuppression.cpp +++ b/src/runtime/CPP/functions/CPPNonMaximumSuppression.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2020 Arm Limited. + * Copyright (c) 2019-2021 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -25,21 +25,32 @@ #include "arm_compute/core/CPP/kernels/CPPNonMaximumSuppressionKernel.h" +#include "src/common/utils/Log.h" + namespace arm_compute { -void CPPNonMaximumSuppression::configure( - const ITensor *bboxes, const ITensor *scores, ITensor *indices, unsigned int max_output_size, - const float score_threshold, const float nms_threshold) +void CPPNonMaximumSuppression::configure(const ITensor *bboxes, + const ITensor *scores, + ITensor *indices, + unsigned int max_output_size, + const float score_threshold, + const float nms_threshold) { + ARM_COMPUTE_LOG_PARAMS(bboxes, scores, indices, max_output_size, score_threshold, nms_threshold); + auto k = std::make_unique<CPPNonMaximumSuppressionKernel>(); k->configure(bboxes, scores, indices, max_output_size, score_threshold, nms_threshold); _kernel = std::move(k); } -Status CPPNonMaximumSuppression::validate( - const ITensorInfo *bboxes, const ITensorInfo *scores, const ITensorInfo *indices, unsigned int max_output_size, - const float score_threshold, const float nms_threshold) +Status CPPNonMaximumSuppression::validate(const ITensorInfo *bboxes, + const ITensorInfo *scores, + const ITensorInfo *indices, + unsigned int max_output_size, + const float score_threshold, + const float nms_threshold) { - return CPPNonMaximumSuppressionKernel::validate(bboxes, scores, indices, max_output_size, score_threshold, nms_threshold); + return CPPNonMaximumSuppressionKernel::validate(bboxes, scores, indices, max_output_size, score_threshold, + nms_threshold); } } // namespace arm_compute diff --git a/src/runtime/CPP/functions/CPPPermute.cpp b/src/runtime/CPP/functions/CPPPermute.cpp index 76fa09f12b..83941f1dc1 100644 --- a/src/runtime/CPP/functions/CPPPermute.cpp +++ b/src/runtime/CPP/functions/CPPPermute.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2020 Arm Limited. + * Copyright (c) 2017-2021 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -25,10 +25,14 @@ #include "arm_compute/core/CPP/kernels/CPPPermuteKernel.h" +#include "src/common/utils/Log.h" + using namespace arm_compute; void CPPPermute::configure(const ITensor *input, ITensor *output, const PermutationVector &perm) { + ARM_COMPUTE_LOG_PARAMS(input, output, perm); + auto k = std::make_unique<CPPPermuteKernel>(); k->configure(input, output, perm); _kernel = std::move(k); diff --git a/src/runtime/CPP/functions/CPPTopKV.cpp b/src/runtime/CPP/functions/CPPTopKV.cpp index 2547e56a1d..3d64def804 100644 --- a/src/runtime/CPP/functions/CPPTopKV.cpp +++ b/src/runtime/CPP/functions/CPPTopKV.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2020 Arm Limited. + * Copyright (c) 2019-2021 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -25,16 +25,23 @@ #include "arm_compute/core/CPP/kernels/CPPTopKVKernel.h" +#include "src/common/utils/Log.h" + namespace arm_compute { void CPPTopKV::configure(const ITensor *predictions, const ITensor *targets, ITensor *output, const unsigned int k) { + ARM_COMPUTE_LOG_PARAMS(predictions, targets, output, k); + auto kernel = std::make_unique<CPPTopKVKernel>(); kernel->configure(predictions, targets, output, k); _kernel = std::move(kernel); } -Status CPPTopKV::validate(const ITensorInfo *predictions, const ITensorInfo *targets, ITensorInfo *output, const unsigned int k) +Status CPPTopKV::validate(const ITensorInfo *predictions, + const ITensorInfo *targets, + ITensorInfo *output, + const unsigned int k) { return CPPTopKVKernel::validate(predictions, targets, output, k); } diff --git a/src/runtime/CPP/functions/CPPUpsample.cpp b/src/runtime/CPP/functions/CPPUpsample.cpp index 3b4ba2ba42..8f72473aeb 100644 --- a/src/runtime/CPP/functions/CPPUpsample.cpp +++ b/src/runtime/CPP/functions/CPPUpsample.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2020 Arm Limited. + * Copyright (c) 2017-2021 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -25,10 +25,14 @@ #include "arm_compute/core/CPP/kernels/CPPUpsampleKernel.h" +#include "src/common/utils/Log.h" + using namespace arm_compute; void CPPUpsample::configure(const ITensor *input, ITensor *output, const PadStrideInfo &info) { + ARM_COMPUTE_LOG_PARAMS(input, output, info); + auto k = std::make_unique<CPPUpsampleKernel>(); k->configure(input, output, info); _kernel = std::move(k); diff --git a/src/runtime/IScheduler.cpp b/src/runtime/IScheduler.cpp index df04fed401..ecf84abd2c 100644 --- a/src/runtime/IScheduler.cpp +++ b/src/runtime/IScheduler.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2016-2021 Arm Limited. + * Copyright (c) 2016-2023 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -25,14 +25,15 @@ #include "arm_compute/core/CPP/ICPPKernel.h" #include "arm_compute/core/Error.h" +#include "arm_compute/core/Log.h" #include "arm_compute/core/Window.h" + #include "src/common/cpuinfo/CpuInfo.h" #include "src/runtime/SchedulerUtils.h" namespace arm_compute { IScheduler::IScheduler() - : _cpu_info() { // Work out the best possible number of execution threads _num_threads_hint = cpuinfo::num_threads_hint(); @@ -40,7 +41,7 @@ IScheduler::IScheduler() CPUInfo &IScheduler::cpu_info() { - return _cpu_info; + return CPUInfo::get(); } void IScheduler::set_num_threads_with_affinity(unsigned int num_threads, BindFunc func) @@ -59,7 +60,7 @@ void IScheduler::schedule_common(ICPPKernel *kernel, const Hints &hints, const W ARM_COMPUTE_ERROR_ON_MSG(!kernel, "The child class didn't set the kernel"); #ifndef BARE_METAL const Window &max_window = window; - if(hints.split_dimension() == IScheduler::split_dimensions_all) + if (hints.split_dimension() == IScheduler::split_dimensions_all) { /* * if the split dim is size_t max then this signals we should parallelise over @@ -73,27 +74,27 @@ void IScheduler::schedule_common(ICPPKernel *kernel, const Hints &hints, const W std::tie(m_threads, n_threads) = scheduler_utils::split_2d(this->num_threads(), m, n); std::vector<IScheduler::Workload> workloads; - for(unsigned int ni = 0; ni != n_threads; ++ni) + for (unsigned int ni = 0; ni != n_threads; ++ni) { - for(unsigned int mi = 0; mi != m_threads; ++mi) + for (unsigned int mi = 0; mi != m_threads; ++mi) { workloads.push_back( - [ni, mi, m_threads, n_threads, &max_window, &kernel](const ThreadInfo & info) - { - //narrow the window to our mi-ni workload - Window win = max_window.split_window(Window::DimX, mi, m_threads) - .split_window(Window::DimY, ni, n_threads); + [ni, mi, m_threads, n_threads, &max_window, &kernel](const ThreadInfo &info) + { + //narrow the window to our mi-ni workload + Window win = max_window.split_window(Window::DimX, mi, m_threads) + .split_window(Window::DimY, ni, n_threads); - win.validate(); + win.validate(); - Window thread_locator; - thread_locator.set(Window::DimX, Window::Dimension(mi, m_threads)); - thread_locator.set(Window::DimY, Window::Dimension(ni, n_threads)); + Window thread_locator; + thread_locator.set(Window::DimX, Window::Dimension(mi, m_threads)); + thread_locator.set(Window::DimY, Window::Dimension(ni, n_threads)); - thread_locator.validate(); + thread_locator.validate(); - kernel->run_nd(win, info, thread_locator); - }); + kernel->run_nd(win, info, thread_locator); + }); } } run_workloads(workloads); @@ -103,16 +104,16 @@ void IScheduler::schedule_common(ICPPKernel *kernel, const Hints &hints, const W const unsigned int num_iterations = max_window.num_iterations(hints.split_dimension()); const unsigned int num_threads = std::min(num_iterations, this->num_threads()); - if(num_iterations == 0) + if (num_iterations == 0) { return; } - if(!kernel->is_parallelisable() || num_threads == 1) + if (!kernel->is_parallelisable() || num_threads == 1) { ThreadInfo info; - info.cpu_info = &_cpu_info; - if(tensors.empty()) + info.cpu_info = &cpu_info(); + if (tensors.empty()) { kernel->run(max_window, info); } @@ -124,14 +125,15 @@ void IScheduler::schedule_common(ICPPKernel *kernel, const Hints &hints, const W else { unsigned int num_windows = 0; - switch(hints.strategy()) + switch (hints.strategy()) { case StrategyHint::STATIC: num_windows = num_threads; break; case StrategyHint::DYNAMIC: { - const unsigned int granule_threshold = (hints.threshold() <= 0) ? num_threads : static_cast<unsigned int>(hints.threshold()); + const unsigned int granule_threshold = + (hints.threshold() <= 0) ? num_threads : static_cast<unsigned int>(hints.threshold()); // Make sure we don't use some windows which are too small as this might create some contention on the ThreadFeeder num_windows = num_iterations > granule_threshold ? granule_threshold : num_iterations; break; @@ -139,16 +141,19 @@ void IScheduler::schedule_common(ICPPKernel *kernel, const Hints &hints, const W default: ARM_COMPUTE_ERROR("Unknown strategy"); } + // Make sure the smallest window is larger than minimum workload size + num_windows = adjust_num_of_windows(max_window, hints.split_dimension(), num_windows, *kernel, cpu_info()); + std::vector<IScheduler::Workload> workloads(num_windows); - for(unsigned int t = 0; t < num_windows; ++t) + for (unsigned int t = 0; t < num_windows; ++t) { //Capture 't' by copy, all the other variables by reference: - workloads[t] = [t, &hints, &max_window, &num_windows, &kernel, &tensors](const ThreadInfo & info) + workloads[t] = [t, &hints, &max_window, &num_windows, &kernel, &tensors](const ThreadInfo &info) { Window win = max_window.split_window(hints.split_dimension(), t, num_windows); win.validate(); - if(tensors.empty()) + if (tensors.empty()) { kernel->run(win, info); } @@ -172,4 +177,44 @@ void IScheduler::run_tagged_workloads(std::vector<Workload> &workloads, const ch run_workloads(workloads); } +std::size_t IScheduler::adjust_num_of_windows(const Window &window, + std::size_t split_dimension, + std::size_t init_num_windows, + const ICPPKernel &kernel, + const CPUInfo &cpu_info) +{ + // Mitigation of the narrow split issue, which occurs when the split dimension is too small to split (hence "narrow"). + if (window.num_iterations(split_dimension) < init_num_windows) + { + auto recommended_split_dim = Window::DimX; + for (std::size_t dims = Window::DimY; dims <= Window::DimW; ++dims) + { + if (window.num_iterations(recommended_split_dim) < window.num_iterations(dims)) + { + recommended_split_dim = dims; + } + } + ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE( + "%zu dimension is not a suitable dimension to split the workload. Recommended: %zu recommended_split_dim", + split_dimension, recommended_split_dim); + } + + for (auto t = init_num_windows; t > 0; --t) // Trying the highest number of windows ,init_num_windows, first + { + // Try splitting the workload into t, subject to each subworkload size <= mws. + if ((window.num_iterations(split_dimension) / kernel.get_mws(cpu_info, t)) >= t) + { + if (t != init_num_windows) + { + ARM_COMPUTE_LOG_INFO_MSG_CORE( + "The scheduler is using a different thread count than the one assigned by the user."); + } + return t; + } + } + ARM_COMPUTE_LOG_INFO_MSG_CORE( + "The scheduler is using single thread instead of the thread count assigned by the user."); + return 1; // If the workload is so small that it can't be split, we should run a single thread +} + } // namespace arm_compute diff --git a/src/runtime/ISimpleLifetimeManager.cpp b/src/runtime/ISimpleLifetimeManager.cpp index a6bc950644..8e5b62ae7d 100644 --- a/src/runtime/ISimpleLifetimeManager.cpp +++ b/src/runtime/ISimpleLifetimeManager.cpp @@ -43,7 +43,7 @@ ISimpleLifetimeManager::ISimpleLifetimeManager() void ISimpleLifetimeManager::register_group(IMemoryGroup *group) { - if(_active_group == nullptr) + if (_active_group == nullptr) { ARM_COMPUTE_ERROR_ON(group == nullptr); _active_group = group; @@ -52,12 +52,12 @@ void ISimpleLifetimeManager::register_group(IMemoryGroup *group) bool ISimpleLifetimeManager::release_group(IMemoryGroup *group) { - if(group == nullptr) + if (group == nullptr) { return false; } const bool status = bool(_finalized_groups.erase(group)); - if(status) + if (status) { group->mappings().clear(); } @@ -67,12 +67,13 @@ bool ISimpleLifetimeManager::release_group(IMemoryGroup *group) void ISimpleLifetimeManager::start_lifetime(void *obj) { ARM_COMPUTE_ERROR_ON(obj == nullptr); - ARM_COMPUTE_ERROR_ON_MSG(_active_elements.find(obj) != std::end(_active_elements), "Memory object is already registered!"); + ARM_COMPUTE_ERROR_ON_MSG(_active_elements.find(obj) != std::end(_active_elements), + "Memory object is already registered!"); // Check if there is a free blob - if(_free_blobs.empty()) + if (_free_blobs.empty()) { - _occupied_blobs.emplace_front(Blob{ obj, 0, 0, { obj } }); + _occupied_blobs.emplace_front(Blob{obj, 0, 0, {obj}}); } else { @@ -100,10 +101,8 @@ void ISimpleLifetimeManager::end_lifetime(void *obj, IMemory &obj_memory, size_t el.status = true; // Find object in the occupied lists - auto occupied_blob_it = std::find_if(std::begin(_occupied_blobs), std::end(_occupied_blobs), [&obj](const Blob & b) - { - return obj == b.id; - }); + auto occupied_blob_it = std::find_if(std::begin(_occupied_blobs), std::end(_occupied_blobs), + [&obj](const Blob &b) { return obj == b.id; }); ARM_COMPUTE_ERROR_ON(occupied_blob_it == std::end(_occupied_blobs)); // Update occupied blob and return as free @@ -114,7 +113,7 @@ void ISimpleLifetimeManager::end_lifetime(void *obj, IMemory &obj_memory, size_t _free_blobs.splice(std::begin(_free_blobs), _occupied_blobs, occupied_blob_it); // Check if all objects are finalized and reset active group - if(are_all_finalized()) + if (are_all_finalized()) { ARM_COMPUTE_ERROR_ON(!_occupied_blobs.empty()); @@ -133,9 +132,7 @@ void ISimpleLifetimeManager::end_lifetime(void *obj, IMemory &obj_memory, size_t bool ISimpleLifetimeManager::are_all_finalized() const { - return !std::any_of(std::begin(_active_elements), std::end(_active_elements), [](const std::pair<void *, Element> &e) - { - return !e.second.status; - }); + return !std::any_of(std::begin(_active_elements), std::end(_active_elements), + [](const std::pair<void *, Element> &e) { return !e.second.status; }); } } // namespace arm_compute diff --git a/src/runtime/IWeightsManager.cpp b/src/runtime/IWeightsManager.cpp index 081cd990f3..96287dcc49 100644 --- a/src/runtime/IWeightsManager.cpp +++ b/src/runtime/IWeightsManager.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019 Arm Limited. + * Copyright (c) 2019, 2021 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -25,23 +25,27 @@ namespace arm_compute { -IWeightsManager::IWeightsManager() - : _managed_weights(), _managed_weights_parents() +IWeightsManager::IWeightsManager() : _managed_weights(), _managed_counter(), _managed_weights_parents() { } void IWeightsManager::manage(const ITensor *weights, ITransformWeights *parent) { - if(!are_weights_managed(weights)) + if (!are_weights_managed(weights)) { _managed_weights[weights]; + _managed_counter[weights]; + } + else + { + _managed_counter[weights].counter++; } // In case the weights are an output of a previous reshape function // store the parent's link - if(parent != nullptr) + if (parent != nullptr) { - if(_managed_weights_parents.find(weights) == _managed_weights_parents.end()) + if (_managed_weights_parents.find(weights) == _managed_weights_parents.end()) { _managed_weights_parents[weights] = parent; } @@ -54,13 +58,13 @@ ITensor *IWeightsManager::run(const ITensor *weights, ITransformWeights *weights // Find if I have the same weights with weights transform. If I do, don't run the reshape auto item = _managed_weights.find(weights); - bool perform_run{ true }; - ITensor *weights_tensor{ nullptr }; + bool perform_run{true}; + ITensor *weights_tensor{nullptr}; // Check if I already have the requested transform and I have run the reshape function - for(auto it : item->second) + for (auto it : item->second) { - if(it->is_reshape_run() && (it->uid() == weights_transform->uid())) + if (it->is_reshape_run() && (it->uid() == weights_transform->uid())) { weights_tensor = it->get_weights(); perform_run = false; @@ -68,7 +72,7 @@ ITensor *IWeightsManager::run(const ITensor *weights, ITransformWeights *weights } } - if(perform_run) + if (perform_run) { weights_transform->run(); weights_tensor = weights_transform->get_weights(); @@ -76,10 +80,10 @@ ITensor *IWeightsManager::run(const ITensor *weights, ITransformWeights *weights // Check if we can release memory from parent auto parent_item = _managed_weights_parents.find(weights); - if(parent_item != _managed_weights_parents.end()) + if (parent_item != _managed_weights_parents.end()) { int32_t refcount = parent_item->second->decrease_refcount(); - if(refcount == 0) + if (refcount == 0) { parent_item->second->release(); } @@ -87,20 +91,20 @@ ITensor *IWeightsManager::run(const ITensor *weights, ITransformWeights *weights // Check top level weights. If all the transformations are done // mark the weights as unused - if(_managed_weights_parents.find(weights) == _managed_weights_parents.end()) + if (_managed_weights_parents.find(weights) == _managed_weights_parents.end()) { auto item = _managed_weights.find(weights); bool mark_as_unused = true; - for(auto it : item->second) + for (auto it : item->second) { - if(!it->is_reshape_run()) + if (!it->is_reshape_run()) { mark_as_unused = false; break; } } - if(mark_as_unused) + if (mark_as_unused) { weights->mark_as_unused(); } @@ -118,15 +122,15 @@ ITensor *IWeightsManager::acquire(const ITensor *weights, ITransformWeights *wei { ARM_COMPUTE_ERROR_ON_MSG(!are_weights_managed(weights), "Cannot acquire weights. Weights are not managed"); - ITensor *transformed_weights{ nullptr }; + ITensor *transformed_weights{nullptr}; auto item = _managed_weights.find(weights); // Check if I already have the requested transform. If I do, // increase the refcount of the transformed weights object and // reuse the tensor - for(auto it : item->second) + for (auto it : item->second) { - if(it->uid() == weights_transform->uid()) + if (it->uid() == weights_transform->uid()) { transformed_weights = it->get_weights(); it->increase_refcount(); @@ -134,7 +138,7 @@ ITensor *IWeightsManager::acquire(const ITensor *weights, ITransformWeights *wei } } - if(transformed_weights == nullptr) + if (transformed_weights == nullptr) { transformed_weights = weights_transform->get_weights(); weights_transform->increase_refcount(); @@ -146,4 +150,28 @@ ITensor *IWeightsManager::acquire(const ITensor *weights, ITransformWeights *wei return transformed_weights; } + +void IWeightsManager::release(const ITensor *weights) +{ + if (weights == nullptr || !are_weights_managed(weights)) + { + return; + } + + _managed_counter[weights].counter--; + if (_managed_counter[weights].counter == 0 && _managed_counter[weights].is_unused) + { + weights->mark_as_unused(); + } +} + +void IWeightsManager::pre_mark_as_unused(const ITensor *weights) +{ + if (weights == nullptr || !are_weights_managed(weights)) + { + return; + } + + _managed_counter[weights].is_unused = true; +} } // namespace arm_compute diff --git a/src/runtime/Memory.cpp b/src/runtime/Memory.cpp index ac0a32539e..90fd025eb7 100644 --- a/src/runtime/Memory.cpp +++ b/src/runtime/Memory.cpp @@ -27,20 +27,17 @@ namespace arm_compute { -Memory::Memory() - : _region(nullptr), _region_owned(nullptr) +Memory::Memory() : _region(nullptr), _region_owned(nullptr) { } -Memory::Memory(const std::shared_ptr<IMemoryRegion> &memory) - : _region(nullptr), _region_owned(memory) +Memory::Memory(const std::shared_ptr<IMemoryRegion> &memory) : _region(nullptr), _region_owned(memory) { _region_owned = memory; _region = _region_owned.get(); } -Memory::Memory(IMemoryRegion *memory) - : _region(memory), _region_owned(nullptr) +Memory::Memory(IMemoryRegion *memory) : _region(memory), _region_owned(nullptr) { _region = memory; } diff --git a/src/runtime/MemoryManagerOnDemand.cpp b/src/runtime/MemoryManagerOnDemand.cpp index 2e418ae9e3..5fa9ea47e9 100644 --- a/src/runtime/MemoryManagerOnDemand.cpp +++ b/src/runtime/MemoryManagerOnDemand.cpp @@ -31,7 +31,8 @@ namespace arm_compute { -MemoryManagerOnDemand::MemoryManagerOnDemand(std::shared_ptr<ILifetimeManager> lifetime_manager, std::shared_ptr<IPoolManager> pool_manager) +MemoryManagerOnDemand::MemoryManagerOnDemand(std::shared_ptr<ILifetimeManager> lifetime_manager, + std::shared_ptr<IPoolManager> pool_manager) : _lifetime_mgr(std::move(lifetime_manager)), _pool_mgr(std::move(pool_manager)) { ARM_COMPUTE_ERROR_ON_MSG(!_lifetime_mgr, "Lifetime manager not specified correctly!"); @@ -57,7 +58,7 @@ void MemoryManagerOnDemand::populate(arm_compute::IAllocator &allocator, size_t // Create pools auto pool_template = _lifetime_mgr->create_pool(&allocator); - for(int i = num_pools; i > 1; --i) + for (int i = num_pools; i > 1; --i) { auto pool = pool_template->duplicate(); _pool_mgr->register_pool(std::move(pool)); diff --git a/src/runtime/NEON/INEOperator.cpp b/src/runtime/NEON/INEOperator.cpp index a5fc0a2726..fcfd3251ff 100644 --- a/src/runtime/NEON/INEOperator.cpp +++ b/src/runtime/NEON/INEOperator.cpp @@ -22,8 +22,10 @@ * SOFTWARE. */ #include "arm_compute/runtime/NEON/INEOperator.h" + #include "arm_compute/core/Window.h" #include "arm_compute/runtime/NEON/NEScheduler.h" + #include "src/core/NEON/INEKernel.h" namespace arm_compute @@ -32,14 +34,13 @@ namespace experimental { INEOperator::~INEOperator() = default; -INEOperator::INEOperator(IRuntimeContext *ctx) - : _kernel(), _ctx(ctx), _workspace() +INEOperator::INEOperator(IRuntimeContext *ctx) : _kernel(), _ctx(ctx), _workspace() { } void INEOperator::run(ITensorPack &tensors) { - if(tensors.empty()) + if (tensors.empty()) { ARM_COMPUTE_ERROR("No inputs provided"); } diff --git a/src/runtime/NEON/INESimpleFunction.cpp b/src/runtime/NEON/INESimpleFunction.cpp index 5438bce62a..b6977221b9 100644 --- a/src/runtime/NEON/INESimpleFunction.cpp +++ b/src/runtime/NEON/INESimpleFunction.cpp @@ -26,6 +26,7 @@ #include "arm_compute/core/CPP/ICPPKernel.h" #include "arm_compute/core/Window.h" #include "arm_compute/runtime/NEON/NEScheduler.h" + #include "src/core/NEON/kernels/NEFillBorderKernel.h" namespace arm_compute @@ -33,8 +34,7 @@ namespace arm_compute INESimpleFunction::~INESimpleFunction() = default; INESimpleFunction::INESimpleFunction() // NOLINT - : _kernel(), - _border_handler() + : _kernel(), _border_handler() { } diff --git a/src/runtime/NEON/INESimpleFunctionNoBorder.cpp b/src/runtime/NEON/INESimpleFunctionNoBorder.cpp index 21dd58e378..04bff9fa4b 100644 --- a/src/runtime/NEON/INESimpleFunctionNoBorder.cpp +++ b/src/runtime/NEON/INESimpleFunctionNoBorder.cpp @@ -25,6 +25,7 @@ #include "arm_compute/core/Window.h" #include "arm_compute/runtime/NEON/NEScheduler.h" + #include "src/core/NEON/INEKernel.h" #include "src/runtime/Utils.h" @@ -32,9 +33,7 @@ namespace arm_compute { INESimpleFunctionNoBorder::~INESimpleFunctionNoBorder() = default; -INESimpleFunctionNoBorder::INESimpleFunctionNoBorder(IRuntimeContext *ctx) - : _kernel(), - _ctx(ctx) +INESimpleFunctionNoBorder::INESimpleFunctionNoBorder(IRuntimeContext *ctx) : _kernel(), _ctx(ctx) { } diff --git a/src/runtime/NEON/functions/NEActivationLayer.cpp b/src/runtime/NEON/functions/NEActivationLayer.cpp index 2b5c51fa5a..59199452ce 100644 --- a/src/runtime/NEON/functions/NEActivationLayer.cpp +++ b/src/runtime/NEON/functions/NEActivationLayer.cpp @@ -24,24 +24,24 @@ #include "arm_compute/runtime/NEON/functions/NEActivationLayer.h" #include "arm_compute/core/Validate.h" -#include "src/runtime/cpu/operators/CpuActivation.h" + +#include "src/cpu/operators/CpuActivation.h" namespace arm_compute { struct NEActivationLayer::Impl { - const ITensor *src{ nullptr }; - ITensor *dst{ nullptr }; - IRuntimeContext *ctx{ nullptr }; - std::unique_ptr<cpu::CpuActivation> op{ nullptr }; + const ITensor *src{nullptr}; + ITensor *dst{nullptr}; + IRuntimeContext *ctx{nullptr}; + std::unique_ptr<cpu::CpuActivation> op{nullptr}; }; -NEActivationLayer::NEActivationLayer(IRuntimeContext *ctx) - : _impl(std::make_unique<Impl>()) +NEActivationLayer::NEActivationLayer(IRuntimeContext *ctx) : _impl(std::make_unique<Impl>()) { _impl->ctx = ctx; } -NEActivationLayer::NEActivationLayer(NEActivationLayer &&) = default; +NEActivationLayer::NEActivationLayer(NEActivationLayer &&) = default; NEActivationLayer &NEActivationLayer::operator=(NEActivationLayer &&) = default; NEActivationLayer::~NEActivationLayer() = default; @@ -56,7 +56,8 @@ void NEActivationLayer::configure(ITensor *input, ITensor *output, ActivationLay _impl->op->configure(_impl->src->info(), _impl->dst->info(), activation_info); } -Status NEActivationLayer::validate(const ITensorInfo *input, const ITensorInfo *output, const ActivationLayerInfo &act_info) +Status +NEActivationLayer::validate(const ITensorInfo *input, const ITensorInfo *output, const ActivationLayerInfo &act_info) { return cpu::CpuActivation::validate(input, output, act_info); } diff --git a/src/runtime/NEON/functions/NEAddMulAdd.cpp b/src/runtime/NEON/functions/NEAddMulAdd.cpp new file mode 100644 index 0000000000..a72364791c --- /dev/null +++ b/src/runtime/NEON/functions/NEAddMulAdd.cpp @@ -0,0 +1,89 @@ +/* + * Copyright (c) 2023 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "arm_compute/runtime/NEON/functions/NEAddMulAdd.h" + +#include "arm_compute/runtime/Tensor.h" + +#include "src/common/utils/Log.h" +#include "src/core/helpers/MemoryHelpers.h" +#include "src/cpu/operators/CpuAddMulAdd.h" + +namespace arm_compute +{ +struct NEAddMulAdd::Impl +{ + std::unique_ptr<cpu::CpuAddMulAdd> op{nullptr}; + WorkspaceData<Tensor> workspace_tensors{}; + ITensorPack run_pack{}; + MemoryGroup memory_group{}; +}; + +NEAddMulAdd::NEAddMulAdd(std::shared_ptr<IMemoryManager> memory_manager) : _impl(std::make_unique<Impl>()) +{ + _impl->memory_group = MemoryGroup(std::move(memory_manager)); +} + +NEAddMulAdd::~NEAddMulAdd() = default; + +void NEAddMulAdd::configure(ITensor *input1, + ITensor *input2, + ITensor *bn_mul, + ITensor *bn_add, + ITensor *add_output, + ITensor *final_output, + const ConvertPolicy policy, + const ActivationLayerInfo &act_info) +{ + ARM_COMPUTE_LOG_PARAMS(input1, input2, bn_mul, bn_add, add_output, final_output, policy, act_info); + + _impl->op = std::make_unique<cpu::CpuAddMulAdd>(); + _impl->op->configure(input1->info(), input2->info(), bn_mul->info(), bn_add->info(), + add_output != nullptr ? add_output->info() : nullptr, final_output->info(), policy, act_info); + + _impl->run_pack = { + {TensorType::ACL_SRC_0, input1}, {TensorType::ACL_SRC_1, input2}, {TensorType::ACL_SRC_2, bn_mul}, + {TensorType::ACL_SRC_3, bn_add}, {TensorType::ACL_DST_0, add_output}, {TensorType::ACL_DST_1, final_output}, + }; + + _impl->workspace_tensors = manage_workspace<Tensor>(_impl->op->workspace(), _impl->memory_group, _impl->run_pack); +} + +Status NEAddMulAdd::validate(const ITensorInfo *input1, + const ITensorInfo *input2, + const ITensorInfo *bn_mul, + const ITensorInfo *bn_add, + const ITensorInfo *add_output, + const ITensorInfo *final_output, + ConvertPolicy policy, + const ActivationLayerInfo &act_info) +{ + return cpu::CpuAddMulAdd::validate(input1, input2, bn_mul, bn_add, add_output, final_output, policy, act_info); +} + +void NEAddMulAdd::run() +{ + _impl->op->run(_impl->run_pack); +} +} // namespace arm_compute diff --git a/src/runtime/NEON/functions/NEArgMinMaxLayer.cpp b/src/runtime/NEON/functions/NEArgMinMaxLayer.cpp index 7bca20d46c..fbaf1a96e7 100644 --- a/src/runtime/NEON/functions/NEArgMinMaxLayer.cpp +++ b/src/runtime/NEON/functions/NEArgMinMaxLayer.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018-2020 Arm Limited. + * Copyright (c) 2018-2021, 2023 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -29,31 +29,68 @@ #include "arm_compute/core/TensorInfo.h" #include "arm_compute/core/Types.h" #include "arm_compute/core/Validate.h" +#include "arm_compute/runtime/NEON/functions/NECast.h" +#include "arm_compute/runtime/NEON/functions/NEReductionOperation.h" +#include "arm_compute/runtime/Tensor.h" + +#include "src/common/utils/Log.h" #include "src/core/NEON/kernels/NEReductionOperationKernel.h" namespace arm_compute { +struct NEArgMinMaxLayer::Impl +{ + MemoryGroup memory_group{}; + std::shared_ptr<IMemoryManager> memory_manager{}; + std::unique_ptr<NEReductionOperation> reduction_function{}; + std::unique_ptr<NECast> cast_function{}; + std::unique_ptr<Tensor> tmp_reduction_result{}; +}; + NEArgMinMaxLayer::~NEArgMinMaxLayer() = default; -NEArgMinMaxLayer::NEArgMinMaxLayer(std::shared_ptr<IMemoryManager> memory_manager) - : _reduction_function(std::make_unique<NEReductionOperation>()) +NEArgMinMaxLayer::NEArgMinMaxLayer(std::shared_ptr<IMemoryManager> memory_manager) : _impl(std::make_unique<Impl>()) { - ARM_COMPUTE_UNUSED(memory_manager); + _impl->memory_manager = std::move(memory_manager); } + void NEArgMinMaxLayer::configure(ITensor *input, int axis, ITensor *output, const ReductionOperation &op) { - _reduction_function->configure(input, output, axis, op, false); + ARM_COMPUTE_LOG_PARAMS(input, axis, output, op); + _impl->reduction_function = std::make_unique<NEReductionOperation>(); + if (output->info() && + (output->info()->data_type() == DataType::S64 || output->info()->data_type() == DataType::U64)) + { + _impl->memory_group = MemoryGroup(std::move(_impl->memory_manager)); + _impl->cast_function = std::make_unique<NECast>(); + _impl->tmp_reduction_result = std::make_unique<Tensor>(); + _impl->reduction_function->configure(input, _impl->tmp_reduction_result.get(), axis, op, false); + _impl->cast_function->configure(_impl->tmp_reduction_result.get(), output, ConvertPolicy::SATURATE); + _impl->memory_group.manage(_impl->tmp_reduction_result.get()); + _impl->tmp_reduction_result->allocator()->allocate(); + } + else + { + _impl->reduction_function->configure(input, output, axis, op, false); + } } -Status NEArgMinMaxLayer::validate(const ITensorInfo *input, int axis, const ITensorInfo *output, const ReductionOperation &op) +Status +NEArgMinMaxLayer::validate(const ITensorInfo *input, int axis, const ITensorInfo *output, const ReductionOperation &op) { - ARM_COMPUTE_RETURN_ERROR_ON_MSG(op != ReductionOperation::ARG_IDX_MAX && op != ReductionOperation::ARG_IDX_MIN, "Invalid operation"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(op != ReductionOperation::ARG_IDX_MAX && op != ReductionOperation::ARG_IDX_MIN, + "Invalid operation"); return NEReductionOperation::validate(input, output, axis, op, false); } void NEArgMinMaxLayer::run() { - _reduction_function->run(); + MemoryGroupResourceScope scope_mg(_impl->memory_group); + _impl->reduction_function->run(); + if (_impl->tmp_reduction_result != nullptr) + { + _impl->cast_function->run(); + } } -} // namespace arm_compute
\ No newline at end of file +} // namespace arm_compute diff --git a/src/runtime/NEON/functions/NEArithmeticAddition.cpp b/src/runtime/NEON/functions/NEArithmeticAddition.cpp index 2e4755b949..aff16ae9d1 100644 --- a/src/runtime/NEON/functions/NEArithmeticAddition.cpp +++ b/src/runtime/NEON/functions/NEArithmeticAddition.cpp @@ -24,7 +24,8 @@ #include "arm_compute/runtime/NEON/functions/NEArithmeticAddition.h" #include "arm_compute/core/Validate.h" -#include "src/runtime/cpu/operators/CpuAdd.h" + +#include "src/cpu/operators/CpuAdd.h" #include <utility> @@ -32,26 +33,33 @@ namespace arm_compute { struct NEArithmeticAddition::Impl { - const ITensor *src_0{ nullptr }; - const ITensor *src_1{ nullptr }; - ITensor *dst{ nullptr }; - std::unique_ptr<cpu::CpuAdd> op{ nullptr }; + const ITensor *src_0{nullptr}; + const ITensor *src_1{nullptr}; + ITensor *dst{nullptr}; + std::unique_ptr<cpu::CpuAdd> op{nullptr}; }; -NEArithmeticAddition::NEArithmeticAddition() - : _impl(std::make_unique<Impl>()) +NEArithmeticAddition::NEArithmeticAddition() : _impl(std::make_unique<Impl>()) { } -NEArithmeticAddition::NEArithmeticAddition(NEArithmeticAddition &&) = default; +NEArithmeticAddition::NEArithmeticAddition(NEArithmeticAddition &&) = default; NEArithmeticAddition &NEArithmeticAddition::operator=(NEArithmeticAddition &&) = default; NEArithmeticAddition::~NEArithmeticAddition() = default; -Status NEArithmeticAddition::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, ConvertPolicy policy, const ActivationLayerInfo &act_info) +Status NEArithmeticAddition::validate(const ITensorInfo *input1, + const ITensorInfo *input2, + const ITensorInfo *output, + ConvertPolicy policy, + const ActivationLayerInfo &act_info) { return cpu::CpuAdd::validate(input1, input2, output, policy, act_info); } -void NEArithmeticAddition::configure(const ITensor *input1, const ITensor *input2, ITensor *output, ConvertPolicy policy, const ActivationLayerInfo &act_info) +void NEArithmeticAddition::configure(const ITensor *input1, + const ITensor *input2, + ITensor *output, + ConvertPolicy policy, + const ActivationLayerInfo &act_info) { _impl->src_0 = input1; _impl->src_1 = input2; diff --git a/src/runtime/NEON/functions/NEArithmeticSubtraction.cpp b/src/runtime/NEON/functions/NEArithmeticSubtraction.cpp index 0263d4cbb6..097525c1a8 100644 --- a/src/runtime/NEON/functions/NEArithmeticSubtraction.cpp +++ b/src/runtime/NEON/functions/NEArithmeticSubtraction.cpp @@ -24,7 +24,8 @@ #include "arm_compute/runtime/NEON/functions/NEArithmeticSubtraction.h" #include "arm_compute/core/ITensor.h" -#include "src/runtime/cpu/operators/CpuSub.h" + +#include "src/cpu/operators/CpuSub.h" #include <utility> @@ -32,26 +33,33 @@ namespace arm_compute { struct NEArithmeticSubtraction::Impl { - const ITensor *src_0{ nullptr }; - const ITensor *src_1{ nullptr }; - ITensor *dst{ nullptr }; - std::unique_ptr<cpu::CpuSub> op{ nullptr }; + const ITensor *src_0{nullptr}; + const ITensor *src_1{nullptr}; + ITensor *dst{nullptr}; + std::unique_ptr<cpu::CpuSub> op{nullptr}; }; -NEArithmeticSubtraction::NEArithmeticSubtraction() - : _impl(std::make_unique<Impl>()) +NEArithmeticSubtraction::NEArithmeticSubtraction() : _impl(std::make_unique<Impl>()) { } -NEArithmeticSubtraction::NEArithmeticSubtraction(NEArithmeticSubtraction &&) = default; +NEArithmeticSubtraction::NEArithmeticSubtraction(NEArithmeticSubtraction &&) = default; NEArithmeticSubtraction &NEArithmeticSubtraction::operator=(NEArithmeticSubtraction &&) = default; NEArithmeticSubtraction::~NEArithmeticSubtraction() = default; -Status NEArithmeticSubtraction::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, ConvertPolicy policy, const ActivationLayerInfo &act_info) +Status NEArithmeticSubtraction::validate(const ITensorInfo *input1, + const ITensorInfo *input2, + const ITensorInfo *output, + ConvertPolicy policy, + const ActivationLayerInfo &act_info) { return cpu::CpuSub::validate(input1, input2, output, policy, act_info); } -void NEArithmeticSubtraction::configure(const ITensor *input1, const ITensor *input2, ITensor *output, ConvertPolicy policy, const ActivationLayerInfo &act_info) +void NEArithmeticSubtraction::configure(const ITensor *input1, + const ITensor *input2, + ITensor *output, + ConvertPolicy policy, + const ActivationLayerInfo &act_info) { _impl->src_0 = input1; _impl->src_1 = input2; diff --git a/src/runtime/NEON/functions/NEBatchNormalizationLayer.cpp b/src/runtime/NEON/functions/NEBatchNormalizationLayer.cpp index b90a38b47f..d491f0aafc 100644 --- a/src/runtime/NEON/functions/NEBatchNormalizationLayer.cpp +++ b/src/runtime/NEON/functions/NEBatchNormalizationLayer.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2020 Arm Limited. + * Copyright (c) 2017-2021 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -29,29 +29,44 @@ #include "arm_compute/core/Types.h" #include "arm_compute/core/Validate.h" #include "arm_compute/runtime/NEON/NEScheduler.h" + +#include "src/common/utils/Log.h" #include "src/core/NEON/kernels/NEBatchNormalizationLayerKernel.h" namespace arm_compute { NEBatchNormalizationLayer::~NEBatchNormalizationLayer() = default; -NEBatchNormalizationLayer::NEBatchNormalizationLayer() - : _norm_kernel() +NEBatchNormalizationLayer::NEBatchNormalizationLayer() : _norm_kernel() { } -void NEBatchNormalizationLayer::configure(ITensor *input, ITensor *output, const ITensor *mean, const ITensor *var, const ITensor *beta, const ITensor *gamma, float epsilon, +void NEBatchNormalizationLayer::configure(ITensor *input, + ITensor *output, + const ITensor *mean, + const ITensor *var, + const ITensor *beta, + const ITensor *gamma, + float epsilon, ActivationLayerInfo act_info) { + ARM_COMPUTE_LOG_PARAMS(input, output, mean, var, beta, gamma, epsilon, act_info); // Configure kernel _norm_kernel = std::make_unique<NEBatchNormalizationLayerKernel>(); _norm_kernel->configure(input, output, mean, var, beta, gamma, epsilon, act_info); } -Status NEBatchNormalizationLayer::validate(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *mean, const ITensorInfo *var, const ITensorInfo *beta, const ITensorInfo *gamma, - float epsilon, ActivationLayerInfo act_info) +Status NEBatchNormalizationLayer::validate(const ITensorInfo *input, + const ITensorInfo *output, + const ITensorInfo *mean, + const ITensorInfo *var, + const ITensorInfo *beta, + const ITensorInfo *gamma, + float epsilon, + ActivationLayerInfo act_info) { - ARM_COMPUTE_RETURN_ON_ERROR(NEBatchNormalizationLayerKernel::validate(input, output, mean, var, beta, gamma, epsilon, act_info)); + ARM_COMPUTE_RETURN_ON_ERROR( + NEBatchNormalizationLayerKernel::validate(input, output, mean, var, beta, gamma, epsilon, act_info)); return Status{}; } diff --git a/src/runtime/NEON/functions/NEBatchToSpaceLayer.cpp b/src/runtime/NEON/functions/NEBatchToSpaceLayer.cpp index 8f537a650a..5d711c5ddf 100644 --- a/src/runtime/NEON/functions/NEBatchToSpaceLayer.cpp +++ b/src/runtime/NEON/functions/NEBatchToSpaceLayer.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2020 Arm Limited. + * Copyright (c) 2019-2021, 2023 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -28,31 +28,40 @@ #include "arm_compute/core/TensorInfo.h" #include "arm_compute/core/Types.h" #include "arm_compute/core/Validate.h" + +#include "src/common/utils/Log.h" #include "src/core/NEON/kernels/NEBatchToSpaceLayerKernel.h" namespace arm_compute { void NEBatchToSpaceLayer::configure(const ITensor *input, const ITensor *block_shape, ITensor *output) { + ARM_COMPUTE_LOG_PARAMS(input, block_shape, output); auto k = std::make_unique<NEBatchToSpaceLayerKernel>(); k->configure(input, block_shape, output); _kernel = std::move(k); } -void NEBatchToSpaceLayer::configure(const ITensor *input, int32_t block_shape_x, int32_t block_shape_y, ITensor *output) +void NEBatchToSpaceLayer::configure( + const ITensor *input, int32_t block_shape_x, int32_t block_shape_y, ITensor *output, const CropInfo &crop_info) { auto k = std::make_unique<NEBatchToSpaceLayerKernel>(); - k->configure(input, block_shape_x, block_shape_y, output); + k->configure(input, block_shape_x, block_shape_y, output, crop_info); _kernel = std::move(k); } -Status NEBatchToSpaceLayer::validate(const ITensorInfo *input, const ITensorInfo *block_shape, const ITensorInfo *output) +Status +NEBatchToSpaceLayer::validate(const ITensorInfo *input, const ITensorInfo *block_shape, const ITensorInfo *output) { return NEBatchToSpaceLayerKernel::validate(input, block_shape, output); } -Status NEBatchToSpaceLayer::validate(const ITensorInfo *input, int32_t block_shape_x, int32_t block_shape_y, const ITensorInfo *output) +Status NEBatchToSpaceLayer::validate(const ITensorInfo *input, + int32_t block_shape_x, + int32_t block_shape_y, + const ITensorInfo *output, + const CropInfo &crop_info) { - return NEBatchToSpaceLayerKernel::validate(input, block_shape_x, block_shape_y, output); + return NEBatchToSpaceLayerKernel::validate(input, block_shape_x, block_shape_y, output, crop_info); } } // namespace arm_compute diff --git a/src/runtime/NEON/functions/NEBitwiseAnd.cpp b/src/runtime/NEON/functions/NEBitwiseAnd.cpp index 81c087988a..89ce2087be 100644 --- a/src/runtime/NEON/functions/NEBitwiseAnd.cpp +++ b/src/runtime/NEON/functions/NEBitwiseAnd.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2020 Arm Limited. + * Copyright (c) 2017-2021 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -23,6 +23,7 @@ */ #include "arm_compute/runtime/NEON/functions/NEBitwiseAnd.h" +#include "src/common/utils/Log.h" #include "src/core/NEON/kernels/NEBitwiseAndKernel.h" #include <utility> @@ -31,6 +32,7 @@ using namespace arm_compute; void NEBitwiseAnd::configure(const ITensor *input1, const ITensor *input2, ITensor *output) { + ARM_COMPUTE_LOG_PARAMS(input1, input2, output); auto k = std::make_unique<NEBitwiseAndKernel>(); k->configure(input1, input2, output); _kernel = std::move(k); diff --git a/src/runtime/NEON/functions/NEBitwiseNot.cpp b/src/runtime/NEON/functions/NEBitwiseNot.cpp index 3155df5db3..eda59cd3e9 100644 --- a/src/runtime/NEON/functions/NEBitwiseNot.cpp +++ b/src/runtime/NEON/functions/NEBitwiseNot.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2020 Arm Limited. + * Copyright (c) 2017-2021 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -23,6 +23,7 @@ */ #include "arm_compute/runtime/NEON/functions/NEBitwiseNot.h" +#include "src/common/utils/Log.h" #include "src/core/NEON/kernels/NEBitwiseNotKernel.h" #include <utility> @@ -31,6 +32,7 @@ using namespace arm_compute; void NEBitwiseNot::configure(const ITensor *input, ITensor *output) { + ARM_COMPUTE_LOG_PARAMS(input, output); auto k = std::make_unique<NEBitwiseNotKernel>(); k->configure(input, output); _kernel = std::move(k); diff --git a/src/runtime/NEON/functions/NEBitwiseOr.cpp b/src/runtime/NEON/functions/NEBitwiseOr.cpp index 793eb25d80..3d6f30b0fe 100644 --- a/src/runtime/NEON/functions/NEBitwiseOr.cpp +++ b/src/runtime/NEON/functions/NEBitwiseOr.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2020 Arm Limited. + * Copyright (c) 2017-2021 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -23,6 +23,7 @@ */ #include "arm_compute/runtime/NEON/functions/NEBitwiseOr.h" +#include "src/common/utils/Log.h" #include "src/core/NEON/kernels/NEBitwiseOrKernel.h" #include <utility> @@ -31,6 +32,7 @@ using namespace arm_compute; void NEBitwiseOr::configure(const ITensor *input1, const ITensor *input2, ITensor *output) { + ARM_COMPUTE_LOG_PARAMS(input1, input2, output); auto k = std::make_unique<NEBitwiseOrKernel>(); k->configure(input1, input2, output); _kernel = std::move(k); diff --git a/src/runtime/NEON/functions/NEBitwiseXor.cpp b/src/runtime/NEON/functions/NEBitwiseXor.cpp index 2d0af63e35..f0cf3d3e5c 100644 --- a/src/runtime/NEON/functions/NEBitwiseXor.cpp +++ b/src/runtime/NEON/functions/NEBitwiseXor.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2020 Arm Limited. + * Copyright (c) 2017-2021 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -23,6 +23,7 @@ */ #include "arm_compute/runtime/NEON/functions/NEBitwiseXor.h" +#include "src/common/utils/Log.h" #include "src/core/NEON/kernels/NEBitwiseXorKernel.h" #include <utility> @@ -31,6 +32,7 @@ using namespace arm_compute; void NEBitwiseXor::configure(const ITensor *input1, const ITensor *input2, ITensor *output) { + ARM_COMPUTE_LOG_PARAMS(input1, input2, output); auto k = std::make_unique<NEBitwiseXorKernel>(); k->configure(input1, input2, output); _kernel = std::move(k); diff --git a/src/runtime/NEON/functions/NEBoundingBoxTransform.cpp b/src/runtime/NEON/functions/NEBoundingBoxTransform.cpp index cfd14faca0..adf891e417 100644 --- a/src/runtime/NEON/functions/NEBoundingBoxTransform.cpp +++ b/src/runtime/NEON/functions/NEBoundingBoxTransform.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2020 Arm Limited. + * Copyright (c) 2019-2021 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -22,19 +22,28 @@ * SOFTWARE. */ #include "arm_compute/runtime/NEON/functions/NEBoundingBoxTransform.h" + +#include "src/common/utils/Log.h" #include "src/core/NEON/kernels/NEBoundingBoxTransformKernel.h" namespace arm_compute { -void NEBoundingBoxTransform::configure(const ITensor *boxes, ITensor *pred_boxes, const ITensor *deltas, const BoundingBoxTransformInfo &info) +void NEBoundingBoxTransform::configure(const ITensor *boxes, + ITensor *pred_boxes, + const ITensor *deltas, + const BoundingBoxTransformInfo &info) { + ARM_COMPUTE_LOG_PARAMS(boxes, pred_boxes, deltas, info); // Configure Bounding Box kernel auto k = std::make_unique<NEBoundingBoxTransformKernel>(); k->configure(boxes, pred_boxes, deltas, info); _kernel = std::move(k); } -Status NEBoundingBoxTransform::validate(const ITensorInfo *boxes, const ITensorInfo *pred_boxes, const ITensorInfo *deltas, const BoundingBoxTransformInfo &info) +Status NEBoundingBoxTransform::validate(const ITensorInfo *boxes, + const ITensorInfo *pred_boxes, + const ITensorInfo *deltas, + const BoundingBoxTransformInfo &info) { return NEBoundingBoxTransformKernel::validate(boxes, pred_boxes, deltas, info); } diff --git a/src/runtime/NEON/functions/NECast.cpp b/src/runtime/NEON/functions/NECast.cpp index b519576ad5..1fd172a730 100644 --- a/src/runtime/NEON/functions/NECast.cpp +++ b/src/runtime/NEON/functions/NECast.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2021 Arm Limited. + * Copyright (c) 2019-2023 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -24,22 +24,23 @@ #include "arm_compute/runtime/NEON/functions/NECast.h" #include "arm_compute/core/Validate.h" -#include "src/runtime/cpu/operators/CpuCast.h" + +#include "src/common/utils/Log.h" +#include "src/cpu/operators/CpuCast.h" namespace arm_compute { struct NECast::Impl { - const ITensor *src{ nullptr }; - ITensor *dst{ nullptr }; - std::unique_ptr<cpu::CpuCast> op{ nullptr }; + const ITensor *src{nullptr}; + ITensor *dst{nullptr}; + std::unique_ptr<cpu::CpuCast> op{nullptr}; }; -NECast::NECast() - : _impl(std::make_unique<Impl>()) +NECast::NECast() : _impl(std::make_unique<Impl>()) { } -NECast::NECast(NECast &&) = default; +NECast::NECast(NECast &&) = default; NECast &NECast::operator=(NECast &&) = default; NECast::~NECast() = default; @@ -49,19 +50,19 @@ void NECast::configure(ITensor *input, ITensor *output, ConvertPolicy policy) _impl->dst = output; ARM_COMPUTE_ERROR_ON_NULLPTR(_impl->src, _impl->dst); - + ARM_COMPUTE_LOG_PARAMS(input, output, policy); _impl->op = std::make_unique<cpu::CpuCast>(); _impl->op->configure(_impl->src->info(), _impl->dst->info(), policy); } -Status NECast::validate(ITensorInfo *input, ITensorInfo *output, ConvertPolicy policy) +Status NECast::validate(const ITensorInfo *input, const ITensorInfo *output, ConvertPolicy policy) { return cpu::CpuCast::validate(input, output, policy); } void NECast::run() { - ITensorPack pack = { { ACL_SRC, _impl->src }, { ACL_DST, _impl->dst } }; + ITensorPack pack = {{ACL_SRC, _impl->src}, {ACL_DST, _impl->dst}}; _impl->op->run(pack); } } // namespace arm_compute diff --git a/src/runtime/NEON/functions/NEChannelShuffleLayer.cpp b/src/runtime/NEON/functions/NEChannelShuffleLayer.cpp index bf4af83a0d..86bee4dd43 100644 --- a/src/runtime/NEON/functions/NEChannelShuffleLayer.cpp +++ b/src/runtime/NEON/functions/NEChannelShuffleLayer.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018-2020 Arm Limited. + * Copyright (c) 2018-2021 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -24,12 +24,15 @@ #include "arm_compute/runtime/NEON/functions/NEChannelShuffleLayer.h" #include "arm_compute/core/Types.h" + +#include "src/common/utils/Log.h" #include "src/core/NEON/kernels/NEChannelShuffleLayerKernel.h" namespace arm_compute { void NEChannelShuffleLayer::configure(const ITensor *input, ITensor *output, unsigned int num_groups) { + ARM_COMPUTE_LOG_PARAMS(input, output, num_groups); auto k = std::make_unique<NEChannelShuffleLayerKernel>(); k->configure(input, output, num_groups); _kernel = std::move(k); diff --git a/src/runtime/NEON/functions/NEConcatenateLayer.cpp b/src/runtime/NEON/functions/NEConcatenateLayer.cpp index dcc5cd3a64..59a0892f1f 100644 --- a/src/runtime/NEON/functions/NEConcatenateLayer.cpp +++ b/src/runtime/NEON/functions/NEConcatenateLayer.cpp @@ -23,33 +23,31 @@ */ #include "arm_compute/runtime/NEON/functions/NEConcatenateLayer.h" -#include "src/runtime/cpu/operators/CpuConcatenate.h" - -#include "arm_compute/core/utils/misc/ShapeCalculator.h" -#include "arm_compute/runtime/NEON/NEScheduler.h" - #include "arm_compute/core/Error.h" #include "arm_compute/core/ITensor.h" #include "arm_compute/core/TensorInfo.h" #include "arm_compute/core/Types.h" +#include "arm_compute/core/utils/misc/ShapeCalculator.h" +#include "arm_compute/runtime/NEON/NEScheduler.h" + #include "src/core/helpers/AutoConfiguration.h" +#include "src/cpu/operators/CpuConcatenate.h" namespace arm_compute { struct NEConcatenateLayer::Impl { std::vector<const ITensor *> srcs{}; - ITensor *dst{ nullptr }; - unsigned int num_inputs{ 0 }; - unsigned int axis{ 0 }; - std::unique_ptr<cpu::CpuConcatenate> op{ nullptr }; + ITensor *dst{nullptr}; + unsigned int num_inputs{0}; + unsigned int axis{0}; + std::unique_ptr<cpu::CpuConcatenate> op{nullptr}; }; -NEConcatenateLayer::NEConcatenateLayer() - : _impl(std::make_unique<Impl>()) +NEConcatenateLayer::NEConcatenateLayer() : _impl(std::make_unique<Impl>()) { } -NEConcatenateLayer::NEConcatenateLayer(NEConcatenateLayer &&) = default; +NEConcatenateLayer::NEConcatenateLayer(NEConcatenateLayer &&) = default; NEConcatenateLayer &NEConcatenateLayer::operator=(NEConcatenateLayer &&) = default; NEConcatenateLayer::~NEConcatenateLayer() = default; @@ -64,7 +62,7 @@ void NEConcatenateLayer::configure(std::vector<const ITensor *> inputs_vector, I _impl->op = std::make_unique<cpu::CpuConcatenate>(); std::vector<const ITensorInfo *> inputs_vector_info; - for(unsigned int i = 0; i < inputs_vector.size(); ++i) + for (unsigned int i = 0; i < inputs_vector.size(); ++i) { ARM_COMPUTE_ERROR_ON_NULLPTR(inputs_vector.at(i)); inputs_vector_info.emplace_back(inputs_vector.at(i)->info()); @@ -72,7 +70,9 @@ void NEConcatenateLayer::configure(std::vector<const ITensor *> inputs_vector, I _impl->op->configure(inputs_vector_info, _impl->dst->info(), axis); } -Status NEConcatenateLayer::validate(const std::vector<const ITensorInfo *> &inputs_vector, const ITensorInfo *output, size_t axis) +Status NEConcatenateLayer::validate(const std::vector<const ITensorInfo *> &inputs_vector, + const ITensorInfo *output, + size_t axis) { return cpu::CpuConcatenate::validate(inputs_vector, output, axis); } @@ -80,7 +80,7 @@ Status NEConcatenateLayer::validate(const std::vector<const ITensorInfo *> &inpu void NEConcatenateLayer::run() { ITensorPack pack; - for(unsigned i = 0; i < _impl->num_inputs; ++i) + for (unsigned i = 0; i < _impl->num_inputs; ++i) { pack.add_tensor(TensorType::ACL_SRC_VEC + i, _impl->srcs.at(i)); } diff --git a/src/runtime/NEON/functions/NEConv3D.cpp b/src/runtime/NEON/functions/NEConv3D.cpp new file mode 100644 index 0000000000..8f41151d6c --- /dev/null +++ b/src/runtime/NEON/functions/NEConv3D.cpp @@ -0,0 +1,87 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/runtime/NEON/functions/NEConv3D.h" + +#include "arm_compute/core/PixelValue.h" +#include "arm_compute/core/Utils.h" +#include "arm_compute/core/Validate.h" + +#include "src/common/utils/Log.h" +#include "src/cpu/operators/CpuDirectConv3d.h" + +namespace arm_compute +{ +using namespace arm_compute::experimental; + +struct NEConv3D::Impl +{ + std::unique_ptr<cpu::ICpuOperator> op{nullptr}; + ITensorPack run_pack{}; +}; + +NEConv3D::NEConv3D() : _impl(std::make_unique<Impl>()) +{ +} + +NEConv3D::~NEConv3D() = default; + +void NEConv3D::configure( + ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const Conv3dInfo &conv_info) +{ + // Perform validate step + ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output); + ARM_COMPUTE_ERROR_THROW_ON(cpu::CpuDirectConv3d::validate( + input->info(), weights->info(), ((biases != nullptr) ? biases->info() : nullptr), output->info(), conv_info)); + ARM_COMPUTE_LOG_PARAMS(input, weights, biases, output, conv_info); + + auto f = std::make_unique<cpu::CpuDirectConv3d>(); + f->configure(input->info(), weights->info(), ((biases != nullptr) ? biases->info() : nullptr), output->info(), + conv_info); + _impl->op = std::move(f); + + if (_impl->op != nullptr) + { + _impl->run_pack = {{ACL_SRC_0, input}, {ACL_SRC_1, weights}, {ACL_SRC_2, biases}, {ACL_DST, output}}; + } +} + +Status NEConv3D::validate(const ITensorInfo *input, + const ITensorInfo *weights, + const ITensorInfo *biases, + const ITensorInfo *output, + const Conv3dInfo &conv_info) +{ + ARM_COMPUTE_RETURN_ON_ERROR(cpu::CpuDirectConv3d::validate(input, weights, biases, output, conv_info)); + + return Status{}; +} + +void NEConv3D::run() +{ + if (_impl->op != nullptr) + { + _impl->op->run(_impl->run_pack); + } +} +} // namespace arm_compute diff --git a/src/runtime/NEON/functions/NEConvertFullyConnectedWeights.cpp b/src/runtime/NEON/functions/NEConvertFullyConnectedWeights.cpp index f2253d8be4..84e8565aaf 100644 --- a/src/runtime/NEON/functions/NEConvertFullyConnectedWeights.cpp +++ b/src/runtime/NEON/functions/NEConvertFullyConnectedWeights.cpp @@ -23,24 +23,27 @@ */ #include "arm_compute/runtime/NEON/functions/NEConvertFullyConnectedWeights.h" -#include "src/runtime/cpu/operators/CpuConvertFullyConnectedWeights.h" +#include "arm_compute/core/Validate.h" + +#include "src/cpu/operators/CpuConvertFullyConnectedWeights.h" namespace arm_compute { struct NEConvertFullyConnectedWeights::Impl { - const ITensor *src{ nullptr }; - ITensor *dst{ nullptr }; - std::unique_ptr<cpu::CpuConvertFullyConnectedWeights> op{ nullptr }; + const ITensor *src{nullptr}; + ITensor *dst{nullptr}; + std::unique_ptr<cpu::CpuConvertFullyConnectedWeights> op{nullptr}; }; -NEConvertFullyConnectedWeights::NEConvertFullyConnectedWeights() - : _impl(std::make_unique<Impl>()) +NEConvertFullyConnectedWeights::NEConvertFullyConnectedWeights() : _impl(std::make_unique<Impl>()) { } NEConvertFullyConnectedWeights::~NEConvertFullyConnectedWeights() = default; -void NEConvertFullyConnectedWeights::configure(const ITensor *input, ITensor *output, const TensorShape &original_input_shape, - DataLayout data_layout) +void NEConvertFullyConnectedWeights::configure(const ITensor *input, + ITensor *output, + const TensorShape &original_input_shape, + DataLayout data_layout) { ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); @@ -50,8 +53,10 @@ void NEConvertFullyConnectedWeights::configure(const ITensor *input, ITensor *ou _impl->op->configure(_impl->src->info(), _impl->dst->info(), original_input_shape, data_layout); } -Status NEConvertFullyConnectedWeights::validate(const ITensorInfo *input, const ITensorInfo *output, const TensorShape &original_input_shape, - DataLayout data_layout) +Status NEConvertFullyConnectedWeights::validate(const ITensorInfo *input, + const ITensorInfo *output, + const TensorShape &original_input_shape, + DataLayout data_layout) { return cpu::CpuConvertFullyConnectedWeights::validate(input, output, original_input_shape, data_layout); } @@ -63,4 +68,4 @@ void NEConvertFullyConnectedWeights::run() pack.add_tensor(TensorType::ACL_DST, _impl->dst); _impl->op->run(pack); } -} // namespace arm_compute
\ No newline at end of file +} // namespace arm_compute diff --git a/src/runtime/NEON/functions/NEConvolutionLayer.cpp b/src/runtime/NEON/functions/NEConvolutionLayer.cpp index e43d976944..8efebbbb1a 100644 --- a/src/runtime/NEON/functions/NEConvolutionLayer.cpp +++ b/src/runtime/NEON/functions/NEConvolutionLayer.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2021 Arm Limited. + * Copyright (c) 2017-2021, 2023 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -25,235 +25,184 @@ #include "arm_compute/core/PixelValue.h" #include "arm_compute/core/Utils.h" +#include "arm_compute/core/utils/DataTypeUtils.h" #include "arm_compute/core/Validate.h" -#include "arm_compute/runtime/NEON/NEScheduler.h" -#include "arm_compute/runtime/NEON/functions/NEDirectConvolutionLayer.h" #include "arm_compute/runtime/NEON/functions/NEFFTConvolutionLayer.h" -#include "arm_compute/runtime/NEON/functions/NEGEMMConv2d.h" -#include "arm_compute/runtime/NEON/functions/NEGEMMConvolutionLayer.h" -#include "arm_compute/runtime/NEON/functions/NEWinogradConvolutionLayer.h" -#include <cmath> -#include <tuple> -#include <utility> +#include "src/common/utils/Log.h" +#include "src/core/helpers/MemoryHelpers.h" +#include "src/cpu/operators/CpuConv2d.h" +#include "src/cpu/operators/CpuDirectConv2d.h" +#include "src/cpu/operators/CpuGemmConv2d.h" +#include "src/cpu/operators/CpuGemmDirectConv2d.h" +#include "src/cpu/operators/CpuWinogradConv2d.h" namespace arm_compute { -NEConvolutionLayer::NEConvolutionLayer(std::shared_ptr<IMemoryManager> memory_manager) //NOLINT - : _memory_manager(std::move(memory_manager)), - _function() +using namespace arm_compute::experimental; + +struct NEConvolutionLayer::Impl +{ + MemoryGroup memory_group{}; + std::shared_ptr<IMemoryManager> memory_manager{}; + std::unique_ptr<cpu::ICpuOperator> op{nullptr}; + ITensorPack run_pack{}; + ITensorPack prep_pack{}; + WorkspaceData<Tensor> workspace{}; + experimental::MemoryRequirements aux_mem_req{}; + std::unique_ptr<IFunction> func{nullptr}; +}; + +NEConvolutionLayer::NEConvolutionLayer(std::shared_ptr<IMemoryManager> memory_manager) : _impl(std::make_unique<Impl>()) { + _impl->memory_manager = std::move(memory_manager); } -void NEConvolutionLayer::configure(ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const PadStrideInfo &conv_info, const WeightsInfo &weights_info, - const Size2D &dilation, const ActivationLayerInfo &act_info, bool enable_fast_math, unsigned int num_groups) +NEConvolutionLayer::~NEConvolutionLayer() = default; + +void NEConvolutionLayer::configure(ITensor *input, + const ITensor *weights, + const ITensor *biases, + ITensor *output, + const PadStrideInfo &conv_info, + const WeightsInfo &weights_info, + const Size2D &dilation, + const ActivationLayerInfo &act_info, + bool enable_fast_math, + unsigned int num_groups) { // Perform validate step ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output); ARM_COMPUTE_UNUSED(num_groups); - ARM_COMPUTE_ERROR_THROW_ON(NEConvolutionLayer::validate(input->info(), weights->info(), ((biases != nullptr) ? biases->info() : nullptr), output->info(), conv_info, weights_info, dilation, act_info, - enable_fast_math, num_groups)); + ARM_COMPUTE_ERROR_THROW_ON(NEConvolutionLayer::validate( + input->info(), weights->info(), ((biases != nullptr) ? biases->info() : nullptr), output->info(), conv_info, + weights_info, dilation, act_info, enable_fast_math, num_groups)); + ARM_COMPUTE_LOG_PARAMS(input, weights, biases, output, conv_info, weights_info, dilation, act_info, + enable_fast_math, num_groups); const Conv2dInfo info(conv_info, dilation, act_info, enable_fast_math, num_groups); - switch(NEConvolutionLayer::get_convolution_method(input->info(), weights->info(), output->info(), conv_info, weights_info, dilation, act_info, enable_fast_math)) + switch (cpu::CpuConv2d::get_convolution_method(input->info(), weights->info(), output->info(), conv_info, + weights_info, dilation, act_info, enable_fast_math)) { case ConvolutionMethod::WINOGRAD: - { - auto f = std::make_unique<NEWinogradConvolutionLayer>(_memory_manager); - f->configure(input, weights, biases, output, conv_info, act_info, enable_fast_math); - _function = std::move(f); - break; - } case ConvolutionMethod::GEMM: - { - auto f = std::make_unique<NEGEMMConvolutionLayer>(_memory_manager); - f->configure(input, weights, biases, output, conv_info, weights_info, dilation, act_info); - _function = std::move(f); - break; - } case ConvolutionMethod::GEMM_CONV2D: - { - auto f = std::make_unique<NEGEMMConv2d>(_memory_manager); - f->configure(input, weights, biases, output, info); - _function = std::move(f); - break; - } case ConvolutionMethod::DIRECT: { - auto f = std::make_unique<NEDirectConvolutionLayer>(_memory_manager); - f->configure(input, weights, biases, output, conv_info, act_info); - _function = std::move(f); + auto f = std::make_unique<cpu::CpuConv2d>(); + f->configure(input->info(), weights->info(), ((biases != nullptr) ? biases->info() : nullptr), + output->info(), conv_info, weights_info, dilation, act_info, enable_fast_math, num_groups); + _impl->op = std::move(f); break; } case ConvolutionMethod::FFT: { - auto f = std::make_unique<NEFFTConvolutionLayer>(_memory_manager); + auto f = std::make_unique<NEFFTConvolutionLayer>(_impl->memory_manager); f->configure(input, weights, biases, output, conv_info, act_info); - _function = std::move(f); + _impl->func = std::move(f); break; } default: ARM_COMPUTE_ERROR("Not supported."); break; } + + if (_impl->op) + { + _impl->memory_group = MemoryGroup(std::move(_impl->memory_manager)); + _impl->aux_mem_req = _impl->op->workspace(); + _impl->run_pack = {{ACL_SRC_0, input}, {ACL_SRC_1, weights}, {ACL_SRC_2, biases}, {ACL_DST, output}}; + _impl->prep_pack = {{ACL_SRC_1, weights}, {ACL_SRC_2, biases}}; + _impl->workspace = + manage_workspace<Tensor>(_impl->aux_mem_req, _impl->memory_group, _impl->run_pack, _impl->prep_pack); + } } -Status NEConvolutionLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info, - const WeightsInfo &weights_info, const Size2D &dilation, const ActivationLayerInfo &act_info, bool enable_fast_math, unsigned int num_groups) +Status NEConvolutionLayer::validate(const ITensorInfo *input, + const ITensorInfo *weights, + const ITensorInfo *biases, + const ITensorInfo *output, + const PadStrideInfo &conv_info, + const WeightsInfo &weights_info, + const Size2D &dilation, + const ActivationLayerInfo &act_info, + bool enable_fast_math, + unsigned int num_groups) { - ARM_COMPUTE_RETURN_ERROR_ON_MSG((num_groups != 1), "Grouping (num_groups != 1) is not supported on Neon"); - const Conv2dInfo info(conv_info, dilation, act_info, enable_fast_math, num_groups); - switch(NEConvolutionLayer::get_convolution_method(input, weights, output, conv_info, weights_info, dilation, act_info, enable_fast_math)) + + ARM_COMPUTE_RETURN_ERROR_ON_MSG(!weights->are_values_constant(), "Dynamic weights are not supported"); + + // Biases with dynamic values are not supported with quantized inputs. + if (biases) + { + ARM_COMPUTE_RETURN_ERROR_ON_MSG((!biases->are_values_constant() && is_data_type_quantized(input->data_type())), + "Dynamic Biases are not supported with quantized input data."); + } + + switch (cpu::CpuConv2d::get_convolution_method(input, weights, output, conv_info, weights_info, dilation, act_info, + enable_fast_math)) { case ConvolutionMethod::WINOGRAD: - ARM_COMPUTE_RETURN_ON_ERROR(NEWinogradConvolutionLayer::validate(input, weights, biases, output, conv_info, act_info, enable_fast_math)); - break; case ConvolutionMethod::GEMM: - ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMConvolutionLayer::validate(input, weights, biases, output, conv_info, weights_info, dilation, act_info)); - break; case ConvolutionMethod::GEMM_CONV2D: - ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMConv2d::validate(input, weights, biases, output, info)); - break; case ConvolutionMethod::DIRECT: - ARM_COMPUTE_RETURN_ON_ERROR(NEDirectConvolutionLayer::validate(input, weights, biases, output, conv_info, act_info)); + ARM_COMPUTE_RETURN_ON_ERROR(cpu::CpuConv2d::validate(input, weights, biases, output, conv_info, + weights_info, dilation, act_info, enable_fast_math, + num_groups)); break; case ConvolutionMethod::FFT: - ARM_COMPUTE_RETURN_ON_ERROR(NEFFTConvolutionLayer::validate(input, weights, nullptr, output, conv_info, act_info)); + ARM_COMPUTE_RETURN_ON_ERROR( + NEFFTConvolutionLayer::validate(input, weights, biases, output, conv_info, act_info)); break; default: ARM_COMPUTE_ERROR("Not supported."); break; } - return Status{}; } -ConvolutionMethod NEConvolutionLayer::get_convolution_method(const ITensorInfo *input, const ITensorInfo *weights, - const ITensorInfo *output, const PadStrideInfo &conv_info, - const WeightsInfo &weights_info, const Size2D &dilation, const ActivationLayerInfo &act_info, bool enable_fast_math) +ConvolutionMethod NEConvolutionLayer::get_convolution_method(const ITensorInfo *input, + const ITensorInfo *weights, + const ITensorInfo *output, + const PadStrideInfo &conv_info, + const WeightsInfo &weights_info, + const Size2D &dilation, + const ActivationLayerInfo &act_info, + bool enable_fast_math) { - ARM_COMPUTE_ERROR_ON_NULLPTR(input, output, weights); - ARM_COMPUTE_UNUSED(weights_info); - - const size_t idx_w = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::WIDTH); - const size_t idx_h = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::HEIGHT); - const size_t idx_c = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::CHANNEL); - - const Conv2dInfo info(conv_info, dilation, act_info, enable_fast_math, 1); + return cpu::CpuConv2d::get_convolution_method(input, weights, output, conv_info, weights_info, dilation, act_info, + enable_fast_math); +} - /* Input spatial dims, kernel size, IFM/OFM, conv info*/ - using ConvolutionConfiguration = std::tuple<Size2D, Size2D, Size2D, PadStrideInfo>; - using ConfigurationMethod = std::pair<ConvolutionConfiguration, ConvolutionMethod>; +void NEConvolutionLayer::run() +{ + prepare(); - const std::vector<ConfigurationMethod> known_configs = - { - // Alexnet - ConfigurationMethod(ConvolutionConfiguration(Size2D(27U, 27U), Size2D(5U, 5U), Size2D(48U, 128U), PadStrideInfo(1U, 1U, 2U, 2U)), ConvolutionMethod::GEMM), - // VGG16 / VGG19 - ConfigurationMethod(ConvolutionConfiguration(Size2D(224U, 224U), Size2D(3U, 3U), Size2D(3U, 64U), PadStrideInfo(1U, 1U, 1U, 1U)), ConvolutionMethod::GEMM), - // Mobilenet 224 - ConfigurationMethod(ConvolutionConfiguration(Size2D(224U, 224U), Size2D(3U, 3U), Size2D(3U, 32U), PadStrideInfo(2U, 2U, 0U, 1U, 0U, 1U, DimensionRoundingType::FLOOR)), ConvolutionMethod::GEMM), - // Mobilenet 160 - ConfigurationMethod(ConvolutionConfiguration(Size2D(160U, 160U), Size2D(3U, 3U), Size2D(3U, 24U), PadStrideInfo(2U, 2U, 0U, 1U, 0U, 1U, DimensionRoundingType::FLOOR)), ConvolutionMethod::GEMM) - }; + MemoryGroupResourceScope scope_mg(_impl->memory_group); - const auto find_config = [&](ConfigurationMethod c) + if (_impl->func) { - const ConvolutionConfiguration config = c.first; - const PadStrideInfo info = std::get<3>(config); - - return std::get<0>(config) == Size2D(input->dimension(idx_w), input->dimension(idx_h)) && std::get<1>(config) == Size2D(weights->dimension(idx_w), weights->dimension(idx_h)) - && std::get<2>(config) == Size2D(weights->dimension(idx_c), weights->dimension(3)) && info.pad_top() == conv_info.pad_top() && info.pad_right() == conv_info.pad_right() - && info.pad_bottom() == conv_info.pad_bottom() && info.pad_left() == conv_info.pad_left() && info.stride() == conv_info.stride(); - }; - - std::vector<ConfigurationMethod>::const_iterator found; - if((found = std::find_if(known_configs.begin(), known_configs.end(), find_config)) != known_configs.end()) + _impl->func->run(); + } + else { - return (*found).second; + _impl->op->run(_impl->run_pack); } +} - if(dilation != Size2D(1U, 1U)) +void NEConvolutionLayer::prepare() +{ + if (_impl->func) { - return ConvolutionMethod::GEMM; + _impl->func->prepare(); } else { - // SRGAN - // Output might not be initialized when it is an internal tensor of the layer using the convolution - if(input->total_size() > 1e7 && (weights->dimension(idx_h) > 7) - && (NEDirectConvolutionLayer::validate(input, weights, nullptr, output, conv_info, act_info))) - { - return ConvolutionMethod::DIRECT; - } - if((weights->dimension(idx_h) > 7) && (input->dimension(idx_c) > output->dimension(idx_c)) && (NEFFTConvolutionLayer::validate(input, weights, nullptr, output, conv_info, act_info))) - { - return ConvolutionMethod::FFT; - } - if(input->dimension(idx_c) < 16) - { - return ConvolutionMethod::GEMM; - } - -#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC - // This heuristics only applies to F16 data type on A55r1 - if(NEScheduler::get().cpu_info().get_cpu_model() == CPUModel::A55r1 && enable_fast_math && input->data_type() == DataType::F16) - { - // Exclude known bad winograd configs (and defaults to GEMM) - const std::vector<ConvolutionConfiguration> known_bad_winograd_f16_with_fastmath_configs = - { - // Squeezenet_V1_1 fire2 and fire3 - ConvolutionConfiguration(Size2D(56U, 56U), Size2D(3U, 3U), Size2D(16U, 64U), PadStrideInfo(1U, 1U, 1U, 1U)), - // Squeezenet_V1_1 fire6 and fire7 - ConvolutionConfiguration(Size2D(14U, 14U), Size2D(3U, 3U), Size2D(48U, 192U), PadStrideInfo(1U, 1U, 1U, 1U)), - // Squeezenet_V1_1 fire8 and fire9 - ConvolutionConfiguration(Size2D(14U, 14U), Size2D(3U, 3U), Size2D(64U, 256U), PadStrideInfo(1U, 1U, 1U, 1U)), - }; - const auto find_conv_config = [&](ConvolutionConfiguration c) - { - const PadStrideInfo info = std::get<3>(c); - - return std::get<0>(c) == Size2D(input->dimension(idx_w), input->dimension(idx_h)) && std::get<1>(c) == Size2D(weights->dimension(idx_w), weights->dimension(idx_h)) - && std::get<2>(c) == Size2D(weights->dimension(idx_c), weights->dimension(3)) && info.pad_top() == conv_info.pad_top() && info.pad_right() == conv_info.pad_right() - && info.pad_bottom() == conv_info.pad_bottom() && info.pad_left() == conv_info.pad_left() && info.stride() == conv_info.stride(); - }; - - bool found_bad = std::find_if(known_bad_winograd_f16_with_fastmath_configs.begin(), known_bad_winograd_f16_with_fastmath_configs.end(), - find_conv_config) - != known_bad_winograd_f16_with_fastmath_configs.end(); - if(found_bad) - { - return ConvolutionMethod::GEMM; - } - } -#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC - // For 1x1 convolutions run the default GEMM - if(weights->dimension(idx_w) == 1 && weights->dimension(idx_h) == 1) - { - return ConvolutionMethod::GEMM; - } + _impl->op->prepare(_impl->prep_pack); - if(bool(NEWinogradConvolutionLayer::validate(input, weights, nullptr, output, conv_info, act_info, enable_fast_math))) - { - return ConvolutionMethod::WINOGRAD; - } - if(bool(NEGEMMConv2d::validate(input, weights, nullptr, output, info))) - { - return ConvolutionMethod::GEMM_CONV2D; - } - return ConvolutionMethod::GEMM; + // Release temporary tensors that are only used in prepare stage + release_temporaries<Tensor>(_impl->aux_mem_req, _impl->workspace); } } - -void NEConvolutionLayer::run() -{ - prepare(); - _function->run(); -} - -void NEConvolutionLayer::prepare() -{ - _function->prepare(); -} } // namespace arm_compute diff --git a/src/runtime/NEON/functions/NECopy.cpp b/src/runtime/NEON/functions/NECopy.cpp index 20642b5eed..c975d3a5b5 100644 --- a/src/runtime/NEON/functions/NECopy.cpp +++ b/src/runtime/NEON/functions/NECopy.cpp @@ -24,7 +24,8 @@ #include "arm_compute/runtime/NEON/functions/NECopy.h" #include "arm_compute/core/Validate.h" -#include "src/runtime/cpu/operators/CpuCopy.h" + +#include "src/cpu/operators/CpuCopy.h" #include <utility> @@ -32,16 +33,15 @@ namespace arm_compute { struct NECopy::Impl { - const ITensor *src{ nullptr }; - ITensor *dst{ nullptr }; - std::unique_ptr<cpu::CpuCopy> op{ nullptr }; + const ITensor *src{nullptr}; + ITensor *dst{nullptr}; + std::unique_ptr<cpu::CpuCopy> op{nullptr}; }; -NECopy::NECopy() - : _impl(std::make_unique<Impl>()) +NECopy::NECopy() : _impl(std::make_unique<Impl>()) { } -NECopy::NECopy(NECopy &&) = default; +NECopy::NECopy(NECopy &&) = default; NECopy &NECopy::operator=(NECopy &&) = default; NECopy::~NECopy() = default; diff --git a/src/runtime/NEON/functions/NECropResize.cpp b/src/runtime/NEON/functions/NECropResize.cpp index 1e1070d961..a94b0882da 100644 --- a/src/runtime/NEON/functions/NECropResize.cpp +++ b/src/runtime/NEON/functions/NECropResize.cpp @@ -21,10 +21,12 @@ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ -#include "arm_compute/runtime/NEON/NEScheduler.h" - #include "arm_compute/runtime/NEON/functions/NECropResize.h" + +#include "arm_compute/runtime/NEON/NEScheduler.h" #include "arm_compute/runtime/Tensor.h" + +#include "src/common/utils/Log.h" #include "src/core/NEON/kernels/NECropKernel.h" #include <cstddef> @@ -34,18 +36,32 @@ namespace arm_compute NECropResize::~NECropResize() = default; NECropResize::NECropResize() - : _output(nullptr), _num_boxes(0), _method(), _extrapolation_value(0), _crop(), _scale(), _crop_results(), _scaled_results() + : _output(nullptr), + _num_boxes(0), + _method(), + _extrapolation_value(0), + _crop(), + _scale(), + _crop_results(), + _scaled_results() { } -Status NECropResize::validate(const ITensorInfo *input, const ITensorInfo *boxes, const ITensorInfo *box_ind, const ITensorInfo *output, - Coordinates2D crop_size, InterpolationPolicy method, float extrapolation_value) +Status NECropResize::validate(const ITensorInfo *input, + const ITensorInfo *boxes, + const ITensorInfo *box_ind, + const ITensorInfo *output, + Coordinates2D crop_size, + InterpolationPolicy method, + float extrapolation_value) { ARM_COMPUTE_RETURN_ERROR_ON(crop_size.x <= 0 || crop_size.y <= 0); ARM_COMPUTE_RETURN_ERROR_ON(method == InterpolationPolicy::AREA); TensorInfo temp_info; - ARM_COMPUTE_RETURN_ON_ERROR(NECropKernel::validate(input->clone().get(), boxes->clone().get(), box_ind->clone().get(), &temp_info, boxes->tensor_shape()[1] - 1, extrapolation_value)); - if(output->total_size() > 0) + ARM_COMPUTE_RETURN_ON_ERROR(NECropKernel::validate(input->clone().get(), boxes->clone().get(), + box_ind->clone().get(), &temp_info, boxes->tensor_shape()[1] - 1, + extrapolation_value)); + if (output->total_size() > 0) { ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_NOT_IN(output, DataType::F32); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, output); @@ -55,11 +71,18 @@ Status NECropResize::validate(const ITensorInfo *input, const ITensorInfo *boxes return Status{}; } -void NECropResize::configure(const ITensor *input, const ITensor *boxes, const ITensor *box_ind, ITensor *output, Coordinates2D crop_size, - InterpolationPolicy method, float extrapolation_value) +void NECropResize::configure(const ITensor *input, + const ITensor *boxes, + const ITensor *box_ind, + ITensor *output, + Coordinates2D crop_size, + InterpolationPolicy method, + float extrapolation_value) { ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); - ARM_COMPUTE_ERROR_THROW_ON(NECropResize::validate(input->info(), boxes->info(), box_ind->info(), output->info(), crop_size, method, extrapolation_value)); + ARM_COMPUTE_ERROR_THROW_ON(NECropResize::validate(input->info(), boxes->info(), box_ind->info(), output->info(), + crop_size, method, extrapolation_value)); + ARM_COMPUTE_LOG_PARAMS(input, boxes, box_ind, output, crop_size, method, extrapolation_value); _num_boxes = boxes->info()->tensor_shape()[1]; TensorShape out_shape(input->info()->tensor_shape()[0], crop_size.x, crop_size.y); @@ -79,7 +102,7 @@ void NECropResize::configure(const ITensor *input, const ITensor *boxes, const I _scaled_results.reserve(_num_boxes); _scale.reserve(_num_boxes); - for(unsigned int i = 0; i < _num_boxes; ++i) + for (unsigned int i = 0; i < _num_boxes; ++i) { auto crop_tensor = std::make_unique<Tensor>(); TensorInfo crop_result_info(1, DataType::F32); @@ -106,7 +129,7 @@ void NECropResize::run() { ARM_COMPUTE_ERROR_ON_MSG(_output == nullptr, "Unconfigured function"); - for(unsigned int i = 0; i < _num_boxes; ++i) + for (unsigned int i = 0; i < _num_boxes; ++i) { // Size of the crop box in _boxes and thus the shape of _crop_results[i] // may not be known until run-time and so the kernels cannot be configured until then. @@ -115,12 +138,15 @@ void NECropResize::run() NEScheduler::get().schedule(_crop[i].get(), Window::DimZ); // Scale the cropped image. - _scale[i]->configure(_crop_results[i].get(), _scaled_results[i].get(), ScaleKernelInfo{ _method, BorderMode::CONSTANT, PixelValue(_extrapolation_value), SamplingPolicy::TOP_LEFT, false }); + _scale[i]->configure(_crop_results[i].get(), _scaled_results[i].get(), + ScaleKernelInfo{_method, BorderMode::CONSTANT, PixelValue(_extrapolation_value), + SamplingPolicy::TOP_LEFT, false}); _scaled_results[i]->allocator()->allocate(); _scale[i]->run(); // Copy scaled image into output. - std::copy_n(_scaled_results[i]->buffer(), _scaled_results[i]->info()->total_size(), _output->ptr_to_element(Coordinates(0, 0, 0, i))); + std::copy_n(_scaled_results[i]->buffer(), _scaled_results[i]->info()->total_size(), + _output->ptr_to_element(Coordinates(0, 0, 0, i))); } } -} // namespace arm_compute
\ No newline at end of file +} // namespace arm_compute diff --git a/src/runtime/NEON/functions/NEDeconvolutionLayer.cpp b/src/runtime/NEON/functions/NEDeconvolutionLayer.cpp index 5bd61b4074..081c7cc538 100644 --- a/src/runtime/NEON/functions/NEDeconvolutionLayer.cpp +++ b/src/runtime/NEON/functions/NEDeconvolutionLayer.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2021 Arm Limited. + * Copyright (c) 2017-2021, 2023 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -25,10 +25,11 @@ #include "arm_compute/core/Helpers.h" #include "arm_compute/core/Utils.h" -#include "arm_compute/core/Validate.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" +#include "arm_compute/core/Validate.h" #include "arm_compute/runtime/NEON/NEScheduler.h" -#include "src/core/NEON/kernels/NEWeightsReshapeKernel.h" + +#include "src/common/utils/Log.h" #include "src/core/helpers/AutoConfiguration.h" using namespace arm_compute::misc::shape_calculator; @@ -61,9 +62,9 @@ PadStrideInfo compute_upsample_info(const PadStrideInfo &info, uint32_t deconv_p deconv_pad_top += deconv_pad_y / 2; deconv_pad_bottom += deconv_pad_y / 2; - return PadStrideInfo(stride_x, stride_y, deconv_pad_left, deconv_pad_right, deconv_pad_top, deconv_pad_bottom, DimensionRoundingType::FLOOR); + return PadStrideInfo(stride_x, stride_y, deconv_pad_left, deconv_pad_right, deconv_pad_top, deconv_pad_bottom, + DimensionRoundingType::FLOOR); } - } // namespace NEDeconvolutionLayer::NEDeconvolutionLayer(std::shared_ptr<IMemoryManager> memory_manager) // NOLINT @@ -77,20 +78,29 @@ NEDeconvolutionLayer::NEDeconvolutionLayer(std::shared_ptr<IMemoryManager> memor _original_weights(nullptr), _input(nullptr), _info(), - _is_prepared(false) + _is_prepared(false), + _do_upsampling(true) { } -Status NEDeconvolutionLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *bias, const ITensorInfo *output, const PadStrideInfo &info) +Status NEDeconvolutionLayer::validate(const ITensorInfo *input, + const ITensorInfo *weights, + const ITensorInfo *bias, + const ITensorInfo *output, + const PadStrideInfo &info, + bool enable_fast_math, + const WeightsInfo &weights_info) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32, DataType::F16, DataType::QASYMM8, DataType::QASYMM8_SIGNED); - const unsigned int width_idx = get_data_layout_dimension_index(weights->data_layout(), DataLayoutDimension::WIDTH); - const unsigned int height_idx = get_data_layout_dimension_index(weights->data_layout(), DataLayoutDimension::HEIGHT); - ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(width_idx) != weights->dimension(height_idx)); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32, DataType::F16, DataType::QASYMM8, + DataType::QASYMM8_SIGNED); + const unsigned int width_idx = get_data_layout_dimension_index(weights->data_layout(), DataLayoutDimension::WIDTH); + const unsigned int height_idx = + get_data_layout_dimension_index(weights->data_layout(), DataLayoutDimension::HEIGHT); ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(width_idx) < 1); + ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(height_idx) < 1); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(weights, input); - if(is_data_type_quantized_per_channel(weights->data_type()) && is_data_type_quantized(input->data_type())) + if (is_data_type_quantized_per_channel(weights->data_type()) && is_data_type_quantized(input->data_type())) { ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(weights, 1, DataType::QSYMM8_PER_CHANNEL); } @@ -99,11 +109,23 @@ Status NEDeconvolutionLayer::validate(const ITensorInfo *input, const ITensorInf ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights); } - auto out_dims = deconvolution_output_dimensions(input->dimension(width_idx), input->dimension(height_idx), weights->dimension(width_idx), weights->dimension(height_idx), info); + const unsigned int pad_left = info.pad_left(); + const unsigned int pad_top = info.pad_top(); + const unsigned int pad_right = info.pad_right(); + const unsigned int pad_bottom = info.pad_bottom(); + + ARM_COMPUTE_RETURN_ERROR_ON(((input->dimension(width_idx) - 1) * info.stride().first + + weights->dimension(width_idx)) < (pad_left + pad_right)); + ARM_COMPUTE_RETURN_ERROR_ON(((input->dimension(height_idx) - 1) * info.stride().second + + weights->dimension(height_idx)) < (pad_top + pad_bottom)); - if(bias != nullptr) + auto out_dims = + deconvolution_output_dimensions(input->dimension(width_idx), input->dimension(height_idx), + weights->dimension(width_idx), weights->dimension(height_idx), info); + + if (bias != nullptr) { - if(is_data_type_quantized_asymmetric(input->data_type())) + if (is_data_type_quantized_asymmetric(input->data_type())) { ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(bias, 1, DataType::S32); } @@ -113,57 +135,84 @@ Status NEDeconvolutionLayer::validate(const ITensorInfo *input, const ITensorInf } } - if(output->tensor_shape().total_size() > 0) + if (output->tensor_shape().total_size() > 0) { ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); const TensorShape output_shape = compute_deconvolution_output_shape(out_dims, *input, *weights); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->dimension(Window::DimX) != output_shape.x(), "Output's width is invalid."); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->dimension(Window::DimY) != output_shape.y(), "Output's height is invalid."); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->dimension(Window::DimZ) != output_shape.z(), "Output's depth is invalid."); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->dimension(Window::DimX) != output_shape.x(), + "Output's width is invalid."); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->dimension(Window::DimY) != output_shape.y(), + "Output's height is invalid."); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->dimension(Window::DimZ) != output_shape.z(), + "Output's depth is invalid."); } - uint32_t deconv_pad_x = 0; - uint32_t deconv_pad_y = 0; - const unsigned int stride_x = info.stride().first; - const unsigned int stride_y = info.stride().second; - // Guard against overflows in compute_deconvolution_upsampled_shape() - const DataLayout data_layout = input->data_layout(); - const size_t idx_w = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH); - const size_t idx_h = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT); - const unsigned int out_x = (input->dimension(idx_w) - 1) * stride_x + 1; - const unsigned int out_y = (input->dimension(idx_h) - 1) * stride_y + 1; - ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(idx_w) > out_x); - ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(idx_h) > out_y); - ARM_COMPUTE_RETURN_ERROR_ON((out_x - weights->dimension(idx_w) + 1) > out_dims.first); - ARM_COMPUTE_RETURN_ERROR_ON((out_y - weights->dimension(idx_h) + 1) > out_dims.second); - - const TensorShape scale_out_shape = compute_deconvolution_upsampled_shape(*input, *weights, stride_x, stride_y, out_dims, deconv_pad_x, deconv_pad_y); - TensorInfo scale_out_info(input->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(scale_out_shape)); - const PadStrideInfo conv_info(1, 1, 0, 0, 0, 0, DimensionRoundingType::CEIL); - - const unsigned int batches_idx = get_data_layout_dimension_index(weights->data_layout(), DataLayoutDimension::BATCHES); - const unsigned int channel_idx = get_data_layout_dimension_index(weights->data_layout(), DataLayoutDimension::CHANNEL); + uint32_t deconv_pad_x = 0; + uint32_t deconv_pad_y = 0; + const uint32_t stride_x = info.stride().first; + const uint32_t stride_y = info.stride().second; + const auto deconv_padding = compute_deconvolution_padding(*input, *weights, static_cast<int32_t>(stride_x), + static_cast<int32_t>(stride_y), out_dims); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(deconv_padding.first < 0 || deconv_padding.second < 0, + "Negative padding not supported"); + + const TensorShape scale_out_shape = compute_deconvolution_upsampled_shape(*input, *weights, stride_x, stride_y, + out_dims, deconv_pad_x, deconv_pad_y); + TensorInfo scale_out_info(input->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(scale_out_shape)); + const PadStrideInfo upsample_info = compute_upsample_info(info, deconv_pad_x, deconv_pad_y); + + // Do not perform upsampling when the operation uses unit stride in all dimensions + const bool do_upsampling = stride_x != 1 || stride_y != 1; + + const unsigned int batches_idx = + get_data_layout_dimension_index(weights->data_layout(), DataLayoutDimension::BATCHES); + const unsigned int channel_idx = + get_data_layout_dimension_index(weights->data_layout(), DataLayoutDimension::CHANNEL); ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(batches_idx) != scale_out_info.dimension(batches_idx)); ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(channel_idx) != scale_out_info.dimension(channel_idx)); - ARM_COMPUTE_RETURN_ON_ERROR(NEConvolutionLayer::validate(&scale_out_info, weights, bias, output, conv_info, WeightsInfo())); + if (do_upsampling) + { + const PadStrideInfo conv_info(1, 1, 0, 0, 0, 0, DimensionRoundingType::CEIL); + ARM_COMPUTE_RETURN_ON_ERROR(NEConvolutionLayer::validate(&scale_out_info, weights, bias, output, conv_info, + weights_info, Size2D(1U, 1U), ActivationLayerInfo(), + enable_fast_math)); + } + else + { + const PadStrideInfo conv_info(1, 1, upsample_info.pad_left(), upsample_info.pad_right(), + upsample_info.pad_top(), upsample_info.pad_bottom(), DimensionRoundingType::CEIL); + ARM_COMPUTE_RETURN_ON_ERROR(NEConvolutionLayer::validate(input, weights, bias, output, conv_info, weights_info, + Size2D(1U, 1U), ActivationLayerInfo(), + enable_fast_math)); + } return Status{}; } -void NEDeconvolutionLayer::configure(ITensor *input, const ITensor *weights, const ITensor *bias, ITensor *output, const PadStrideInfo &info) +void NEDeconvolutionLayer::configure(ITensor *input, + const ITensor *weights, + const ITensor *bias, + ITensor *output, + const PadStrideInfo &info, + bool enable_fast_math, + const WeightsInfo &weights_info) { // Perform validation step ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output); - ARM_COMPUTE_ERROR_THROW_ON(NEDeconvolutionLayer::validate(input->info(), weights->info(), (bias == nullptr) ? nullptr : bias->info(), output->info(), info)); + ARM_COMPUTE_ERROR_THROW_ON(NEDeconvolutionLayer::validate(input->info(), weights->info(), + (bias == nullptr) ? nullptr : bias->info(), + output->info(), info, enable_fast_math, weights_info)); + ARM_COMPUTE_LOG_PARAMS(input, weights, bias, output, info, enable_fast_math, weights_info); const DataLayout data_layout = input->info()->data_layout(); const unsigned int width_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH); const unsigned int height_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT); - auto out_dims = deconvolution_output_dimensions(input->info()->dimension(width_idx), input->info()->dimension(height_idx), - weights->info()->dimension(width_idx), weights->info()->dimension(height_idx), info); + auto out_dims = deconvolution_output_dimensions( + input->info()->dimension(width_idx), input->info()->dimension(height_idx), + weights->info()->dimension(width_idx), weights->info()->dimension(height_idx), info); const TensorShape output_shape = compute_deconvolution_output_shape(out_dims, *input->info(), *weights->info()); @@ -176,32 +225,24 @@ void NEDeconvolutionLayer::configure(ITensor *input, const ITensor *weights, con const unsigned int stride_y = info.stride().second; // Output auto initialization if not yet initialized - auto_init_if_empty(*output->info(), output_shape, 1, input->info()->data_type(), input->info()->quantization_info()); + auto_init_if_empty(*output->info(), output_shape, 1, input->info()->data_type(), + input->info()->quantization_info()); _flip_axis.allocator()->init(TensorInfo(TensorShape(2U), 1, DataType::U32)); - _memory_group.manage(&_scaled_output); _weights_flipped.allocator()->init(weights->info()->clone()->set_data_layout(data_layout)); _flip_weights.configure(weights, &_weights_flipped, &_flip_axis); // setup the function to convolve the upscaled output - const PadStrideInfo conv_info(1, 1, 0, 0, 0, 0, DimensionRoundingType::CEIL); - uint32_t deconv_pad_x = 0; - uint32_t deconv_pad_y = 0; - - const TensorShape scale_out_shape = compute_deconvolution_upsampled_shape(*input->info(), *weights->info(), - stride_x, stride_y, - out_dims, deconv_pad_x, deconv_pad_y); + uint32_t deconv_pad_x = 0; + uint32_t deconv_pad_y = 0; + const TensorShape scale_out_shape = compute_deconvolution_upsampled_shape( + *input->info(), *weights->info(), stride_x, stride_y, out_dims, deconv_pad_x, deconv_pad_y); const PadStrideInfo upsample_info = compute_upsample_info(info, deconv_pad_x, deconv_pad_y); - TensorInfo scale_out_info(scale_out_shape, 1, input->info()->data_type(), input->info()->quantization_info()); - scale_out_info.set_data_layout(data_layout); - _scaled_output.allocator()->init(scale_out_info); - - _upsample_f.configure(input, &_scaled_output, upsample_info); - - _conv_f.configure(&_scaled_output, &_weights_flipped, bias, output, conv_info); + // Do not perform upsampling when the operation uses unit stride in all dimensions + _do_upsampling = stride_x != 1 || stride_y != 1; // Setup flip axis data _flip_axis.allocator()->allocate(); @@ -209,7 +250,32 @@ void NEDeconvolutionLayer::configure(ITensor *input, const ITensor *weights, con axis_data[0] = static_cast<uint32_t>(width_idx); axis_data[1] = static_cast<uint32_t>(height_idx); - _scaled_output.allocator()->allocate(); + // Setup convolution and upsampling, if needed + if (_do_upsampling) + { + _memory_group.manage(&_scaled_output); + + const PadStrideInfo conv_info(1, 1, 0, 0, 0, 0, DimensionRoundingType::CEIL); + TensorInfo scale_out_info(scale_out_shape, 1, input->info()->data_type(), input->info()->quantization_info()); + scale_out_info.set_data_layout(data_layout); + _scaled_output.allocator()->init(scale_out_info); + + // Minor optimization: In the upsampling step, we do not need to allocate space for the padding in the upsampled image. + // The padding amount can be given as input to the convolution layer. + _upsample_f.configure(input, &_scaled_output, upsample_info); + + _conv_f.configure(&_scaled_output, &_weights_flipped, bias, output, conv_info, weights_info, Size2D(1U, 1U), + ActivationLayerInfo(), enable_fast_math); + + _scaled_output.allocator()->allocate(); + } + else + { + const PadStrideInfo conv_info(1, 1, upsample_info.pad_left(), upsample_info.pad_right(), + upsample_info.pad_top(), upsample_info.pad_bottom(), DimensionRoundingType::CEIL); + _conv_f.configure(input, &_weights_flipped, bias, output, conv_info, weights_info, Size2D(1U, 1U), + ActivationLayerInfo(), enable_fast_math); + } } void NEDeconvolutionLayer::run() @@ -218,13 +284,16 @@ void NEDeconvolutionLayer::run() MemoryGroupResourceScope scope_mg(_memory_group); - _upsample_f.run(); + if (_do_upsampling) + { + _upsample_f.run(); + } _conv_f.run(); } void NEDeconvolutionLayer::prepare() { - if(!_is_prepared) + if (!_is_prepared) { ARM_COMPUTE_ERROR_ON(!_original_weights->is_used()); diff --git a/src/runtime/NEON/functions/NEDepthConvertLayer.cpp b/src/runtime/NEON/functions/NEDepthConvertLayer.cpp index 07e985c25e..766635dfa1 100644 --- a/src/runtime/NEON/functions/NEDepthConvertLayer.cpp +++ b/src/runtime/NEON/functions/NEDepthConvertLayer.cpp @@ -24,7 +24,8 @@ #include "arm_compute/runtime/NEON/functions/NEDepthConvertLayer.h" #include "arm_compute/core/Validate.h" -#include "src/runtime/cpu/operators/CpuCast.h" + +#include "src/cpu/operators/CpuCast.h" #include <utility> @@ -32,16 +33,15 @@ namespace arm_compute { struct NEDepthConvertLayer::Impl { - const ITensor *src{ nullptr }; - ITensor *dst{ nullptr }; - std::unique_ptr<cpu::CpuCast> op{ nullptr }; + const ITensor *src{nullptr}; + ITensor *dst{nullptr}; + std::unique_ptr<cpu::CpuCast> op{nullptr}; }; -NEDepthConvertLayer::NEDepthConvertLayer() - : _impl(std::make_unique<Impl>()) +NEDepthConvertLayer::NEDepthConvertLayer() : _impl(std::make_unique<Impl>()) { } -NEDepthConvertLayer::NEDepthConvertLayer(NEDepthConvertLayer &&) = default; +NEDepthConvertLayer::NEDepthConvertLayer(NEDepthConvertLayer &&) = default; NEDepthConvertLayer &NEDepthConvertLayer::operator=(NEDepthConvertLayer &&) = default; NEDepthConvertLayer::~NEDepthConvertLayer() = default; @@ -59,7 +59,8 @@ void NEDepthConvertLayer::configure(const ITensor *input, ITensor *output, Conve _impl->op->configure(_impl->src->info(), _impl->dst->info(), policy); } -Status NEDepthConvertLayer::validate(const ITensorInfo *input, const ITensorInfo *output, ConvertPolicy policy, uint32_t shift) +Status +NEDepthConvertLayer::validate(const ITensorInfo *input, const ITensorInfo *output, ConvertPolicy policy, uint32_t shift) { ARM_COMPUTE_RETURN_ERROR_ON(shift != 0); return cpu::CpuCast::validate(input, output, policy); @@ -67,7 +68,7 @@ Status NEDepthConvertLayer::validate(const ITensorInfo *input, const ITensorInfo void NEDepthConvertLayer::run() { - ITensorPack pack = { { ACL_SRC, _impl->src }, { ACL_DST, _impl->dst } }; + ITensorPack pack = {{ACL_SRC, _impl->src}, {ACL_DST, _impl->dst}}; _impl->op->run(pack); } } // namespace arm_compute diff --git a/src/runtime/NEON/functions/NEDepthToSpaceLayer.cpp b/src/runtime/NEON/functions/NEDepthToSpaceLayer.cpp index 2793c3f27e..5eea4dca65 100644 --- a/src/runtime/NEON/functions/NEDepthToSpaceLayer.cpp +++ b/src/runtime/NEON/functions/NEDepthToSpaceLayer.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2020 Arm Limited. + * Copyright (c) 2019-2021, 2023 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -25,15 +25,24 @@ #include "arm_compute/runtime/NEON/functions/NEDepthToSpaceLayer.h" #include "arm_compute/core/Error.h" -#include "arm_compute/core/TensorInfo.h" #include "arm_compute/core/Types.h" -#include "arm_compute/core/Validate.h" +#include "arm_compute/runtime/NEON/NEScheduler.h" + +#include "src/common/utils/Log.h" #include "src/core/NEON/kernels/NEDepthToSpaceLayerKernel.h" namespace arm_compute { +NEDepthToSpaceLayer::NEDepthToSpaceLayer() : _kernel{} +{ +} + +NEDepthToSpaceLayer::~NEDepthToSpaceLayer() = default; + void NEDepthToSpaceLayer::configure(const ITensor *input, ITensor *output, int32_t block_shape) { + ARM_COMPUTE_LOG_PARAMS(input, output, block_shape); + auto k = std::make_unique<NEDepthToSpaceLayerKernel>(); k->configure(input, output, block_shape); _kernel = std::move(k); @@ -43,4 +52,10 @@ Status NEDepthToSpaceLayer::validate(const ITensorInfo *input, const ITensorInfo { return NEDepthToSpaceLayerKernel::validate(input, output, block_shape); } + +void NEDepthToSpaceLayer::run() +{ + NEScheduler::get().schedule(_kernel.get(), _kernel->get_split_dimension()); +} + } // namespace arm_compute diff --git a/src/runtime/NEON/functions/NEDepthwiseConvolutionLayer.cpp b/src/runtime/NEON/functions/NEDepthwiseConvolutionLayer.cpp index a561b88058..6c085645db 100644 --- a/src/runtime/NEON/functions/NEDepthwiseConvolutionLayer.cpp +++ b/src/runtime/NEON/functions/NEDepthwiseConvolutionLayer.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2021 Arm Limited. + * Copyright (c) 2017-2021, 2023 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -27,7 +27,9 @@ #include "arm_compute/core/utils/misc/ShapeCalculator.h" #include "arm_compute/core/utils/quantization/AsymmHelpers.h" #include "arm_compute/runtime/NEON/NEScheduler.h" -#include "src/runtime/cpu/operators/CpuDepthwiseConv2d.h" + +#include "src/common/utils/Log.h" +#include "src/cpu/operators/CpuDepthwiseConv2d.h" using namespace arm_compute::misc; using namespace arm_compute::misc::shape_calculator; @@ -38,38 +40,35 @@ NEDepthwiseConvolutionLayer::~NEDepthwiseConvolutionLayer() = default; struct NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerOptimizedInternal::Impl { - ITensor *src{ nullptr }; // SRC_0 - ITensor *dst{ nullptr }; // DST_0 - const ITensor *weights - { - nullptr - }; // SRC_1 - const ITensor *biases - { - nullptr - }; // SRC_2 + ITensor *src{nullptr}; // SRC_0 + ITensor *dst{nullptr}; // DST_0 + const ITensor *weights{nullptr}; // SRC_1 + const ITensor *biases{nullptr}; // SRC_2 Tensor permuted_input{}; // INT_0 Tensor permuted_weights{}; // INT_1 Tensor permuted_output{}; // INT_2 Tensor workspace{}; // INT_3 Tensor packed_weights{}; // INT_4 - std::shared_ptr<cpu::CpuDepthwiseConv2d> op{ nullptr }; - bool is_prepared{ false }; - bool permute{ false }; + std::shared_ptr<cpu::CpuDepthwiseConv2d> op{nullptr}; + bool is_prepared{false}; + bool permute{false}; }; -NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerOptimizedInternal::NEDepthwiseConvolutionLayerOptimizedInternal(std::shared_ptr<IMemoryManager> memory_manager) +NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerOptimizedInternal::NEDepthwiseConvolutionLayerOptimizedInternal( + std::shared_ptr<IMemoryManager> memory_manager) : _memory_group(memory_manager), _impl(std::make_unique<Impl>()) { } -void NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerOptimizedInternal::configure(ITensor *input, - const ITensor *weights, - const ITensor *biases, - ITensor *output, const PadStrideInfo &conv_info, - unsigned int depth_multiplier, - const ActivationLayerInfo &act_info, - const Size2D &dilation) +void NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerOptimizedInternal::configure( + ITensor *input, + const ITensor *weights, + const ITensor *biases, + ITensor *output, + const PadStrideInfo &conv_info, + unsigned int depth_multiplier, + const ActivationLayerInfo &act_info, + const Size2D &dilation) { ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output); @@ -81,9 +80,9 @@ void NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerOptimizedInternal:: _impl->permute = is_nhwc; _impl->op = std::make_unique<cpu::CpuDepthwiseConv2d>(); - ConvolutionInfo info{ conv_info, depth_multiplier, act_info, dilation }; - _impl->op->configure(_impl->src->info(), _impl->weights->info(), _impl->biases == nullptr ? nullptr : _impl->biases->info(), - _impl->dst->info(), info); + ConvolutionInfo info{conv_info, depth_multiplier, act_info, dilation}; + _impl->op->configure(_impl->src->info(), _impl->weights->info(), + _impl->biases == nullptr ? nullptr : _impl->biases->info(), _impl->dst->info(), info); // Configure pipeline ActivationLayerInfo act_info_to_use = ActivationLayerInfo(); @@ -91,15 +90,15 @@ void NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerOptimizedInternal:: const bool is_relu6 = arm_compute::utils::info_helpers::is_relu6(act_info); bool is_activationlayer_enabled = act_info.enabled() && !(is_relu || is_relu6); - if(!is_activationlayer_enabled) + if (!is_activationlayer_enabled) { act_info_to_use = act_info; } - info = ConvolutionInfo{ conv_info, depth_multiplier, act_info_to_use, dilation }; + info = ConvolutionInfo{conv_info, depth_multiplier, act_info_to_use, dilation}; auto dwc_optimized_func = std::make_unique<cpu::CpuDepthwiseConv2dAssemblyDispatch>(); - if(is_nhwc) + if (is_nhwc) { auto permute_input = std::make_unique<cpu::CpuPermute>(); auto permute_weights = std::make_unique<cpu::CpuPermute>(); @@ -121,7 +120,9 @@ void NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerOptimizedInternal:: _impl->permuted_output.info()->set_quantization_info(output->info()->quantization_info()); // Configure optimized depthwise - dwc_optimized_func->configure(_impl->permuted_input.info(), _impl->permuted_weights.info(), biases == nullptr ? nullptr : biases->info(), _impl->permuted_output.info(), info); + dwc_optimized_func->configure(_impl->permuted_input.info(), _impl->permuted_weights.info(), + biases == nullptr ? nullptr : biases->info(), _impl->permuted_output.info(), + info); // Configure the function to transform the convoluted output to ACL's native ordering format NCHW _impl->permuted_output.info()->set_data_layout(DataLayout::NHWC); @@ -132,28 +133,33 @@ void NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerOptimizedInternal:: } else { - dwc_optimized_func->configure(_impl->src->info(), _impl->weights->info(), biases == nullptr ? nullptr : biases->info(), _impl->dst->info(), info); + dwc_optimized_func->configure(_impl->src->info(), _impl->weights->info(), + biases == nullptr ? nullptr : biases->info(), _impl->dst->info(), info); } // Allocate memory based on the internal memory requirements experimental::MemoryRequirements mem_req = dwc_optimized_func->workspace(); - _impl->workspace.allocator()->init(TensorInfo(TensorShape{ mem_req[0].size }, 1, DataType::S8), mem_req[0].alignment); - _impl->packed_weights.allocator()->init(TensorInfo(TensorShape{ mem_req[1].size }, 1, DataType::S8), mem_req[1].alignment); - + _impl->workspace.allocator()->init(TensorInfo(TensorShape{mem_req[0].size + mem_req[0].alignment}, 1, DataType::S8), + mem_req[0].alignment); + _impl->packed_weights.allocator()->init( + TensorInfo(TensorShape{mem_req[1].size + mem_req[1].alignment}, 1, DataType::S8), mem_req[1].alignment); + _memory_group.manage(&_impl->workspace); + _memory_group.manage(&_impl->packed_weights); _impl->workspace.allocator()->allocate(); _impl->packed_weights.allocator()->allocate(); } -Status NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerOptimizedInternal::validate(const ITensorInfo *input, - const ITensorInfo *weights, - const ITensorInfo *biases, - const ITensorInfo *output, - const PadStrideInfo &conv_info, - unsigned int depth_multiplier, - const ActivationLayerInfo &act_info, - const Size2D &dilation) +Status +NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerOptimizedInternal::validate(const ITensorInfo *input, + const ITensorInfo *weights, + const ITensorInfo *biases, + const ITensorInfo *output, + const PadStrideInfo &conv_info, + unsigned int depth_multiplier, + const ActivationLayerInfo &act_info, + const Size2D &dilation) { - ConvolutionInfo info{ conv_info, depth_multiplier, act_info, dilation }; + ConvolutionInfo info{conv_info, depth_multiplier, act_info, dilation}; return cpu::CpuDepthwiseConv2d::validate(input, weights, biases, output, info); } @@ -178,15 +184,15 @@ void NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerOptimizedInternal:: void NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerOptimizedInternal::prepare() { - if(!_impl->is_prepared) + if (!_impl->is_prepared) { // Permute weights - if(_impl->permute) + if (_impl->permute) { _impl->permuted_weights.allocator()->allocate(); } - if(!_impl->permuted_weights.is_used()) + if (!_impl->permuted_weights.is_used()) { _impl->permuted_weights.allocator()->free(); } @@ -200,14 +206,14 @@ struct NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerGeneric::Impl Tensor permuted_input{}; Tensor permuted_weights{}; Tensor permuted_output{}; - bool is_prepared{ false }; - bool is_nchw{ false }; - bool is_activationlayer_enabled{ false }; - const ITensor *weights{ nullptr }; - const ITensor *biases{ nullptr }; - const ITensor *src{ nullptr }; - ITensor *dst{ nullptr }; - std::shared_ptr<cpu::CpuDepthwiseConv2d> op{ nullptr }; + bool is_prepared{false}; + bool is_nchw{false}; + bool is_activationlayer_enabled{false}; + const ITensor *weights{nullptr}; + const ITensor *biases{nullptr}; + const ITensor *src{nullptr}; + ITensor *dst{nullptr}; + std::shared_ptr<cpu::CpuDepthwiseConv2d> op{nullptr}; }; NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerGeneric::NEDepthwiseConvolutionLayerGeneric() @@ -215,16 +221,21 @@ NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerGeneric::NEDepthwiseConv { } -void NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerGeneric::configure(ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const PadStrideInfo &conv_info, - unsigned int depth_multiplier, const ActivationLayerInfo &act_info, const Size2D &dilation) +void NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerGeneric::configure(ITensor *input, + const ITensor *weights, + const ITensor *biases, + ITensor *output, + const PadStrideInfo &conv_info, + unsigned int depth_multiplier, + const ActivationLayerInfo &act_info, + const Size2D &dilation) { ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output); - ARM_COMPUTE_ERROR_THROW_ON(NEDepthwiseConvolutionLayer::validate(input->info(), weights->info(), (biases == nullptr) ? nullptr : biases->info(), - output->info(), conv_info, depth_multiplier, act_info, dilation)); - const ConvolutionInfo info{ conv_info, depth_multiplier, act_info, dilation }; + const ConvolutionInfo info{conv_info, depth_multiplier, act_info, dilation}; _impl->op = std::make_unique<cpu::CpuDepthwiseConv2d>(); - _impl->op->configure(input->info(), weights->info(), biases == nullptr ? nullptr : biases->info(), output->info(), info); + _impl->op->configure(input->info(), weights->info(), biases == nullptr ? nullptr : biases->info(), output->info(), + info); _impl->src = input; _impl->dst = output; @@ -236,7 +247,7 @@ void NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerGeneric::configure( ITensor *input_to_use = input; const ITensor *weights_to_use = weights; ITensor *output_to_use = output; - if(_impl->is_nchw) + if (_impl->is_nchw) { auto permute_input = std::make_unique<cpu::CpuPermute>(); auto permute_weights = std::make_unique<cpu::CpuPermute>(); @@ -249,14 +260,16 @@ void NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerGeneric::configure( _impl->permuted_weights.info()->set_data_layout(DataLayout::NHWC); weights_to_use = &_impl->permuted_weights; - _impl->permuted_output.allocator()->init(output->info()->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(TensorShape())); + _impl->permuted_output.allocator()->init( + output->info()->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(TensorShape())); output_to_use = &_impl->permuted_output; } auto depthwise_conv_kernel = std::make_unique<cpu::kernels::CpuDepthwiseConv2dNativeKernel>(); - depthwise_conv_kernel->configure(input_to_use->info(), weights_to_use->info(), biases == nullptr ? nullptr : biases->info(), output_to_use->info(), info); + depthwise_conv_kernel->configure(input_to_use->info(), weights_to_use->info(), + biases == nullptr ? nullptr : biases->info(), output_to_use->info(), info); - if(_impl->is_nchw) + if (_impl->is_nchw) { auto permute_output = std::make_unique<cpu::CpuPermute>(); permute_output->configure(_impl->permuted_output.info(), output->info(), PermutationVector(1U, 2U, 0U)); @@ -268,11 +281,16 @@ void NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerGeneric::configure( } } -Status NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerGeneric::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, +Status NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerGeneric::validate(const ITensorInfo *input, + const ITensorInfo *weights, + const ITensorInfo *biases, + const ITensorInfo *output, const PadStrideInfo &conv_info, - unsigned int depth_multiplier, const ActivationLayerInfo &act_info, const Size2D &dilation) + unsigned int depth_multiplier, + const ActivationLayerInfo &act_info, + const Size2D &dilation) { - ConvolutionInfo info{ conv_info, depth_multiplier, act_info, dilation }; + ConvolutionInfo info{conv_info, depth_multiplier, act_info, dilation}; return cpu::CpuDepthwiseConv2d::validate(input, weights, biases, output, info); } @@ -298,43 +316,64 @@ NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayer(std::shared_ptr<IMemory #ifndef DOXYGEN_SKIP_THIS struct NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayer::Impl { - DepthwiseConvolutionFunction depth_conv_func{ DepthwiseConvolutionFunction::OPTIMIZED }; - NEDepthwiseConvolutionLayerOptimizedInternal func_optimized{ nullptr }; + DepthwiseConvolutionFunction depth_conv_func{DepthwiseConvolutionFunction::OPTIMIZED}; + NEDepthwiseConvolutionLayerOptimizedInternal func_optimized{nullptr}; NEDepthwiseConvolutionLayerGeneric func_generic{}; - std::shared_ptr<cpu::CpuDepthwiseConv2d> op{ nullptr }; + std::shared_ptr<cpu::CpuDepthwiseConv2d> op{nullptr}; }; #endif // DOXYGEN_SKIP_THIS -void NEDepthwiseConvolutionLayer::configure(ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const PadStrideInfo &conv_info, unsigned int depth_multiplier, - const ActivationLayerInfo &act_info, const Size2D &dilation) +void NEDepthwiseConvolutionLayer::configure(ITensor *input, + const ITensor *weights, + const ITensor *biases, + ITensor *output, + const PadStrideInfo &conv_info, + unsigned int depth_multiplier, + const ActivationLayerInfo &act_info, + const Size2D &dilation) { - const ConvolutionInfo info{ conv_info, depth_multiplier, act_info, dilation }; + ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output); + + ARM_COMPUTE_LOG_PARAMS(input, weights, output, conv_info, depth_multiplier, biases, act_info, dilation); + ARM_COMPUTE_ERROR_THROW_ON(NEDepthwiseConvolutionLayer::validate( + input->info(), weights->info(), (biases == nullptr) ? nullptr : biases->info(), output->info(), conv_info, + depth_multiplier, act_info, dilation)); + + const ConvolutionInfo info{conv_info, depth_multiplier, act_info, dilation}; _impl->op = std::make_shared<cpu::CpuDepthwiseConv2d>(); - _impl->depth_conv_func = _impl->op->get_depthwiseconvolution_function(input->info(), weights->info(), (biases != nullptr) ? biases->info() : nullptr, output->info(), - info); - switch(_impl->depth_conv_func) + _impl->depth_conv_func = _impl->op->get_depthwiseconvolution_function( + input->info(), weights->info(), (biases != nullptr) ? biases->info() : nullptr, output->info(), info); + switch (_impl->depth_conv_func) { case DepthwiseConvolutionFunction::OPTIMIZED: - _impl->func_optimized.configure(input, weights, biases, output, conv_info, depth_multiplier, act_info, dilation); + _impl->func_optimized.configure(input, weights, biases, output, conv_info, depth_multiplier, act_info, + dilation); break; case DepthwiseConvolutionFunction::GENERIC: - _impl->func_generic.configure(input, weights, biases, output, conv_info, depth_multiplier, act_info, dilation); + _impl->func_generic.configure(input, weights, biases, output, conv_info, depth_multiplier, act_info, + dilation); break; default: ARM_COMPUTE_ERROR("Unsupported DepthwiseConvolutionFunction"); } } -Status NEDepthwiseConvolutionLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info, - unsigned int depth_multiplier, const ActivationLayerInfo &act_info, const Size2D &dilation) +Status NEDepthwiseConvolutionLayer::validate(const ITensorInfo *input, + const ITensorInfo *weights, + const ITensorInfo *biases, + const ITensorInfo *output, + const PadStrideInfo &conv_info, + unsigned int depth_multiplier, + const ActivationLayerInfo &act_info, + const Size2D &dilation) { - ConvolutionInfo info{ conv_info, depth_multiplier, act_info, dilation }; + ConvolutionInfo info{conv_info, depth_multiplier, act_info, dilation}; return cpu::CpuDepthwiseConv2d::validate(input, weights, biases, output, info); } void NEDepthwiseConvolutionLayer::run() { - switch(_impl->depth_conv_func) + switch (_impl->depth_conv_func) { case DepthwiseConvolutionFunction::OPTIMIZED: _impl->func_optimized.run(); @@ -349,7 +388,7 @@ void NEDepthwiseConvolutionLayer::run() void NEDepthwiseConvolutionLayer::prepare() { - switch(_impl->depth_conv_func) + switch (_impl->depth_conv_func) { case DepthwiseConvolutionFunction::OPTIMIZED: _impl->func_optimized.prepare(); diff --git a/src/runtime/NEON/functions/NEDequantizationLayer.cpp b/src/runtime/NEON/functions/NEDequantizationLayer.cpp index 91e37594af..28d19d2950 100644 --- a/src/runtime/NEON/functions/NEDequantizationLayer.cpp +++ b/src/runtime/NEON/functions/NEDequantizationLayer.cpp @@ -26,19 +26,19 @@ #include "arm_compute/core/Validate.h" #include "arm_compute/runtime/Tensor.h" -#include "src/runtime/cpu/operators/CpuDequantize.h" + +#include "src/cpu/operators/CpuDequantize.h" namespace arm_compute { struct NEDequantizationLayer::Impl { - const ITensor *src{ nullptr }; - ITensor *dst{ nullptr }; - std::unique_ptr<cpu::CpuDequantize> op{ nullptr }; + const ITensor *src{nullptr}; + ITensor *dst{nullptr}; + std::unique_ptr<cpu::CpuDequantize> op{nullptr}; }; -NEDequantizationLayer::NEDequantizationLayer() - : _impl(std::make_unique<Impl>()) +NEDequantizationLayer::NEDequantizationLayer() : _impl(std::make_unique<Impl>()) { } NEDequantizationLayer::~NEDequantizationLayer() = default; diff --git a/src/runtime/NEON/functions/NEDetectionPostProcessLayer.cpp b/src/runtime/NEON/functions/NEDetectionPostProcessLayer.cpp index 9e63800728..b347390162 100644 --- a/src/runtime/NEON/functions/NEDetectionPostProcessLayer.cpp +++ b/src/runtime/NEON/functions/NEDetectionPostProcessLayer.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2020 Arm Limited. + * Copyright (c) 2019-2021 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -27,6 +27,8 @@ #include "arm_compute/core/Helpers.h" #include "arm_compute/core/Validate.h" +#include "src/common/utils/Log.h" + #include <cstddef> #include <ios> #include <list> @@ -34,23 +36,36 @@ namespace arm_compute { NEDetectionPostProcessLayer::NEDetectionPostProcessLayer(std::shared_ptr<IMemoryManager> memory_manager) - : _memory_group(std::move(memory_manager)), _dequantize(), _detection_post_process(), _decoded_scores(), _run_dequantize(false) + : _memory_group(std::move(memory_manager)), + _dequantize(), + _detection_post_process(), + _decoded_scores(), + _run_dequantize(false) { } -void NEDetectionPostProcessLayer::configure(const ITensor *input_box_encoding, const ITensor *input_scores, const ITensor *input_anchors, - ITensor *output_boxes, ITensor *output_classes, ITensor *output_scores, ITensor *num_detection, DetectionPostProcessLayerInfo info) +void NEDetectionPostProcessLayer::configure(const ITensor *input_box_encoding, + const ITensor *input_scores, + const ITensor *input_anchors, + ITensor *output_boxes, + ITensor *output_classes, + ITensor *output_scores, + ITensor *num_detection, + DetectionPostProcessLayerInfo info) { - ARM_COMPUTE_ERROR_ON_NULLPTR(input_box_encoding, input_scores, input_anchors, output_boxes, output_classes, output_scores); - ARM_COMPUTE_ERROR_THROW_ON(NEDetectionPostProcessLayer::validate(input_box_encoding->info(), input_scores->info(), input_anchors->info(), output_boxes->info(), output_classes->info(), - output_scores->info(), - num_detection->info(), info)); + ARM_COMPUTE_ERROR_ON_NULLPTR(input_box_encoding, input_scores, input_anchors, output_boxes, output_classes, + output_scores); + ARM_COMPUTE_ERROR_THROW_ON(NEDetectionPostProcessLayer::validate( + input_box_encoding->info(), input_scores->info(), input_anchors->info(), output_boxes->info(), + output_classes->info(), output_scores->info(), num_detection->info(), info)); + ARM_COMPUTE_LOG_PARAMS(input_box_encoding, input_scores, input_anchors, output_boxes, output_classes, output_scores, + num_detection, info); const ITensor *input_scores_to_use = input_scores; DetectionPostProcessLayerInfo info_to_use = info; _run_dequantize = is_data_type_quantized(input_box_encoding->info()->data_type()); - if(_run_dequantize) + if (_run_dequantize) { _memory_group.manage(&_decoded_scores); @@ -59,26 +74,37 @@ void NEDetectionPostProcessLayer::configure(const ITensor *input_box_encoding, c input_scores_to_use = &_decoded_scores; // Create a new info struct to avoid dequantizing in the CPP layer - std::array<float, 4> scales_values{ info.scale_value_y(), info.scale_value_x(), info.scale_value_h(), info.scale_value_w() }; - DetectionPostProcessLayerInfo info_quantized(info.max_detections(), info.max_classes_per_detection(), info.nms_score_threshold(), info.iou_threshold(), info.num_classes(), - scales_values, info.use_regular_nms(), info.detection_per_class(), false); + std::array<float, 4> scales_values{info.scale_value_y(), info.scale_value_x(), info.scale_value_h(), + info.scale_value_w()}; + DetectionPostProcessLayerInfo info_quantized( + info.max_detections(), info.max_classes_per_detection(), info.nms_score_threshold(), info.iou_threshold(), + info.num_classes(), scales_values, info.use_regular_nms(), info.detection_per_class(), false); info_to_use = info_quantized; } - _detection_post_process.configure(input_box_encoding, input_scores_to_use, input_anchors, output_boxes, output_classes, output_scores, num_detection, info_to_use); + _detection_post_process.configure(input_box_encoding, input_scores_to_use, input_anchors, output_boxes, + output_classes, output_scores, num_detection, info_to_use); _decoded_scores.allocator()->allocate(); } -Status NEDetectionPostProcessLayer::validate(const ITensorInfo *input_box_encoding, const ITensorInfo *input_scores, const ITensorInfo *input_anchors, - ITensorInfo *output_boxes, ITensorInfo *output_classes, ITensorInfo *output_scores, ITensorInfo *num_detection, DetectionPostProcessLayerInfo info) +Status NEDetectionPostProcessLayer::validate(const ITensorInfo *input_box_encoding, + const ITensorInfo *input_scores, + const ITensorInfo *input_anchors, + ITensorInfo *output_boxes, + ITensorInfo *output_classes, + ITensorInfo *output_scores, + ITensorInfo *num_detection, + DetectionPostProcessLayerInfo info) { bool run_dequantize = is_data_type_quantized(input_box_encoding->data_type()); - if(run_dequantize) + if (run_dequantize) { TensorInfo decoded_classes_info = input_scores->clone()->set_is_resizable(true).set_data_type(DataType::F32); ARM_COMPUTE_RETURN_ON_ERROR(NEDequantizationLayer::validate(input_scores, &decoded_classes_info)); } - ARM_COMPUTE_RETURN_ON_ERROR(CPPDetectionPostProcessLayer::validate(input_box_encoding, input_scores, input_anchors, output_boxes, output_classes, output_scores, num_detection, info)); + ARM_COMPUTE_RETURN_ON_ERROR(CPPDetectionPostProcessLayer::validate(input_box_encoding, input_scores, input_anchors, + output_boxes, output_classes, output_scores, + num_detection, info)); return Status{}; } @@ -88,7 +114,7 @@ void NEDetectionPostProcessLayer::run() MemoryGroupResourceScope scope_mg(_memory_group); // Decode scores if necessary - if(_run_dequantize) + if (_run_dequantize) { _dequantize.run(); } diff --git a/src/runtime/NEON/functions/NEDirectConvolutionLayer.cpp b/src/runtime/NEON/functions/NEDirectConvolutionLayer.cpp index 58530e4a8f..f1c2cf969f 100644 --- a/src/runtime/NEON/functions/NEDirectConvolutionLayer.cpp +++ b/src/runtime/NEON/functions/NEDirectConvolutionLayer.cpp @@ -27,17 +27,18 @@ #include "arm_compute/core/Utils.h" #include "arm_compute/core/Validate.h" #include "arm_compute/runtime/NEON/NEScheduler.h" -#include "src/runtime/cpu/operators/CpuDirectConv2d.h" + +#include "src/cpu/operators/CpuDirectConv2d.h" namespace arm_compute { struct NEDirectConvolutionLayer::Impl { - ITensor *src{ nullptr }; - const ITensor *weights{ nullptr }; - const ITensor *bias{ nullptr }; - ITensor *dst{ nullptr }; - std::unique_ptr<cpu::CpuDirectConv2d> op{ nullptr }; + ITensor *src{nullptr}; + const ITensor *weights{nullptr}; + const ITensor *bias{nullptr}; + ITensor *dst{nullptr}; + std::unique_ptr<cpu::CpuDirectConv2d> op{nullptr}; }; NEDirectConvolutionLayer::NEDirectConvolutionLayer(std::shared_ptr<IMemoryManager> memory_manager) @@ -46,17 +47,27 @@ NEDirectConvolutionLayer::NEDirectConvolutionLayer(std::shared_ptr<IMemoryManage } NEDirectConvolutionLayer::~NEDirectConvolutionLayer() = default; -void NEDirectConvolutionLayer::configure(ITensor *input, const ITensor *weights, const ITensor *bias, ITensor *output, const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info) +void NEDirectConvolutionLayer::configure(ITensor *input, + const ITensor *weights, + const ITensor *bias, + ITensor *output, + const PadStrideInfo &conv_info, + const ActivationLayerInfo &act_info) { _impl->src = input; _impl->weights = weights; _impl->bias = bias; _impl->dst = output; _impl->op = std::make_unique<cpu::CpuDirectConv2d>(_memory_manager); - _impl->op->configure(input->info(), weights->info(), (bias != nullptr ? bias->info() : nullptr), output->info(), conv_info, act_info); + _impl->op->configure(input->info(), weights->info(), (bias != nullptr ? bias->info() : nullptr), output->info(), + conv_info, act_info); } -Status NEDirectConvolutionLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *bias, const ITensorInfo *output, const PadStrideInfo &conv_info, +Status NEDirectConvolutionLayer::validate(const ITensorInfo *input, + const ITensorInfo *weights, + const ITensorInfo *bias, + const ITensorInfo *output, + const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info) { return cpu::CpuDirectConv2d::validate(input, weights, bias, output, conv_info, act_info); diff --git a/src/runtime/NEON/functions/NEElementwiseOperations.cpp b/src/runtime/NEON/functions/NEElementwiseOperations.cpp index 946bbb24b8..685ef2d4d7 100644 --- a/src/runtime/NEON/functions/NEElementwiseOperations.cpp +++ b/src/runtime/NEON/functions/NEElementwiseOperations.cpp @@ -22,10 +22,11 @@ * SOFTWARE. */ #include "arm_compute/runtime/NEON/functions/NEElementwiseOperations.h" -#include "arm_compute/core/Validate.h" -#include "src/runtime/cpu/operators/CpuElementwise.h" #include "arm_compute/core/ITensor.h" +#include "arm_compute/core/Validate.h" + +#include "src/cpu/operators/CpuElementwise.h" #include <utility> @@ -33,17 +34,16 @@ namespace arm_compute { struct NEElementwiseMax::Impl { - const ITensor *src_0{ nullptr }; - const ITensor *src_1{ nullptr }; - ITensor *dst{ nullptr }; - std::unique_ptr<cpu::CpuElementwiseMax> op{ nullptr }; + const ITensor *src_0{nullptr}; + const ITensor *src_1{nullptr}; + ITensor *dst{nullptr}; + std::unique_ptr<cpu::CpuElementwiseMax> op{nullptr}; }; -NEElementwiseMax::NEElementwiseMax() - : _impl(std::make_unique<Impl>()) +NEElementwiseMax::NEElementwiseMax() : _impl(std::make_unique<Impl>()) { } -NEElementwiseMax::NEElementwiseMax(NEElementwiseMax &&) = default; +NEElementwiseMax::NEElementwiseMax(NEElementwiseMax &&) = default; NEElementwiseMax &NEElementwiseMax::operator=(NEElementwiseMax &&) = default; NEElementwiseMax::~NEElementwiseMax() = default; @@ -57,7 +57,10 @@ void NEElementwiseMax::configure(ITensor *input1, ITensor *input2, ITensor *outp _impl->op->configure(input1->info(), input2->info(), output->info()); } -Status NEElementwiseMax::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const ActivationLayerInfo &act_info) +Status NEElementwiseMax::validate(const ITensorInfo *input1, + const ITensorInfo *input2, + const ITensorInfo *output, + const ActivationLayerInfo &act_info) { ARM_COMPUTE_RETURN_ERROR_ON(act_info.enabled()); return cpu::CpuElementwiseMax::validate(input1, input2, output); @@ -74,17 +77,16 @@ void NEElementwiseMax::run() struct NEElementwiseMin::Impl { - const ITensor *src_0{ nullptr }; - const ITensor *src_1{ nullptr }; - ITensor *dst{ nullptr }; - std::unique_ptr<cpu::CpuElementwiseMin> op{ nullptr }; + const ITensor *src_0{nullptr}; + const ITensor *src_1{nullptr}; + ITensor *dst{nullptr}; + std::unique_ptr<cpu::CpuElementwiseMin> op{nullptr}; }; -NEElementwiseMin::NEElementwiseMin() - : _impl(std::make_unique<Impl>()) +NEElementwiseMin::NEElementwiseMin() : _impl(std::make_unique<Impl>()) { } -NEElementwiseMin::NEElementwiseMin(NEElementwiseMin &&) = default; +NEElementwiseMin::NEElementwiseMin(NEElementwiseMin &&) = default; NEElementwiseMin &NEElementwiseMin::operator=(NEElementwiseMin &&) = default; NEElementwiseMin::~NEElementwiseMin() = default; @@ -98,7 +100,10 @@ void NEElementwiseMin::configure(ITensor *input1, ITensor *input2, ITensor *outp _impl->op->configure(input1->info(), input2->info(), output->info()); } -Status NEElementwiseMin::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const ActivationLayerInfo &act_info) +Status NEElementwiseMin::validate(const ITensorInfo *input1, + const ITensorInfo *input2, + const ITensorInfo *output, + const ActivationLayerInfo &act_info) { ARM_COMPUTE_RETURN_ERROR_ON(act_info.enabled()); return cpu::CpuElementwiseMin::validate(input1, input2, output); @@ -115,21 +120,23 @@ void NEElementwiseMin::run() struct NEElementwiseSquaredDiff::Impl { - const ITensor *src_0{ nullptr }; - const ITensor *src_1{ nullptr }; - ITensor *dst{ nullptr }; - std::unique_ptr<cpu::CpuElementwiseSquaredDiff> op{ nullptr }; + const ITensor *src_0{nullptr}; + const ITensor *src_1{nullptr}; + ITensor *dst{nullptr}; + std::unique_ptr<cpu::CpuElementwiseSquaredDiff> op{nullptr}; }; -NEElementwiseSquaredDiff::NEElementwiseSquaredDiff() - : _impl(std::make_unique<Impl>()) +NEElementwiseSquaredDiff::NEElementwiseSquaredDiff() : _impl(std::make_unique<Impl>()) { } -NEElementwiseSquaredDiff::NEElementwiseSquaredDiff(NEElementwiseSquaredDiff &&) = default; +NEElementwiseSquaredDiff::NEElementwiseSquaredDiff(NEElementwiseSquaredDiff &&) = default; NEElementwiseSquaredDiff &NEElementwiseSquaredDiff::operator=(NEElementwiseSquaredDiff &&) = default; NEElementwiseSquaredDiff::~NEElementwiseSquaredDiff() = default; -void NEElementwiseSquaredDiff::configure(ITensor *input1, ITensor *input2, ITensor *output, const ActivationLayerInfo &act_info) +void NEElementwiseSquaredDiff::configure(ITensor *input1, + ITensor *input2, + ITensor *output, + const ActivationLayerInfo &act_info) { ARM_COMPUTE_UNUSED(act_info); _impl->src_0 = input1; @@ -139,7 +146,10 @@ void NEElementwiseSquaredDiff::configure(ITensor *input1, ITensor *input2, ITens _impl->op->configure(input1->info(), input2->info(), output->info()); } -Status NEElementwiseSquaredDiff::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const ActivationLayerInfo &act_info) +Status NEElementwiseSquaredDiff::validate(const ITensorInfo *input1, + const ITensorInfo *input2, + const ITensorInfo *output, + const ActivationLayerInfo &act_info) { ARM_COMPUTE_RETURN_ERROR_ON(act_info.enabled()); return cpu::CpuElementwiseSquaredDiff::validate(input1, input2, output); @@ -156,21 +166,23 @@ void NEElementwiseSquaredDiff::run() struct NEElementwiseDivision::Impl { - const ITensor *src_0{ nullptr }; - const ITensor *src_1{ nullptr }; - ITensor *dst{ nullptr }; - std::unique_ptr<cpu::CpuElementwiseDivision> op{ nullptr }; + const ITensor *src_0{nullptr}; + const ITensor *src_1{nullptr}; + ITensor *dst{nullptr}; + std::unique_ptr<cpu::CpuElementwiseDivision> op{nullptr}; }; -NEElementwiseDivision::NEElementwiseDivision() - : _impl(std::make_unique<Impl>()) +NEElementwiseDivision::NEElementwiseDivision() : _impl(std::make_unique<Impl>()) { } -NEElementwiseDivision::NEElementwiseDivision(NEElementwiseDivision &&) = default; +NEElementwiseDivision::NEElementwiseDivision(NEElementwiseDivision &&) = default; NEElementwiseDivision &NEElementwiseDivision::operator=(NEElementwiseDivision &&) = default; NEElementwiseDivision::~NEElementwiseDivision() = default; -void NEElementwiseDivision::configure(ITensor *input1, ITensor *input2, ITensor *output, const ActivationLayerInfo &act_info) +void NEElementwiseDivision::configure(ITensor *input1, + ITensor *input2, + ITensor *output, + const ActivationLayerInfo &act_info) { ARM_COMPUTE_UNUSED(act_info); _impl->src_0 = input1; @@ -180,7 +192,10 @@ void NEElementwiseDivision::configure(ITensor *input1, ITensor *input2, ITensor _impl->op->configure(input1->info(), input2->info(), output->info()); } -Status NEElementwiseDivision::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const ActivationLayerInfo &act_info) +Status NEElementwiseDivision::validate(const ITensorInfo *input1, + const ITensorInfo *input2, + const ITensorInfo *output, + const ActivationLayerInfo &act_info) { ARM_COMPUTE_RETURN_ERROR_ON(act_info.enabled()); return cpu::CpuElementwiseDivision::validate(input1, input2, output); @@ -197,21 +212,23 @@ void NEElementwiseDivision::run() struct NEElementwisePower::Impl { - const ITensor *src_0{ nullptr }; - const ITensor *src_1{ nullptr }; - ITensor *dst{ nullptr }; - std::unique_ptr<cpu::CpuElementwisePower> op{ nullptr }; + const ITensor *src_0{nullptr}; + const ITensor *src_1{nullptr}; + ITensor *dst{nullptr}; + std::unique_ptr<cpu::CpuElementwisePower> op{nullptr}; }; -NEElementwisePower::NEElementwisePower() - : _impl(std::make_unique<Impl>()) +NEElementwisePower::NEElementwisePower() : _impl(std::make_unique<Impl>()) { } -NEElementwisePower::NEElementwisePower(NEElementwisePower &&) = default; +NEElementwisePower::NEElementwisePower(NEElementwisePower &&) = default; NEElementwisePower &NEElementwisePower::operator=(NEElementwisePower &&) = default; NEElementwisePower::~NEElementwisePower() = default; -void NEElementwisePower::configure(ITensor *input1, ITensor *input2, ITensor *output, const ActivationLayerInfo &act_info) +void NEElementwisePower::configure(ITensor *input1, + ITensor *input2, + ITensor *output, + const ActivationLayerInfo &act_info) { ARM_COMPUTE_UNUSED(act_info); _impl->src_0 = input1; @@ -221,7 +238,10 @@ void NEElementwisePower::configure(ITensor *input1, ITensor *input2, ITensor *ou _impl->op->configure(input1->info(), input2->info(), output->info()); } -Status NEElementwisePower::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const ActivationLayerInfo &act_info) +Status NEElementwisePower::validate(const ITensorInfo *input1, + const ITensorInfo *input2, + const ITensorInfo *output, + const ActivationLayerInfo &act_info) { ARM_COMPUTE_RETURN_ERROR_ON(act_info.enabled()); return cpu::CpuElementwisePower::validate(input1, input2, output); @@ -239,22 +259,22 @@ void NEElementwisePower::run() template <ComparisonOperation COP> struct NEElementwiseComparisonStatic<COP>::Impl { - const ITensor *src_0{ nullptr }; - const ITensor *src_1{ nullptr }; - ITensor *dst{ nullptr }; - std::unique_ptr<cpu::CpuElementwiseComparisonStatic<COP>> op{ nullptr }; + const ITensor *src_0{nullptr}; + const ITensor *src_1{nullptr}; + ITensor *dst{nullptr}; + std::unique_ptr<cpu::CpuElementwiseComparisonStatic<COP>> op{nullptr}; }; template <ComparisonOperation COP> -NEElementwiseComparisonStatic<COP>::NEElementwiseComparisonStatic() - : _impl(std::make_unique<Impl>()) +NEElementwiseComparisonStatic<COP>::NEElementwiseComparisonStatic() : _impl(std::make_unique<Impl>()) { } template <ComparisonOperation COP> NEElementwiseComparisonStatic<COP>::NEElementwiseComparisonStatic(NEElementwiseComparisonStatic &&) = default; -template <ComparisonOperation COP> -NEElementwiseComparisonStatic<COP> &NEElementwiseComparisonStatic<COP>::operator=(NEElementwiseComparisonStatic &&) = default; -template <ComparisonOperation COP> +template <ComparisonOperation COP> +NEElementwiseComparisonStatic<COP> & +NEElementwiseComparisonStatic<COP>::operator=(NEElementwiseComparisonStatic &&) = default; +template <ComparisonOperation COP> NEElementwiseComparisonStatic<COP>::~NEElementwiseComparisonStatic() = default; template <ComparisonOperation COP> @@ -268,13 +288,15 @@ void NEElementwiseComparisonStatic<COP>::configure(ITensor *input1, ITensor *inp } template <ComparisonOperation COP> -Status NEElementwiseComparisonStatic<COP>::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output) +Status NEElementwiseComparisonStatic<COP>::validate(const ITensorInfo *input1, + const ITensorInfo *input2, + const ITensorInfo *output) { return cpu::CpuElementwiseComparisonStatic<COP>::validate(input1, input2, output); } template <ComparisonOperation COP> -void NEElementwiseComparisonStatic<COP>::run() +void NEElementwiseComparisonStatic<COP>::run() { ITensorPack pack; pack.add_tensor(TensorType::ACL_SRC_0, _impl->src_0); @@ -285,17 +307,16 @@ void NEElementwiseComparisonStatic<COP>::run() struct NEElementwiseComparison::Impl { - const ITensor *src_0{ nullptr }; - const ITensor *src_1{ nullptr }; - ITensor *dst{ nullptr }; - std::unique_ptr<cpu::CpuElementwiseComparison> op{ nullptr }; + const ITensor *src_0{nullptr}; + const ITensor *src_1{nullptr}; + ITensor *dst{nullptr}; + std::unique_ptr<cpu::CpuElementwiseComparison> op{nullptr}; }; -NEElementwiseComparison::NEElementwiseComparison() - : _impl(std::make_unique<Impl>()) +NEElementwiseComparison::NEElementwiseComparison() : _impl(std::make_unique<Impl>()) { } -NEElementwiseComparison::NEElementwiseComparison(NEElementwiseComparison &&) = default; +NEElementwiseComparison::NEElementwiseComparison(NEElementwiseComparison &&) = default; NEElementwiseComparison &NEElementwiseComparison::operator=(NEElementwiseComparison &&) = default; NEElementwiseComparison::~NEElementwiseComparison() = default; @@ -308,7 +329,10 @@ void NEElementwiseComparison::configure(ITensor *input1, ITensor *input2, ITenso _impl->op->configure(input1->info(), input2->info(), output->info(), op); } -Status NEElementwiseComparison::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, ComparisonOperation op) +Status NEElementwiseComparison::validate(const ITensorInfo *input1, + const ITensorInfo *input2, + const ITensorInfo *output, + ComparisonOperation op) { return cpu::CpuElementwiseComparison::validate(input1, input2, output, op); } diff --git a/src/runtime/NEON/functions/NEElementwiseUnaryLayer.cpp b/src/runtime/NEON/functions/NEElementwiseUnaryLayer.cpp index 1a9e8839ca..23a092c407 100644 --- a/src/runtime/NEON/functions/NEElementwiseUnaryLayer.cpp +++ b/src/runtime/NEON/functions/NEElementwiseUnaryLayer.cpp @@ -22,7 +22,9 @@ * SOFTWARE. */ #include "arm_compute/runtime/NEON/functions/NEElementwiseUnaryLayer.h" -#include "src/runtime/cpu/operators/CpuElementwiseUnary.h" + +#include "src/cpu/operators/CpuElementwiseUnary.h" + #include <utility> namespace arm_compute @@ -32,21 +34,20 @@ using OperatorType = cpu::CpuElementwiseUnary; template <ElementWiseUnary op> struct NEElementwiseUnaryLayer<op>::Impl { - const ITensor *src{ nullptr }; - ITensor *dst{ nullptr }; - std::unique_ptr<OperatorType> cpu_op{ nullptr }; + const ITensor *src{nullptr}; + ITensor *dst{nullptr}; + std::unique_ptr<OperatorType> cpu_op{nullptr}; }; template <ElementWiseUnary op> -NEElementwiseUnaryLayer<op>::NEElementwiseUnaryLayer() - : _impl(std::make_unique<Impl>()) +NEElementwiseUnaryLayer<op>::NEElementwiseUnaryLayer() : _impl(std::make_unique<Impl>()) { } template <ElementWiseUnary op> NEElementwiseUnaryLayer<op>::~NEElementwiseUnaryLayer() = default; template <ElementWiseUnary op> NEElementwiseUnaryLayer<op>::NEElementwiseUnaryLayer(NEElementwiseUnaryLayer &&) = default; -template <ElementWiseUnary op> +template <ElementWiseUnary op> NEElementwiseUnaryLayer<op> &NEElementwiseUnaryLayer<op>::operator=(NEElementwiseUnaryLayer &&) = default; template <ElementWiseUnary op> @@ -65,7 +66,7 @@ Status NEElementwiseUnaryLayer<op>::validate(const ITensorInfo *input, const ITe } template <ElementWiseUnary op> -void NEElementwiseUnaryLayer<op>::run() +void NEElementwiseUnaryLayer<op>::run() { ITensorPack pack; pack.add_tensor(TensorType::ACL_SRC, _impl->src); diff --git a/src/runtime/NEON/functions/NEFFT1D.cpp b/src/runtime/NEON/functions/NEFFT1D.cpp index e72488f0f6..fb75f9da29 100644 --- a/src/runtime/NEON/functions/NEFFT1D.cpp +++ b/src/runtime/NEON/functions/NEFFT1D.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2020 Arm Limited. + * Copyright (c) 2019-2021 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -26,6 +26,8 @@ #include "arm_compute/core/ITensor.h" #include "arm_compute/core/Validate.h" #include "arm_compute/runtime/NEON/NEScheduler.h" + +#include "src/common/utils/Log.h" #include "src/core/NEON/kernels/NEFFTDigitReverseKernel.h" #include "src/core/NEON/kernels/NEFFTRadixStageKernel.h" #include "src/core/NEON/kernels/NEFFTScaleKernel.h" @@ -36,7 +38,15 @@ namespace arm_compute NEFFT1D::~NEFFT1D() = default; NEFFT1D::NEFFT1D(std::shared_ptr<IMemoryManager> memory_manager) - : _memory_group(std::move(memory_manager)), _digit_reverse_kernel(), _fft_kernels(), _scale_kernel(), _digit_reversed_input(), _digit_reverse_indices(), _num_ffts(0), _axis(0), _run_scale(false) + : _memory_group(std::move(memory_manager)), + _digit_reverse_kernel(), + _fft_kernels(), + _scale_kernel(), + _digit_reversed_input(), + _digit_reverse_indices(), + _num_ffts(0), + _axis(0), + _run_scale(false) { } @@ -44,6 +54,7 @@ void NEFFT1D::configure(const ITensor *input, ITensor *output, const FFT1DInfo & { ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); ARM_COMPUTE_ERROR_THROW_ON(NEFFT1D::validate(input->info(), output->info(), config)); + ARM_COMPUTE_LOG_PARAMS(input, output, config); // Decompose size to radix factors const auto supported_radix = NEFFTRadixStageKernel::supported_radix(); @@ -72,7 +83,7 @@ void NEFFT1D::configure(const ITensor *input, ITensor *output, const FFT1DInfo & _fft_kernels.resize(_num_ffts); _axis = config.axis; - for(unsigned int i = 0; i < _num_ffts; ++i) + for (unsigned int i = 0; i < _num_ffts; ++i) { const unsigned int radix_for_stage = decomposed_vector.at(i); @@ -82,19 +93,21 @@ void NEFFT1D::configure(const ITensor *input, ITensor *output, const FFT1DInfo & fft_kernel_info.Nx = Nx; fft_kernel_info.is_first_stage = (i == 0); _fft_kernels[i] = std::make_unique<NEFFTRadixStageKernel>(); - _fft_kernels[i]->configure(&_digit_reversed_input, ((i == (_num_ffts - 1)) && !is_c2r) ? output : nullptr, fft_kernel_info); + _fft_kernels[i]->configure(&_digit_reversed_input, ((i == (_num_ffts - 1)) && !is_c2r) ? output : nullptr, + fft_kernel_info); Nx *= radix_for_stage; } // Configure scale kernel - if(_run_scale) + if (_run_scale) { FFTScaleKernelInfo scale_config; scale_config.scale = static_cast<float>(N); scale_config.conjugate = config.direction == FFTDirection::Inverse; _scale_kernel = std::make_unique<NEFFTScaleKernel>(); - is_c2r ? _scale_kernel->configure(&_digit_reversed_input, output, scale_config) : _scale_kernel->configure(output, nullptr, scale_config); + is_c2r ? _scale_kernel->configure(&_digit_reversed_input, output, scale_config) + : _scale_kernel->configure(output, nullptr, scale_config); } // Allocate tensors @@ -111,7 +124,7 @@ Status NEFFT1D::validate(const ITensorInfo *input, const ITensorInfo *output, co ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output); ARM_COMPUTE_RETURN_ERROR_ON(input->data_type() != DataType::F32); ARM_COMPUTE_RETURN_ERROR_ON(input->num_channels() > 2); - ARM_COMPUTE_RETURN_ERROR_ON(std::set<unsigned int>({ 0, 1 }).count(config.axis) == 0); + ARM_COMPUTE_RETURN_ERROR_ON(std::set<unsigned int>({0, 1}).count(config.axis) == 0); // Check if FFT is decomposable const auto supported_radix = NEFFTRadixStageKernel::supported_radix(); @@ -120,7 +133,7 @@ Status NEFFT1D::validate(const ITensorInfo *input, const ITensorInfo *output, co ARM_COMPUTE_RETURN_ERROR_ON(decomposed_vector.empty()); // Checks performed when output is configured - if((output != nullptr) && (output->total_size() != 0)) + if ((output != nullptr) && (output->total_size() != 0)) { // All combinations are supported except real input with real output (i.e., both input channels set to 1) ARM_COMPUTE_RETURN_ERROR_ON(output->num_channels() == 1 && input->num_channels() == 1); @@ -138,13 +151,13 @@ void NEFFT1D::run() NEScheduler::get().schedule(_digit_reverse_kernel.get(), (_axis == 0 ? Window::DimY : Window::DimZ)); - for(unsigned int i = 0; i < _num_ffts; ++i) + for (unsigned int i = 0; i < _num_ffts; ++i) { NEScheduler::get().schedule(_fft_kernels[i].get(), (_axis == 0 ? Window::DimY : Window::DimX)); } // Run output scaling - if(_run_scale) + if (_run_scale) { NEScheduler::get().schedule(_scale_kernel.get(), Window::DimY); } diff --git a/src/runtime/NEON/functions/NEFFT2D.cpp b/src/runtime/NEON/functions/NEFFT2D.cpp index 3b787cd523..066909221d 100644 --- a/src/runtime/NEON/functions/NEFFT2D.cpp +++ b/src/runtime/NEON/functions/NEFFT2D.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2020 Arm Limited. + * Copyright (c) 2019-2021 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -26,16 +26,18 @@ #include "arm_compute/core/ITensor.h" #include "arm_compute/core/Validate.h" #include "arm_compute/runtime/Scheduler.h" -#include "src/core/NEON/kernels/NEFFTDigitReverseKernel.h" -#include "src/core/NEON/kernels/NEFFTRadixStageKernel.h" -#include "src/core/NEON/kernels/NEFFTScaleKernel.h" + +#include "src/common/utils/Log.h" namespace arm_compute { NEFFT2D::~NEFFT2D() = default; NEFFT2D::NEFFT2D(std::shared_ptr<IMemoryManager> memory_manager) - : _memory_group(memory_manager), _first_pass_func(memory_manager), _second_pass_func(memory_manager), _first_pass_tensor() + : _memory_group(memory_manager), + _first_pass_func(memory_manager), + _second_pass_func(memory_manager), + _first_pass_tensor() { } @@ -43,6 +45,7 @@ void NEFFT2D::configure(const ITensor *input, ITensor *output, const FFT2DInfo & { ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); ARM_COMPUTE_ERROR_THROW_ON(NEFFT2D::validate(input->info(), output->info(), config)); + ARM_COMPUTE_LOG_PARAMS(input, output, config); // Setup first pass FFT1DInfo first_pass_config; @@ -79,7 +82,7 @@ Status NEFFT2D::validate(const ITensorInfo *input, const ITensorInfo *output, co ARM_COMPUTE_RETURN_ON_ERROR(NEFFT1D::validate(&first_pass_tensor, output, second_pass_config)); // Checks performed when output is configured - if((output != nullptr) && (output->total_size() != 0)) + if ((output != nullptr) && (output->total_size() != 0)) { ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); diff --git a/src/runtime/NEON/functions/NEFFTConvolutionLayer.cpp b/src/runtime/NEON/functions/NEFFTConvolutionLayer.cpp index 56fc2e4a2b..94f85e5ffa 100644 --- a/src/runtime/NEON/functions/NEFFTConvolutionLayer.cpp +++ b/src/runtime/NEON/functions/NEFFTConvolutionLayer.cpp @@ -25,14 +25,16 @@ #include "arm_compute/core/ITensor.h" #include "arm_compute/core/Utils.h" -#include "arm_compute/core/Validate.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" +#include "arm_compute/core/Validate.h" + +#include "src/common/utils/Log.h" +#include "src/core/helpers/AutoConfiguration.h" #include "src/core/NEON/kernels/NEFFTDigitReverseKernel.h" #include "src/core/NEON/kernels/NEFFTRadixStageKernel.h" #include "src/core/NEON/kernels/NEFFTScaleKernel.h" #include "src/core/NEON/kernels/NEPadLayerKernel.h" #include "src/core/NEON/kernels/NEReductionOperationKernel.h" -#include "src/core/helpers/AutoConfiguration.h" #include "src/core/utils/helpers/fft.h" namespace arm_compute @@ -45,11 +47,11 @@ int pad_decomposable(int N) int pad = 0; bool is_decomposed = false; - while(!is_decomposed) + while (!is_decomposed) { const auto decomposed_vector = arm_compute::helpers::fft::decompose_stages(N++, supported_radix); is_decomposed = !decomposed_vector.empty(); - if(!is_decomposed) + if (!is_decomposed) { ++pad; } @@ -101,10 +103,16 @@ NEFFTConvolutionLayer::NEFFTConvolutionLayer(std::shared_ptr<IMemoryManager> mem } NEFFTConvolutionLayer::~NEFFTConvolutionLayer() = default; -void NEFFTConvolutionLayer::configure(ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const PadStrideInfo &conv_info, - const ActivationLayerInfo &act_info, bool enable_fast_math) +void NEFFTConvolutionLayer::configure(ITensor *input, + const ITensor *weights, + const ITensor *biases, + ITensor *output, + const PadStrideInfo &conv_info, + const ActivationLayerInfo &act_info, + bool enable_fast_math) { ARM_COMPUTE_UNUSED(enable_fast_math); + ARM_COMPUTE_LOG_PARAMS(input, weights, biases, output, conv_info, act_info, enable_fast_math); _original_weights = weights; _original_bias = biases; @@ -113,21 +121,24 @@ void NEFFTConvolutionLayer::configure(ITensor *input, const ITensor *weights, co _has_bias = biases != nullptr; // Get indices for the width and height - const size_t idx_width = get_data_layout_dimension_index(input->info()->data_layout(), DataLayoutDimension::WIDTH); - const size_t idx_height = get_data_layout_dimension_index(input->info()->data_layout(), DataLayoutDimension::HEIGHT); + const size_t idx_width = get_data_layout_dimension_index(input->info()->data_layout(), DataLayoutDimension::WIDTH); + const size_t idx_height = + get_data_layout_dimension_index(input->info()->data_layout(), DataLayoutDimension::HEIGHT); // Input shape, kernel size and output tile - const Size2D input_dims = Size2D(input->info()->tensor_shape()[idx_width], input->info()->tensor_shape()[idx_height]); - const Size2D kernel_size = Size2D(weights->info()->tensor_shape()[idx_width], weights->info()->tensor_shape()[idx_height]); - const Size2D pad_valid = Size2D(pad_decomposable(input_dims.x() + kernel_size.x() - 1), - pad_decomposable(input_dims.y() + kernel_size.y() - 1)); + const Size2D input_dims = + Size2D(input->info()->tensor_shape()[idx_width], input->info()->tensor_shape()[idx_height]); + const Size2D kernel_size = + Size2D(weights->info()->tensor_shape()[idx_width], weights->info()->tensor_shape()[idx_height]); + const Size2D pad_valid = Size2D(pad_decomposable(input_dims.x() + kernel_size.x() - 1), + pad_decomposable(input_dims.y() + kernel_size.y() - 1)); // Tensors to use ITensor *input_to_use = input; const ITensor *weights_to_use = weights; ITensor *output_to_use = _has_bias ? &_bias_output : output; // Permute bias - if(biases != nullptr) + if (biases != nullptr) { _permute_bias_func.configure(biases, &_permuted_bias, PermutationVector(1U, 2U, 0U)); _permuted_bias.info()->set_data_layout(DataLayout::NCHW); @@ -135,7 +146,7 @@ void NEFFTConvolutionLayer::configure(ITensor *input, const ITensor *weights, co // Permute input if needed _needs_permute = input->info()->data_layout() == DataLayout::NHWC; - if(_needs_permute) + if (_needs_permute) { _memory_group.manage(&_permuted_input); // Configure the function to transform the input tensor from NHWC -> NCHW @@ -156,7 +167,7 @@ void NEFFTConvolutionLayer::configure(ITensor *input, const ITensor *weights, co _flip_weights_func.configure(weights_to_use, &_flipped_weights, &_flip_axis); // Pad weights - const PaddingList padding_w = { { 0, input_dims.x() + pad_valid.x() - 1 }, { 0, input_dims.y() + pad_valid.y() - 1 } }; + const PaddingList padding_w = {{0, input_dims.x() + pad_valid.x() - 1}, {0, input_dims.y() + pad_valid.y() - 1}}; _pad_weights_func.configure(&_flipped_weights, &_padded_weights, padding_w); // Transform weights @@ -164,10 +175,10 @@ void NEFFTConvolutionLayer::configure(ITensor *input, const ITensor *weights, co _transform_weights_func->configure(&_padded_weights, &_transformed_weights, FFT2DInfo()); // Pad input - const PaddingList padding_in = { { 0, kernel_size.x() + pad_valid.x() - 1 }, { 0, kernel_size.y() + pad_valid.y() - 1 } }; + const PaddingList padding_in = {{0, kernel_size.x() + pad_valid.x() - 1}, {0, kernel_size.y() + pad_valid.y() - 1}}; _memory_group.manage(&_padded_input); _pad_input_func.configure(input_to_use, &_padded_input, padding_in); - if(_needs_permute) + if (_needs_permute) { _permuted_input.allocator()->allocate(); } @@ -191,7 +202,8 @@ void NEFFTConvolutionLayer::configure(ITensor *input, const ITensor *weights, co _memory_group.manage(&_itransformed_output); FFT2DInfo itranform_info; itranform_info.direction = FFTDirection::Inverse; - _itransformed_output.allocator()->init(_output_reduced.info()->clone()->set_is_resizable(true).set_num_channels(1).reset_padding()); + _itransformed_output.allocator()->init( + _output_reduced.info()->clone()->set_is_resizable(true).set_num_channels(1).reset_padding()); _itransform_output_func.configure(&_output_reduced, &_itransformed_output, itranform_info); _output_reduced.allocator()->allocate(); @@ -203,26 +215,29 @@ void NEFFTConvolutionLayer::configure(ITensor *input, const ITensor *weights, co // Extract correct region const int start_left = kernel_size.x() - conv_info.pad_left() - 1; const int start_top = kernel_size.y() - conv_info.pad_top() - 1; - const int end_right = _reshaped_output.info()->tensor_shape().x() - (kernel_size.x() - conv_info.pad_right() - 1) - pad_valid.x(); - const int end_botton = _reshaped_output.info()->tensor_shape().y() - (kernel_size.y() - conv_info.pad_bottom() - 1) - pad_valid.y(); - if(_has_bias) + const int end_right = + _reshaped_output.info()->tensor_shape().x() - (kernel_size.x() - conv_info.pad_right() - 1) - pad_valid.x(); + const int end_botton = + _reshaped_output.info()->tensor_shape().y() - (kernel_size.y() - conv_info.pad_bottom() - 1) - pad_valid.y(); + if (_has_bias) { _memory_group.manage(&_bias_output); } - else if(_needs_permute) + else if (_needs_permute) { output_to_use = &_permuted_output; _memory_group.manage(&_permuted_output); } - _extract_output_func.configure(&_reshaped_output, output_to_use, Coordinates(start_left, start_top), Coordinates(end_right, end_botton)); + _extract_output_func.configure(&_reshaped_output, output_to_use, Coordinates(start_left, start_top), + Coordinates(end_right, end_botton)); _reshaped_output.allocator()->allocate(); _itransformed_output.allocator()->allocate(); // Add bias - if(biases != nullptr) + if (biases != nullptr) { output_to_use = output; - if(_needs_permute) + if (_needs_permute) { output_to_use = &_permuted_output; _memory_group.manage(&_permuted_output); @@ -233,7 +248,7 @@ void NEFFTConvolutionLayer::configure(ITensor *input, const ITensor *weights, co } // Permute output - if(_needs_permute) + if (_needs_permute) { // Configure the function to transform the convoluted output to ACL's native ordering format NCHW _permuted_output.info()->set_data_layout(DataLayout::NCHW); @@ -245,7 +260,7 @@ void NEFFTConvolutionLayer::configure(ITensor *input, const ITensor *weights, co // Configure Activation Layer _is_activationlayer_enabled = act_info.enabled(); - if(_is_activationlayer_enabled) + if (_is_activationlayer_enabled) { _activation_layer_func.configure(output, nullptr, act_info); } @@ -258,8 +273,13 @@ void NEFFTConvolutionLayer::configure(ITensor *input, const ITensor *weights, co axis_data[1] = 1; } -Status NEFFTConvolutionLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info, - const ActivationLayerInfo &act_info, bool enable_fast_math) +Status NEFFTConvolutionLayer::validate(const ITensorInfo *input, + const ITensorInfo *weights, + const ITensorInfo *biases, + const ITensorInfo *output, + const PadStrideInfo &conv_info, + const ActivationLayerInfo &act_info, + bool enable_fast_math) { ARM_COMPUTE_UNUSED(enable_fast_math); @@ -277,11 +297,13 @@ Status NEFFTConvolutionLayer::validate(const ITensorInfo *input, const ITensorIn const auto strides = conv_info.stride(); ARM_COMPUTE_RETURN_ERROR_ON(strides.first != strides.second && strides.first != 1); ARM_COMPUTE_RETURN_ERROR_ON(kernel_size.x() != kernel_size.y()); - ARM_COMPUTE_RETURN_ERROR_ON(conv_info.pad_left() != (kernel_size.x() / 2) || conv_info.pad_right() != (kernel_size.x() / 2)); - ARM_COMPUTE_RETURN_ERROR_ON(conv_info.pad_top() != (kernel_size.y() / 2) || conv_info.pad_bottom() != (kernel_size.y() / 2)); + ARM_COMPUTE_RETURN_ERROR_ON(conv_info.pad_left() != (kernel_size.x() / 2) || + conv_info.pad_right() != (kernel_size.x() / 2)); + ARM_COMPUTE_RETURN_ERROR_ON(conv_info.pad_top() != (kernel_size.y() / 2) || + conv_info.pad_bottom() != (kernel_size.y() / 2)); // Validate biases - if(biases != nullptr) + if (biases != nullptr) { const size_t idx_channels = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::CHANNEL); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, biases); @@ -289,13 +311,14 @@ Status NEFFTConvolutionLayer::validate(const ITensorInfo *input, const ITensorIn } // Checks performed when output is configured - if((output != nullptr) && (output->total_size() != 0)) + if ((output != nullptr) && (output->total_size() != 0)) { ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); - ARM_COMPUTE_RETURN_ERROR_ON((input->tensor_shape()[idx_height] != output->tensor_shape()[idx_height]) || (input->tensor_shape()[idx_width] != output->tensor_shape()[idx_width])); + ARM_COMPUTE_RETURN_ERROR_ON((input->tensor_shape()[idx_height] != output->tensor_shape()[idx_height]) || + (input->tensor_shape()[idx_width] != output->tensor_shape()[idx_width])); // Validate Activation Layer - if(act_info.enabled()) + if (act_info.enabled()) { ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayer::validate(output, nullptr, act_info)); } @@ -311,7 +334,7 @@ void NEFFTConvolutionLayer::run() MemoryGroupResourceScope scope_mg(_memory_group); // Transform input - if(_needs_permute) + if (_needs_permute) { _permute_input_func.run(); } @@ -329,17 +352,17 @@ void NEFFTConvolutionLayer::run() _extract_output_func.run(); // Add bias - if(_has_bias) + if (_has_bias) { _bias_add_func.run(); } - if(_needs_permute) + if (_needs_permute) { _permute_output_func.run(); } // Run activation layer - if(_is_activationlayer_enabled) + if (_is_activationlayer_enabled) { _activation_layer_func.run(); } @@ -347,10 +370,10 @@ void NEFFTConvolutionLayer::run() void NEFFTConvolutionLayer::prepare() { - if(!_is_prepared) + if (!_is_prepared) { // Permute bias to NCHW - if(_original_bias != nullptr) + if (_original_bias != nullptr) { _permuted_bias.allocator()->allocate(); _permute_bias_func.run(); @@ -360,7 +383,7 @@ void NEFFTConvolutionLayer::prepare() const ITensor *cur_weights = _original_weights; // Permute weights - if(_needs_permute) + if (_needs_permute) { ARM_COMPUTE_ERROR_ON(!cur_weights->is_used()); diff --git a/src/runtime/NEON/functions/NEFill.cpp b/src/runtime/NEON/functions/NEFill.cpp index ee539fdfc8..bc1d5b7f5c 100644 --- a/src/runtime/NEON/functions/NEFill.cpp +++ b/src/runtime/NEON/functions/NEFill.cpp @@ -24,7 +24,8 @@ #include "arm_compute/runtime/NEON/functions/NEFill.h" #include "arm_compute/core/Validate.h" -#include "src/runtime/cpu/operators/CpuFill.h" + +#include "src/cpu/operators/CpuFill.h" #include <utility> @@ -32,15 +33,14 @@ namespace arm_compute { struct NEFill::Impl { - ITensor *tensor{ nullptr }; - std::unique_ptr<cpu::CpuFill> op{ nullptr }; + ITensor *tensor{nullptr}; + std::unique_ptr<cpu::CpuFill> op{nullptr}; }; -NEFill::NEFill() - : _impl(std::make_unique<Impl>()) +NEFill::NEFill() : _impl(std::make_unique<Impl>()) { } -NEFill::NEFill(NEFill &&) = default; +NEFill::NEFill(NEFill &&) = default; NEFill &NEFill::operator=(NEFill &&) = default; NEFill::~NEFill() = default; diff --git a/src/runtime/NEON/functions/NEFillBorder.cpp b/src/runtime/NEON/functions/NEFillBorder.cpp index 256aad6d3f..a3ab9c3db4 100644 --- a/src/runtime/NEON/functions/NEFillBorder.cpp +++ b/src/runtime/NEON/functions/NEFillBorder.cpp @@ -25,17 +25,22 @@ #include "arm_compute/core/Window.h" #include "arm_compute/runtime/NEON/NEScheduler.h" + +#include "src/common/utils/Log.h" #include "src/core/NEON/kernels/NEFillBorderKernel.h" namespace arm_compute { -NEFillBorder::NEFillBorder() - : _border_handler(nullptr) +NEFillBorder::NEFillBorder() : _border_handler(nullptr) { } -void NEFillBorder::configure(ITensor *input, unsigned int border_width, BorderMode border_mode, const PixelValue &constant_border_value) +void NEFillBorder::configure(ITensor *input, + unsigned int border_width, + BorderMode border_mode, + const PixelValue &constant_border_value) { + ARM_COMPUTE_LOG_PARAMS(input, border_width, border_mode, constant_border_value); _border_handler = std::make_unique<NEFillBorderKernel>(); _border_handler->configure(input, BorderSize(border_width), border_mode, constant_border_value); } @@ -44,4 +49,4 @@ void NEFillBorder::run() { NEScheduler::get().schedule(_border_handler.get(), Window::DimZ); } -} // namespace arm_compute
\ No newline at end of file +} // namespace arm_compute diff --git a/src/runtime/NEON/functions/NEFlattenLayer.cpp b/src/runtime/NEON/functions/NEFlattenLayer.cpp index 4d1054ad25..56db2be3fa 100644 --- a/src/runtime/NEON/functions/NEFlattenLayer.cpp +++ b/src/runtime/NEON/functions/NEFlattenLayer.cpp @@ -24,25 +24,25 @@ #include "arm_compute/runtime/NEON/functions/NEFlattenLayer.h" #include "arm_compute/core/TensorInfo.h" -#include "arm_compute/core/Validate.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" +#include "arm_compute/core/Validate.h" + #include "src/core/helpers/AutoConfiguration.h" -#include "src/runtime/cpu/operators/CpuFlatten.h" +#include "src/cpu/operators/CpuFlatten.h" namespace arm_compute { struct NEFlattenLayer::Impl { - const ITensor *src{ nullptr }; - ITensor *dst{ nullptr }; - std::unique_ptr<cpu::CpuFlatten> op{ nullptr }; + const ITensor *src{nullptr}; + ITensor *dst{nullptr}; + std::unique_ptr<cpu::CpuFlatten> op{nullptr}; }; -NEFlattenLayer::NEFlattenLayer() - : _impl(std::make_unique<Impl>()) +NEFlattenLayer::NEFlattenLayer() : _impl(std::make_unique<Impl>()) { } -NEFlattenLayer::NEFlattenLayer(NEFlattenLayer &&) = default; +NEFlattenLayer::NEFlattenLayer(NEFlattenLayer &&) = default; NEFlattenLayer &NEFlattenLayer::operator=(NEFlattenLayer &&) = default; NEFlattenLayer::~NEFlattenLayer() = default; @@ -51,7 +51,8 @@ void NEFlattenLayer::configure(const ITensor *input, ITensor *output) ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); _impl->src = input; _impl->dst = output; - auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(misc::shape_calculator::compute_flatten_shape(input->info()))); + auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape( + misc::shape_calculator::compute_flatten_shape(input->info()))); _impl->op = std::make_unique<cpu::CpuFlatten>(); _impl->op->configure(_impl->src->info(), _impl->dst->info()); @@ -60,9 +61,10 @@ void NEFlattenLayer::configure(const ITensor *input, ITensor *output) Status NEFlattenLayer::validate(const ITensorInfo *input, const ITensorInfo *output) { // Checks performed when output is configured - if(output->total_size() != 0) + if (output->total_size() != 0) { - const TensorInfo tensor_info_output = input->clone()->set_tensor_shape(misc::shape_calculator::compute_flatten_shape(input)); + const TensorInfo tensor_info_output = + input->clone()->set_tensor_shape(misc::shape_calculator::compute_flatten_shape(input)); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output, &tensor_info_output); } return cpu::CpuFlatten::validate(input, output); diff --git a/src/runtime/NEON/functions/NEFloor.cpp b/src/runtime/NEON/functions/NEFloor.cpp index f8a3c13d6d..112c93c478 100644 --- a/src/runtime/NEON/functions/NEFloor.cpp +++ b/src/runtime/NEON/functions/NEFloor.cpp @@ -24,22 +24,22 @@ #include "arm_compute/runtime/NEON/functions/NEFloor.h" #include "arm_compute/core/Validate.h" -#include "src/runtime/cpu/operators/CpuFloor.h" + +#include "src/cpu/operators/CpuFloor.h" namespace arm_compute { struct NEFloor::Impl { - const ITensor *src{ nullptr }; - ITensor *dst{ nullptr }; - std::unique_ptr<cpu::CpuFloor> op{ nullptr }; + const ITensor *src{nullptr}; + ITensor *dst{nullptr}; + std::unique_ptr<cpu::CpuFloor> op{nullptr}; }; -NEFloor::NEFloor() - : _impl(std::make_unique<Impl>()) +NEFloor::NEFloor() : _impl(std::make_unique<Impl>()) { } -NEFloor::NEFloor(NEFloor &&) = default; +NEFloor::NEFloor(NEFloor &&) = default; NEFloor &NEFloor::operator=(NEFloor &&) = default; NEFloor::~NEFloor() = default; diff --git a/src/runtime/NEON/functions/NEFullyConnectedLayer.cpp b/src/runtime/NEON/functions/NEFullyConnectedLayer.cpp index f469a0bdab..2656d0fa0f 100644 --- a/src/runtime/NEON/functions/NEFullyConnectedLayer.cpp +++ b/src/runtime/NEON/functions/NEFullyConnectedLayer.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2021 Arm Limited. + * Copyright (c) 2017-2023 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -23,469 +23,138 @@ */ #include "arm_compute/runtime/NEON/functions/NEFullyConnectedLayer.h" -#include "arm_compute/core/Helpers.h" #include "arm_compute/core/ITensorPack.h" -#include "arm_compute/core/Size2D.h" #include "arm_compute/core/Validate.h" -#include "arm_compute/core/utils/misc/ShapeCalculator.h" -#include "arm_compute/core/utils/quantization/AsymmHelpers.h" -#include "arm_compute/runtime/NEON/NEScheduler.h" -#include "src/core/NEON/kernels/NEConvertQuantizedSignednessKernel.h" -#include "src/core/NEON/kernels/NEGEMMInterleave4x4Kernel.h" -#include "src/core/NEON/kernels/NEGEMMLowpMatrixMultiplyKernel.h" -#include "src/core/NEON/kernels/NEGEMMLowpOffsetContributionKernel.h" -#include "src/core/NEON/kernels/NEGEMMLowpOffsetContributionOutputStageKernel.h" -#include "src/core/NEON/kernels/NEGEMMLowpReductionKernel.h" -#include "src/core/NEON/kernels/NEGEMMMatrixAdditionKernel.h" -#include "src/core/NEON/kernels/NEGEMMMatrixMultiplyKernel.h" -#include "src/core/NEON/kernels/NEGEMMTranspose1xWKernel.h" -#include "src/core/cpu/kernels/CpuTransposeKernel.h" +#include "arm_compute/runtime/MemoryGroup.h" +#include "arm_compute/runtime/NEON/functions/NEConvertFullyConnectedWeights.h" -#include <cmath> +#include "src/common/utils/Log.h" +#include "src/core/helpers/MemoryHelpers.h" +#include "src/cpu/operators/CpuFullyConnected.h" namespace arm_compute { -using namespace arm_compute::misc::shape_calculator; +using namespace arm_compute::experimental; -namespace +struct NEFullyConnectedLayer::Impl { -// Get min, max bound of a quantized assymetric output tensor, with the effect of fused activation -std::pair<PixelValue, PixelValue> get_quantized_asymmetric_output_min_max(const QuantizationInfo &q_info, const ActivationLayerInfo &act_info, DataType data_type) -{ - PixelValue type_min{}; - PixelValue type_max{}; - std::tie(type_min, type_max) = get_min_max(data_type); - const UniformQuantizationInfo q_unif = q_info.uniform(); - - if(act_info.enabled()) - { - switch(act_info.activation()) - { - case ActivationLayerInfo::ActivationFunction::RELU: - type_min = PixelValue(q_unif.offset); - break; - case ActivationLayerInfo::ActivationFunction::BOUNDED_RELU: - type_min = PixelValue(q_unif.offset); - type_max = PixelValue(act_info.a(), data_type, q_info); - break; - case ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU: - type_min = PixelValue(act_info.b(), data_type, q_info); - type_max = PixelValue(act_info.a(), data_type, q_info); - break; - default: - ARM_COMPUTE_ERROR("Activation function not supported."); - break; - } - } - - return std::make_pair(type_min, type_max); -} + MemoryGroup memory_group{}; + IWeightsManager *weights_manager{nullptr}; -Status get_gemmlowp_output_stage_info(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *output, const ActivationLayerInfo &act, - GEMMLowpOutputStageInfo &gemmlowp_output_stage_info) -{ - const auto data_type = input->data_type(); - const QuantizationInfo oq_info = output->quantization_info(); - const UniformQuantizationInfo iq_unif = input->quantization_info().uniform(); - const UniformQuantizationInfo wq_unif = weights->quantization_info().uniform(); - const UniformQuantizationInfo oq_unif = oq_info.uniform(); - - float multiplier = (iq_unif.scale * wq_unif.scale) / oq_unif.scale; - int32_t output_multiplier; - int32_t output_shift; - - ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier(multiplier, &output_multiplier, &output_shift)); - - PixelValue type_min{}; - PixelValue type_max{}; - std::tie(type_min, type_max) = get_quantized_asymmetric_output_min_max(oq_info, act, data_type); + std::unique_ptr<cpu::CpuFullyConnected> op{nullptr}; - gemmlowp_output_stage_info.gemmlowp_multiplier = output_multiplier; - gemmlowp_output_stage_info.gemmlowp_shift = output_shift; - gemmlowp_output_stage_info.gemmlowp_offset = oq_unif.offset; - gemmlowp_output_stage_info.type = GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT; - gemmlowp_output_stage_info.gemmlowp_min_bound = type_min.get<int32_t>(); - gemmlowp_output_stage_info.gemmlowp_max_bound = type_max.get<int32_t>(); + const ITensor *original_weights{nullptr}; - return Status{}; -} - -Status validate_mm(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const ActivationLayerInfo &act) -{ - if(is_data_type_quantized_asymmetric(input->data_type())) - { - // Since we need negative offsets for computing convolution, we need to change QuantizationInfo() - // Extract and negate input and weights offset - const QuantizationInfo input_quantization_info(input->quantization_info().uniform().scale, -input->quantization_info().uniform().offset); - const QuantizationInfo weights_quantization_info(weights->quantization_info().uniform().scale, -weights->quantization_info().uniform().offset); + ITensorPack run_pack{}; + WorkspaceData<Tensor> workspace{}; + experimental::MemoryRequirements aux_mem_req{}; - GEMMLowpOutputStageInfo gemmlowp_output_stage_info; - ARM_COMPUTE_RETURN_ON_ERROR(get_gemmlowp_output_stage_info(input, weights, output, act, gemmlowp_output_stage_info)); - - GEMMInfo gemm_info; - gemm_info.set_gemmlowp_output_stage(gemmlowp_output_stage_info); - - // Validate gemmlowp function - ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpMatrixMultiplyCore::validate(&input->clone()->set_quantization_info(input_quantization_info), - &weights->clone()->set_quantization_info(weights_quantization_info), - biases, - output, - gemm_info)); - } - else - { - ARM_COMPUTE_RETURN_ON_ERROR(NEGEMM::validate(input, weights, biases, output, 1.f, 1.0f, GEMMInfo(false, false, true /* Reshape weights only for the first run */))); - } - - return Status{}; -} -} // namespace + bool is_prepared{false}; + bool dynamic_weights{false}; +}; NEFullyConnectedLayer::~NEFullyConnectedLayer() = default; -NEFullyConnectedLayer::NEFullyConnectedLayer(std::shared_ptr<IMemoryManager> memory_manager, IWeightsManager *weights_manager) - : _memory_group(std::move(memory_manager)), _weights_manager(weights_manager), _flatten(), _convert_weights(), _convert_weights_managed(), _reshape_weights_function(), - _reshape_weights_managed_function(), _mm_gemm(nullptr, weights_manager), _mm_gemmlowp(nullptr, weights_manager), _flatten_output(), _converted_weights_output(), _reshape_weights_output(), - _original_weights(nullptr), _are_weights_converted(true), _are_weights_reshaped(false), _is_fc_after_conv(false), _is_quantized_asymmetric(false), _is_prepared(false) -{ -} - -void NEFullyConnectedLayer::configure_mm(const ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const ActivationLayerInfo &act) -{ - if(_is_quantized_asymmetric) - { - // Since we need negative offsets for computing convolution, we need to change QuantizationInfo() - // Extract and negate input and weights offset - const QuantizationInfo input_quantization_info = input->info()->quantization_info(); - const QuantizationInfo weights_quantization_info = weights->info()->quantization_info(); - - input->info()->set_quantization_info(QuantizationInfo(input_quantization_info.uniform().scale, -input_quantization_info.uniform().offset)); - weights->info()->set_quantization_info(QuantizationInfo(weights_quantization_info.uniform().scale, -weights_quantization_info.uniform().offset)); - - // Configure gemmlowp function and output stage for asymmetric quantized types - GEMMLowpOutputStageInfo gemmlowp_output_stage_info; - const Status status = get_gemmlowp_output_stage_info(input->info(), weights->info(), output->info(), act, gemmlowp_output_stage_info); - ARM_COMPUTE_ERROR_ON(status.error_code() != ErrorCode::OK); - - GEMMInfo gemm_info; - gemm_info.set_gemmlowp_output_stage(gemmlowp_output_stage_info); - gemm_info.set_activation_info(act); - _mm_gemmlowp.configure(input, weights, biases, output, gemm_info); - - // Revert back QuantizatioInfo as input and weights could be used in other fully connected layers - input->info()->set_quantization_info(input_quantization_info); - weights->info()->set_quantization_info(weights_quantization_info); - } - else - { - // Configure matrix multiply kernel - GEMMInfo gemm_info(false, false, true /* Reshape weights only for the first run */); - gemm_info.set_activation_info(act); - _mm_gemm.configure(input, weights, biases, output, 1.f, 1.0f, gemm_info); - } -} - -void NEFullyConnectedLayer::configure_conv_fc(const ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const ActivationLayerInfo &act) -{ - ARM_COMPUTE_ERROR_ON((weights->info()->dimension(1) != (input->info()->dimension(0) * input->info()->dimension(1) * input->info()->dimension(2)))); - - // If the fully connected layer is called after a convolution layer, the input tensor must be linearized - - // Initialize output tensor for flatten - TensorShape shape_flatten = compute_flatten_shape(input->info()); - _flatten_output.allocator()->init(input->info()->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(shape_flatten)); - - // Configure flatten kernel - _memory_group.manage(&_flatten_output); - - _flatten.configure(input, &_flatten_output); - - // Configure matrix multiply kernel - configure_mm(&_flatten_output, weights, biases, output, act); - - // Allocate the output tensor for flatten once all the configure methods have been called - _flatten_output.allocator()->allocate(); -} - -void NEFullyConnectedLayer::configure_fc_fc(const ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const ActivationLayerInfo &act) +NEFullyConnectedLayer::NEFullyConnectedLayer(std::shared_ptr<IMemoryManager> memory_manager, + IWeightsManager *weights_manager) + : _impl(std::make_unique<Impl>()) { - ARM_COMPUTE_ERROR_ON(input->info()->dimension(0) != weights->info()->dimension(1)); - - // Configure matrix multiply kernel - configure_mm(input, weights, biases, output, act); + _impl->memory_group = MemoryGroup(std::move(memory_manager)); + _impl->weights_manager = weights_manager; } -void NEFullyConnectedLayer::configure(const ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, - FullyConnectedLayerInfo fc_info) +void NEFullyConnectedLayer::configure(const ITensor *input, + const ITensor *weights, + const ITensor *biases, + ITensor *output, + FullyConnectedLayerInfo fc_info, + const WeightsInfo &weights_info) { // Perform validate step ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output); - ARM_COMPUTE_ERROR_THROW_ON(NEFullyConnectedLayer::validate(input->info(), - weights->info(), + ARM_COMPUTE_ERROR_THROW_ON(NEFullyConnectedLayer::validate(input->info(), weights->info(), biases != nullptr ? biases->info() : nullptr, - output->info(), - fc_info)); + output->info(), fc_info, weights_info)); + ARM_COMPUTE_LOG_PARAMS(input, weights, biases, output, fc_info); - _are_weights_converted = true; - _are_weights_reshaped = fc_info.transpose_weights ? fc_info.are_weights_reshaped : true; - _is_fc_after_conv = true; - _is_quantized_asymmetric = is_data_type_quantized_asymmetric(input->info()->data_type()); - _original_weights = weights; + _impl->op = std::make_unique<cpu::CpuFullyConnected>(); + _impl->original_weights = weights; + _impl->is_prepared = false; - if(_weights_manager) - { - _weights_manager->manage(weights); - } - - // With the Fully Connected layer we can have 4 different cases: - // 1) Convolution layer -> Fully Connected layer without batches - // 2) Fully Connected layer -> Fully Connected layer without batches - // 3) Convolution layer -> Fully Connected layer with batches - // 4) Fully Connected layer -> Fully Connected layer with batches - - const ITensor *weights_to_use = weights; - - // Check if we have a fully connected layer with batches - const bool is_batched_fc_layer = output->info()->dimension(1) > 1; - if(is_batched_fc_layer) - { - _is_fc_after_conv = (TensorShape::num_max_dimensions >= 4) && (std::equal(input->info()->tensor_shape().cbegin() + 3, - input->info()->tensor_shape().cend(), - output->info()->tensor_shape().cbegin() + 1)); - } - else - { - _is_fc_after_conv = input->info()->num_dimensions() > 1; - } - - // Reshape weights if needed - if(!_are_weights_reshaped) - { - if(_weights_manager && _weights_manager->are_weights_managed(weights)) - { - _reshape_weights_managed_function.configure(weights); - weights_to_use = _weights_manager->acquire(weights, &_reshape_weights_managed_function); - } - else - { - // Reshape the weights - _reshape_weights_function.configure(weights, &_reshape_weights_output); - weights_to_use = &_reshape_weights_output; - } - } + _impl->op->configure(input->info(), weights->info(), (biases != nullptr) ? biases->info() : nullptr, output->info(), + fc_info, weights_info); - // Convert weights if needed - if(_is_fc_after_conv && (input->info()->data_layout() != fc_info.weights_trained_layout)) + if (_impl->weights_manager != nullptr) { - if(_weights_manager && _weights_manager->are_weights_managed(weights_to_use)) - { - _convert_weights_managed.configure(weights_to_use, - input->info()->tensor_shape(), - fc_info.weights_trained_layout); - weights_to_use = _weights_manager->acquire(weights, &_convert_weights_managed); - } - else - { - // Convert weights - _convert_weights.configure(weights_to_use, - &_converted_weights_output, - input->info()->tensor_shape(), - fc_info.weights_trained_layout); - - weights_to_use = &_converted_weights_output; - } - _are_weights_converted = false; + _impl->weights_manager->manage(_impl->original_weights); } - if(_is_fc_after_conv) - { - // Fully Connected layer after a Convolution Layer without batches - configure_conv_fc(input, weights_to_use, biases, output, fc_info.activation_info); - } - else - { - // Fully Connected layer after a Fully Connected Layer without batches - configure_fc_fc(input, weights_to_use, biases, output, fc_info.activation_info); - } + _impl->aux_mem_req = _impl->op->workspace(); + _impl->run_pack = {{ACL_SRC_0, input}, {ACL_SRC_1, weights}, {ACL_SRC_2, biases}, {ACL_DST, output}}; + _impl->workspace = + manage_workspace<Tensor>(_impl->aux_mem_req, _impl->memory_group, _impl->run_pack, _impl->run_pack); - _are_weights_reshaped = _are_weights_reshaped || fc_info.retain_internal_weights; + _impl->dynamic_weights = !weights->info()->are_values_constant() && fc_info.transpose_weights && + !fc_info.are_weights_reshaped && !fc_info.retain_internal_weights; } -Status NEFullyConnectedLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, - FullyConnectedLayerInfo fc_info) +Status NEFullyConnectedLayer::has_opt_impl(arm_compute::WeightFormat &expected_weight_format, + const ITensorInfo *input, + const ITensorInfo *weights, + const ITensorInfo *biases, + const ITensorInfo *output, + const FullyConnectedLayerInfo &fc_info, + const WeightsInfo &weights_info) { - ARM_COMPUTE_UNUSED(fc_info.retain_internal_weights); - ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F16, DataType::F32); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights, output); - ARM_COMPUTE_RETURN_ERROR_ON(weights->num_dimensions() > 2); - ARM_COMPUTE_RETURN_ERROR_ON(biases != nullptr && biases->num_dimensions() > 1); - ARM_COMPUTE_RETURN_ERROR_ON(fc_info.activation_info.enabled() && is_data_type_quantized(input->data_type()) && fc_info.activation_info.activation() != ActivationLayerInfo::ActivationFunction::RELU - && fc_info.activation_info.activation() != ActivationLayerInfo::ActivationFunction::BOUNDED_RELU && fc_info.activation_info.activation() != ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU); - - bool weights_reshaped = fc_info.transpose_weights ? fc_info.are_weights_reshaped : true; - bool is_fc_after_conv = true; - - const ITensorInfo &flatten_input = TensorInfo(input->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(compute_flatten_shape(input))); - const ITensorInfo &reshaped_weights = TensorInfo(weights->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(compute_transposed_shape(*weights))); - const ITensorInfo &converted_weights = weights_reshaped ? TensorInfo(weights->clone()->set_is_resizable(true).reset_padding()) : TensorInfo(*reshaped_weights.clone()); - - // With the Fully Connected layer we can have 4 different cases: - // 1) Convolution layer -> Fully Connected layer without batches - // 2) Fully Connected layer -> Fully Connected layer without batches - // 3) Convolution layer -> Fully Connected layer with batches - // 4) Fully Connected layer -> Fully Connected layer with batches - - const ITensorInfo *input_to_use = input; - const ITensorInfo *weights_to_use = weights; - - // Check if we have a fully connected layer with batches - const bool is_batched_fc_layer = output->dimension(1) > 1; - - if(is_batched_fc_layer) - { - is_fc_after_conv = (TensorShape::num_max_dimensions >= 4) && (std::equal(input->tensor_shape().cbegin() + 3, - input->tensor_shape().cend(), - output->tensor_shape().cbegin() + 1)); - } - else - { - is_fc_after_conv = input->num_dimensions() > 1; - } - - if(!weights_reshaped) - { - // Validate reshape weights kernel - ARM_COMPUTE_RETURN_ON_ERROR(NETranspose::validate(weights, &reshaped_weights)); - weights_to_use = &reshaped_weights; - } - - if(is_fc_after_conv && (input->data_layout() != fc_info.weights_trained_layout)) - { - // Validate convert weights kernel - ARM_COMPUTE_RETURN_ON_ERROR(NEConvertFullyConnectedWeights::validate(weights_to_use, - &converted_weights, - input->tensor_shape(), - fc_info.weights_trained_layout)); - weights_to_use = &converted_weights; - } - - if(is_fc_after_conv) - { - // Fully Connected layer after a Convolution Layer without batches - ARM_COMPUTE_RETURN_ERROR_ON((weights_to_use->dimension(1) != (input->dimension(0) * input->dimension(1) * input->dimension(2)))); - - // Validate flatten kernel - ARM_COMPUTE_RETURN_ON_ERROR(NEFlattenLayer::validate(input, &flatten_input)); - input_to_use = &flatten_input; - } - else - { - // Fully Connected layer after a Fully Connected Layer without batches - ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(0) != weights_to_use->dimension(1)); - } - // Validate matrix multiply kernel - ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(input_to_use, weights_to_use, biases, output, fc_info.activation_info)); + return cpu::CpuFullyConnected::has_opt_impl(expected_weight_format, input, weights, biases, output, fc_info, + weights_info); +} - return Status{}; +Status NEFullyConnectedLayer::validate(const ITensorInfo *input, + const ITensorInfo *weights, + const ITensorInfo *biases, + const ITensorInfo *output, + FullyConnectedLayerInfo fc_info, + const WeightsInfo &weights_info) +{ + return cpu::CpuFullyConnected::validate(input, weights, biases, output, fc_info, weights_info); } void NEFullyConnectedLayer::run() { - prepare(); - - MemoryGroupResourceScope scope_mg(_memory_group); - - // Linearize input if it comes from a convolutional layer - if(_is_fc_after_conv) + if (!_impl->dynamic_weights) { - _flatten.run(); + prepare(); } - // Run matrix multiply - if(_is_quantized_asymmetric) - { - _mm_gemmlowp.run(); - } - else - { - _mm_gemm.run(); - } + MemoryGroupResourceScope scope_mg(_impl->memory_group); + _impl->op->run(_impl->run_pack); } void NEFullyConnectedLayer::prepare() { - if(!_is_prepared) + if (!_impl->is_prepared) { - if(!_weights_manager) - { - ARM_COMPUTE_ERROR_ON(!_original_weights->is_used()); - } - - auto release_unused = [](Tensor * w) - { - if(!w->is_used()) - { - w->allocator()->free(); - } - }; + _impl->op->prepare(_impl->run_pack); - // Pointer to current weights - const ITensor *cur_weights = _original_weights; + // Release temporary tensors that are only used in prepare stage + release_temporaries<Tensor>(_impl->aux_mem_req, _impl->workspace); + _impl->is_prepared = true; - // Reshape of the weights (happens only once) - if(!_are_weights_reshaped) + // Handle weights managed infrastructure + if (_impl->weights_manager != nullptr && _impl->weights_manager->are_weights_managed(_impl->original_weights)) { - if(_weights_manager && _weights_manager->are_weights_managed(_original_weights)) + // Ensure that b gets marked as unused (memory released) only after the last function which uses b also finishes its prepare + // This is for cases where multiple functions share the same b (weights) + // Therefore when a function marks original b as unused, we pre-mark it in weights manager, and mark it back to used so that it doesn't get released before its last reference + const ITensor *original_b = _impl->original_weights; + if (!original_b->is_used()) { - cur_weights = _weights_manager->run(cur_weights, &_reshape_weights_managed_function); + _impl->weights_manager->pre_mark_as_unused(original_b); } - else - { - // Reshape of the weights (happens only once) - if(!_are_weights_reshaped) - { - // Run reshape weights kernel and mark weights as unused - _reshape_weights_output.allocator()->allocate(); - _reshape_weights_function.run(); - } - cur_weights->mark_as_unused(); - cur_weights = &_reshape_weights_output; - } - _are_weights_reshaped = true; + _impl->original_weights->mark_as_used(); + _impl->weights_manager->release(_impl->original_weights); } - - // Convert weights if needed (happens only once) - if(!_are_weights_converted) - { - if(_weights_manager && _weights_manager->are_weights_managed(cur_weights)) - { - _weights_manager->run(cur_weights, &_convert_weights_managed); - } - else - { - _converted_weights_output.allocator()->allocate(); - _convert_weights.run(); - cur_weights->mark_as_unused(); - } - - _are_weights_converted = true; - } - - // Release reshaped weights if unused - release_unused(&_reshape_weights_output); - - // Prepare GEMM prepare and release unused weights - if(!_is_quantized_asymmetric) - { - _mm_gemm.prepare(); - } - - // Release converted weights if unused - release_unused(&_reshape_weights_output); - release_unused(&_converted_weights_output); - - _is_prepared = true; } } } // namespace arm_compute diff --git a/src/runtime/NEON/functions/NEFuseBatchNormalization.cpp b/src/runtime/NEON/functions/NEFuseBatchNormalization.cpp index a8ce6b2bfc..f5b8b57dac 100644 --- a/src/runtime/NEON/functions/NEFuseBatchNormalization.cpp +++ b/src/runtime/NEON/functions/NEFuseBatchNormalization.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018-2020 Arm Limited. + * Copyright (c) 2018-2021 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -28,32 +28,50 @@ #include "arm_compute/core/TensorInfo.h" #include "arm_compute/core/Types.h" #include "arm_compute/runtime/NEON/NEScheduler.h" + +#include "src/common/utils/Log.h" #include "src/core/NEON/kernels/NEFuseBatchNormalizationKernel.h" namespace arm_compute { NEFuseBatchNormalization::~NEFuseBatchNormalization() = default; -NEFuseBatchNormalization::NEFuseBatchNormalization() - : _fuse_bn_kernel() +NEFuseBatchNormalization::NEFuseBatchNormalization() : _fuse_bn_kernel() { } -void NEFuseBatchNormalization::configure(const ITensor *input_weights, const ITensor *bn_mean, const ITensor *bn_var, - ITensor *fused_weights, ITensor *fused_bias, - const ITensor *input_bias, const ITensor *bn_beta, const ITensor *bn_gamma, - float epsilon, FuseBatchNormalizationType fbn_type) +void NEFuseBatchNormalization::configure(const ITensor *input_weights, + const ITensor *bn_mean, + const ITensor *bn_var, + ITensor *fused_weights, + ITensor *fused_bias, + const ITensor *input_bias, + const ITensor *bn_beta, + const ITensor *bn_gamma, + float epsilon, + FuseBatchNormalizationType fbn_type) { + ARM_COMPUTE_LOG_PARAMS(input_weights, bn_mean, bn_var, fused_weights, fused_bias, input_bias, bn_beta, bn_gamma, + epsilon, fbn_type); + _fuse_bn_kernel = std::make_unique<NEFuseBatchNormalizationKernel>(); - _fuse_bn_kernel->configure(input_weights, bn_mean, bn_var, fused_weights, fused_bias, input_bias, bn_beta, bn_gamma, epsilon, fbn_type); + _fuse_bn_kernel->configure(input_weights, bn_mean, bn_var, fused_weights, fused_bias, input_bias, bn_beta, bn_gamma, + epsilon, fbn_type); } -Status NEFuseBatchNormalization::validate(const ITensorInfo *input_weights, const ITensorInfo *bn_mean, const ITensorInfo *bn_var, - const ITensorInfo *fused_weights, const ITensorInfo *fused_bias, - const ITensorInfo *input_bias, const ITensorInfo *bn_beta, const ITensorInfo *bn_gamma, - float epsilon, FuseBatchNormalizationType fbn_type) +Status NEFuseBatchNormalization::validate(const ITensorInfo *input_weights, + const ITensorInfo *bn_mean, + const ITensorInfo *bn_var, + const ITensorInfo *fused_weights, + const ITensorInfo *fused_bias, + const ITensorInfo *input_bias, + const ITensorInfo *bn_beta, + const ITensorInfo *bn_gamma, + float epsilon, + FuseBatchNormalizationType fbn_type) { - return NEFuseBatchNormalizationKernel::validate(input_weights, bn_mean, bn_var, fused_weights, fused_bias, input_bias, bn_beta, bn_gamma, epsilon, fbn_type); + return NEFuseBatchNormalizationKernel::validate(input_weights, bn_mean, bn_var, fused_weights, fused_bias, + input_bias, bn_beta, bn_gamma, epsilon, fbn_type); } void NEFuseBatchNormalization::run() diff --git a/src/runtime/NEON/functions/NEGEMM.cpp b/src/runtime/NEON/functions/NEGEMM.cpp index 7318c3e492..934a8250cc 100644 --- a/src/runtime/NEON/functions/NEGEMM.cpp +++ b/src/runtime/NEON/functions/NEGEMM.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2021 Arm Limited. + * Copyright (c) 2017-2023 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -23,382 +23,140 @@ */ #include "arm_compute/runtime/NEON/functions/NEGEMM.h" -#include "arm_compute/core/Error.h" -#include "arm_compute/core/Helpers.h" -#include "arm_compute/core/ITensor.h" +#include "arm_compute/core/ITensorPack.h" #include "arm_compute/core/TensorInfo.h" #include "arm_compute/core/Types.h" -#include "arm_compute/core/Validate.h" -#include "arm_compute/core/utils/misc/ShapeCalculator.h" -#include "arm_compute/runtime/NEON/NEScheduler.h" -#include "arm_compute/runtime/TensorAllocator.h" -#include "src/core/CPP/Validate.h" -#include "src/core/NEON/kernels/NEGEMMInterleave4x4Kernel.h" -#include "src/core/NEON/kernels/NEGEMMMatrixAdditionKernel.h" -#include "src/core/NEON/kernels/NEGEMMMatrixMultiplyKernel.h" -#include "src/core/NEON/kernels/NEGEMMTranspose1xWKernel.h" -#include "src/core/helpers/AutoConfiguration.h" -#include "src/runtime/cpu/operators/internal/CpuGemmAssemblyDispatch.h" +#include "arm_compute/runtime/MemoryGroup.h" +#include "arm_compute/runtime/Tensor.h" -#include <cmath> +#include "src/core/CPP/Validate.h" +#include "src/core/helpers/MemoryHelpers.h" +#include "src/cpu/operators/CpuGemm.h" -using namespace arm_compute::misc::shape_calculator; +using namespace arm_compute::experimental; namespace arm_compute { -namespace +struct NEGEMM::Impl { -cpu::AsmGemmInfo init_assembly_metadata(const GEMMInfo &info) -{ - cpu::AsmGemmInfo asm_info; - asm_info.method = cpu::AsmConvMethod::Im2Col; - asm_info.reinterpret_input_as_3d = info.reinterpret_input_as_3d(); - asm_info.depth_output_gemm3d = info.depth_output_gemm3d(); - asm_info.activation_info = info.activation_info(); + MemoryGroup memory_group{}; + IWeightsManager *weights_manager{nullptr}; - return asm_info; -} -} // namespace + std::unique_ptr<cpu::CpuGemm> op{nullptr}; + + const ITensor *original_b{nullptr}; + bool is_prepared{false}; + + ITensorPack run_pack{}; + ITensorPack prep_pack{}; + WorkspaceData<Tensor> workspace{}; + experimental::MemoryRequirements aux_mem_req{}; +}; NEGEMM::NEGEMM(std::shared_ptr<IMemoryManager> memory_manager, IWeightsManager *weights_manager) - : _memory_group(memory_manager), _weights_manager(weights_manager), _interleave_kernel(), _transpose_kernel(), _mm_kernel(), _asm_glue(std::make_unique<cpu::CpuGemmAssemblyDispatch>()), _ma_kernel(), - _alpha_scale_func(nullptr), _add_bias(), _activation_func(), _tmp_a(), _tmp_b(), _tmp_d(), _original_b(nullptr), _run_vector_matrix_multiplication(false), _run_alpha_scale(false), - _run_addition(false), _run_bias_addition(false), _run_activation(false), _reshape_b_only_on_first_run(false), _is_prepared(false) + : _impl(std::make_unique<Impl>()) { + _impl->memory_group = MemoryGroup(std::move(memory_manager)); + _impl->weights_manager = weights_manager; } NEGEMM::~NEGEMM() = default; -void NEGEMM::configure(const ITensor *a, const ITensor *b, const ITensor *c, ITensor *d, float alpha, float beta, const GEMMInfo &gemm_info) +void NEGEMM::configure(const ITensor *a, + const ITensor *b, + const ITensor *c, + ITensor *d, + float alpha, + float beta, + const GEMMInfo &gemm_info) { - ARM_COMPUTE_ERROR_THROW_ON(NEGEMM::validate(a->info(), b->info(), (c != nullptr) ? c->info() : nullptr, d->info(), alpha, beta, gemm_info)); - - const cpu::AsmGemmInfo asm_info = init_assembly_metadata(gemm_info); - const bool is_c_bias = gemm_info.reshape_b_only_on_first_run(); - bool run_optimised = bool(cpu::CpuGemmAssemblyDispatch::validate(a->info(), b->info(), (is_c_bias && c != nullptr) ? c->info() : nullptr, d->info(), asm_info)); + ARM_COMPUTE_ERROR_ON_NULLPTR(a, b, d); + ARM_COMPUTE_ERROR_THROW_ON(cpu::CpuGemm::validate(a->info(), b->info(), (c != nullptr) ? c->info() : nullptr, + d->info(), alpha, beta, gemm_info)); // Check if we need to reshape the matrix B only on the first run - _is_prepared = false; - _reshape_b_only_on_first_run = gemm_info.reshape_b_only_on_first_run(); - _run_vector_matrix_multiplication = a->info()->dimension(1) < 2; - _original_b = b; - _run_alpha_scale = alpha != 1.f; - _run_bias_addition = c != nullptr && gemm_info.reshape_b_only_on_first_run(); - _run_addition = beta != 0 && c != nullptr && !gemm_info.reshape_b_only_on_first_run(); - _run_activation = gemm_info.activation_info().enabled() && (!run_optimised || (run_optimised && !cpu::CpuGemmAssemblyDispatch::is_activation_supported(gemm_info.activation_info()))); + _impl->is_prepared = false; + _impl->original_b = b; + _impl->op = std::make_unique<cpu::CpuGemm>(); - if(run_optimised) + // Make the B matrix dynamic values. + auto b_info_to_use = b->info()->clone(); + if (!gemm_info.reshape_b_only_on_first_run()) { - const ITensor *c_to_use = is_c_bias ? c : nullptr; - const ITensorInfo *c_info_to_use = c_to_use != nullptr ? c_to_use->info() : nullptr; - _asm_glue->configure(a->info(), b->info(), c_info_to_use, d->info(), asm_info); - ARM_COMPUTE_ERROR_ON(!_asm_glue->is_configured()); - - _asm_glue_tensors = - { - { ACL_SRC_0, a }, - { ACL_SRC_1, b }, - { ACL_SRC_2, c_to_use }, - { ACL_DST, d }, - }; - - // Scale product by alpha - if(_run_alpha_scale) - { - _alpha_scale_func.configure(d, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LINEAR, alpha, 0.f)); - } + b_info_to_use->set_are_values_constant(false); } - else - { - // Pick output tensor in case bias addition should be performed - ITensor *gemm_output_to_use = d; - if(_run_bias_addition) - { - gemm_output_to_use = &_tmp_d; - _memory_group.manage(&_tmp_d); - } - - _mm_kernel = std::make_unique<NEGEMMMatrixMultiplyKernel>(); - - // Select between GEMV and GEMM - if(_run_vector_matrix_multiplication) - { - // Configure the matrix multiply kernel - _mm_kernel->configure(a, b, gemm_output_to_use, alpha, false); - } - else - { - TensorShape shape_tmp_a = a->info()->tensor_shape(); - TensorShape shape_tmp_b = b->info()->tensor_shape(); - - shape_tmp_a.set(0, a->info()->dimension(0) * 4); - shape_tmp_a.set(1, std::ceil(a->info()->dimension(1) / 4.0f)); - - const unsigned int transpose_w = 16 / data_size_from_type(b->info()->data_type()); - shape_tmp_b.set(0, b->info()->dimension(1) * transpose_w); - shape_tmp_b.set(1, std::ceil(b->info()->dimension(0) / static_cast<float>(transpose_w))); - - TensorInfo info_a = a->info()->clone()->set_tensor_shape(shape_tmp_a).set_is_resizable(true); - TensorInfo info_b = b->info()->clone()->set_tensor_shape(shape_tmp_b).set_is_resizable(true); - - _tmp_a.allocator()->init(info_a); - _tmp_b.allocator()->init(info_b); - - // Manage intermediate buffers - _memory_group.manage(&_tmp_a); - if(!_reshape_b_only_on_first_run) - { - _memory_group.manage(&_tmp_b); - } - int m = a->info()->dimension(1); - int n = b->info()->dimension(0); - int k = a->info()->dimension(0); + _impl->op->configure(a->info(), b_info_to_use.get(), (c != nullptr) ? c->info() : nullptr, d->info(), alpha, beta, + gemm_info); - // Configure interleave kernel - _interleave_kernel = std::make_unique<NEGEMMInterleave4x4Kernel>(); - _interleave_kernel->configure(a, &_tmp_a); - - // Configure transpose kernel - _transpose_kernel = std::make_unique<NEGEMMTranspose1xWKernel>(); - _transpose_kernel->configure(b, &_tmp_b); - - // Configure matrix multiplication kernel - _mm_kernel->configure(&_tmp_a, &_tmp_b, gemm_output_to_use, alpha, true, GEMMReshapeInfo(m, n, k)); - - // Allocate once the all configure methods have been called - _tmp_a.allocator()->allocate(); - if(!_reshape_b_only_on_first_run) - { - _tmp_b.allocator()->allocate(); - } - } - - if(_run_bias_addition) - { - _add_bias.configure(gemm_output_to_use, c, d, ConvertPolicy::SATURATE); - _tmp_d.allocator()->allocate(); - } - } - - // Configure matrix addition kernel - if(_run_addition) - { - _ma_kernel = std::make_unique<NEGEMMMatrixAdditionKernel>(); - _ma_kernel->configure(c, d, beta); - } - - // Configure activation - const ActivationLayerInfo &activation = gemm_info.activation_info(); - if(_run_activation) - { - _activation_func.configure(d, nullptr, activation); - } + _impl->aux_mem_req = _impl->op->workspace(); + _impl->run_pack = {{ACL_SRC_0, a}, {ACL_SRC_1, b}, {ACL_SRC_2, c}, {ACL_DST, d}}; + _impl->prep_pack = {{ACL_SRC_1, b}, {ACL_SRC_2, c}}; + _impl->workspace = + manage_workspace<Tensor>(_impl->aux_mem_req, _impl->memory_group, _impl->run_pack, _impl->prep_pack); } -Status NEGEMM::validate(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *output, float alpha, float beta, const GEMMInfo &gemm_info) +Status NEGEMM::validate(const ITensorInfo *a, + const ITensorInfo *b, + const ITensorInfo *c, + const ITensorInfo *output, + float alpha, + float beta, + const GEMMInfo &gemm_info) { - ARM_COMPUTE_UNUSED(alpha); - const bool is_c_bias = gemm_info.reshape_b_only_on_first_run(); - - ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(a); - ARM_COMPUTE_RETURN_ERROR_ON_CPU_BF16_UNSUPPORTED(a); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(a, 1, DataType::BFLOAT16, DataType::F16, DataType::F32); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(a, b); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(a->dimension(0) != b->dimension(1), "The product AB is defined only if the number of columns in A is equal to the number of rows in B"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.is_a_reshaped(), "Matrix A already reshaped is not supported"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.is_b_reshaped(), "Matrix B already reshaped is not supported"); - if(a->data_type() != DataType::BFLOAT16) - { - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(a, output); - } - - if(c != nullptr && !is_c_bias) - { - ARM_COMPUTE_RETURN_ERROR_ON(gemm_info.depth_output_gemm3d() != 0); - ARM_COMPUTE_RETURN_ERROR_ON(gemm_info.reinterpret_input_as_3d()); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(c, output); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(a->dimension(1) != c->dimension(1), "The C matrix must have the same number of rows as the matrix A"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(b->dimension(0) != c->dimension(0), "The C matrix must have the same number of columns as the matrix B"); - } - - if(output->total_size() != 0) - { - ARM_COMPUTE_RETURN_ERROR_ON(b->dimension(0) != output->dimension(0)); - if(gemm_info.depth_output_gemm3d() != 0) - { - if(gemm_info.reinterpret_input_as_3d()) - { - ARM_COMPUTE_RETURN_ERROR_ON(a->dimension(1) != output->dimension(1)); - ARM_COMPUTE_RETURN_ERROR_ON(a->dimension(2) != output->dimension(2)); - } - else - { - ARM_COMPUTE_RETURN_ERROR_ON(a->dimension(1) != output->dimension(1) * output->dimension(2)); - } - } - else - { - ARM_COMPUTE_RETURN_ERROR_ON(a->dimension(1) != output->dimension(1)); - } - } - - // Check if we need to run the optimized assembly kernel - cpu::AsmGemmInfo asm_info = init_assembly_metadata(gemm_info); - const bool run_optimised = bool(cpu::CpuGemmAssemblyDispatch::validate(a, b, is_c_bias ? c : nullptr, output, asm_info)); - - if(!run_optimised) - { - ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.reinterpret_input_as_3d(), "NEGEMM cannot reinterpret the input tensor as 3D"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.depth_output_gemm3d() != 0, "NEGEMM cannot reinterpret the output tensor as 3D"); - - // Check if the first input tensor is a vector. - const bool run_vector_matrix_multiplication = a->dimension(1) < 2; - // Check if we need to reshape the matrix A and matrix B - const bool run_interleave_transpose = !run_vector_matrix_multiplication && !(gemm_info.reshape_b_only_on_first_run()); - - // Arguments used by GEMMReshapeInfo - // If we pass the matrix A and matrix B reshaped to NEGEMMMatrixMultiplyKernel, we need to pass m, n, k, mult_transpose1xW_width and mult_interleave4x4_height to NEGEMMReshapeInfo - // in order to know how the matrices have been reshaped - const int m = a->dimension(1); - const int n = b->dimension(0); - const int k = a->dimension(0); - int mult_transpose1xW_width = 1; - int mult_interleave4x4_height = 1; - - const GEMMReshapeInfo reshape_info = GEMMReshapeInfo(m, n, k, mult_transpose1xW_width, mult_interleave4x4_height, gemm_info.depth_output_gemm3d()); - - const ITensorInfo *matrix_a_info = a; - const ITensorInfo *matrix_b_info = b; - - TensorInfo tmp_a_info{}; - TensorInfo tmp_b_info{}; - TensorInfo tmp_output_info = *output->clone(); - - if(run_interleave_transpose) - { - matrix_a_info = &tmp_a_info; - matrix_b_info = &tmp_b_info; - - // Validate interleave kernel - auto_init_if_empty(tmp_a_info, a->clone()->set_tensor_shape(compute_interleaved_shape(*a, mult_interleave4x4_height, gemm_info.reinterpret_input_as_3d()))); - ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMInterleave4x4Kernel::validate(a, &tmp_a_info)); - - // Validate transpose kernel - auto_init_if_empty(tmp_b_info, b->clone()->set_tensor_shape(compute_transpose1xW_with_element_size_shape(*b, mult_transpose1xW_width))); - ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMTranspose1xWKernel::validate(b, &tmp_b_info)); - } - - // Validate matrix multiply - auto_init_if_empty(tmp_output_info, matrix_a_info->clone()->set_tensor_shape(compute_mm_shape(*matrix_a_info, *matrix_b_info, run_interleave_transpose, reshape_info))); - ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMMatrixMultiplyKernel::validate(matrix_a_info, matrix_b_info, &tmp_output_info, alpha, run_interleave_transpose, reshape_info)); - - if(c != nullptr && gemm_info.reshape_b_only_on_first_run()) - { - ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(&tmp_output_info, c, output, ConvertPolicy::SATURATE)); - } - } - - // Validate matrix addition kernel - if(beta != 0 && c != nullptr && !is_c_bias) + // Make the B matrix dynamic values. + auto b_to_use = b->clone(); + if (!gemm_info.reshape_b_only_on_first_run()) { - ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMMatrixAdditionKernel::validate(c, output, beta)); + b_to_use->set_are_values_constant(false); } - // Validate activation - const ActivationLayerInfo &activation = gemm_info.activation_info(); - if(activation.enabled()) - { - ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayer::validate(output, nullptr, activation)); - } + return cpu::CpuGemm::validate(a, b_to_use.get(), c, output, alpha, beta, gemm_info); +} - return Status{}; +Status NEGEMM::has_opt_impl(arm_compute::WeightFormat &expected_weight_format, + const ITensorInfo *a, + const ITensorInfo *b, + const ITensorInfo *c, + const ITensorInfo *output, + float alpha, + float beta, + const GEMMInfo &gemm_info) +{ + ARM_COMPUTE_UNUSED(alpha, beta); + return cpu::CpuGemm::has_opt_impl(expected_weight_format, a, b, c, output, gemm_info); } void NEGEMM::run() { prepare(); - MemoryGroupResourceScope scope_mg(_memory_group); - - if(_asm_glue->is_configured()) - { - _asm_glue->run(_asm_glue_tensors); - if(_run_alpha_scale) - { - _alpha_scale_func.run(); - } - } - else - { - if(!_run_vector_matrix_multiplication) - { - // Run interleave kernel - NEScheduler::get().schedule(_interleave_kernel.get(), Window::DimY); - - if(!_reshape_b_only_on_first_run) - { - // Run transpose kernel - NEScheduler::get().schedule(_transpose_kernel.get(), Window::DimY); - } - } - - NEScheduler::get().schedule(_mm_kernel.get(), _run_vector_matrix_multiplication ? Window::DimX : Window::DimY); - - // Run bias addition kernel - if(_run_bias_addition) - { - _add_bias.run(); - } - } - - // Run matrix addition kernel - if(_run_addition) - { - NEScheduler::get().schedule(_ma_kernel.get(), Window::DimY); - } - - // Run activation function - if(_run_activation) - { - _activation_func.run(); - } + MemoryGroupResourceScope scope_mg(_impl->memory_group); + _impl->op->run(_impl->run_pack); } void NEGEMM::prepare() { - if(!_is_prepared) + if (!_impl->is_prepared) { - const bool original_b_managed_by_weights_manager = _weights_manager && _weights_manager->are_weights_managed(_original_b); - if(_asm_glue->is_configured()) - { - if(!original_b_managed_by_weights_manager) - { - ARM_COMPUTE_ERROR_ON(!_original_b->is_used()); - } + _impl->op->prepare(_impl->prep_pack); + + auto has_reshape = + std::find_if(_impl->aux_mem_req.begin(), _impl->aux_mem_req.end(), + [](const MemoryInfo &m) -> bool { return m.lifetime == MemoryLifetime::Persistent; }); - _asm_glue->prepare(_asm_glue_tensors); - if(!original_b_managed_by_weights_manager) - { - _original_b->mark_as_unused(); - } + if (has_reshape != std::end(_impl->aux_mem_req)) + { + _impl->original_b->mark_as_unused(); } - else if(_reshape_b_only_on_first_run && !_run_vector_matrix_multiplication && !_asm_glue->is_configured()) + else { - if(!original_b_managed_by_weights_manager) - { - ARM_COMPUTE_ERROR_ON(!_original_b->is_used()); - } - - _tmp_b.allocator()->allocate(); - NEScheduler::get().schedule(_transpose_kernel.get(), Window::DimY); - if(!original_b_managed_by_weights_manager) - { - _original_b->mark_as_unused(); - } + _impl->run_pack.add_const_tensor(ACL_SRC_1, _impl->original_b); } - _is_prepared = true; + // Release temporary tensors that are only used in prepare stage + release_temporaries<Tensor>(_impl->aux_mem_req, _impl->workspace); + _impl->is_prepared = true; } } } // namespace arm_compute diff --git a/src/runtime/NEON/functions/NEGEMMConv2d.cpp b/src/runtime/NEON/functions/NEGEMMConv2d.cpp index 564ce2f514..6cca02eea9 100644 --- a/src/runtime/NEON/functions/NEGEMMConv2d.cpp +++ b/src/runtime/NEON/functions/NEGEMMConv2d.cpp @@ -24,50 +24,93 @@ #include "arm_compute/runtime/NEON/functions/NEGEMMConv2d.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" -#include "arm_compute/core/utils/quantization/AsymmHelpers.h" -#include "arm_compute/runtime/NEON/NEScheduler.h" -#include "src/runtime/cpu/operators/CpuGemmDirectConv2d.h" +#include "arm_compute/runtime/Tensor.h" -#include <set> +#include "src/core/helpers/MemoryHelpers.h" +#include "src/cpu/operators/CpuGemmDirectConv2d.h" namespace arm_compute { using OperatorType = cpu::CpuGemmDirectConv2d; +using namespace arm_compute::experimental; struct NEGEMMConv2d::Impl { - ITensorPack tensors{}; - std::unique_ptr<OperatorType> op{ nullptr }; + const ITensor *weights{nullptr}; + std::unique_ptr<OperatorType> op{nullptr}; + ITensorPack run_pack{}; + ITensorPack prep_pack{}; + WorkspaceData<Tensor> workspace{}; + MemoryGroup memory_group{}; + bool is_prepared{false}; + experimental::MemoryRequirements aux_mem_req{}; }; -NEGEMMConv2d::NEGEMMConv2d(const std::shared_ptr<IMemoryManager> &memory_manager) - : _impl(std::make_unique<Impl>()) +NEGEMMConv2d::NEGEMMConv2d(const std::shared_ptr<IMemoryManager> &memory_manager) : _impl(std::make_unique<Impl>()) { - _impl->op = std::make_unique<OperatorType>(memory_manager); + _impl->memory_group = MemoryGroup(memory_manager); } NEGEMMConv2d::~NEGEMMConv2d() = default; -void NEGEMMConv2d::configure(ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const Conv2dInfo &info) +void NEGEMMConv2d::configure( + ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const Conv2dInfo &info) { - _impl->tensors.add_const_tensor(TensorType::ACL_SRC_0, input); - _impl->tensors.add_const_tensor(TensorType::ACL_SRC_1, weights); - _impl->tensors.add_const_tensor(TensorType::ACL_SRC_2, biases); - _impl->tensors.add_tensor(TensorType::ACL_DST, output); + ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output); - _impl->op->configure(input->info(), weights->info(), biases != nullptr ? biases->info() : nullptr, output->info(), info); + _impl->weights = weights; + _impl->is_prepared = false; + _impl->op = std::make_unique<OperatorType>(); + + _impl->op->configure(input->info(), weights->info(), biases != nullptr ? biases->info() : nullptr, output->info(), + info); + + _impl->aux_mem_req = _impl->op->workspace(); + _impl->run_pack = {{TensorType::ACL_SRC_0, input}, {TensorType::ACL_SRC_2, biases}, {TensorType::ACL_DST, output}}; + _impl->prep_pack = {{TensorType::ACL_SRC_1, weights}, {TensorType::ACL_SRC_2, biases}}; + _impl->workspace = + manage_workspace<Tensor>(_impl->op->workspace(), _impl->memory_group, _impl->run_pack, _impl->prep_pack); } -Status NEGEMMConv2d::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const Conv2dInfo &info) +Status NEGEMMConv2d::validate(const ITensorInfo *input, + const ITensorInfo *weights, + const ITensorInfo *biases, + const ITensorInfo *output, + const Conv2dInfo &info) { return OperatorType::validate(input, weights, biases, output, info); } + void NEGEMMConv2d::run() { - _impl->op->run(_impl->tensors); + prepare(); + + MemoryGroupResourceScope scope_mg(_impl->memory_group); + _impl->op->run(_impl->run_pack); } + void NEGEMMConv2d::prepare() { - _impl->op->prepare(_impl->tensors); + if (!_impl->is_prepared) + { + _impl->op->prepare(_impl->prep_pack); + + auto has_reshape = + std::find_if(_impl->aux_mem_req.begin(), _impl->aux_mem_req.end(), + [](const MemoryInfo &m) -> bool { return m.lifetime == MemoryLifetime::Persistent; }); + + if (has_reshape != std::end(_impl->aux_mem_req)) + { + _impl->weights->mark_as_unused(); + } + else + { + _impl->run_pack.add_const_tensor(ACL_SRC_1, _impl->weights); + } + + // Release temporary tensors that are only used in prepare stage + release_temporaries<Tensor>(_impl->aux_mem_req, _impl->workspace); + _impl->is_prepared = true; + } } } // namespace arm_compute diff --git a/src/runtime/NEON/functions/NEGEMMConvolutionLayer.cpp b/src/runtime/NEON/functions/NEGEMMConvolutionLayer.cpp index 2876c254fa..c8f65d2fd9 100644 --- a/src/runtime/NEON/functions/NEGEMMConvolutionLayer.cpp +++ b/src/runtime/NEON/functions/NEGEMMConvolutionLayer.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2021 Arm Limited. + * Copyright (c) 2017-2022 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -26,617 +26,109 @@ #include "arm_compute/core/Size2D.h" #include "arm_compute/core/Utils.h" #include "arm_compute/core/Validate.h" -#include "arm_compute/core/utils/misc/ShapeCalculator.h" -#include "arm_compute/core/utils/quantization/AsymmHelpers.h" -#include "arm_compute/runtime/NEON/NEScheduler.h" +#include "arm_compute/runtime/Tensor.h" -#include "src/core/NEON/kernels/NECol2ImKernel.h" -#include "src/core/NEON/kernels/NEConvertQuantizedSignednessKernel.h" -#include "src/core/NEON/kernels/NEGEMMInterleave4x4Kernel.h" -#include "src/core/NEON/kernels/NEGEMMLowpMatrixMultiplyKernel.h" -#include "src/core/NEON/kernels/NEGEMMLowpOffsetContributionKernel.h" -#include "src/core/NEON/kernels/NEGEMMLowpOffsetContributionOutputStageKernel.h" -#include "src/core/NEON/kernels/NEGEMMLowpReductionKernel.h" -#include "src/core/NEON/kernels/NEGEMMMatrixAdditionKernel.h" -#include "src/core/NEON/kernels/NEGEMMMatrixMultiplyKernel.h" -#include "src/core/NEON/kernels/NEGEMMTranspose1xWKernel.h" -#include "src/core/NEON/kernels/NEGEMMTranspose1xWKernel.h" -#include "src/core/NEON/kernels/NEIm2ColKernel.h" -#include "src/core/NEON/kernels/NEWeightsReshapeKernel.h" +#include "src/core/helpers/MemoryHelpers.h" +#include "src/cpu/operators/CpuGemmConv2d.h" -#include <set> -#include <tuple> +using namespace arm_compute::experimental; namespace arm_compute { -using namespace arm_compute::misc::shape_calculator; - -NEConvolutionLayerReshapeWeights::~NEConvolutionLayerReshapeWeights() = default; -NEConvolutionLayerReshapeWeights::NEConvolutionLayerReshapeWeights() noexcept - : _weights_reshape_kernel() -{ -} - -void NEConvolutionLayerReshapeWeights::configure(const ITensor *weights, const ITensor *biases, ITensor *output) -{ - // Perform validation step - ARM_COMPUTE_ERROR_ON_NULLPTR(weights, output); - ARM_COMPUTE_ERROR_THROW_ON(NEConvolutionLayerReshapeWeights::validate(weights->info(), - (biases != nullptr) ? biases->info() : nullptr, - output->info())); - const bool append_biases = (biases != nullptr) && !is_data_type_quantized_asymmetric(weights->info()->data_type()); - const ITensor *biases_to_use = (append_biases) ? biases : nullptr; - - _weights_reshape_kernel = std::make_unique<NEWeightsReshapeKernel>(); - _weights_reshape_kernel->configure(weights, biases_to_use, output); - - output->info()->set_quantization_info(weights->info()->quantization_info()); -} - -Status NEConvolutionLayerReshapeWeights::validate(const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output) +struct NEGEMMConvolutionLayer::Impl { - ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(weights); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(weights, 1, - DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM8_PER_CHANNEL, - DataType::BFLOAT16, DataType::F16, DataType::F32); - ARM_COMPUTE_RETURN_ERROR_ON(weights->num_dimensions() > 4); - - if(biases != nullptr) - { - const int idx_kernels = get_data_layout_dimension_index(weights->data_layout(), DataLayoutDimension::BATCHES); - ARM_COMPUTE_RETURN_ERROR_ON(is_data_type_quantized_asymmetric(weights->data_type())); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(weights, biases); - ARM_COMPUTE_RETURN_ERROR_ON(biases->dimension(0) != weights->dimension(idx_kernels)); - ARM_COMPUTE_RETURN_ERROR_ON(biases->num_dimensions() > 1); - } - - if((output != nullptr) && (output->total_size() != 0)) - { - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(weights, output); - - NEWeightsReshapeKernel::validate(weights, biases, output); - } - - return Status{}; -} - -void NEConvolutionLayerReshapeWeights::run() + const ITensor *weights{nullptr}; + std::unique_ptr<cpu::CpuGemmConv2d> op{nullptr}; + ITensorPack run_pack{}; + MemoryGroup memory_group{}; + IWeightsManager *weights_manager{nullptr}; + MemoryRequirements aux_mem_req{}; + WorkspaceData<Tensor> workspace_tensors{}; + bool is_prepared{false}; +}; + +NEGEMMConvolutionLayer::NEGEMMConvolutionLayer(const std::shared_ptr<IMemoryManager> &memory_manager, + IWeightsManager *weights_manager) + : _impl(std::make_unique<Impl>()) { - NEScheduler::get().schedule(_weights_reshape_kernel.get(), 3); + _impl->weights_manager = weights_manager; + _impl->memory_group = MemoryGroup(memory_manager); } - NEGEMMConvolutionLayer::~NEGEMMConvolutionLayer() = default; -NEGEMMConvolutionLayer::NEGEMMConvolutionLayer(const std::shared_ptr<IMemoryManager> &memory_manager, IWeightsManager *weights_manager) - : _memory_group(memory_manager), _weights_manager(weights_manager), _reshape_weights(), _reshape_weights_managed(), _im2col_kernel(), _mm_gemm(memory_manager), _mm_gemmlowp(memory_manager), - _col2im_kernel(), _reshape_layer(), _original_weights(nullptr), _original_output(nullptr), _im2col_output(), _weights_reshaped(), _gemm_output(), _gemm_output_3d(), _tmp_output(), - _data_layout(DataLayout::NCHW), _skip_im2col(false), _skip_col2im(false), _is_quantized(false), _is_prepared(false) +void NEGEMMConvolutionLayer::configure(const ITensor *input, + const ITensor *weights, + const ITensor *biases, + ITensor *output, + const PadStrideInfo &conv_info, + const WeightsInfo &weights_info, + const Size2D &dilation, + const ActivationLayerInfo &act_info, + bool enable_fast_math, + unsigned int num_groups) { -} - -void NEGEMMConvolutionLayer::configure_mm(const ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const ActivationLayerInfo &act_info, int gemm_3d_depth) -{ - ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights); - ARM_COMPUTE_ERROR_THROW_ON(validate_mm(input->info(), weights->info(), biases == nullptr ? nullptr : biases->info(), output == nullptr ? nullptr : output->info(), - act_info, gemm_3d_depth, _skip_im2col)); - - // Create GEMMInfo structure - const GEMMInfo &gemm_info = GEMMInfo(false, false, true /* Reshape weights only for the first run */, - gemm_3d_depth, _skip_im2col /* Reinterpret the input as 3D if im2col is skipped */, - false, GEMMLowpOutputStageInfo(), false, false, act_info); - - // Supported activations in GEMM - const std::set<ActivationLayerInfo::ActivationFunction> supported_acts = { ActivationLayerInfo::ActivationFunction::RELU, - ActivationLayerInfo::ActivationFunction::BOUNDED_RELU, - ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU - }; - - if(_is_quantized) - { - // Since we need negative offsets for computing convolution, we need to change QuantizationInfo() - // Extract and negate input and weights offset - const QuantizationInfo iqinfo = input->info()->quantization_info(); - const QuantizationInfo wqinfo = weights->info()->quantization_info(); - const QuantizationInfo oqinfo = (output->info()->total_size() == 0) ? iqinfo : output->info()->quantization_info(); - const UniformQuantizationInfo uiqinfo = iqinfo.uniform(); - const UniformQuantizationInfo uoqinfo = oqinfo.uniform(); - const DataType data_type = input->info()->data_type(); - - input->info()->set_quantization_info(QuantizationInfo(uiqinfo.scale, -uiqinfo.offset)); - if(!is_data_type_quantized_per_channel(weights->info()->data_type())) - { - const UniformQuantizationInfo uwqinfo = wqinfo.uniform(); - weights->info()->set_quantization_info(QuantizationInfo(uwqinfo.scale, -uwqinfo.offset)); - } - - // Merge activation with output stage - PixelValue type_min{}; - PixelValue type_max{}; - std::tie(type_min, type_max) = get_min_max(data_type); - int32_t min_activation = type_min.get<int32_t>(); - int32_t max_activation = type_max.get<int32_t>(); - - if(supported_acts.count(act_info.activation()) != 0) - { - std::tie(min_activation, max_activation) = get_quantized_activation_min_max(act_info, data_type, uoqinfo); - } - - GEMMLowpOutputStageInfo output_info; - output_info.type = GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT; - output_info.gemmlowp_offset = uoqinfo.offset; - output_info.gemmlowp_min_bound = min_activation; - output_info.gemmlowp_max_bound = max_activation; - output_info.is_quantized_per_channel = (weights->info()->data_type() == DataType::QSYMM8_PER_CHANNEL); - quantization::calculate_quantized_multipliers(iqinfo, wqinfo, oqinfo, output_info); - - _mm_gemmlowp.configure(input, weights, biases, output, GEMMInfo(false, false, true, gemm_3d_depth, _skip_im2col, false, output_info, false, false, act_info)); - - // Revert back QuantizatioInfo as input and weights could be used in other convolution layers - input->info()->set_quantization_info(iqinfo); - weights->info()->set_quantization_info(wqinfo); - } - else - { - // Configure matrix multiply function - _mm_gemm.configure(input, weights, biases, output, 1.0f, 0.0f, gemm_info); - } -} - -Status NEGEMMConvolutionLayer::validate_mm(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, - const ActivationLayerInfo &act_info, int gemm_3d_depth, bool skip_im2col) -{ - const DataType data_type = input->data_type(); - const bool is_quantized = is_data_type_quantized_asymmetric(data_type); - const bool is_activation_enabled = act_info.enabled(); - - // Create GEMMInfo structure - const GEMMInfo gemm_info = GEMMInfo(false, false, true /* Reshape weights only for the first run */, - gemm_3d_depth, skip_im2col /* Reinterpret the input as 3D if im2col is skipped */, - false, GEMMLowpOutputStageInfo(), false, false, act_info); - - if(is_quantized) - { - // Since we need negative offsets for computing convolution, we need to change QuantizationInfo() - // Extract and negate input and weights offset - const QuantizationInfo &iqinfo = input->quantization_info(); - const QuantizationInfo &wqinfo = weights->quantization_info(); - const QuantizationInfo &oqinfo = (output->total_size() == 0) ? iqinfo : output->quantization_info(); - const UniformQuantizationInfo uoqinfo = oqinfo.uniform(); - - // Merge activation with output stage - PixelValue type_min{}; - PixelValue type_max{}; - std::tie(type_min, type_max) = get_min_max(data_type); - int32_t min_activation = type_min.get<int32_t>(); - int32_t max_activation = type_max.get<int32_t>(); - - const std::set<ActivationLayerInfo::ActivationFunction> supported_acts = { ActivationLayerInfo::ActivationFunction::RELU, - ActivationLayerInfo::ActivationFunction::BOUNDED_RELU, - ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU - }; - if(is_activation_enabled && supported_acts.count(act_info.activation()) != 0) - { - std::tie(min_activation, max_activation) = get_quantized_activation_min_max(act_info, data_type, uoqinfo); - } - - GEMMLowpOutputStageInfo output_info; - output_info.type = GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT; - output_info.gemmlowp_offset = uoqinfo.offset; - output_info.gemmlowp_min_bound = min_activation; - output_info.gemmlowp_max_bound = max_activation; - output_info.is_quantized_per_channel = (weights->data_type() == DataType::QSYMM8_PER_CHANNEL); - ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multipliers(iqinfo, wqinfo, oqinfo, output_info)); - - // Perform validation step on GEMMLowp - std::unique_ptr<ITensorInfo> input_qa = input->clone(); - std::unique_ptr<ITensorInfo> weights_qa = weights->clone(); - input_qa->set_quantization_info(QuantizationInfo(iqinfo.uniform().scale, -iqinfo.uniform().offset)); - weights_qa->set_quantization_info(QuantizationInfo(wqinfo.uniform().scale, -wqinfo.uniform().offset)); - return NEGEMMLowpMatrixMultiplyCore::validate(input_qa.get(), weights_qa.get(), biases, output, GEMMInfo(false, false, true, gemm_3d_depth, skip_im2col, false, output_info, false, false, act_info)); - } - else - { - // Perform validation step on Matrix multiply function - return NEGEMM::validate(input, weights, nullptr, output, 1.0f, 0.0f, gemm_info); - } -} - -Status NEGEMMConvolutionLayer::validate_gemm3d(const ITensorInfo *input_info, const ITensorInfo *weights_info, const ActivationLayerInfo &act_info, int gemm_3d_depth, bool skip_im2col) -{ - const DataType data_type = input_info->data_type(); - const unsigned int mult_y = skip_im2col ? 1U : gemm_3d_depth; - const unsigned int mult_z = skip_im2col ? gemm_3d_depth : 1U; - - // Set dummy tensor shapes for the validation - const TensorInfo dummy_input_info(TensorShape(4U, 4U * mult_y, 1U * mult_z), 1, data_type, input_info->quantization_info()); - const TensorInfo dummy_weights_info(TensorShape(4U, 4U), 1, data_type, weights_info->quantization_info()); - const TensorInfo dummy_output_info(TensorShape(4U, 4U, gemm_3d_depth), 1, data_type, input_info->quantization_info()); + ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output); - return validate_mm(&dummy_input_info, &dummy_weights_info, nullptr, &dummy_output_info, act_info, gemm_3d_depth, skip_im2col); + _impl->weights = weights; + _impl->op = std::make_unique<cpu::CpuGemmConv2d>(); + _impl->op->configure(input->info(), weights->info(), (biases != nullptr ? biases->info() : nullptr), output->info(), + conv_info, weights_info, dilation, act_info, enable_fast_math, num_groups); + + _impl->run_pack = {{TensorType::ACL_SRC_0, input}, + {TensorType::ACL_SRC_1, weights}, + {TensorType::ACL_SRC_2, biases}, + {TensorType::ACL_DST, output}}; + _impl->aux_mem_req = _impl->op->workspace(); + _impl->workspace_tensors = + manage_workspace<Tensor>(_impl->aux_mem_req, _impl->memory_group, _impl->run_pack, _impl->run_pack); } -void NEGEMMConvolutionLayer::configure(const ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const PadStrideInfo &conv_info, const WeightsInfo &weights_info, - const Size2D &dilation, const ActivationLayerInfo &act_info, unsigned int num_groups) +Status NEGEMMConvolutionLayer::validate(const ITensorInfo *input, + const ITensorInfo *weights, + const ITensorInfo *biases, + const ITensorInfo *output, + const PadStrideInfo &conv_info, + const WeightsInfo &weights_info, + const Size2D &dilation, + const ActivationLayerInfo &act_info, + bool enable_fast_math, + unsigned int num_groups) { - ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output); - ARM_COMPUTE_UNUSED(num_groups, weights_info); - ARM_COMPUTE_ERROR_THROW_ON(NEGEMMConvolutionLayer::validate(input->info(), - weights->info(), - biases != nullptr ? biases->info() : nullptr, - output->info(), - conv_info, - weights_info, - dilation, - act_info, - num_groups)); - - const DataType data_type = input->info()->data_type(); - const DataLayout data_layout = input->info()->data_layout(); - const int idx_width = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH); - const int idx_height = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT); - const int idx_kernels = get_data_layout_dimension_index(data_layout, DataLayoutDimension::BATCHES); - - const unsigned int kernel_width = weights->info()->dimension(idx_width); - const unsigned int kernel_height = weights->info()->dimension(idx_height); - - _is_prepared = weights_info.retain_internal_weights(); - _original_weights = weights; - _original_output = output; - _is_quantized = is_data_type_quantized_asymmetric(input->info()->data_type()); - _data_layout = data_layout; - _skip_im2col = (data_layout == DataLayout::NHWC && kernel_width == 1 && kernel_height == 1 && conv_info.stride().first == 1 && conv_info.stride().second == 1); - - const ITensor *gemm_input_to_use = input; - ITensor *gemm_output_to_use = output; - - // Get convolved dimensions - unsigned int conv_w = 0; - unsigned int conv_h = 0; - std::tie(conv_w, conv_h) = scaled_dimensions(input->info()->dimension(idx_width), - input->info()->dimension(idx_height), - kernel_width, - kernel_height, - conv_info, - dilation); - - // Check if GEMM3D is supported - if(data_layout == DataLayout::NHWC) - { - _skip_col2im = bool(validate_gemm3d(input->info(), weights->info(), act_info, conv_h, true)); - // If not supported, we need to perform im2col and col2im (or reshape layer) - if(!_skip_col2im) - { - _skip_im2col = false; - } - } - else - { - _skip_col2im = false; - } - - // Get parameters from conv_info - unsigned int stride_x = 0; - unsigned int stride_y = 0; - std::tie(stride_x, stride_y) = conv_info.stride(); - - unsigned int mat_weights_cols = weights->info()->dimension(idx_kernels); - - // _weights_reshaped will be auto configured in the kernel. - // Just append biases and do not transpose 1xW as it will be reshaped in NEGEMM - const ITensor *weights_to_use = weights; - - if(_weights_manager && _weights_manager->are_weights_managed(weights)) - { - _reshape_weights_managed.configure(weights, nullptr); - weights_to_use = _weights_manager->acquire(weights, &_reshape_weights_managed); - } - else - { - _reshape_weights.configure(weights, nullptr, &_weights_reshaped); - weights_to_use = &_weights_reshaped; - } - - // Create tensor to store im2col reshaped inputs - if(!_skip_im2col) - { - _memory_group.manage(&_im2col_output); - - // Configure - _im2col_kernel = std::make_unique<NEIm2ColKernel>(); - _im2col_kernel->configure(input, &_im2col_output, Size2D(kernel_width, kernel_height), conv_info, false, dilation); - - // Update GEMM input - gemm_input_to_use = &_im2col_output; - } - - // Create temporary GEMM output tensor in case we cannot skip col2im - const DataType output_data_type = data_type == DataType::BFLOAT16 ? DataType::F32 : data_type; - if(!_skip_col2im) - { - TensorShape shape_gemm; - - // Calculate GEMM output shape - shape_gemm = _im2col_output.info()->tensor_shape(); - shape_gemm.set(0, mat_weights_cols); - shape_gemm.set(1, conv_w * conv_h); - - // FIXME: input->clone() doesn't work with subtensors for grouped convolutions. - TensorInfo info_gemm(shape_gemm, 1, output_data_type); - info_gemm.set_quantization_info(output->info()->quantization_info()).set_data_layout(input->info()->data_layout()); - _gemm_output.allocator()->init(info_gemm); - _gemm_output_3d.allocator()->init(info_gemm); - _memory_group.manage(&_gemm_output); - - // Update GEMM output - gemm_output_to_use = &_gemm_output; - } - else - { - TensorInfo out_info{ *output->info() }; - out_info.set_data_type(output_data_type).set_data_layout(input->info()->data_layout()).set_is_resizable(true); - _gemm_output.allocator()->init(out_info); - _gemm_output_3d.allocator()->init(out_info); - _memory_group.manage(&_gemm_output); - - // Update GEMM output - gemm_output_to_use = &_gemm_output_3d; - } - - // Configure GEMM - // In case we need to skip col2im, GEMM3D (gemm_3d_depth != 0) must be called in order to avoid reshaping the output matrix - const unsigned int gemm_3d_depth = _skip_col2im ? conv_h : 0; - configure_mm(gemm_input_to_use, weights_to_use, biases, gemm_output_to_use, act_info, gemm_3d_depth); - - if(!_skip_im2col) - { - _im2col_output.allocator()->allocate(); - } - - if(!_skip_col2im) - { - if(_data_layout == DataLayout::NCHW) - { - // Configure col2im - _col2im_kernel = std::make_unique<NECol2ImKernel>(); - _col2im_kernel->configure(gemm_output_to_use, output, Size2D(conv_w, conv_h)); - } - else - { - // Configure reshape layer - _reshape_layer.configure(gemm_output_to_use, output); - } - } - else - { - // Configure reshape layer - _reshape_layer.configure(gemm_output_to_use, output); - } - - if(_is_quantized && !_skip_col2im) - { - _tmp_output.allocator()->allocate(); - } - - _gemm_output.allocator()->allocate(); - - ARM_COMPUTE_ERROR_ON_MSG((output->info()->dimension(idx_width) != conv_w) || (output->info()->dimension(idx_height) != conv_h), - "Output shape does not match the expected one"); + return cpu::CpuGemmConv2d::validate(input, weights, biases, output, conv_info, weights_info, dilation, act_info, + enable_fast_math, num_groups); } -Status NEGEMMConvolutionLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info, - const WeightsInfo &weights_info, const Size2D &dilation, const ActivationLayerInfo &act_info, unsigned int num_groups) +Status NEGEMMConvolutionLayer::has_opt_impl(arm_compute::WeightFormat &expected_weight_format, + const ITensorInfo *src, + const ITensorInfo *weights, + const ITensorInfo *biases, + const ITensorInfo *dst, + const PadStrideInfo &conv_info, + const WeightsInfo &weights_info, + const Size2D &dilation, + const ActivationLayerInfo &act_info, + const bool enable_fast_math) { - ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(weights_info.are_reshaped(), "Weights already reshaped are not supported!"); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::BFLOAT16, DataType::F16, DataType::F32); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(weights, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM8_PER_CHANNEL, DataType::BFLOAT16, DataType::F16, DataType::F32); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, weights); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(num_groups > 1, "Grouping (num_groups != 1) is not supported"); - - const DataLayout data_layout = input->data_layout(); - const DataType data_type = input->data_type(); - const int idx_width = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH); - const int idx_height = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT); - const int idx_channel = get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL); - const int idx_kernels = get_data_layout_dimension_index(data_layout, DataLayoutDimension::BATCHES); - - const unsigned int kernel_width = weights->dimension(idx_width); - const unsigned int kernel_height = weights->dimension(idx_height); - - TensorInfo im2col_reshaped_info{}; - TensorInfo info_gemm{}; - TensorInfo tmp_info{}; - TensorInfo weights_reshaped_info{}; - const ITensorInfo *gemm_input_to_use = input; - const ITensorInfo *gemm_output_to_use = output; - const ITensorInfo *weights_to_use = weights; - - const bool append_bias = false; - const bool is_quantized = is_data_type_quantized_asymmetric(data_type); - const bool is_bf16 = data_type == DataType::BFLOAT16; - bool skip_im2col = (data_layout == DataLayout::NHWC && kernel_width == 1 && kernel_height == 1 && conv_info.stride().first == 1 && conv_info.stride().second == 1); - - // Get convolved dimensions - unsigned int conv_w = 0; - unsigned int conv_h = 0; - - std::tie(conv_w, conv_h) = scaled_dimensions(input->dimension(idx_width), - input->dimension(idx_height), - kernel_width, - kernel_height, - conv_info, - dilation); - - // Check if GEMM3D is supported - bool skip_col2im = false; - if(data_layout == DataLayout::NHWC) - { - skip_col2im = bool(validate_gemm3d(input, weights, act_info, conv_h, true)); - // If not supported, we need to perform im2col and col2im (or reshape layer) - if(!skip_col2im) - { - skip_im2col = false; - } - } - - if(skip_col2im) - { - // If not supported, we need to perform im2col and col2im (or reshape layer) - if(!bool(validate_gemm3d(input, weights, act_info, conv_h, skip_im2col))) - { - skip_im2col = false; - skip_col2im = false; - } - } - - ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(idx_channel) != input->dimension(idx_channel)); - ARM_COMPUTE_RETURN_ERROR_ON(weights->num_dimensions() > 4); - - // Validate biases - if(biases != nullptr) - { - if(is_quantized) - { - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(biases, 1, DataType::S32); - } - else if(is_bf16) - { - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(biases, 1, DataType::F32); - } - else - { - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, biases); - } - ARM_COMPUTE_RETURN_ERROR_ON(biases->dimension(0) != weights->dimension(idx_kernels)); - ARM_COMPUTE_RETURN_ERROR_ON(biases->num_dimensions() > 1); - } - - unsigned int mat_weights_cols = weights->dimension(idx_kernels); - unsigned int mat_weights_rows = weights->dimension(idx_width) * weights->dimension(idx_height) * weights->dimension(idx_channel); - - // Output tensor auto inizialization if not yet initialized - ARM_COMPUTE_RETURN_ON_ERROR(NEConvolutionLayerReshapeWeights::validate(weights, nullptr, nullptr)); - weights_reshaped_info = TensorInfo(compute_weights_reshaped_shape(*weights, append_bias), 1, data_type); - weights_reshaped_info.set_quantization_info(weights->quantization_info()); - weights_to_use = &weights_reshaped_info; - - if(!skip_im2col) - { - // Create tensor info for im2col reshaped inputs - // For CPU, the batch size is on the fourth dimension - TensorShape shape_im2col = input->tensor_shape(); - shape_im2col.set(0, mat_weights_rows); - shape_im2col.set(1, conv_w * conv_h); - shape_im2col.set(2, 1); - - im2col_reshaped_info = TensorInfo(shape_im2col, 1, data_type); - im2col_reshaped_info.set_quantization_info(input->quantization_info()); - ARM_COMPUTE_RETURN_ON_ERROR(NEIm2ColKernel::validate(input, &im2col_reshaped_info, Size2D(kernel_width, kernel_height), conv_info, append_bias, dilation)); - gemm_input_to_use = &im2col_reshaped_info; - } - - // Create temporary GEMM output tensor in case we cannot skip col2im - const DataType output_data_type = data_type == DataType::BFLOAT16 ? DataType::F32 : data_type; - if(!skip_col2im) - { - TensorShape shape_gemm = gemm_input_to_use->tensor_shape(); - shape_gemm.set(0, mat_weights_cols); - shape_gemm.set(1, conv_w * conv_h); - info_gemm = TensorInfo(shape_gemm, 1, output_data_type); - } - else - { - info_gemm = TensorInfo(output->tensor_shape(), 1, output_data_type); - } - info_gemm.set_quantization_info(output->quantization_info()).set_data_layout(input->data_layout()); - gemm_output_to_use = &info_gemm; - ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemm_input_to_use, weights_to_use, biases, gemm_output_to_use, act_info, skip_col2im ? conv_h : 0, skip_im2col)); - - // Validate Col2Im/ReshapeLayer - if(!skip_col2im && (data_layout == DataLayout::NCHW)) - { - ARM_COMPUTE_RETURN_ON_ERROR(NECol2ImKernel::validate(gemm_output_to_use, output, Size2D(conv_w, conv_h))); - } - - return Status{}; + return cpu::CpuGemmConv2d::has_opt_impl(expected_weight_format, src, weights, biases, dst, conv_info, weights_info, + dilation, act_info, enable_fast_math); } void NEGEMMConvolutionLayer::run() { prepare(); - - MemoryGroupResourceScope scope_mg(_memory_group); - - bool out_has_padding = _skip_col2im && (_original_output->info()->padding().bottom != 0 || _original_output->info()->padding().top != 0); - - if(!_skip_im2col) - { - // Run input reshaping - unsigned int y_dim = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::HEIGHT); - NEScheduler::get().schedule(_im2col_kernel.get(), y_dim); - } - - // Handle the case where output has top/bottom padding - const ITensor *out_to_use = out_has_padding ? &_gemm_output : _original_output; - _gemm_output_3d.info()->extend_padding(out_to_use->info()->padding()); - _gemm_output_3d.allocator()->import_memory(out_to_use->buffer()); - - // Runs NEGEMM or NEGEMMLowpMatrixMultiplyCore functions - if(_is_quantized) - { - // Run gemmlowp - _mm_gemmlowp.run(); - } - else - { - // Run gemm - _mm_gemm.run(); - } - - // Reshape output matrix - if(!_skip_col2im) - { - if(_data_layout == DataLayout::NCHW) - { - NEScheduler::get().schedule(_col2im_kernel.get(), Window::DimY); - } - else - { - _reshape_layer.run(); - } - } - else if(out_has_padding) - { - _reshape_layer.run(); - } - - _gemm_output_3d.allocator()->free(); + MemoryGroupResourceScope scope_mg(_impl->memory_group); + _impl->op->run(_impl->run_pack); } void NEGEMMConvolutionLayer::prepare() { - if(!_is_prepared) + if (!_impl->is_prepared) { - if(_weights_manager && _weights_manager->are_weights_managed(_original_weights)) - { - _weights_manager->run(_original_weights, &_reshape_weights_managed); - } - else - { - // Run weights reshaping and mark original weights tensor as unused - _weights_reshaped.allocator()->allocate(); - _reshape_weights.run(); - _original_weights->mark_as_unused(); - } - - // Prepare GEMM - _is_quantized ? _mm_gemmlowp.prepare() : _mm_gemm.prepare(); - if(!_weights_reshaped.is_used()) - { - _weights_reshaped.allocator()->free(); - } + _impl->op->prepare(_impl->run_pack); - _is_prepared = true; + // Release temporary tensors that are only used in prepare stage + release_temporaries<Tensor>(_impl->aux_mem_req, _impl->workspace_tensors); + _impl->is_prepared = true; } } } // namespace arm_compute diff --git a/src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.cpp b/src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.cpp index cc0f20e695..44bfc6a51e 100644 --- a/src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.cpp +++ b/src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2021 Arm Limited. + * Copyright (c) 2017-2021, 2023 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -23,604 +23,109 @@ */ #include "arm_compute/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.h" -#include "arm_compute/core/Error.h" -#include "arm_compute/core/Helpers.h" #include "arm_compute/core/ITensor.h" -#include "arm_compute/core/KernelDescriptors.h" -#include "arm_compute/core/TensorInfo.h" -#include "arm_compute/core/Types.h" #include "arm_compute/core/Validate.h" -#include "arm_compute/core/utils/misc/ShapeCalculator.h" +#include "arm_compute/runtime/IWeightsManager.h" +#include "arm_compute/runtime/MemoryGroup.h" #include "arm_compute/runtime/NEON/NEScheduler.h" -#include "arm_compute/runtime/TensorAllocator.h" -#include "src/core/helpers/AutoConfiguration.h" +#include "arm_compute/runtime/Tensor.h" -#include "src/core/NEON/kernels/NEConvertQuantizedSignednessKernel.h" -#include "src/core/NEON/kernels/NEGEMMInterleave4x4Kernel.h" -#include "src/core/NEON/kernels/NEGEMMLowpMatrixMultiplyKernel.h" -#include "src/core/NEON/kernels/NEGEMMLowpOffsetContributionKernel.h" -#include "src/core/NEON/kernels/NEGEMMLowpOffsetContributionOutputStageKernel.h" -#include "src/core/NEON/kernels/NEGEMMLowpReductionKernel.h" -#include "src/core/NEON/kernels/NEGEMMTranspose1xWKernel.h" -#include "src/runtime/cpu/operators/internal/CpuGemmAssemblyDispatch.h" +#include "src/core/helpers/MemoryHelpers.h" +#include "src/cpu/operators/CpuGemmLowpMatrixMultiplyCore.h" + +using namespace arm_compute::experimental; namespace arm_compute { -namespace +struct NEGEMMLowpMatrixMultiplyCore::Impl { -cpu::AsmGemmInfo init_assembly_metadata(const GEMMInfo &info) + const ITensor *b{nullptr}; + std::unique_ptr<cpu::CpuGemmLowpMatrixMultiplyCore> op{nullptr}; + ITensorPack run_pack{}; + ITensorPack prep_pack{}; + MemoryGroup memory_group{}; + IWeightsManager *weights_manager{nullptr}; + MemoryRequirements aux_mem_req{}; + WorkspaceData<Tensor> workspace_tensors{}; + bool is_prepared{false}; +}; + +NEGEMMLowpMatrixMultiplyCore::NEGEMMLowpMatrixMultiplyCore(std::shared_ptr<IMemoryManager> memory_manager, + IWeightsManager *weights_manager) + : _impl(std::make_unique<Impl>()) { - cpu::AsmGemmInfo asm_info; - asm_info.method = cpu::AsmConvMethod::Im2Col; - asm_info.reinterpret_input_as_3d = info.reinterpret_input_as_3d(); - asm_info.depth_output_gemm3d = info.depth_output_gemm3d(); - asm_info.activation_info = info.activation_info(); - asm_info.output_stage = info.gemmlowp_output_stage(); - - return asm_info; + _impl->weights_manager = weights_manager; + _impl->memory_group = MemoryGroup(memory_manager); } -} // namespace - -using namespace arm_compute::misc::shape_calculator; - NEGEMMLowpMatrixMultiplyCore::~NEGEMMLowpMatrixMultiplyCore() = default; -NEGEMMLowpMatrixMultiplyCore::NEGEMMLowpMatrixMultiplyCore(std::shared_ptr<IMemoryManager> memory_manager, IWeightsManager *weights_manager) - : _memory_group(memory_manager), _weights_manager(weights_manager), _asm_glue(std::make_unique<cpu::CpuGemmAssemblyDispatch>(memory_manager, weights_manager)), _mm_kernel(), _mtx_a_reshape_kernel(), - _mtx_b_reshape_kernel(), _mtx_a_reduction_kernel(), _mtx_b_reduction_kernel(), _offset_contribution_kernel(), _offset_contribution_output_stage_kernel(), _activation_func(), - _convert_to_signed_asymm(), _convert_from_signed_asymm(), _vector_sum_col(), _vector_sum_row(), _tmp_a(), _tmp_b(), _mm_result_s32(), _signed_a(), _signed_output(), _original_b(nullptr), _a_offset(0), - _b_offset(0), _run_vector_matrix_multiplication(false), _assembly_path(false), _fused_assembly_path(false), _reshape_b_only_on_first_run(false), _is_prepared(false), _fuse_output_stage(false), - _run_activation(false), _flip_signedness(false) -{ -} - -void NEGEMMLowpMatrixMultiplyCore::configure(const ITensor *a, const ITensor *b, const ITensor *c, ITensor *output, const GEMMInfo &gemm_info) +void NEGEMMLowpMatrixMultiplyCore::configure( + const ITensor *a, const ITensor *b, const ITensor *c, ITensor *output, const GEMMInfo &gemm_info) { ARM_COMPUTE_ERROR_ON_NULLPTR(a, b, output); - ARM_COMPUTE_UNUSED(c); - ARM_COMPUTE_ERROR_THROW_ON(NEGEMMLowpMatrixMultiplyCore::validate(a->info(), b->info(), c != nullptr ? c->info() : nullptr, output->info(), gemm_info)); - - const ITensor *matrix_a = a; - const ITensor *matrix_b = b; - GEMMInfo info = gemm_info; - - // Set internal variables - _a_offset = a->info()->quantization_info().uniform().offset; - _b_offset = b->info()->quantization_info().uniform().offset; - _run_vector_matrix_multiplication = a->info()->dimension(1) < 2; - _reshape_b_only_on_first_run = info.reshape_b_only_on_first_run(); - _is_prepared = false; - _fused_assembly_path = false; - _flip_signedness = is_data_type_quantized_per_channel(b->info()->data_type()) && (a->info()->data_type() == DataType::QASYMM8) && _reshape_b_only_on_first_run; - _original_b = b; - - const ITensor *a_to_use = a; - - // Convert to QASYMM8 -> QASYMM8_SIGNED and back - if(_flip_signedness) - { - const int32_t offset_correction = 128; - const DataType dt = DataType::QASYMM8_SIGNED; - const UniformQuantizationInfo iqinfo = a_to_use->info()->quantization_info().uniform(); - - _signed_a.allocator()->init(a_to_use->info()->clone()->set_data_type(dt).set_quantization_info(QuantizationInfo(iqinfo.scale, iqinfo.offset + offset_correction))); - _memory_group.manage(&_signed_a); - _convert_to_signed_asymm = std::make_unique<NEConvertQuantizedSignednessKernel>(); - _convert_to_signed_asymm->configure(a_to_use, &_signed_a); - a_to_use = &_signed_a; - _a_offset = _signed_a.info()->quantization_info().uniform().offset; - - const UniformQuantizationInfo oqinfo = output->info()->quantization_info().uniform(); - _memory_group.manage(&_signed_output); - _signed_output.allocator()->init(output->info()->clone()->set_data_type(dt).set_quantization_info(QuantizationInfo(oqinfo.scale, oqinfo.offset - offset_correction))); - - // Output stage correction - GEMMLowpOutputStageInfo output_stage_corr = info.gemmlowp_output_stage(); - output_stage_corr.gemmlowp_offset = _signed_output.info()->quantization_info().uniform().offset; - output_stage_corr.gemmlowp_min_bound -= offset_correction; - output_stage_corr.gemmlowp_max_bound -= offset_correction; - info.set_gemmlowp_output_stage(output_stage_corr); - - // Update matrix a - matrix_a = &_signed_a; - } - - // If GEMMLowpOutputStage != NONE, fuse the offset contribution with the output stage - if(info.gemmlowp_output_stage().type != GEMMLowpOutputStageType::NONE) - { - _fuse_output_stage = true; - _memory_group.manage(&_mm_result_s32); - TensorInfo info_mm_result_s32(output->info()->tensor_shape(), 1, DataType::S32); - _mm_result_s32.allocator()->init(info_mm_result_s32); - } - - // Initialize assembly kernel meta-data - const cpu::AsmGemmInfo asm_info = init_assembly_metadata(gemm_info); -#ifdef __aarch64__ - switch(a->info()->data_type()) - { - case DataType::QASYMM8: - case DataType::QASYMM8_SIGNED: - case DataType::U8: - case DataType::S8: - { - if(is_data_type_quantized_asymmetric(a_to_use->info()->data_type()) && info.gemmlowp_output_stage().type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT) - { - auto c_info_to_use = c == nullptr ? nullptr : c->info(); - _asm_glue->configure(a_to_use->info(), b->info(), c_info_to_use, output->info(), asm_info); - _fused_assembly_path = _asm_glue->is_configured(); - _asm_glue_tensors.add_const_tensor(TensorType::ACL_SRC_2, c); - _asm_glue_tensors.add_tensor(TensorType::ACL_DST, output); - } - else - { - auto output_to_use = (_fuse_output_stage ? &_mm_result_s32 : output); - _asm_glue->configure(a_to_use->info(), b->info(), nullptr, output_to_use->info(), asm_info); - _asm_glue_tensors.add_tensor(TensorType::ACL_DST, output_to_use); - } - _assembly_path = _asm_glue->is_configured(); - _asm_glue_tensors.add_const_tensor(TensorType::ACL_SRC_0, a_to_use); - _asm_glue_tensors.add_const_tensor(TensorType::ACL_SRC_1, b); - break; - } - default: - { - ARM_COMPUTE_ERROR("Datatype not supported"); - break; - } - } -#endif /* __aarch64__ */ - if(!(_assembly_path || _run_vector_matrix_multiplication)) - { - matrix_a = &_tmp_a; - matrix_b = &_tmp_b; - - // The interleaved output matrix will have the following shape: [ a_height * 4, ceil(a_width / 4.0f) ] - TensorInfo a_info(compute_interleaved_shape(*a_to_use->info()), 1, a_to_use->info()->data_type(), a_to_use->info()->quantization_info()); - // The transpose1xW output matrix will have the following shape: [ b_height * 16, ceil(b_width / 16.0f) ] - TensorInfo b_info(compute_transpose1xW_shape(*b->info()), 1, b->info()->data_type(), b->info()->quantization_info()); - _tmp_a.allocator()->init(a_info); - _tmp_b.allocator()->init(b_info); - _memory_group.manage(&_tmp_a); - if(!_reshape_b_only_on_first_run) - { - _memory_group.manage(&_tmp_b); - } - - // Configure interleave kernel - _mtx_a_reshape_kernel = std::make_unique<NEGEMMInterleave4x4Kernel>(); - _mtx_a_reshape_kernel->configure(a_to_use, &_tmp_a); - - // Configure transpose kernel - _mtx_b_reshape_kernel = std::make_unique<NEGEMMTranspose1xWKernel>(); - _mtx_b_reshape_kernel->configure(b, &_tmp_b); - } - - if(!_fused_assembly_path) - { - // Build reduction info - const GEMMLowpReductionKernelInfo reduction_info(a_to_use->info()->dimension(0), false, 0, false); - - // Initialize matrix B reduction kernel only if _a_offset is not equal to 0 - if(_a_offset != 0) - { - TensorInfo info_vector_sum_col(compute_reductionA_shape(*b->info()), 1, DataType::S32); - - _vector_sum_col.allocator()->init(info_vector_sum_col); - if(!_reshape_b_only_on_first_run) - { - _memory_group.manage(&_vector_sum_col); - } - - // Configure Matrix B reduction kernel - _mtx_b_reduction_kernel = std::make_unique<NEGEMMLowpMatrixBReductionKernel>(); - _mtx_b_reduction_kernel->configure(b, &_vector_sum_col, reduction_info); - } - - // Initialize Matrix A reduction kernel only if _b_offset is not equal to 0 - if(_b_offset != 0) - { - TensorInfo info_vector_sum_row(compute_reductionB_shape(*a_to_use->info()), 1, DataType::S32); - - _vector_sum_row.allocator()->init(info_vector_sum_row); - _memory_group.manage(&_vector_sum_row); - - // Configure matrix A reduction kernel - _mtx_a_reduction_kernel = std::make_unique<NEGEMMLowpMatrixAReductionKernel>(); - _mtx_a_reduction_kernel->configure(a_to_use, &_vector_sum_row, reduction_info); - } - - if(_fuse_output_stage) - { - // Configure matrix multiply kernel - if(!_assembly_path) - { - _mm_kernel = std::make_unique<NEGEMMLowpMatrixMultiplyKernel>(); - _mm_kernel->configure(matrix_a, matrix_b, &_mm_result_s32); - } - - _offset_contribution_output_stage_kernel = std::make_unique<NEGEMMLowpOffsetContributionOutputStageKernel>(); - _offset_contribution_output_stage_kernel->configure(&_mm_result_s32, - _a_offset == 0 ? nullptr : &_vector_sum_col, - _b_offset == 0 ? nullptr : &_vector_sum_row, c, - _flip_signedness ? &_signed_output : output, - a->info()->dimension(0), - _a_offset, _b_offset, info.gemmlowp_output_stage()); - - if(_flip_signedness) - { - _convert_from_signed_asymm = std::make_unique<NEConvertQuantizedSignednessKernel>(); - _convert_from_signed_asymm->configure(&_signed_output, output); - } - } - else - { - // Configure matrix multiply kernel - if(!_assembly_path) - { - _mm_kernel = std::make_unique<NEGEMMLowpMatrixMultiplyKernel>(); - _mm_kernel->configure(matrix_a, matrix_b, output); - } - // Configure offset contribution kernel - _offset_contribution_kernel = std::make_unique<NEGEMMLowpOffsetContributionKernel>(); - _offset_contribution_kernel->configure(output, _a_offset == 0 ? nullptr : &_vector_sum_col, _b_offset == 0 ? nullptr : &_vector_sum_row, a_to_use->info()->dimension(0), _a_offset, _b_offset); - } - } - // Configure activation - const ActivationLayerInfo &activation = gemm_info.activation_info(); - _run_activation = activation.enabled() && (!_assembly_path || !cpu::CpuGemmAssemblyDispatch::is_activation_supported(activation)); - if(_run_activation) - { - _activation_func.configure(output, nullptr, activation); - } - - // Allocate tensors - if(!_assembly_path && !_run_vector_matrix_multiplication) - { - _tmp_a.allocator()->allocate(); - if(!_reshape_b_only_on_first_run) - { - _tmp_b.allocator()->allocate(); - } - } - - if(!_fused_assembly_path) - { - if(_a_offset != 0 && !_reshape_b_only_on_first_run) - { - _vector_sum_col.allocator()->allocate(); - } - - if(_b_offset != 0) - { - _vector_sum_row.allocator()->allocate(); - } - } - if(_fuse_output_stage) - { - _mm_result_s32.allocator()->allocate(); - } - - if(_flip_signedness) - { - _signed_a.allocator()->allocate(); - _signed_output.allocator()->allocate(); - } + // Make the B matrix dynamic values. + auto b_info_to_use = b->info()->clone(); + if (!gemm_info.reshape_b_only_on_first_run()) + { + b_info_to_use->set_are_values_constant(false); + } + + _impl->b = b; + _impl->op = std::make_unique<cpu::CpuGemmLowpMatrixMultiplyCore>(); + _impl->op->configure(a->info(), b_info_to_use.get(), (c != nullptr ? c->info() : nullptr), output->info(), + gemm_info); + _impl->run_pack = {{TensorType::ACL_SRC_0, a}, + {TensorType::ACL_SRC_1, b}, + {TensorType::ACL_SRC_2, c}, + {TensorType::ACL_DST, output}}; + _impl->prep_pack = {{TensorType::ACL_SRC_1, b}, {TensorType::ACL_SRC_2, c}}; + _impl->aux_mem_req = _impl->op->workspace(); + _impl->workspace_tensors = + manage_workspace<Tensor>(_impl->aux_mem_req, _impl->memory_group, _impl->run_pack, _impl->prep_pack); } -Status NEGEMMLowpMatrixMultiplyCore::validate(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *output, const GEMMInfo &gemm_info) +Status NEGEMMLowpMatrixMultiplyCore::validate(const ITensorInfo *a, + const ITensorInfo *b, + const ITensorInfo *c, + const ITensorInfo *output, + const GEMMInfo &gemm_info) { - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(a, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(b, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM8, DataType::QSYMM8_PER_CHANNEL); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::S32, DataType::QASYMM8, DataType::QASYMM8_SIGNED); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(c != nullptr && gemm_info.gemmlowp_output_stage().type == GEMMLowpOutputStageType::NONE, "Bias addition not supported in NEGEMMLowpMatrixMultiplyCore for output S32"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG((a)->dimension(0) != (b)->dimension(1), - "The product AB is defined only if the number of columns in A is equal to the number of rows in B"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.is_a_reshaped(), "Matrix A already reshaped is not supported"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.is_b_reshaped(), "Matrix B already reshaped is not supported"); - - GEMMInfo info = gemm_info; - const ITensorInfo *matrix_a_info = a; - const ITensorInfo *matrix_b_info = b; - - const ITensorInfo *a_to_use = a; - - TensorInfo tmp_a_info{}; - TensorInfo tmp_b_info{}; - TensorInfo mm_result_s32_info{}; - - int32_t a_offset = a->quantization_info().uniform().offset; - int32_t b_offset = b->quantization_info().uniform().offset; - - bool fuse_output_stage = info.gemmlowp_output_stage().type != GEMMLowpOutputStageType::NONE; - if(fuse_output_stage) + // Make the B matrix dynamic values. + auto b_info_to_use = b->clone(); + if (!gemm_info.reshape_b_only_on_first_run()) { - auto_init_if_empty(mm_result_s32_info, a->clone()->set_tensor_shape(output->tensor_shape()).set_data_type(DataType::S32)); + b_info_to_use->set_are_values_constant(false); } - // Convert QASYMM8->QASYMM8_SIGNED - TensorInfo signed_a{}; - TensorInfo signed_output{}; - bool flip_signedness = is_data_type_quantized_per_channel(b->data_type()) && (a->data_type() == DataType::QASYMM8) && info.reshape_b_only_on_first_run(); - if(flip_signedness) - { - const int32_t offset_correction = 128; - const DataType dt = DataType::QASYMM8_SIGNED; - const UniformQuantizationInfo iqinfo = a_to_use->quantization_info().uniform(); - - signed_a = a_to_use->clone()->set_data_type(dt).set_quantization_info(QuantizationInfo(iqinfo.scale, iqinfo.offset + offset_correction)); - ARM_COMPUTE_RETURN_ON_ERROR(NEConvertQuantizedSignednessKernel::validate(a_to_use, &signed_a)); - a_to_use = &signed_a; - a_offset = signed_a.quantization_info().uniform().offset; - - const UniformQuantizationInfo oqinfo = output->quantization_info().uniform(); - signed_output = output->clone()->set_data_type(dt).set_quantization_info(QuantizationInfo(oqinfo.scale, oqinfo.offset - offset_correction)); - - // Output stage correction - GEMMLowpOutputStageInfo output_stage_corr = info.gemmlowp_output_stage(); - output_stage_corr.gemmlowp_offset = signed_output.quantization_info().uniform().offset; - output_stage_corr.gemmlowp_min_bound -= offset_correction; - output_stage_corr.gemmlowp_max_bound -= offset_correction; - info.set_gemmlowp_output_stage(output_stage_corr); - - // Update matrix a - matrix_a_info = &signed_a; - } - - // Initialize assembly kernel meta-data - const cpu::AsmGemmInfo asm_info = init_assembly_metadata(info); - - // Check if we need to run the optimized assembly kernel - bool run_optimised = false; - bool run_optimised_requantized = false; - if(is_data_type_quantized_asymmetric(a_to_use->data_type()) && info.gemmlowp_output_stage().type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT) - { - run_optimised = bool(cpu::CpuGemmAssemblyDispatch::validate(a_to_use, b, c, output, asm_info)); - run_optimised_requantized = run_optimised; - } - else - { - run_optimised = bool(cpu::CpuGemmAssemblyDispatch::validate(a_to_use, b, nullptr, fuse_output_stage ? &mm_result_s32_info : output, asm_info)); - } - - if(run_optimised) - { - ARM_COMPUTE_RETURN_ERROR_ON(b->dimension(0) != output->dimension(0)); - if(info.depth_output_gemm3d() != 0) - { - if(info.reinterpret_input_as_3d()) - { - ARM_COMPUTE_RETURN_ERROR_ON(a->dimension(1) != output->dimension(1)); - ARM_COMPUTE_RETURN_ERROR_ON(a->dimension(2) != output->dimension(2)); - } - else - { - ARM_COMPUTE_RETURN_ERROR_ON(a->dimension(1) != output->dimension(1) * output->dimension(2)); - } - } - else - { - ARM_COMPUTE_RETURN_ERROR_ON(a->dimension(1) != output->dimension(1)); - } - } - else - { - ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.reinterpret_input_as_3d(), "NEGEMM cannot reinterpret the input tensor as 3D"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.depth_output_gemm3d() != 0, "NEGEMM cannot reinterpret the output tensor as 3D"); - - const bool run_vector_matrix_multiplication = a->dimension(1) < 2; - if(!run_vector_matrix_multiplication) - { - matrix_a_info = &tmp_a_info; - matrix_b_info = &tmp_b_info; - - // The interleaved output matrix will have the following shape: [ a_height * 4, ceil(a_width / 4.0f) ] - TensorShape shape_tmp_a = a->tensor_shape(); - shape_tmp_a.set(0, a->dimension(0) * 4); - shape_tmp_a.set(1, std::ceil(a->dimension(1) / 4.f)); - - // The transpose1xW output matrix will have the following shape: [ b_height * 16, ceil(b_width / 16.0f) ] - TensorShape shape_tmp_b = b->tensor_shape(); - shape_tmp_b.set(0, b->dimension(1) * 16); - shape_tmp_b.set(1, std::ceil(b->dimension(0) / 16.f)); - - // Validate interleave kernel - auto_init_if_empty(tmp_a_info, a_to_use->clone()->set_tensor_shape(shape_tmp_a)); - auto_init_if_empty(tmp_b_info, b->clone()->set_tensor_shape(shape_tmp_b)); - - ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMInterleave4x4Kernel::validate(a_to_use, &tmp_a_info)); - ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMTranspose1xWKernel::validate(b, &tmp_b_info)); - } - } - - if(!run_optimised_requantized) - { - TensorInfo info_vector_sum_col{}; - TensorInfo info_vector_sum_row{}; - - const GEMMLowpReductionKernelInfo reduction_info(a_to_use->dimension(0), false, 0, false); - - // Validate matrix B reduction kernel only if _a_offset is not equal to 0 - if(a_offset != 0) - { - info_vector_sum_col = TensorInfo(compute_reductionA_shape(*b), 1, DataType::S32); - - // Configure Matrix B reduction kernel - ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpMatrixBReductionKernel::validate(b, &info_vector_sum_col, reduction_info)); - } - - // Validate Matrix A reduction kernel only if _b_offset is not equal to 0 - if(b_offset != 0) - { - info_vector_sum_row = TensorInfo(compute_reductionB_shape(*a), 1, DataType::S32); - - // Configure matrix A reduction kernel - ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpMatrixAReductionKernel::validate(a_to_use, &info_vector_sum_row, reduction_info)); - } - - if(fuse_output_stage) - { - if(!run_optimised) - { - ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.reinterpret_input_as_3d(), "NEGEMMLowpMatrixMultiplyKernel cannot reinterpret the input tensor as 3D"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.depth_output_gemm3d() != 0, "NEGEMMLowpMatrixMultiplyKernel cannot reinterpret the output tensor as 3D"); - - ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpMatrixMultiplyKernel::validate(matrix_a_info, matrix_b_info, &mm_result_s32_info)); - } - - // Validate offset contribution kernel - ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpOffsetContributionOutputStageKernel::validate(&mm_result_s32_info, - a_offset == 0 ? nullptr : &info_vector_sum_col, - b_offset == 0 ? nullptr : &info_vector_sum_row, - c, - flip_signedness ? &signed_output : output, - a_offset, b_offset, - info.gemmlowp_output_stage())); - } - else - { - if(!run_optimised) - { - ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.reinterpret_input_as_3d(), "NEGEMMLowpMatrixMultiplyKernel cannot reinterpret the input tensor as 3D"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.depth_output_gemm3d() != 0, "NEGEMMLowpMatrixMultiplyKernel cannot reinterpret the output tensor as 3D"); - - ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpMatrixMultiplyKernel::validate(matrix_a_info, matrix_b_info, output)); - } - // Validate offset contribution kernel - ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpOffsetContributionKernel::validate(output, - a_offset == 0 ? nullptr : &info_vector_sum_col, - b_offset == 0 ? nullptr : &info_vector_sum_row, - a_offset, b_offset)); - } - } - - // Validate activation - const ActivationLayerInfo &activation = gemm_info.activation_info(); - if(activation.enabled()) - { - ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayer::validate(output, nullptr, activation)); - } - - return Status{}; + return cpu::CpuGemmLowpMatrixMultiplyCore::validate(a, b_info_to_use.get(), c, output, gemm_info); } void NEGEMMLowpMatrixMultiplyCore::run() { prepare(); - - MemoryGroupResourceScope scope_mg(_memory_group); - - // Convert QASYMM8->QASYMM8_SIGNED - if(_flip_signedness) - { - NEScheduler::get().schedule(_convert_to_signed_asymm.get(), Window::DimY); - } - - // Run GEMM - if(_asm_glue->is_configured()) - { - _asm_glue->run(_asm_glue_tensors); - } - else - { - if(!_run_vector_matrix_multiplication) - { - // Run interleave kernel - NEScheduler::get().schedule(_mtx_a_reshape_kernel.get(), Window::DimY); - - if(!_reshape_b_only_on_first_run) - { - // Run transpose kernel - NEScheduler::get().schedule(_mtx_b_reshape_kernel.get(), Window::DimY); - } - } - NEScheduler::get().schedule(_mm_kernel.get(), Window::DimY); - } - - if(!_fused_assembly_path) - { - // Run matrix A reduction kernel only if _b_offset is not equal to 0 - if(_b_offset != 0) - { - NEScheduler::get().schedule(_mtx_a_reduction_kernel.get(), Window::DimX); - } - - // Run matrix B reduction kernel only if _a_offset is not equal to 0 - if(_a_offset != 0 && !_reshape_b_only_on_first_run) - { - NEScheduler::get().schedule(_mtx_b_reduction_kernel.get(), Window::DimX); - } - - if(_fuse_output_stage) - { - // Run offset contribution kernel - NEScheduler::get().schedule(_offset_contribution_output_stage_kernel.get(), Window::DimY); - } - else - { - // Run offset contribution kernel - NEScheduler::get().schedule(_offset_contribution_kernel.get(), Window::DimY); - } - } - - // Convert QASYMM8_SIGNED->QASYMM8 - if(!_fused_assembly_path && _fuse_output_stage && _flip_signedness) - { - NEScheduler::get().schedule(_convert_from_signed_asymm.get(), Window::DimY); - } - - // Run fused activation unless already run in the fused assembly - if(_run_activation) - { - _activation_func.run(); - } + MemoryGroupResourceScope scope_mg(_impl->memory_group); + _impl->op->run(_impl->run_pack); } void NEGEMMLowpMatrixMultiplyCore::prepare() { - if(!_is_prepared) + if (!_impl->is_prepared) { - const bool original_b_managed_by_weights_manager = _weights_manager && _weights_manager->are_weights_managed(_original_b); - // Run assembly reshape - if(_asm_glue->is_configured()) - { - if(!original_b_managed_by_weights_manager) - { - ARM_COMPUTE_ERROR_ON(!_original_b->is_used()); - } + _impl->op->prepare(_impl->prep_pack); - _asm_glue->prepare(_asm_glue_tensors); - if(!original_b_managed_by_weights_manager) - { - _original_b->mark_as_unused(); - } - } - // Run non-assembly reshape - else if(_reshape_b_only_on_first_run && !_run_vector_matrix_multiplication && !_asm_glue->is_configured()) - { - if(!original_b_managed_by_weights_manager) - { - ARM_COMPUTE_ERROR_ON(!_original_b->is_used()); - } - - // Run reshape kernel and mark original weights tensor as unused - _tmp_b.allocator()->allocate(); - NEScheduler::get().schedule(_mtx_b_reshape_kernel.get(), Window::DimY); - if(!original_b_managed_by_weights_manager) - { - _original_b->mark_as_unused(); - } - } + auto has_reshape = + std::find_if(_impl->aux_mem_req.begin(), _impl->aux_mem_req.end(), + [](const MemoryInfo &m) -> bool { return m.lifetime == MemoryLifetime::Persistent; }); - // Run matrix B reduction kernel only if _a_offset is not equal to 0 - if(!_fused_assembly_path && _a_offset != 0 && _reshape_b_only_on_first_run) + if (has_reshape != std::end(_impl->aux_mem_req)) { - _vector_sum_col.allocator()->allocate(); - NEScheduler::get().schedule(_mtx_b_reduction_kernel.get(), Window::DimX); + _impl->b->mark_as_unused(); } - _is_prepared = true; + // Release temporary tensors that are only used in prepare stage + release_temporaries<Tensor>(_impl->aux_mem_req, _impl->workspace_tensors); + _impl->is_prepared = true; } } } // namespace arm_compute diff --git a/src/runtime/NEON/functions/NEGEMMLowpOutputStage.cpp b/src/runtime/NEON/functions/NEGEMMLowpOutputStage.cpp index 807785a534..8178003b5e 100644 --- a/src/runtime/NEON/functions/NEGEMMLowpOutputStage.cpp +++ b/src/runtime/NEON/functions/NEGEMMLowpOutputStage.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2020 Arm Limited. + * Copyright (c) 2017-2021 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -25,162 +25,54 @@ #include "arm_compute/core/ITensor.h" #include "arm_compute/core/Validate.h" -#include "src/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ScaleKernel.h" -#include "src/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel.h" -#include "src/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel.h" -#include "src/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel.h" -namespace arm_compute -{ -NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint::~NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint() = default; - -void NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint::configure(const ITensor *input, const ITensor *bias, ITensor *output, int result_fixedpoint_multiplier, int result_shift, - int result_offset_after_shift, int min, int max) -{ - auto k = std::make_unique<NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel>(); - k->configure(input, bias, output, result_fixedpoint_multiplier, result_shift, result_offset_after_shift, min, max); - _kernel = std::move(k); -} - -Status NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint::validate(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, int min, int max) -{ - return NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel::validate(input, bias, output, min, max); -} - -NEGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPoint::~NEGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPoint() = default; - -void NEGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPoint::configure(const ITensor *input, const ITensor *bias, ITensor *output, int result_fixedpoint_multiplier, int result_shift, - int result_offset_after_shift, int min, int max) -{ - auto k = std::make_unique<NEGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel>(); - k->configure(input, bias, output, result_fixedpoint_multiplier, result_shift, result_offset_after_shift, min, max); - _kernel = std::move(k); -} +#include "src/cpu/operators/CpuGemmLowpOutputStage.h" -Status NEGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPoint::validate(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, int min, int max) +namespace arm_compute { - return NEGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel::validate(input, bias, output, min, max); -} - -NEGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPoint::~NEGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPoint() = default; - -void NEGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPoint::configure(const ITensor *input, const ITensor *bias, ITensor *output, int result_fixedpoint_multiplier, int result_shift, int min, int max) +struct NEGEMMLowpOutputStage::Impl { - auto k = std::make_unique<NEGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel>(); - k->configure(input, bias, output, result_fixedpoint_multiplier, result_shift, min, max); - _kernel = std::move(k); -} - -Status NEGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPoint::validate(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, int min, int max) + const ITensor *src{nullptr}; + const ITensor *bias{nullptr}; + ITensor *dst{nullptr}; + ITensorPack run_pack{}; + std::unique_ptr<cpu::CpuGemmLowpOutputStage> op{nullptr}; +}; + +NEGEMMLowpOutputStage::NEGEMMLowpOutputStage() : _impl(std::make_unique<Impl>()) { - return NEGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel::validate(input, bias, output, min, max); } - NEGEMMLowpOutputStage::~NEGEMMLowpOutputStage() = default; -void NEGEMMLowpOutputStage::configure(const ITensor *input, const ITensor *bias, ITensor *output, const GEMMLowpOutputStageInfo &info) +void NEGEMMLowpOutputStage::configure(const ITensor *input, + const ITensor *bias, + ITensor *output, + const GEMMLowpOutputStageInfo &info) { // Perform validate step ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); - ARM_COMPUTE_ERROR_THROW_ON(NEGEMMLowpOutputStage::validate(input->info(), bias != nullptr ? bias->info() : nullptr, output->info(), info)); - - switch(info.type) - { - case GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT: - { - switch(info.output_data_type) - { - case DataType::QASYMM8: - { - auto k = std::make_unique<NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel>(); - k->configure(input, bias, output, info.gemmlowp_multiplier, info.gemmlowp_shift, info.gemmlowp_offset, info.gemmlowp_min_bound, info.gemmlowp_max_bound); - _kernel = std::move(k); - break; - } - case DataType::QASYMM8_SIGNED: - { - auto k = std::make_unique<NEGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel>(); - k->configure(input, bias, output, info.gemmlowp_multiplier, info.gemmlowp_shift, info.gemmlowp_offset, info.gemmlowp_min_bound, info.gemmlowp_max_bound); - _kernel = std::move(k); - break; - } - case DataType::QSYMM16: - { - auto k = std::make_unique<NEGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel>(); - k->configure(input, bias, output, info.gemmlowp_multiplier, info.gemmlowp_shift, info.gemmlowp_min_bound, info.gemmlowp_max_bound); - _kernel = std::move(k); - break; - } - default: - { - ARM_COMPUTE_ERROR("Unsupported output data type."); - break; - } - } - break; - } - case GEMMLowpOutputStageType::QUANTIZE_DOWN: - { - switch(info.output_data_type) - { - case DataType::QASYMM8: - case DataType::QASYMM8_SIGNED: - { - auto k = std::make_unique<NEGEMMLowpQuantizeDownInt32ScaleKernel>(); - k->configure(input, bias, output, &info); - _kernel = std::move(k); - break; - } - default: - { - ARM_COMPUTE_ERROR("Unsupported output data type."); - break; - } - } - break; - } - default: - ARM_COMPUTE_ERROR("Unsupported GEMMLowpOutputStage type."); - } + ARM_COMPUTE_ERROR_THROW_ON( + NEGEMMLowpOutputStage::validate(input->info(), bias != nullptr ? bias->info() : nullptr, output->info(), info)); + _impl->src = input; + _impl->bias = bias; + _impl->dst = output; + _impl->op = std::make_unique<cpu::CpuGemmLowpOutputStage>(); + _impl->op->configure(input->info(), (bias == nullptr) ? nullptr : bias->info(), output->info(), info); + + _impl->run_pack = { + {TensorType::ACL_SRC, _impl->src}, {TensorType::ACL_BIAS, _impl->bias}, {TensorType::ACL_DST, _impl->dst}}; } -Status NEGEMMLowpOutputStage::validate(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, const GEMMLowpOutputStageInfo &info) +Status NEGEMMLowpOutputStage::validate(const ITensorInfo *input, + const ITensorInfo *bias, + const ITensorInfo *output, + const GEMMLowpOutputStageInfo &info) { - ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(output); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->data_type() == DataType::UNKNOWN, "NEGEMMLowpQuantizeDownScaleByFixedPoint cannot be used with UNKNOWN output data type."); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM16); - - ARM_COMPUTE_RETURN_ERROR_ON((info.type != GEMMLowpOutputStageType::QUANTIZE_DOWN) && (info.type != GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT)); + return cpu::CpuGemmLowpOutputStage::validate(input, bias, output, info); +} - switch(info.type) - { - case GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT: - { - switch(output->data_type()) - { - case DataType::QASYMM8: - return NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel::validate(input, bias, output, info.gemmlowp_min_bound, info.gemmlowp_max_bound); - case DataType::QASYMM8_SIGNED: - return NEGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel::validate(input, bias, output, info.gemmlowp_min_bound, info.gemmlowp_max_bound); - case DataType::QSYMM16: - return NEGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel::validate(input, bias, output, info.gemmlowp_min_bound, info.gemmlowp_max_bound); - default: - return ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Unsupported output data type."); - } - } - case GEMMLowpOutputStageType::QUANTIZE_DOWN: - { - switch(output->data_type()) - { - case DataType::QASYMM8: - case DataType::QASYMM8_SIGNED: - return NEGEMMLowpQuantizeDownInt32ScaleKernel::validate(input, bias, output, &info); - default: - return ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Unsupported output data type."); - } - } - default: - return ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Unsupported GEMMLowpOutputStage type."); - } +void NEGEMMLowpOutputStage::run() +{ + _impl->op->run(_impl->run_pack); } } // namespace arm_compute diff --git a/src/runtime/NEON/functions/NEGather.cpp b/src/runtime/NEON/functions/NEGather.cpp index 86cbfd187a..62b8cfa48b 100644 --- a/src/runtime/NEON/functions/NEGather.cpp +++ b/src/runtime/NEON/functions/NEGather.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2020 Arm Limited. + * Copyright (c) 2019-2021 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -23,6 +23,7 @@ */ #include "arm_compute/runtime/NEON/functions/NEGather.h" +#include "src/common/utils/Log.h" #include "src/core/NEON/kernels/NEGatherKernel.h" #include <utility> @@ -31,6 +32,7 @@ namespace arm_compute { void NEGather::configure(const ITensor *input, const ITensor *indices, ITensor *output, int axis) { + ARM_COMPUTE_LOG_PARAMS(input, indices, output, axis); auto k = std::make_unique<NEGatherKernel>(); k->configure(input, indices, output, axis); _kernel = std::move(k); diff --git a/src/runtime/NEON/functions/NEGenerateProposalsLayer.cpp b/src/runtime/NEON/functions/NEGenerateProposalsLayer.cpp index 931fdb22f7..1022b4153e 100644 --- a/src/runtime/NEON/functions/NEGenerateProposalsLayer.cpp +++ b/src/runtime/NEON/functions/NEGenerateProposalsLayer.cpp @@ -25,10 +25,12 @@ #include "arm_compute/core/Types.h" #include "arm_compute/runtime/NEON/NEScheduler.h" + +#include "src/common/utils/Log.h" +#include "src/core/helpers/AutoConfiguration.h" #include "src/core/NEON/kernels/NEFillBorderKernel.h" #include "src/core/NEON/kernels/NEGenerateProposalsLayerKernel.h" #include "src/core/NEON/kernels/NEPadLayerKernel.h" -#include "src/core/helpers/AutoConfiguration.h" namespace arm_compute { @@ -67,41 +69,55 @@ NEGenerateProposalsLayer::NEGenerateProposalsLayer(std::shared_ptr<IMemoryManage NEGenerateProposalsLayer::~NEGenerateProposalsLayer() = default; -void NEGenerateProposalsLayer::configure(const ITensor *scores, const ITensor *deltas, const ITensor *anchors, ITensor *proposals, ITensor *scores_out, ITensor *num_valid_proposals, +void NEGenerateProposalsLayer::configure(const ITensor *scores, + const ITensor *deltas, + const ITensor *anchors, + ITensor *proposals, + ITensor *scores_out, + ITensor *num_valid_proposals, const GenerateProposalsInfo &info) { ARM_COMPUTE_ERROR_ON_NULLPTR(scores, deltas, anchors, proposals, scores_out, num_valid_proposals); - ARM_COMPUTE_ERROR_THROW_ON(NEGenerateProposalsLayer::validate(scores->info(), deltas->info(), anchors->info(), proposals->info(), scores_out->info(), num_valid_proposals->info(), info)); + ARM_COMPUTE_ERROR_THROW_ON(NEGenerateProposalsLayer::validate(scores->info(), deltas->info(), anchors->info(), + proposals->info(), scores_out->info(), + num_valid_proposals->info(), info)); + ARM_COMPUTE_LOG_PARAMS(scores, deltas, anchors, proposals, scores_out, num_valid_proposals, info); _is_nhwc = scores->info()->data_layout() == DataLayout::NHWC; const DataType scores_data_type = scores->info()->data_type(); _is_qasymm8 = scores_data_type == DataType::QASYMM8; - const int num_anchors = scores->info()->dimension(get_data_layout_dimension_index(scores->info()->data_layout(), DataLayoutDimension::CHANNEL)); - const int feat_width = scores->info()->dimension(get_data_layout_dimension_index(scores->info()->data_layout(), DataLayoutDimension::WIDTH)); - const int feat_height = scores->info()->dimension(get_data_layout_dimension_index(scores->info()->data_layout(), DataLayoutDimension::HEIGHT)); - const int total_num_anchors = num_anchors * feat_width * feat_height; - const int pre_nms_topN = info.pre_nms_topN(); - const int post_nms_topN = info.post_nms_topN(); - const size_t values_per_roi = info.values_per_roi(); + const int num_anchors = scores->info()->dimension( + get_data_layout_dimension_index(scores->info()->data_layout(), DataLayoutDimension::CHANNEL)); + const int feat_width = scores->info()->dimension( + get_data_layout_dimension_index(scores->info()->data_layout(), DataLayoutDimension::WIDTH)); + const int feat_height = scores->info()->dimension( + get_data_layout_dimension_index(scores->info()->data_layout(), DataLayoutDimension::HEIGHT)); + const int total_num_anchors = num_anchors * feat_width * feat_height; + const int pre_nms_topN = info.pre_nms_topN(); + const int post_nms_topN = info.post_nms_topN(); + const size_t values_per_roi = info.values_per_roi(); const QuantizationInfo scores_qinfo = scores->info()->quantization_info(); const DataType rois_data_type = (_is_qasymm8) ? DataType::QASYMM16 : scores_data_type; - const QuantizationInfo rois_qinfo = (_is_qasymm8) ? QuantizationInfo(0.125f, 0) : scores->info()->quantization_info(); + const QuantizationInfo rois_qinfo = + (_is_qasymm8) ? QuantizationInfo(0.125f, 0) : scores->info()->quantization_info(); // Compute all the anchors _memory_group.manage(&_all_anchors); _compute_anchors = std::make_unique<NEComputeAllAnchorsKernel>(); - _compute_anchors->configure(anchors, &_all_anchors, ComputeAnchorsInfo(feat_width, feat_height, info.spatial_scale())); + _compute_anchors->configure(anchors, &_all_anchors, + ComputeAnchorsInfo(feat_width, feat_height, info.spatial_scale())); const TensorShape flatten_shape_deltas(values_per_roi, total_num_anchors); - _deltas_flattened.allocator()->init(TensorInfo(flatten_shape_deltas, 1, scores_data_type, deltas->info()->quantization_info())); + _deltas_flattened.allocator()->init( + TensorInfo(flatten_shape_deltas, 1, scores_data_type, deltas->info()->quantization_info())); // Permute and reshape deltas _memory_group.manage(&_deltas_flattened); - if(!_is_nhwc) + if (!_is_nhwc) { _memory_group.manage(&_deltas_permuted); - _permute_deltas.configure(deltas, &_deltas_permuted, PermutationVector{ 2, 0, 1 }); + _permute_deltas.configure(deltas, &_deltas_permuted, PermutationVector{2, 0, 1}); _flatten_deltas.configure(&_deltas_permuted, &_deltas_flattened); _deltas_permuted.allocator()->allocate(); } @@ -115,10 +131,10 @@ void NEGenerateProposalsLayer::configure(const ITensor *scores, const ITensor *d // Permute and reshape scores _memory_group.manage(&_scores_flattened); - if(!_is_nhwc) + if (!_is_nhwc) { _memory_group.manage(&_scores_permuted); - _permute_scores.configure(scores, &_scores_permuted, PermutationVector{ 2, 0, 1 }); + _permute_scores.configure(scores, &_scores_permuted, PermutationVector{2, 0, 1}); _flatten_scores.configure(&_scores_permuted, &_scores_flattened); _scores_permuted.allocator()->allocate(); } @@ -129,7 +145,7 @@ void NEGenerateProposalsLayer::configure(const ITensor *scores, const ITensor *d Tensor *anchors_to_use = &_all_anchors; Tensor *deltas_to_use = &_deltas_flattened; - if(_is_qasymm8) + if (_is_qasymm8) { _all_anchors_f32.allocator()->init(TensorInfo(_all_anchors.info()->tensor_shape(), 1, DataType::F32)); _deltas_flattened_f32.allocator()->init(TensorInfo(_deltas_flattened.info()->tensor_shape(), 1, DataType::F32)); @@ -152,11 +168,12 @@ void NEGenerateProposalsLayer::configure(const ITensor *scores, const ITensor *d anchors_to_use->allocator()->allocate(); _all_proposals_to_use = &_all_proposals; - if(_is_qasymm8) + if (_is_qasymm8) { _memory_group.manage(&_all_proposals_quantized); // Requantize all_proposals to QASYMM16 with 0.125 scale and 0 offset - _all_proposals_quantized.allocator()->init(TensorInfo(_all_proposals.info()->tensor_shape(), 1, DataType::QASYMM16, QuantizationInfo(0.125f, 0))); + _all_proposals_quantized.allocator()->init( + TensorInfo(_all_proposals.info()->tensor_shape(), 1, DataType::QASYMM16, QuantizationInfo(0.125f, 0))); _quantize_all_proposals.configure(&_all_proposals, &_all_proposals_quantized); _all_proposals.allocator()->allocate(); _all_proposals_to_use = &_all_proposals_quantized; @@ -172,7 +189,8 @@ void NEGenerateProposalsLayer::configure(const ITensor *scores, const ITensor *d // Note that NMS needs outputs preinitialized. auto_init_if_empty(*scores_out->info(), TensorShape(scores_nms_size), 1, scores_data_type, scores_qinfo); - auto_init_if_empty(*_proposals_4_roi_values.info(), TensorShape(values_per_roi, scores_nms_size), 1, rois_data_type, rois_qinfo); + auto_init_if_empty(*_proposals_4_roi_values.info(), TensorShape(values_per_roi, scores_nms_size), 1, rois_data_type, + rois_qinfo); auto_init_if_empty(*num_valid_proposals->info(), TensorShape(1), 1, DataType::U32); // Initialize temporaries (unused) outputs @@ -185,17 +203,12 @@ void NEGenerateProposalsLayer::configure(const ITensor *scores, const ITensor *d _memory_group.manage(&_proposals_4_roi_values); - const BoxNMSLimitInfo box_nms_info(0.0f, info.nms_thres(), scores_nms_size, false, NMSType::LINEAR, 0.5f, 0.001f, true, min_size_scaled, info.im_width(), info.im_height()); - _cpp_nms.configure(&_scores_flattened /*scores_in*/, - _all_proposals_to_use /*boxes_in,*/, - nullptr /* batch_splits_in*/, - scores_out /* scores_out*/, - &_proposals_4_roi_values /*boxes_out*/, - &_classes_nms_unused /*classes*/, - nullptr /*batch_splits_out*/, - &_keeps_nms_unused /*keeps*/, - num_valid_proposals /* keeps_size*/, - box_nms_info); + const BoxNMSLimitInfo box_nms_info(0.0f, info.nms_thres(), scores_nms_size, false, NMSType::LINEAR, 0.5f, 0.001f, + true, min_size_scaled, info.im_width(), info.im_height()); + _cpp_nms.configure(&_scores_flattened /*scores_in*/, _all_proposals_to_use /*boxes_in,*/, + nullptr /* batch_splits_in*/, scores_out /* scores_out*/, &_proposals_4_roi_values /*boxes_out*/, + &_classes_nms_unused /*classes*/, nullptr /*batch_splits_out*/, &_keeps_nms_unused /*keeps*/, + num_valid_proposals /* keeps_size*/, box_nms_info); _keeps_nms_unused.allocator()->allocate(); _classes_nms_unused.allocator()->allocate(); @@ -203,12 +216,17 @@ void NEGenerateProposalsLayer::configure(const ITensor *scores, const ITensor *d _scores_flattened.allocator()->allocate(); // Add the first column that represents the batch id. This will be all zeros, as we don't support multiple images - _pad.configure(&_proposals_4_roi_values, proposals, PaddingList{ { 1, 0 } }); + _pad.configure(&_proposals_4_roi_values, proposals, PaddingList{{1, 0}}); _proposals_4_roi_values.allocator()->allocate(); } -Status NEGenerateProposalsLayer::validate(const ITensorInfo *scores, const ITensorInfo *deltas, const ITensorInfo *anchors, const ITensorInfo *proposals, const ITensorInfo *scores_out, - const ITensorInfo *num_valid_proposals, const GenerateProposalsInfo &info) +Status NEGenerateProposalsLayer::validate(const ITensorInfo *scores, + const ITensorInfo *deltas, + const ITensorInfo *anchors, + const ITensorInfo *proposals, + const ITensorInfo *scores_out, + const ITensorInfo *num_valid_proposals, + const GenerateProposalsInfo &info) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(scores, deltas, anchors, proposals, scores_out, num_valid_proposals); ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(scores, 1, DataType::QASYMM8, DataType::F16, DataType::F32); @@ -216,9 +234,12 @@ Status NEGenerateProposalsLayer::validate(const ITensorInfo *scores, const ITens ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(scores, deltas); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(scores, deltas); - const int num_anchors = scores->dimension(get_data_layout_dimension_index(scores->data_layout(), DataLayoutDimension::CHANNEL)); - const int feat_width = scores->dimension(get_data_layout_dimension_index(scores->data_layout(), DataLayoutDimension::WIDTH)); - const int feat_height = scores->dimension(get_data_layout_dimension_index(scores->data_layout(), DataLayoutDimension::HEIGHT)); + const int num_anchors = + scores->dimension(get_data_layout_dimension_index(scores->data_layout(), DataLayoutDimension::CHANNEL)); + const int feat_width = + scores->dimension(get_data_layout_dimension_index(scores->data_layout(), DataLayoutDimension::WIDTH)); + const int feat_height = + scores->dimension(get_data_layout_dimension_index(scores->data_layout(), DataLayoutDimension::HEIGHT)); const int num_images = scores->dimension(3); const int total_num_anchors = num_anchors * feat_width * feat_height; const int values_per_roi = info.values_per_roi(); @@ -227,76 +248,100 @@ Status NEGenerateProposalsLayer::validate(const ITensorInfo *scores, const ITens ARM_COMPUTE_RETURN_ERROR_ON(num_images > 1); - if(is_qasymm8) + if (is_qasymm8) { ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(anchors, 1, DataType::QSYMM16); const UniformQuantizationInfo anchors_qinfo = anchors->quantization_info().uniform(); ARM_COMPUTE_RETURN_ERROR_ON(anchors_qinfo.scale != 0.125f); } - TensorInfo all_anchors_info(anchors->clone()->set_tensor_shape(TensorShape(values_per_roi, total_num_anchors)).set_is_resizable(true)); - ARM_COMPUTE_RETURN_ON_ERROR(NEComputeAllAnchorsKernel::validate(anchors, &all_anchors_info, ComputeAnchorsInfo(feat_width, feat_height, info.spatial_scale()))); - - TensorInfo deltas_permuted_info = deltas->clone()->set_tensor_shape(TensorShape(values_per_roi * num_anchors, feat_width, feat_height)).set_is_resizable(true); - TensorInfo scores_permuted_info = scores->clone()->set_tensor_shape(TensorShape(num_anchors, feat_width, feat_height)).set_is_resizable(true); - if(scores->data_layout() == DataLayout::NHWC) + TensorInfo all_anchors_info( + anchors->clone()->set_tensor_shape(TensorShape(values_per_roi, total_num_anchors)).set_is_resizable(true)); + ARM_COMPUTE_RETURN_ON_ERROR(NEComputeAllAnchorsKernel::validate( + anchors, &all_anchors_info, ComputeAnchorsInfo(feat_width, feat_height, info.spatial_scale()))); + + TensorInfo deltas_permuted_info = + deltas->clone() + ->set_tensor_shape(TensorShape(values_per_roi * num_anchors, feat_width, feat_height)) + .set_is_resizable(true); + TensorInfo scores_permuted_info = + scores->clone()->set_tensor_shape(TensorShape(num_anchors, feat_width, feat_height)).set_is_resizable(true); + if (scores->data_layout() == DataLayout::NHWC) { ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(deltas, &deltas_permuted_info); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(scores, &scores_permuted_info); } else { - ARM_COMPUTE_RETURN_ON_ERROR(NEPermute::validate(deltas, &deltas_permuted_info, PermutationVector{ 2, 0, 1 })); - ARM_COMPUTE_RETURN_ON_ERROR(NEPermute::validate(scores, &scores_permuted_info, PermutationVector{ 2, 0, 1 })); + ARM_COMPUTE_RETURN_ON_ERROR(NEPermute::validate(deltas, &deltas_permuted_info, PermutationVector{2, 0, 1})); + ARM_COMPUTE_RETURN_ON_ERROR(NEPermute::validate(scores, &scores_permuted_info, PermutationVector{2, 0, 1})); } - TensorInfo deltas_flattened_info(deltas->clone()->set_tensor_shape(TensorShape(values_per_roi, total_num_anchors)).set_is_resizable(true)); + TensorInfo deltas_flattened_info( + deltas->clone()->set_tensor_shape(TensorShape(values_per_roi, total_num_anchors)).set_is_resizable(true)); ARM_COMPUTE_RETURN_ON_ERROR(NEReshapeLayer::validate(&deltas_permuted_info, &deltas_flattened_info)); - TensorInfo scores_flattened_info(scores->clone()->set_tensor_shape(TensorShape(1, total_num_anchors)).set_is_resizable(true)); - TensorInfo proposals_4_roi_values(deltas->clone()->set_tensor_shape(TensorShape(values_per_roi, total_num_anchors)).set_is_resizable(true)); + TensorInfo scores_flattened_info( + scores->clone()->set_tensor_shape(TensorShape(1, total_num_anchors)).set_is_resizable(true)); + TensorInfo proposals_4_roi_values( + deltas->clone()->set_tensor_shape(TensorShape(values_per_roi, total_num_anchors)).set_is_resizable(true)); ARM_COMPUTE_RETURN_ON_ERROR(NEReshapeLayer::validate(&scores_permuted_info, &scores_flattened_info)); TensorInfo *proposals_4_roi_values_to_use = &proposals_4_roi_values; - TensorInfo proposals_4_roi_values_quantized(deltas->clone()->set_tensor_shape(TensorShape(values_per_roi, total_num_anchors)).set_is_resizable(true)); - proposals_4_roi_values_quantized.set_data_type(DataType::QASYMM16).set_quantization_info(QuantizationInfo(0.125f, 0)); - if(is_qasymm8) + TensorInfo proposals_4_roi_values_quantized( + deltas->clone()->set_tensor_shape(TensorShape(values_per_roi, total_num_anchors)).set_is_resizable(true)); + proposals_4_roi_values_quantized.set_data_type(DataType::QASYMM16) + .set_quantization_info(QuantizationInfo(0.125f, 0)); + if (is_qasymm8) { - TensorInfo all_anchors_f32_info(anchors->clone()->set_tensor_shape(TensorShape(values_per_roi, total_num_anchors)).set_is_resizable(true).set_data_type(DataType::F32)); + TensorInfo all_anchors_f32_info(anchors->clone() + ->set_tensor_shape(TensorShape(values_per_roi, total_num_anchors)) + .set_is_resizable(true) + .set_data_type(DataType::F32)); ARM_COMPUTE_RETURN_ON_ERROR(NEDequantizationLayer::validate(&all_anchors_info, &all_anchors_f32_info)); - TensorInfo deltas_flattened_f32_info(deltas->clone()->set_tensor_shape(TensorShape(values_per_roi, total_num_anchors)).set_is_resizable(true).set_data_type(DataType::F32)); - ARM_COMPUTE_RETURN_ON_ERROR(NEDequantizationLayer::validate(&deltas_flattened_info, &deltas_flattened_f32_info)); - - TensorInfo proposals_4_roi_values_f32(deltas->clone()->set_tensor_shape(TensorShape(values_per_roi, total_num_anchors)).set_is_resizable(true).set_data_type(DataType::F32)); - ARM_COMPUTE_RETURN_ON_ERROR(NEBoundingBoxTransform::validate(&all_anchors_f32_info, &proposals_4_roi_values_f32, &deltas_flattened_f32_info, - BoundingBoxTransformInfo(info.im_width(), info.im_height(), 1.f))); - - ARM_COMPUTE_RETURN_ON_ERROR(NEQuantizationLayer::validate(&proposals_4_roi_values_f32, &proposals_4_roi_values_quantized)); + TensorInfo deltas_flattened_f32_info(deltas->clone() + ->set_tensor_shape(TensorShape(values_per_roi, total_num_anchors)) + .set_is_resizable(true) + .set_data_type(DataType::F32)); + ARM_COMPUTE_RETURN_ON_ERROR( + NEDequantizationLayer::validate(&deltas_flattened_info, &deltas_flattened_f32_info)); + + TensorInfo proposals_4_roi_values_f32(deltas->clone() + ->set_tensor_shape(TensorShape(values_per_roi, total_num_anchors)) + .set_is_resizable(true) + .set_data_type(DataType::F32)); + ARM_COMPUTE_RETURN_ON_ERROR(NEBoundingBoxTransform::validate( + &all_anchors_f32_info, &proposals_4_roi_values_f32, &deltas_flattened_f32_info, + BoundingBoxTransformInfo(info.im_width(), info.im_height(), 1.f))); + + ARM_COMPUTE_RETURN_ON_ERROR( + NEQuantizationLayer::validate(&proposals_4_roi_values_f32, &proposals_4_roi_values_quantized)); proposals_4_roi_values_to_use = &proposals_4_roi_values_quantized; } else { - ARM_COMPUTE_RETURN_ON_ERROR(NEBoundingBoxTransform::validate(&all_anchors_info, &proposals_4_roi_values, &deltas_flattened_info, - BoundingBoxTransformInfo(info.im_width(), info.im_height(), 1.f))); + ARM_COMPUTE_RETURN_ON_ERROR( + NEBoundingBoxTransform::validate(&all_anchors_info, &proposals_4_roi_values, &deltas_flattened_info, + BoundingBoxTransformInfo(info.im_width(), info.im_height(), 1.f))); } - ARM_COMPUTE_RETURN_ON_ERROR(NEPadLayer::validate(proposals_4_roi_values_to_use, proposals, PaddingList{ { 1, 0 } })); + ARM_COMPUTE_RETURN_ON_ERROR(NEPadLayer::validate(proposals_4_roi_values_to_use, proposals, PaddingList{{1, 0}})); - if(num_valid_proposals->total_size() > 0) + if (num_valid_proposals->total_size() > 0) { ARM_COMPUTE_RETURN_ERROR_ON(num_valid_proposals->num_dimensions() > 1); ARM_COMPUTE_RETURN_ERROR_ON(num_valid_proposals->dimension(0) > 1); ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(num_valid_proposals, 1, DataType::U32); } - if(proposals->total_size() > 0) + if (proposals->total_size() > 0) { ARM_COMPUTE_RETURN_ERROR_ON(proposals->num_dimensions() > 2); ARM_COMPUTE_RETURN_ERROR_ON(proposals->dimension(0) != size_t(values_per_roi) + 1); ARM_COMPUTE_RETURN_ERROR_ON(proposals->dimension(1) != size_t(total_num_anchors)); - if(is_qasymm8) + if (is_qasymm8) { ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(proposals, 1, DataType::QASYMM16); const UniformQuantizationInfo proposals_qinfo = proposals->quantization_info().uniform(); @@ -309,7 +354,7 @@ Status NEGenerateProposalsLayer::validate(const ITensorInfo *scores, const ITens } } - if(scores_out->total_size() > 0) + if (scores_out->total_size() > 0) { ARM_COMPUTE_RETURN_ERROR_ON(scores_out->num_dimensions() > 1); ARM_COMPUTE_RETURN_ERROR_ON(scores_out->dimension(0) != size_t(total_num_anchors)); @@ -328,7 +373,7 @@ void NEGenerateProposalsLayer::run() NEScheduler::get().schedule(_compute_anchors.get(), Window::DimY); // Transpose and reshape the inputs - if(!_is_nhwc) + if (!_is_nhwc) { _permute_deltas.run(); _permute_scores.run(); @@ -337,7 +382,7 @@ void NEGenerateProposalsLayer::run() _flatten_deltas.run(); _flatten_scores.run(); - if(_is_qasymm8) + if (_is_qasymm8) { _dequantize_anchors.run(); _dequantize_deltas.run(); @@ -346,7 +391,7 @@ void NEGenerateProposalsLayer::run() // Build the boxes _bounding_box.run(); - if(_is_qasymm8) + if (_is_qasymm8) { _quantize_all_proposals.run(); } diff --git a/src/runtime/NEON/functions/NEInstanceNormalizationLayer.cpp b/src/runtime/NEON/functions/NEInstanceNormalizationLayer.cpp index 5965b9722f..78218cbdee 100644 --- a/src/runtime/NEON/functions/NEInstanceNormalizationLayer.cpp +++ b/src/runtime/NEON/functions/NEInstanceNormalizationLayer.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2020 Arm Limited. + * Copyright (c) 2019-2021 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -26,6 +26,8 @@ #include "arm_compute/core/Helpers.h" #include "arm_compute/core/KernelDescriptors.h" #include "arm_compute/runtime/NEON/NEScheduler.h" + +#include "src/common/utils/Log.h" #include "src/core/NEON/kernels/NEInstanceNormalizationLayerKernel.h" namespace arm_compute @@ -33,21 +35,29 @@ namespace arm_compute NEInstanceNormalizationLayer::~NEInstanceNormalizationLayer() = default; NEInstanceNormalizationLayer::NEInstanceNormalizationLayer(std::shared_ptr<IMemoryManager> memory_manager) - : _memory_group(std::move(memory_manager)), _normalization_kernel(), _is_nchw(false), _permute_input(), _permute_output(), _permuted_input(), _permuted_output() + : _memory_group(std::move(memory_manager)), + _normalization_kernel(), + _is_nchw(false), + _permute_input(), + _permute_output(), + _permuted_input(), + _permuted_output() { } void NEInstanceNormalizationLayer::configure(ITensor *input, ITensor *output, float gamma, float beta, float epsilon) { + ARM_COMPUTE_LOG_PARAMS(input, output, gamma, beta, epsilon); + const DataLayout data_layout = input->info()->data_layout(); - const auto kernel_descriptor = InstanceNormalizationLayerKernelInfo{ gamma, beta, epsilon, true }; + const auto kernel_descriptor = InstanceNormalizationLayerKernelInfo{gamma, beta, epsilon, true}; // Configure Kernels _is_nchw = data_layout == DataLayout::NCHW; _normalization_kernel = std::make_unique<NEInstanceNormalizationLayerKernel>(); - if(!_is_nchw) + if (!_is_nchw) { _memory_group.manage(&_permuted_input); _memory_group.manage(&_permuted_output); @@ -69,11 +79,12 @@ void NEInstanceNormalizationLayer::configure(ITensor *input, ITensor *output, fl } } -Status NEInstanceNormalizationLayer::validate(const ITensorInfo *input, const ITensorInfo *output, float gamma, float beta, float epsilon) +Status NEInstanceNormalizationLayer::validate( + const ITensorInfo *input, const ITensorInfo *output, float gamma, float beta, float epsilon) { - return NEInstanceNormalizationLayerKernel::validate(&input->clone()->set_data_layout(DataLayout::NCHW), - &output->clone()->set_data_layout(DataLayout::NCHW), - InstanceNormalizationLayerKernelInfo{ gamma, beta, epsilon, true }); + return NEInstanceNormalizationLayerKernel::validate( + &input->clone()->set_data_layout(DataLayout::NCHW), &output->clone()->set_data_layout(DataLayout::NCHW), + InstanceNormalizationLayerKernelInfo{gamma, beta, epsilon, true}); } void NEInstanceNormalizationLayer::run() @@ -81,7 +92,7 @@ void NEInstanceNormalizationLayer::run() MemoryGroupResourceScope scope_mg(_memory_group); // Permute input - if(!_is_nchw) + if (!_is_nchw) { _permute_input.run(); } @@ -89,7 +100,7 @@ void NEInstanceNormalizationLayer::run() NEScheduler::get().schedule(_normalization_kernel.get(), Window::DimZ); // Permute output - if(!_is_nchw) + if (!_is_nchw) { _permute_output.run(); } diff --git a/src/runtime/NEON/functions/NEL2NormalizeLayer.cpp b/src/runtime/NEON/functions/NEL2NormalizeLayer.cpp index 505ee0a962..b7f6203efd 100644 --- a/src/runtime/NEON/functions/NEL2NormalizeLayer.cpp +++ b/src/runtime/NEON/functions/NEL2NormalizeLayer.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2020 Arm Limited. + * Copyright (c) 2017-2021 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -25,6 +25,8 @@ #include "arm_compute/core/Helpers.h" #include "arm_compute/runtime/NEON/NEScheduler.h" + +#include "src/common/utils/Log.h" #include "src/core/NEON/kernels/NEL2NormalizeLayerKernel.h" #include "src/core/NEON/kernels/NEReductionOperationKernel.h" @@ -43,6 +45,8 @@ NEL2NormalizeLayer::NEL2NormalizeLayer(std::shared_ptr<IMemoryManager> memory_ma void NEL2NormalizeLayer::configure(ITensor *input, ITensor *output, int axis, float epsilon) { + ARM_COMPUTE_LOG_PARAMS(input, output, axis, epsilon); + // Manage intermediate buffers _memory_group.manage(&_sumsq); @@ -66,7 +70,8 @@ Status NEL2NormalizeLayer::validate(const ITensorInfo *input, const ITensorInfo sum_sq.set_tensor_shape(shape); const uint32_t actual_axis = wrap_around(axis, max_input_tensor_dim); - ARM_COMPUTE_RETURN_ON_ERROR(NEReductionOperation::validate(input, &sum_sq, actual_axis, ReductionOperation::SUM_SQUARE)); + ARM_COMPUTE_RETURN_ON_ERROR( + NEReductionOperation::validate(input, &sum_sq, actual_axis, ReductionOperation::SUM_SQUARE)); // Reduce shape on axis shape.set(actual_axis, 1); diff --git a/src/runtime/NEON/functions/NELSTMLayer.cpp b/src/runtime/NEON/functions/NELSTMLayer.cpp index d338e4fd2d..1a08cdeb06 100644 --- a/src/runtime/NEON/functions/NELSTMLayer.cpp +++ b/src/runtime/NEON/functions/NELSTMLayer.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018-2021 Arm Limited. + * Copyright (c) 2018-2022 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -24,20 +24,13 @@ #include "arm_compute/runtime/NEON/functions/NELSTMLayer.h" #include "arm_compute/core/Utils.h" -#include "arm_compute/core/Validate.h" #include "arm_compute/core/utils/misc/InfoHelpers.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" #include "arm_compute/core/utils/quantization/AsymmHelpers.h" +#include "arm_compute/core/Validate.h" #include "arm_compute/runtime/common/LSTMParams.h" -#include "src/core/NEON/kernels/NEConvertQuantizedSignednessKernel.h" -#include "src/core/NEON/kernels/NEGEMMInterleave4x4Kernel.h" -#include "src/core/NEON/kernels/NEGEMMLowpMatrixMultiplyKernel.h" -#include "src/core/NEON/kernels/NEGEMMLowpOffsetContributionKernel.h" -#include "src/core/NEON/kernels/NEGEMMLowpOffsetContributionOutputStageKernel.h" -#include "src/core/NEON/kernels/NEGEMMLowpReductionKernel.h" -#include "src/core/NEON/kernels/NEGEMMMatrixAdditionKernel.h" -#include "src/core/NEON/kernels/NEGEMMMatrixMultiplyKernel.h" -#include "src/core/NEON/kernels/NEGEMMTranspose1xWKernel.h" + +#include "src/common/utils/Log.h" namespace arm_compute { @@ -47,35 +40,122 @@ using namespace arm_compute::utils::info_helpers; NELSTMLayer::~NELSTMLayer() = default; NELSTMLayer::NELSTMLayer(std::shared_ptr<IMemoryManager> memory_manager) - : _memory_group(std::move(memory_manager)), _fully_connected_input_gate(), _accum_input_gate1(), _subtract_input_gate(), _pixelwise_mul_input_gate(), _activation_input_gate(), - _fully_connected_forget_gate(), _accum_forget_gate1(), _pixelwise_mul_forget_gate(), _activation_forget_gate(), _fully_connected_cell_state(), _gemm_cell_state1(), _transpose_cell_state(), - _accum_cell_state1(), _accum_cell_state2(), _pixelwise_mul_cell_state1(), _activation_cell_state(), _cell_clip(), _pixelwise_mul_cell_state2(), _fully_connected_output(), - _pixelwise_mul_output_state1(), _accum_output1(), _activation_output(), _activation_output_state(), _pixelwise_mul_output_state2(), _fully_connected_output_state(), _projection_clip(), - _copy_cell_state(), _copy_output(), _concat_scratch_buffer(), _concat_inputs_forget_gate(), _concat_weights_forget_gate(), _concat_weights_input_gate(), _concat_weights_output(), - _mean_std_norm_input_gate(), _pixelwise_mul_input_gate_coeff(), _accum_input_gate_bias(), _mean_std_norm_forget_gate(), _pixelwise_mul_forget_gate_coeff(), _accum_forget_gate_bias(), - _mean_std_norm_cell_gate(), _pixelwise_mul_cell_gate_coeff(), _accum_cell_gate_bias(), _mean_std_norm_output_gate(), _pixelwise_mul_output_gate_coeff(), _accum_output_gate_bias(), _input_gate_out1(), - _input_gate_out2(), _input_gate_out3(), _input_gate_out4(), _forget_gate_out1(), _forget_gate_out2(), _forget_gate_out3(), _forget_gate_out4(), _forget_gate_out5(), _forget_gate_out6(), - _cell_state_out1(), _cell_state_out2(), _cell_state_out3(), _cell_state_out4(), _cell_state_out5(), _output1(), _output2(), _output3(), _output4(), _cell_state_activation(), _output_state1(), _ones(), - _input_layer_norm_out1(), _input_layer_norm_out2(), _forget_layer_norm_out1(), _forget_layer_norm_out2(), _cell_layer_norm_out1(), _cell_layer_norm_out2(), _output_layer_norm_out1(), - _output_layer_norm_out2(), _run_peephole_opt(false), _run_cifg_opt(false), _perform_cell_clipping(false), _has_projection_weights(false), _perform_projection_clipping(false), _is_prepared(false), + : _memory_group(std::move(memory_manager)), + _fully_connected_input_gate(), + _accum_input_gate1(), + _subtract_input_gate(), + _pixelwise_mul_input_gate(), + _activation_input_gate(), + _fully_connected_forget_gate(), + _accum_forget_gate1(), + _pixelwise_mul_forget_gate(), + _activation_forget_gate(), + _fully_connected_cell_state(), + _gemm_cell_state1(), + _transpose_cell_state(), + _accum_cell_state1(), + _accum_cell_state2(), + _pixelwise_mul_cell_state1(), + _activation_cell_state(), + _cell_clip(), + _pixelwise_mul_cell_state2(), + _fully_connected_output(), + _pixelwise_mul_output_state1(), + _accum_output1(), + _activation_output(), + _activation_output_state(), + _pixelwise_mul_output_state2(), + _fully_connected_output_state(), + _projection_clip(), + _copy_cell_state(), + _copy_output(), + _concat_scratch_buffer(), + _concat_inputs_forget_gate(), + _concat_weights_forget_gate(), + _concat_weights_input_gate(), + _concat_weights_output(), + _mean_std_norm_input_gate(), + _pixelwise_mul_input_gate_coeff(), + _accum_input_gate_bias(), + _mean_std_norm_forget_gate(), + _pixelwise_mul_forget_gate_coeff(), + _accum_forget_gate_bias(), + _mean_std_norm_cell_gate(), + _pixelwise_mul_cell_gate_coeff(), + _accum_cell_gate_bias(), + _mean_std_norm_output_gate(), + _pixelwise_mul_output_gate_coeff(), + _accum_output_gate_bias(), + _input_gate_out1(), + _input_gate_out2(), + _input_gate_out3(), + _input_gate_out4(), + _forget_gate_out1(), + _forget_gate_out2(), + _forget_gate_out3(), + _forget_gate_out4(), + _forget_gate_out5(), + _forget_gate_out6(), + _cell_state_out1(), + _cell_state_out2(), + _cell_state_out3(), + _cell_state_out4(), + _cell_state_out5(), + _output1(), + _output2(), + _output3(), + _output4(), + _cell_state_activation(), + _output_state1(), + _ones(), + _input_layer_norm_out1(), + _input_layer_norm_out2(), + _forget_layer_norm_out1(), + _forget_layer_norm_out2(), + _cell_layer_norm_out1(), + _cell_layer_norm_out2(), + _output_layer_norm_out1(), + _output_layer_norm_out2(), + _run_peephole_opt(false), + _run_cifg_opt(false), + _perform_cell_clipping(false), + _has_projection_weights(false), + _perform_projection_clipping(false), + _is_prepared(false), _is_layer_norm_lstm(false) { } -void NELSTMLayer::configure(const ITensor *input, - const ITensor *input_to_forget_weights, const ITensor *input_to_cell_weights, const ITensor *input_to_output_weights, - const ITensor *recurrent_to_forget_weights, const ITensor *recurrent_to_cell_weights, const ITensor *recurrent_to_output_weights, - const ITensor *forget_gate_bias, const ITensor *cell_bias, const ITensor *output_gate_bias, - const ITensor *output_state_in, const ITensor *cell_state_in, - ITensor *scratch_buffer, ITensor *output_state_out, ITensor *cell_state_out, ITensor *output, - const LSTMParams<ITensor> &lstm_params, const ActivationLayerInfo &activation_info, float cell_threshold, float projection_threshold) +void NELSTMLayer::configure(const ITensor *input, + const ITensor *input_to_forget_weights, + const ITensor *input_to_cell_weights, + const ITensor *input_to_output_weights, + const ITensor *recurrent_to_forget_weights, + const ITensor *recurrent_to_cell_weights, + const ITensor *recurrent_to_output_weights, + const ITensor *forget_gate_bias, + const ITensor *cell_bias, + const ITensor *output_gate_bias, + const ITensor *output_state_in, + const ITensor *cell_state_in, + ITensor *scratch_buffer, + ITensor *output_state_out, + ITensor *cell_state_out, + ITensor *output, + const LSTMParams<ITensor> &lstm_params, + const ActivationLayerInfo &activation_info, + float cell_threshold, + float projection_threshold) { - ARM_COMPUTE_ERROR_ON_NULLPTR(input, - input_to_forget_weights, input_to_cell_weights, input_to_output_weights, + ARM_COMPUTE_ERROR_ON_NULLPTR(input, input_to_forget_weights, input_to_cell_weights, input_to_output_weights, recurrent_to_forget_weights, recurrent_to_cell_weights, recurrent_to_output_weights, - forget_gate_bias, cell_bias, output_gate_bias, - output_state_in, cell_state_in, + forget_gate_bias, cell_bias, output_gate_bias, output_state_in, cell_state_in, scratch_buffer, output_state_out, cell_state_out, output); + ARM_COMPUTE_LOG_PARAMS(input, input_to_forget_weights, input_to_cell_weights, input_to_output_weights, + recurrent_to_forget_weights, recurrent_to_cell_weights, recurrent_to_output_weights, + forget_gate_bias, cell_bias, output_gate_bias, output_state_in, cell_state_in, + scratch_buffer, output_state_out, cell_state_out, output, lstm_params, activation_info, + cell_threshold, projection_threshold); _is_layer_norm_lstm = lstm_params.use_layer_norm(); @@ -84,13 +164,12 @@ void NELSTMLayer::configure(const ITensor *input, build_lstm_params_tensor_info(lstm_params, &lstm_params_info); // Validate - ARM_COMPUTE_ERROR_THROW_ON(NELSTMLayer::validate(input->info(), input_to_forget_weights->info(), - input_to_cell_weights->info(), input_to_output_weights->info(), - recurrent_to_forget_weights->info(), recurrent_to_cell_weights->info(), recurrent_to_output_weights->info(), - forget_gate_bias->info(), cell_bias->info(), output_gate_bias->info(), - output_state_in->info(), cell_state_in->info(), - scratch_buffer->info(), output_state_out->info(), cell_state_out->info(), output->info(), - lstm_params_info, activation_info, cell_threshold, projection_threshold)); + ARM_COMPUTE_ERROR_THROW_ON(NELSTMLayer::validate( + input->info(), input_to_forget_weights->info(), input_to_cell_weights->info(), input_to_output_weights->info(), + recurrent_to_forget_weights->info(), recurrent_to_cell_weights->info(), recurrent_to_output_weights->info(), + forget_gate_bias->info(), cell_bias->info(), output_gate_bias->info(), output_state_in->info(), + cell_state_in->info(), scratch_buffer->info(), output_state_out->info(), cell_state_out->info(), output->info(), + lstm_params_info, activation_info, cell_threshold, projection_threshold)); const TensorShape cell_state_shape = cell_state_in->info()->tensor_shape(); @@ -117,20 +196,23 @@ void NELSTMLayer::configure(const ITensor *input, _concat_weights_forget_gate.configure(weights_vector, &_forget_gate_out6, Window::DimX); _memory_group.manage(&_forget_gate_out5); - _fully_connected_forget_gate.configure(&_forget_gate_out2, &_forget_gate_out6, (_is_layer_norm_lstm) ? nullptr : forget_gate_bias, &_forget_gate_out5); + _fully_connected_forget_gate.configure(&_forget_gate_out2, &_forget_gate_out6, + (_is_layer_norm_lstm) ? nullptr : forget_gate_bias, &_forget_gate_out5); _memory_group.manage(&_forget_gate_out1); _memory_group.manage(&_forget_gate_out3); _forget_gate_out6.allocator()->allocate(); Tensor *forget_gate_out = &_forget_gate_out5; - if(lstm_params.has_peephole_opt()) + if (lstm_params.has_peephole_opt()) { _forget_gate_out4.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type())); _run_peephole_opt = true; _memory_group.manage(&_forget_gate_out4); - _pixelwise_mul_forget_gate.configure(cell_state_in, lstm_params.cell_to_forget_weights(), &_forget_gate_out4, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO); - _accum_forget_gate1.configure(&_forget_gate_out5, &_forget_gate_out4, &_forget_gate_out3, ConvertPolicy::SATURATE); + _pixelwise_mul_forget_gate.configure(cell_state_in, lstm_params.cell_to_forget_weights(), &_forget_gate_out4, 1, + ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO); + _accum_forget_gate1.configure(&_forget_gate_out5, &_forget_gate_out4, &_forget_gate_out3, + ConvertPolicy::SATURATE); _forget_gate_out4.allocator()->allocate(); _forget_gate_out5.allocator()->allocate(); forget_gate_out = &_forget_gate_out3; @@ -139,21 +221,25 @@ void NELSTMLayer::configure(const ITensor *input, { _forget_gate_out3.allocator()->allocate(); } - if(_is_layer_norm_lstm) + if (_is_layer_norm_lstm) { _forget_layer_norm_out1.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type())); _forget_layer_norm_out2.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type())); _memory_group.manage(&_forget_layer_norm_out1); _memory_group.manage(&_forget_layer_norm_out2); _mean_std_norm_forget_gate.configure(forget_gate_out); - _pixelwise_mul_forget_gate_coeff.configure(forget_gate_out, lstm_params.forget_layer_norm_weights(), &_forget_layer_norm_out1, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO); + _pixelwise_mul_forget_gate_coeff.configure(forget_gate_out, lstm_params.forget_layer_norm_weights(), + &_forget_layer_norm_out1, 1, ConvertPolicy::SATURATE, + RoundingPolicy::TO_ZERO); // forget_gate_out is going to be reassigned, so allocate the tensor that it was assigned to before forget_gate_out->allocator()->allocate(); - _accum_forget_gate_bias.configure(&_forget_layer_norm_out1, forget_gate_bias, &_forget_layer_norm_out2, ConvertPolicy::SATURATE); + _accum_forget_gate_bias.configure(&_forget_layer_norm_out1, forget_gate_bias, &_forget_layer_norm_out2, + ConvertPolicy::SATURATE); _forget_layer_norm_out1.allocator()->allocate(); forget_gate_out = &_forget_layer_norm_out2; } - _activation_forget_gate.configure(forget_gate_out, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)); + _activation_forget_gate.configure(forget_gate_out, nullptr, + ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)); // Configure block that calculates the input gate // input_gate = Activation(input * input_to_input_weights + output_state * recurrent_to_input_weights + PixelWiseMul(cell_state, cell_to_input_weights) + input_gate_bias), without CIFG @@ -162,7 +248,7 @@ void NELSTMLayer::configure(const ITensor *input, // input_gate = Activation((input,output_state) * (input_to_input_weights,recurrent_to_input_weights) + PixelWiseMul(cell_state, cell_to_input_weights) + input_gate_bias), without CIFG _input_gate_out1.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type())); Tensor *input_gate_out = &_input_gate_out1; - if(lstm_params.has_cifg_opt()) + if (lstm_params.has_cifg_opt()) { _memory_group.manage(&_input_gate_out1); _ones.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type())); @@ -184,15 +270,19 @@ void NELSTMLayer::configure(const ITensor *input, _memory_group.manage(&_input_gate_out1); _memory_group.manage(&_input_gate_out4); - _fully_connected_input_gate.configure(&_forget_gate_out2, &_input_gate_out2, (_is_layer_norm_lstm) ? nullptr : lstm_params.input_gate_bias(), &_input_gate_out3); + _fully_connected_input_gate.configure(&_forget_gate_out2, &_input_gate_out2, + (_is_layer_norm_lstm) ? nullptr : lstm_params.input_gate_bias(), + &_input_gate_out3); _input_gate_out2.allocator()->allocate(); input_gate_out = &_input_gate_out3; - if(_run_peephole_opt) + if (_run_peephole_opt) { _memory_group.manage(&_input_gate_out4); - _pixelwise_mul_input_gate.configure(cell_state_in, lstm_params.cell_to_input_weights(), &_input_gate_out4, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO); - _accum_input_gate1.configure(&_input_gate_out3, &_input_gate_out4, &_input_gate_out1, ConvertPolicy::SATURATE); + _pixelwise_mul_input_gate.configure(cell_state_in, lstm_params.cell_to_input_weights(), &_input_gate_out4, + 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO); + _accum_input_gate1.configure(&_input_gate_out3, &_input_gate_out4, &_input_gate_out1, + ConvertPolicy::SATURATE); _input_gate_out3.allocator()->allocate(); _input_gate_out4.allocator()->allocate(); input_gate_out = &_input_gate_out1; @@ -202,21 +292,25 @@ void NELSTMLayer::configure(const ITensor *input, _input_gate_out1.allocator()->allocate(); } - if(_is_layer_norm_lstm) + if (_is_layer_norm_lstm) { _input_layer_norm_out1.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type())); _input_layer_norm_out2.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type())); _memory_group.manage(&_input_layer_norm_out1); _memory_group.manage(&_input_layer_norm_out2); _mean_std_norm_input_gate.configure(input_gate_out); - _pixelwise_mul_input_gate_coeff.configure(input_gate_out, lstm_params.input_layer_norm_weights(), &_input_layer_norm_out1, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO); + _pixelwise_mul_input_gate_coeff.configure(input_gate_out, lstm_params.input_layer_norm_weights(), + &_input_layer_norm_out1, 1, ConvertPolicy::SATURATE, + RoundingPolicy::TO_ZERO); // input_gate_out is going to be reassigned, so allocate the tensor that it was assigned to before input_gate_out->allocator()->allocate(); - _accum_input_gate_bias.configure(&_input_layer_norm_out1, lstm_params.input_gate_bias(), &_input_layer_norm_out2, ConvertPolicy::SATURATE); + _accum_input_gate_bias.configure(&_input_layer_norm_out1, lstm_params.input_gate_bias(), + &_input_layer_norm_out2, ConvertPolicy::SATURATE); _input_layer_norm_out1.allocator()->allocate(); input_gate_out = &_input_layer_norm_out2; } - _activation_input_gate.configure(input_gate_out, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)); + _activation_input_gate.configure(input_gate_out, nullptr, + ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)); } // Configure block that calculates the cell state @@ -229,7 +323,8 @@ void NELSTMLayer::configure(const ITensor *input, _cell_state_out5.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type())); _memory_group.manage(&_cell_state_out1); - _fully_connected_cell_state.configure(input, input_to_cell_weights, (_is_layer_norm_lstm) ? nullptr : cell_bias, &_cell_state_out1); + _fully_connected_cell_state.configure(input, input_to_cell_weights, (_is_layer_norm_lstm) ? nullptr : cell_bias, + &_cell_state_out1); _memory_group.manage(&_cell_state_out2); _transpose_cell_state.configure(recurrent_to_cell_weights, &_cell_state_out2); _memory_group.manage(&_cell_state_out3); @@ -238,33 +333,40 @@ void NELSTMLayer::configure(const ITensor *input, _memory_group.manage(&_cell_state_out4); _accum_cell_state1.configure(&_cell_state_out1, &_cell_state_out3, &_cell_state_out4, ConvertPolicy::SATURATE); Tensor *cell_state_out_ptr = &_cell_state_out4; - if(_is_layer_norm_lstm) + if (_is_layer_norm_lstm) { _cell_layer_norm_out1.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type())); _cell_layer_norm_out2.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type())); _memory_group.manage(&_cell_layer_norm_out1); _memory_group.manage(&_cell_layer_norm_out2); _mean_std_norm_cell_gate.configure(cell_state_out_ptr); - _pixelwise_mul_cell_gate_coeff.configure(cell_state_out_ptr, lstm_params.cell_layer_norm_weights(), &_cell_layer_norm_out1, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO); + _pixelwise_mul_cell_gate_coeff.configure(cell_state_out_ptr, lstm_params.cell_layer_norm_weights(), + &_cell_layer_norm_out1, 1, ConvertPolicy::SATURATE, + RoundingPolicy::TO_ZERO); // cell_state_out_ptr is going to be reassigned, so allocate the tensor that it was assigned to before cell_state_out_ptr->allocator()->allocate(); - _accum_cell_gate_bias.configure(&_cell_layer_norm_out1, cell_bias, &_cell_layer_norm_out2, ConvertPolicy::SATURATE); + _accum_cell_gate_bias.configure(&_cell_layer_norm_out1, cell_bias, &_cell_layer_norm_out2, + ConvertPolicy::SATURATE); _cell_layer_norm_out1.allocator()->allocate(); cell_state_out_ptr = &_cell_layer_norm_out2; } _activation_cell_state.configure(cell_state_out_ptr, nullptr, activation_info); _memory_group.manage(&_cell_state_out5); - _pixelwise_mul_cell_state1.configure(cell_state_out_ptr, input_gate_out, &_cell_state_out5, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO); + _pixelwise_mul_cell_state1.configure(cell_state_out_ptr, input_gate_out, &_cell_state_out5, 1, + ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO); cell_state_out_ptr->allocator()->allocate(); - _pixelwise_mul_cell_state2.configure(forget_gate_out, cell_state_in, &_cell_state_out3, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO); + _pixelwise_mul_cell_state2.configure(forget_gate_out, cell_state_in, &_cell_state_out3, 1, ConvertPolicy::SATURATE, + RoundingPolicy::TO_ZERO); _accum_cell_state2.configure(&_cell_state_out5, &_cell_state_out3, &_cell_state_out1, ConvertPolicy::SATURATE); _cell_state_out3.allocator()->allocate(); _cell_state_out5.allocator()->allocate(); // Perform clipping - if(cell_threshold != 0.f) + if (cell_threshold != 0.f) { _perform_cell_clipping = true; - _cell_clip.configure(&_cell_state_out1, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, -cell_threshold, cell_threshold)); + _cell_clip.configure(&_cell_state_out1, nullptr, + ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, + cell_threshold, -cell_threshold)); } // Configure block that calculates the output @@ -282,18 +384,20 @@ void NELSTMLayer::configure(const ITensor *input, _memory_group.manage(&_output1); _memory_group.manage(&_output4); - _fully_connected_output.configure(&_forget_gate_out2, &_output2, (_is_layer_norm_lstm) ? nullptr : output_gate_bias, &_output4); + _fully_connected_output.configure(&_forget_gate_out2, &_output2, (_is_layer_norm_lstm) ? nullptr : output_gate_bias, + &_output4); _output2.allocator()->allocate(); _forget_gate_out2.allocator()->allocate(); Tensor *output_gate_out = &_output4; - if(lstm_params.has_peephole_opt()) + if (lstm_params.has_peephole_opt()) { _output3.allocator()->init(TensorInfo(_cell_state_out1.info()->tensor_shape(), 1, input->info()->data_type())); _memory_group.manage(&_output3); - _pixelwise_mul_output_state1.configure(&_cell_state_out1, lstm_params.cell_to_output_weights(), &_output3, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO); + _pixelwise_mul_output_state1.configure(&_cell_state_out1, lstm_params.cell_to_output_weights(), &_output3, 1, + ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO); _accum_output1.configure(&_output4, &_output3, &_output1, ConvertPolicy::SATURATE); _output4.allocator()->allocate(); output_gate_out = &_output1; @@ -305,21 +409,25 @@ void NELSTMLayer::configure(const ITensor *input, { _output1.allocator()->allocate(); } - if(_is_layer_norm_lstm) + if (_is_layer_norm_lstm) { _output_layer_norm_out1.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type())); _output_layer_norm_out2.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type())); _memory_group.manage(&_output_layer_norm_out1); _memory_group.manage(&_output_layer_norm_out2); _mean_std_norm_output_gate.configure(output_gate_out); - _pixelwise_mul_output_gate_coeff.configure(output_gate_out, lstm_params.output_layer_norm_weights(), &_output_layer_norm_out1, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO); + _pixelwise_mul_output_gate_coeff.configure(output_gate_out, lstm_params.output_layer_norm_weights(), + &_output_layer_norm_out1, 1, ConvertPolicy::SATURATE, + RoundingPolicy::TO_ZERO); // output_gate_out is going to be reassigned, so allocate the tensor that it was assigned to before output_gate_out->allocator()->allocate(); - _accum_output_gate_bias.configure(&_output_layer_norm_out1, output_gate_bias, &_output_layer_norm_out2, ConvertPolicy::SATURATE); + _accum_output_gate_bias.configure(&_output_layer_norm_out1, output_gate_bias, &_output_layer_norm_out2, + ConvertPolicy::SATURATE); _output_layer_norm_out1.allocator()->allocate(); output_gate_out = &_output_layer_norm_out2; } - _activation_output.configure(output_gate_out, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)); + _activation_output.configure(output_gate_out, nullptr, + ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)); // Configure block that calculates the output state /** lstm_res = PixelwiseMul(output, Activation(cell_state)) @@ -336,20 +444,24 @@ void NELSTMLayer::configure(const ITensor *input, _memory_group.manage(&_cell_state_activation); _activation_output_state.configure(&_cell_state_out1, &_cell_state_activation, activation_info); - _pixelwise_mul_output_state2.configure(&_cell_state_activation, output_gate_out, output_state_out_tmp, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO); + _pixelwise_mul_output_state2.configure(&_cell_state_activation, output_gate_out, output_state_out_tmp, 1, + ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO); _cell_state_activation.allocator()->allocate(); output_gate_out->allocator()->allocate(); - if(lstm_params.has_projection()) + if (lstm_params.has_projection()) { _has_projection_weights = true; - _fully_connected_output_state.configure(output_state_out_tmp, lstm_params.projection_weights(), lstm_params.projection_bias(), output_state_out); + _fully_connected_output_state.configure(output_state_out_tmp, lstm_params.projection_weights(), + lstm_params.projection_bias(), output_state_out); _output_state1.allocator()->allocate(); // Perform clipping - if(projection_threshold != 0.f) + if (projection_threshold != 0.f) { _perform_projection_clipping = true; - _projection_clip.configure(output_state_out, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, -projection_threshold, projection_threshold)); + _projection_clip.configure(output_state_out, nullptr, + ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, + -projection_threshold, projection_threshold)); } } @@ -359,7 +471,7 @@ void NELSTMLayer::configure(const ITensor *input, // Vector for holding the tensors to store in scratch buffer std::vector<const ITensor *> scratch_inputs; - if(!lstm_params.has_cifg_opt()) + if (!lstm_params.has_cifg_opt()) { scratch_inputs.emplace_back(input_gate_out); } @@ -373,29 +485,38 @@ void NELSTMLayer::configure(const ITensor *input, output_gate_out->allocator()->allocate(); } -Status NELSTMLayer::validate(const ITensorInfo *input, - const ITensorInfo *input_to_forget_weights, const ITensorInfo *input_to_cell_weights, const ITensorInfo *input_to_output_weights, - const ITensorInfo *recurrent_to_forget_weights, const ITensorInfo *recurrent_to_cell_weights, const ITensorInfo *recurrent_to_output_weights, - const ITensorInfo *forget_gate_bias, const ITensorInfo *cell_bias, const ITensorInfo *output_gate_bias, - const ITensorInfo *output_state_in, const ITensorInfo *cell_state_in, - const ITensorInfo *scratch_buffer, const ITensorInfo *output_state_out, const ITensorInfo *cell_state_out, const ITensorInfo *output, - const LSTMParams<ITensorInfo> &lstm_params, const ActivationLayerInfo &activation_info, float cell_threshold, float projection_threshold) +Status NELSTMLayer::validate(const ITensorInfo *input, + const ITensorInfo *input_to_forget_weights, + const ITensorInfo *input_to_cell_weights, + const ITensorInfo *input_to_output_weights, + const ITensorInfo *recurrent_to_forget_weights, + const ITensorInfo *recurrent_to_cell_weights, + const ITensorInfo *recurrent_to_output_weights, + const ITensorInfo *forget_gate_bias, + const ITensorInfo *cell_bias, + const ITensorInfo *output_gate_bias, + const ITensorInfo *output_state_in, + const ITensorInfo *cell_state_in, + const ITensorInfo *scratch_buffer, + const ITensorInfo *output_state_out, + const ITensorInfo *cell_state_out, + const ITensorInfo *output, + const LSTMParams<ITensorInfo> &lstm_params, + const ActivationLayerInfo &activation_info, + float cell_threshold, + float projection_threshold) { - ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, - input_to_forget_weights, input_to_cell_weights, input_to_output_weights, - recurrent_to_forget_weights, recurrent_to_cell_weights, recurrent_to_output_weights, - forget_gate_bias, cell_bias, output_gate_bias, - output_state_in, cell_state_in, - scratch_buffer, output_state_out, cell_state_out, output); + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR( + input, input_to_forget_weights, input_to_cell_weights, input_to_output_weights, recurrent_to_forget_weights, + recurrent_to_cell_weights, recurrent_to_output_weights, forget_gate_bias, cell_bias, output_gate_bias, + output_state_in, cell_state_in, scratch_buffer, output_state_out, cell_state_out, output); // Check data types ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, - input_to_forget_weights, input_to_cell_weights, input_to_output_weights, - recurrent_to_forget_weights, recurrent_to_cell_weights, recurrent_to_output_weights, - forget_gate_bias, cell_bias, output_gate_bias, - output_state_in, cell_state_in, - scratch_buffer, output_state_out, cell_state_out, output); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES( + input, input_to_forget_weights, input_to_cell_weights, input_to_output_weights, recurrent_to_forget_weights, + recurrent_to_cell_weights, recurrent_to_output_weights, forget_gate_bias, cell_bias, output_gate_bias, + output_state_in, cell_state_in, scratch_buffer, output_state_out, cell_state_out, output); // Check dimensions ARM_COMPUTE_RETURN_ERROR_ON(input->num_dimensions() > 2); @@ -414,16 +535,16 @@ Status NELSTMLayer::validate(const ITensorInfo *input, ARM_COMPUTE_RETURN_ERROR_ON(output_state_out->num_dimensions() > 2); ARM_COMPUTE_RETURN_ERROR_ON(cell_state_out->num_dimensions() > 2); ARM_COMPUTE_RETURN_ERROR_ON(output->num_dimensions() > 2); - ARM_COMPUTE_RETURN_ERROR_ON(cell_bias->dimension(0) * 4 != scratch_buffer->dimension(0) - && cell_bias->dimension(0) * 3 != scratch_buffer->dimension(0)); + ARM_COMPUTE_RETURN_ERROR_ON(cell_bias->dimension(0) * 4 != scratch_buffer->dimension(0) && + cell_bias->dimension(0) * 3 != scratch_buffer->dimension(0)); const unsigned int num_batches = input->dimension(1); const unsigned int num_cells = input_to_output_weights->dimension(1); - if(lstm_params.use_layer_norm()) + if (lstm_params.use_layer_norm()) { // If CIFG is used, input layer normalization weights tensor is omitted - if(lstm_params.has_cifg_opt()) + if (lstm_params.has_cifg_opt()) { ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.input_layer_norm_weights() != nullptr); } @@ -435,8 +556,12 @@ Status NELSTMLayer::validate(const ITensorInfo *input, ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, lstm_params.input_layer_norm_weights()); } - ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(lstm_params.forget_layer_norm_weights(), lstm_params.cell_layer_norm_weights(), lstm_params.output_layer_norm_weights()); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, lstm_params.forget_layer_norm_weights(), lstm_params.cell_layer_norm_weights(), lstm_params.output_layer_norm_weights()); + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(lstm_params.forget_layer_norm_weights(), + lstm_params.cell_layer_norm_weights(), + lstm_params.output_layer_norm_weights()); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, lstm_params.forget_layer_norm_weights(), + lstm_params.cell_layer_norm_weights(), + lstm_params.output_layer_norm_weights()); ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.forget_layer_norm_weights()->num_dimensions() > 1); ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.cell_layer_norm_weights()->num_dimensions() > 1); ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.output_layer_norm_weights()->num_dimensions() > 1); @@ -446,7 +571,7 @@ Status NELSTMLayer::validate(const ITensorInfo *input, } // Check peephole optimization - if(lstm_params.has_peephole_opt()) + if (lstm_params.has_peephole_opt()) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(lstm_params.cell_to_output_weights(), lstm_params.cell_to_forget_weights()); ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.cell_to_forget_weights()->num_dimensions() > 1); @@ -466,33 +591,39 @@ Status NELSTMLayer::validate(const ITensorInfo *input, std::vector<const ITensorInfo *> inputs_vector; inputs_vector.emplace_back(input); inputs_vector.emplace_back(output_state_in); - const TensorShape concat_shape = arm_compute::misc::shape_calculator::calculate_concatenate_shape(inputs_vector, 0); + const TensorShape concat_shape = arm_compute::misc::shape_calculator::calculate_concatenate_shape(inputs_vector, 0); TensorInfo forget_gate_concat = TensorInfo(concat_shape, 1, input->data_type()); ARM_COMPUTE_RETURN_ON_ERROR(NEConcatenateLayer::validate(inputs_vector, &forget_gate_concat, Window::DimX)); // Validate forget gate - ARM_COMPUTE_RETURN_ON_ERROR(NEFullyConnectedLayer::validate(input, input_to_forget_weights, (lstm_params.use_layer_norm()) ? nullptr : forget_gate_bias, &forget_gate)); + ARM_COMPUTE_RETURN_ON_ERROR(NEFullyConnectedLayer::validate( + input, input_to_forget_weights, (lstm_params.use_layer_norm()) ? nullptr : forget_gate_bias, &forget_gate)); - if(lstm_params.has_peephole_opt()) + if (lstm_params.has_peephole_opt()) { - ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplication::validate(cell_state_in, lstm_params.cell_to_forget_weights(), &forget_gate, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO)); - ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(&forget_gate, &forget_gate, &forget_gate, ConvertPolicy::SATURATE)); + ARM_COMPUTE_RETURN_ON_ERROR( + NEPixelWiseMultiplication::validate(cell_state_in, lstm_params.cell_to_forget_weights(), &forget_gate, 1, + ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO)); + ARM_COMPUTE_RETURN_ON_ERROR( + NEArithmeticAddition::validate(&forget_gate, &forget_gate, &forget_gate, ConvertPolicy::SATURATE)); } - if(lstm_params.use_layer_norm()) + if (lstm_params.use_layer_norm()) { ARM_COMPUTE_RETURN_ON_ERROR(NEMeanStdDevNormalizationLayer::validate(&forget_gate)); - ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplication::validate(&forget_gate, lstm_params.forget_layer_norm_weights(), &forget_gate, 1, ConvertPolicy::SATURATE, - RoundingPolicy::TO_ZERO)); - ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(&forget_gate, forget_gate_bias, &forget_gate, ConvertPolicy::SATURATE)); + ARM_COMPUTE_RETURN_ON_ERROR( + NEPixelWiseMultiplication::validate(&forget_gate, lstm_params.forget_layer_norm_weights(), &forget_gate, 1, + ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO)); + ARM_COMPUTE_RETURN_ON_ERROR( + NEArithmeticAddition::validate(&forget_gate, forget_gate_bias, &forget_gate, ConvertPolicy::SATURATE)); } - ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayer::validate(&forget_gate, &forget_gate, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC))); + ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayer::validate( + &forget_gate, &forget_gate, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC))); // Validate input gate - if(!lstm_params.has_cifg_opt()) + if (!lstm_params.has_cifg_opt()) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(lstm_params.input_to_input_weights(), - lstm_params.recurrent_to_input_weights(), - lstm_params.input_gate_bias()); + lstm_params.recurrent_to_input_weights(), lstm_params.input_gate_bias()); ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.input_to_input_weights()->num_dimensions() > 2); ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.recurrent_to_input_weights()->num_dimensions() > 2); ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.input_gate_bias()->num_dimensions() > 1); @@ -500,88 +631,120 @@ Status NELSTMLayer::validate(const ITensorInfo *input, std::vector<const ITensorInfo *> lstm_weights; lstm_weights.emplace_back(lstm_params.input_to_input_weights()); lstm_weights.emplace_back(lstm_params.recurrent_to_input_weights()); - TensorShape lstm_weights_concat_shape = arm_compute::misc::shape_calculator::calculate_concatenate_shape(lstm_weights, 0); - TensorInfo lstm_gate_concat = TensorInfo(lstm_weights_concat_shape, 1, input->data_type()); + TensorShape lstm_weights_concat_shape = + arm_compute::misc::shape_calculator::calculate_concatenate_shape(lstm_weights, 0); + TensorInfo lstm_gate_concat = TensorInfo(lstm_weights_concat_shape, 1, input->data_type()); ARM_COMPUTE_RETURN_ON_ERROR(NEConcatenateLayer::validate(lstm_weights, &lstm_gate_concat, Window::DimX)); - ARM_COMPUTE_RETURN_ON_ERROR(NEFullyConnectedLayer::validate(input, lstm_params.input_to_input_weights(), (lstm_params.use_layer_norm()) ? nullptr : lstm_params.input_gate_bias(), &input_gate)); + ARM_COMPUTE_RETURN_ON_ERROR(NEFullyConnectedLayer::validate( + input, lstm_params.input_to_input_weights(), + (lstm_params.use_layer_norm()) ? nullptr : lstm_params.input_gate_bias(), &input_gate)); - if(lstm_params.has_peephole_opt()) + if (lstm_params.has_peephole_opt()) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(lstm_params.cell_to_input_weights()); ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.cell_to_input_weights()->num_dimensions() > 1); - ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplication::validate(cell_state_in, lstm_params.cell_to_input_weights(), &input_gate, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO)); - ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(&input_gate, &input_gate, &input_gate, ConvertPolicy::SATURATE)); + ARM_COMPUTE_RETURN_ON_ERROR( + NEPixelWiseMultiplication::validate(cell_state_in, lstm_params.cell_to_input_weights(), &input_gate, 1, + ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO)); + ARM_COMPUTE_RETURN_ON_ERROR( + NEArithmeticAddition::validate(&input_gate, &input_gate, &input_gate, ConvertPolicy::SATURATE)); } - if(lstm_params.use_layer_norm()) + if (lstm_params.use_layer_norm()) { ARM_COMPUTE_RETURN_ON_ERROR(NEMeanStdDevNormalizationLayer::validate(&input_gate)); - ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplication::validate(&input_gate, lstm_params.input_layer_norm_weights(), &input_gate, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO)); - ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(&input_gate, lstm_params.input_gate_bias(), &input_gate, ConvertPolicy::SATURATE)); + ARM_COMPUTE_RETURN_ON_ERROR( + NEPixelWiseMultiplication::validate(&input_gate, lstm_params.input_layer_norm_weights(), &input_gate, 1, + ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO)); + ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(&input_gate, lstm_params.input_gate_bias(), + &input_gate, ConvertPolicy::SATURATE)); } - ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayer::validate(&input_gate, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC))); + ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayer::validate( + &input_gate, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC))); } else { - ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticSubtraction::validate(&forget_gate, &forget_gate, &forget_gate, ConvertPolicy::SATURATE)); + ARM_COMPUTE_RETURN_ON_ERROR( + NEArithmeticSubtraction::validate(&forget_gate, &forget_gate, &forget_gate, ConvertPolicy::SATURATE)); } // Validate cell state - ARM_COMPUTE_RETURN_ON_ERROR(NEFullyConnectedLayer::validate(input, input_to_cell_weights, (lstm_params.use_layer_norm()) ? nullptr : cell_bias, &cell_state_tmp)); - ARM_COMPUTE_RETURN_ON_ERROR(NEGEMM::validate(output_state_in, &units_out_transposed_info, nullptr, &cell_state_tmp, 1.f, 0.f, GEMMInfo())); - ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(&cell_state_tmp, &cell_state_tmp, &cell_state_tmp, ConvertPolicy::SATURATE)); - if(lstm_params.use_layer_norm()) + ARM_COMPUTE_RETURN_ON_ERROR(NEFullyConnectedLayer::validate( + input, input_to_cell_weights, (lstm_params.use_layer_norm()) ? nullptr : cell_bias, &cell_state_tmp)); + ARM_COMPUTE_RETURN_ON_ERROR( + NEGEMM::validate(output_state_in, &units_out_transposed_info, nullptr, &cell_state_tmp, 1.f, 0.f, GEMMInfo())); + ARM_COMPUTE_RETURN_ON_ERROR( + NEArithmeticAddition::validate(&cell_state_tmp, &cell_state_tmp, &cell_state_tmp, ConvertPolicy::SATURATE)); + if (lstm_params.use_layer_norm()) { ARM_COMPUTE_RETURN_ON_ERROR(NEMeanStdDevNormalizationLayer::validate(&cell_state_tmp)); - ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplication::validate(&cell_state_tmp, lstm_params.cell_layer_norm_weights(), &cell_state_tmp, 1, ConvertPolicy::SATURATE, - RoundingPolicy::TO_ZERO)); - ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(&cell_state_tmp, cell_bias, &cell_state_tmp, ConvertPolicy::SATURATE)); + ARM_COMPUTE_RETURN_ON_ERROR( + NEPixelWiseMultiplication::validate(&cell_state_tmp, lstm_params.cell_layer_norm_weights(), &cell_state_tmp, + 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO)); + ARM_COMPUTE_RETURN_ON_ERROR( + NEArithmeticAddition::validate(&cell_state_tmp, cell_bias, &cell_state_tmp, ConvertPolicy::SATURATE)); } ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayer::validate(&cell_state_tmp, nullptr, activation_info)); - ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplication::validate(&cell_state_tmp, &input_gate, &cell_state_tmp, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO)); - ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplication::validate(&cell_state_tmp, &forget_gate, &cell_state_tmp, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO)); - ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(&cell_state_tmp, &cell_state_tmp, &cell_state_tmp, ConvertPolicy::SATURATE)); - if(cell_threshold != 0.f) + ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplication::validate(&cell_state_tmp, &input_gate, &cell_state_tmp, 1, + ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO)); + ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplication::validate(&cell_state_tmp, &forget_gate, &cell_state_tmp, 1, + ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO)); + ARM_COMPUTE_RETURN_ON_ERROR( + NEArithmeticAddition::validate(&cell_state_tmp, &cell_state_tmp, &cell_state_tmp, ConvertPolicy::SATURATE)); + if (cell_threshold != 0.f) { - ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayer::validate(&cell_state_tmp, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, -cell_threshold, - cell_threshold))); + ARM_COMPUTE_RETURN_ON_ERROR( + NEActivationLayer::validate(&cell_state_tmp, nullptr, + ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, + cell_threshold, -cell_threshold))); } // Validate output gate tmp std::vector<const ITensorInfo *> in_out_weights; in_out_weights.emplace_back(input_to_output_weights); in_out_weights.emplace_back(recurrent_to_output_weights); - TensorShape in_out_weights_concat_shape = arm_compute::misc::shape_calculator::calculate_concatenate_shape(in_out_weights, 0); - TensorInfo in_out_gate_concat = TensorInfo(in_out_weights_concat_shape, 1, input->data_type()); + TensorShape in_out_weights_concat_shape = + arm_compute::misc::shape_calculator::calculate_concatenate_shape(in_out_weights, 0); + TensorInfo in_out_gate_concat = TensorInfo(in_out_weights_concat_shape, 1, input->data_type()); ARM_COMPUTE_RETURN_ON_ERROR(NEConcatenateLayer::validate(in_out_weights, &in_out_gate_concat, Window::DimX)); - ARM_COMPUTE_RETURN_ON_ERROR(NEFullyConnectedLayer::validate(input, input_to_output_weights, (lstm_params.use_layer_norm()) ? nullptr : output_gate_bias, &output_gate_tmp)); + ARM_COMPUTE_RETURN_ON_ERROR(NEFullyConnectedLayer::validate( + input, input_to_output_weights, (lstm_params.use_layer_norm()) ? nullptr : output_gate_bias, &output_gate_tmp)); - if(lstm_params.has_peephole_opt()) + if (lstm_params.has_peephole_opt()) { - ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplication::validate(&cell_state_tmp, lstm_params.cell_to_output_weights(), &output_gate_tmp, 1, ConvertPolicy::SATURATE, - RoundingPolicy::TO_ZERO)); - ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(&output_gate_tmp, &output_gate_tmp, &output_gate_tmp, ConvertPolicy::SATURATE)); + ARM_COMPUTE_RETURN_ON_ERROR( + NEPixelWiseMultiplication::validate(&cell_state_tmp, lstm_params.cell_to_output_weights(), &output_gate_tmp, + 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO)); + ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(&output_gate_tmp, &output_gate_tmp, &output_gate_tmp, + ConvertPolicy::SATURATE)); } - if(lstm_params.use_layer_norm()) + if (lstm_params.use_layer_norm()) { ARM_COMPUTE_RETURN_ON_ERROR(NEMeanStdDevNormalizationLayer::validate(&output_gate_tmp)); - ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplication::validate(&output_gate_tmp, lstm_params.output_layer_norm_weights(), &output_gate_tmp, 1, ConvertPolicy::SATURATE, - RoundingPolicy::TO_ZERO)); - ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(&output_gate_tmp, output_gate_bias, &output_gate_tmp, ConvertPolicy::SATURATE)); + ARM_COMPUTE_RETURN_ON_ERROR( + NEPixelWiseMultiplication::validate(&output_gate_tmp, lstm_params.output_layer_norm_weights(), + &output_gate_tmp, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO)); + ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(&output_gate_tmp, output_gate_bias, &output_gate_tmp, + ConvertPolicy::SATURATE)); } - ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayer::validate(&output_gate_tmp, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC))); + ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayer::validate( + &output_gate_tmp, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC))); // Validate output state ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayer::validate(&cell_state_tmp, &cell_state_tmp, activation_info)); - ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplication::validate(&cell_state_tmp, &output_gate_tmp, &output_gate_tmp, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO)); - if(lstm_params.has_projection()) + ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplication::validate( + &cell_state_tmp, &output_gate_tmp, &output_gate_tmp, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO)); + if (lstm_params.has_projection()) { - ARM_COMPUTE_RETURN_ON_ERROR(NEFullyConnectedLayer::validate(&output_gate_tmp, lstm_params.projection_weights(), lstm_params.projection_bias(), output_state_out)); - if(projection_threshold != 0.f) + ARM_COMPUTE_RETURN_ON_ERROR(NEFullyConnectedLayer::validate(&output_gate_tmp, lstm_params.projection_weights(), + lstm_params.projection_bias(), output_state_out)); + if (projection_threshold != 0.f) { - ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayer::validate(output_state_out, output_state_out, - ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, -projection_threshold, projection_threshold))); + ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayer::validate( + output_state_out, output_state_out, + ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, -projection_threshold, + projection_threshold))); } } @@ -591,7 +754,7 @@ Status NELSTMLayer::validate(const ITensorInfo *input, // Validate scratch concatenation std::vector<const ITensorInfo *> inputs_vector_info_raw; - if(!lstm_params.has_cifg_opt()) + if (!lstm_params.has_cifg_opt()) { inputs_vector_info_raw.push_back(&input_gate); } @@ -612,12 +775,12 @@ void NELSTMLayer::run() _concat_inputs_forget_gate.run(); _fully_connected_forget_gate.run(); - if(_run_peephole_opt) + if (_run_peephole_opt) { _pixelwise_mul_forget_gate.run(); _accum_forget_gate1.run(); } - if(_is_layer_norm_lstm) + if (_is_layer_norm_lstm) { _mean_std_norm_forget_gate.run(); _pixelwise_mul_forget_gate_coeff.run(); @@ -625,15 +788,17 @@ void NELSTMLayer::run() } _activation_forget_gate.run(); - if(_run_cifg_opt) + if (_run_cifg_opt) { - if(_ones.info()->data_type() == DataType::F16) + if (_ones.info()->data_type() == DataType::F16) { - std::fill_n(reinterpret_cast<half *>(_ones.buffer()), _ones.info()->total_size() / _ones.info()->element_size(), 1); + std::fill_n(reinterpret_cast<half *>(_ones.buffer()), + _ones.info()->total_size() / _ones.info()->element_size(), 1); } else { - std::fill_n(reinterpret_cast<float *>(_ones.buffer()), _ones.info()->total_size() / _ones.info()->element_size(), 1); + std::fill_n(reinterpret_cast<float *>(_ones.buffer()), + _ones.info()->total_size() / _ones.info()->element_size(), 1); } _subtract_input_gate.run(); } @@ -641,13 +806,13 @@ void NELSTMLayer::run() { _fully_connected_input_gate.run(); - if(_run_peephole_opt) + if (_run_peephole_opt) { _pixelwise_mul_input_gate.run(); _accum_input_gate1.run(); } - if(_is_layer_norm_lstm) + if (_is_layer_norm_lstm) { _mean_std_norm_input_gate.run(); _pixelwise_mul_input_gate_coeff.run(); @@ -660,29 +825,30 @@ void NELSTMLayer::run() _transpose_cell_state.run(); _gemm_cell_state1.run(); _accum_cell_state1.run(); - if(_is_layer_norm_lstm) + if (_is_layer_norm_lstm) { _mean_std_norm_cell_gate.run(); _pixelwise_mul_cell_gate_coeff.run(); _accum_cell_gate_bias.run(); } + _activation_cell_state.run(); _pixelwise_mul_cell_state1.run(); _pixelwise_mul_cell_state2.run(); _accum_cell_state2.run(); - if(_perform_cell_clipping) + if (_perform_cell_clipping) { _cell_clip.run(); } _fully_connected_output.run(); - if(_run_peephole_opt) + if (_run_peephole_opt) { _pixelwise_mul_output_state1.run(); _accum_output1.run(); } - if(_is_layer_norm_lstm) + if (_is_layer_norm_lstm) { _mean_std_norm_output_gate.run(); _pixelwise_mul_output_gate_coeff.run(); @@ -693,10 +859,10 @@ void NELSTMLayer::run() _activation_output_state.run(); _pixelwise_mul_output_state2.run(); - if(_has_projection_weights) + if (_has_projection_weights) { _fully_connected_output_state.run(); - if(_perform_projection_clipping) + if (_perform_projection_clipping) { _projection_clip.run(); } @@ -710,10 +876,10 @@ void NELSTMLayer::run() void NELSTMLayer::prepare() { - if(!_is_prepared) + if (!_is_prepared) { _concat_weights_forget_gate.run(); - if(!_run_cifg_opt) + if (!_run_cifg_opt) { _concat_weights_input_gate.run(); } diff --git a/src/runtime/NEON/functions/NELSTMLayerQuantized.cpp b/src/runtime/NEON/functions/NELSTMLayerQuantized.cpp index 5c0f19a15c..41f9c3d700 100644 --- a/src/runtime/NEON/functions/NELSTMLayerQuantized.cpp +++ b/src/runtime/NEON/functions/NELSTMLayerQuantized.cpp @@ -24,17 +24,10 @@ #include "arm_compute/runtime/NEON/functions/NELSTMLayerQuantized.h" #include "arm_compute/core/Utils.h" -#include "arm_compute/core/Validate.h" #include "arm_compute/core/utils/quantization/AsymmHelpers.h" -#include "src/core/NEON/kernels/NEConvertQuantizedSignednessKernel.h" -#include "src/core/NEON/kernels/NEGEMMInterleave4x4Kernel.h" -#include "src/core/NEON/kernels/NEGEMMLowpMatrixMultiplyKernel.h" -#include "src/core/NEON/kernels/NEGEMMLowpOffsetContributionKernel.h" -#include "src/core/NEON/kernels/NEGEMMLowpOffsetContributionOutputStageKernel.h" -#include "src/core/NEON/kernels/NEGEMMLowpReductionKernel.h" -#include "src/core/NEON/kernels/NEGEMMMatrixAdditionKernel.h" -#include "src/core/NEON/kernels/NEGEMMMatrixMultiplyKernel.h" -#include "src/core/NEON/kernels/NEGEMMTranspose1xWKernel.h" +#include "arm_compute/core/Validate.h" + +#include "src/common/utils/Log.h" #include "src/core/helpers/AutoConfiguration.h" #include <cmath> @@ -54,32 +47,104 @@ const QuantizationInfo qsymm_0(1.f / 32768.f, 0); // qsymm16 with 0 integer bit NELSTMLayerQuantized::~NELSTMLayerQuantized() = default; NELSTMLayerQuantized::NELSTMLayerQuantized(std::shared_ptr<IMemoryManager> memory_manager) - : _memory_group(std::move(memory_manager)), _gemmlowp(), _output_stage(), _transpose_weights(), _concat_input_weights(), _concat_recurrent_weights(), _concat_weights(), _concat_inputs(), - _concat_bias(), _sigmoid_forget_gate(), _sigmoid_input_gate(), _sigmoid_output_gate(), _tanh_modulation_gate(), _tanh_output_state(), _add1(), _add2(), _mul1(), _mul2(), _mul3(), - _slice_input_tensor(), _slice_forget_tensor(), _slice_cell_tensor(), _slice_output_tensor(), _dequantize(), _quantize(), _input_to_input_weights(nullptr), _input_to_forget_weights(nullptr), - _input_to_cell_weights(nullptr), _input_to_output_weights(nullptr), _recurrent_to_input_weights(nullptr), _recurrent_to_forget_weights(nullptr), _recurrent_to_cell_weights(nullptr), - _recurrent_to_output_weights(nullptr), _input_gate_bias(nullptr), _forget_gate_bias(nullptr), _cell_bias(nullptr), _output_gate_bias(nullptr), _recurrent_weights(), _input_weights(), _weights(), - _input(), _weights_transposed(), _output_highp(), _output_lowp(), _bias(), _forget_gate_input(), _input_gate_input(), _output_gate_input(), _input_modulation_gate_input(), _forget_gate_output(), - _input_gate_output(), _output_gate_output(), _input_modulation_gate_output(), _cell_state1(), _cell_state2(), _output_state_tmp(), _output_state_out_symm(), _output_state_out_f32(), + : _memory_group(std::move(memory_manager)), + _gemmlowp(), + _output_stage(), + _transpose_weights(), + _concat_input_weights(), + _concat_recurrent_weights(), + _concat_weights(), + _concat_inputs(), + _concat_bias(), + _sigmoid_forget_gate(), + _sigmoid_input_gate(), + _sigmoid_output_gate(), + _tanh_modulation_gate(), + _tanh_output_state(), + _add1(), + _add2(), + _mul1(), + _mul2(), + _mul3(), + _slice_input_tensor(), + _slice_forget_tensor(), + _slice_cell_tensor(), + _slice_output_tensor(), + _dequantize(), + _quantize(), + _input_to_input_weights(nullptr), + _input_to_forget_weights(nullptr), + _input_to_cell_weights(nullptr), + _input_to_output_weights(nullptr), + _recurrent_to_input_weights(nullptr), + _recurrent_to_forget_weights(nullptr), + _recurrent_to_cell_weights(nullptr), + _recurrent_to_output_weights(nullptr), + _input_gate_bias(nullptr), + _forget_gate_bias(nullptr), + _cell_bias(nullptr), + _output_gate_bias(nullptr), + _recurrent_weights(), + _input_weights(), + _weights(), + _input(), + _weights_transposed(), + _output_highp(), + _output_lowp(), + _bias(), + _forget_gate_input(), + _input_gate_input(), + _output_gate_input(), + _input_modulation_gate_input(), + _forget_gate_output(), + _input_gate_output(), + _output_gate_output(), + _input_modulation_gate_output(), + _cell_state1(), + _cell_state2(), + _output_state_tmp(), + _output_state_out_symm(), + _output_state_out_f32(), _is_prepared(false) { } void NELSTMLayerQuantized::configure(const ITensor *input, - const ITensor *input_to_input_weights, const ITensor *input_to_forget_weights, const ITensor *input_to_cell_weights, const ITensor *input_to_output_weights, - const ITensor *recurrent_to_input_weights, const ITensor *recurrent_to_forget_weights, const ITensor *recurrent_to_cell_weights, const ITensor *recurrent_to_output_weights, - const ITensor *input_gate_bias, const ITensor *forget_gate_bias, const ITensor *cell_bias, const ITensor *output_gate_bias, - ITensor *cell_state_in, const ITensor *output_state_in, - ITensor *cell_state_out, ITensor *output_state_out) + const ITensor *input_to_input_weights, + const ITensor *input_to_forget_weights, + const ITensor *input_to_cell_weights, + const ITensor *input_to_output_weights, + const ITensor *recurrent_to_input_weights, + const ITensor *recurrent_to_forget_weights, + const ITensor *recurrent_to_cell_weights, + const ITensor *recurrent_to_output_weights, + const ITensor *input_gate_bias, + const ITensor *forget_gate_bias, + const ITensor *cell_bias, + const ITensor *output_gate_bias, + ITensor *cell_state_in, + const ITensor *output_state_in, + ITensor *cell_state_out, + ITensor *output_state_out) { - ARM_COMPUTE_ERROR_ON_NULLPTR(input, input_to_input_weights, input_to_forget_weights, input_to_cell_weights, input_to_output_weights, - recurrent_to_input_weights, recurrent_to_forget_weights, recurrent_to_cell_weights, recurrent_to_output_weights, - input_gate_bias, forget_gate_bias, cell_bias, output_gate_bias, cell_state_in, output_state_in, cell_state_out, output_state_out); - - ARM_COMPUTE_ERROR_THROW_ON(NELSTMLayerQuantized::validate(input->info(), input_to_input_weights->info(), input_to_forget_weights->info(), input_to_cell_weights->info(), - input_to_output_weights->info(), - recurrent_to_input_weights->info(), recurrent_to_forget_weights->info(), recurrent_to_cell_weights->info(), recurrent_to_output_weights->info(), - input_gate_bias->info(), forget_gate_bias->info(), cell_bias->info(), output_gate_bias->info(), cell_state_in->info(), output_state_in->info(), cell_state_out->info(), output_state_out->info())); + ARM_COMPUTE_ERROR_ON_NULLPTR(input, input_to_input_weights, input_to_forget_weights, input_to_cell_weights, + input_to_output_weights, recurrent_to_input_weights, recurrent_to_forget_weights, + recurrent_to_cell_weights, recurrent_to_output_weights, input_gate_bias, + forget_gate_bias, cell_bias, output_gate_bias, cell_state_in, output_state_in, + cell_state_out, output_state_out); + + ARM_COMPUTE_ERROR_THROW_ON(NELSTMLayerQuantized::validate( + input->info(), input_to_input_weights->info(), input_to_forget_weights->info(), input_to_cell_weights->info(), + input_to_output_weights->info(), recurrent_to_input_weights->info(), recurrent_to_forget_weights->info(), + recurrent_to_cell_weights->info(), recurrent_to_output_weights->info(), input_gate_bias->info(), + forget_gate_bias->info(), cell_bias->info(), output_gate_bias->info(), cell_state_in->info(), + output_state_in->info(), cell_state_out->info(), output_state_out->info())); + + ARM_COMPUTE_LOG_PARAMS(input, input_to_input_weights, input_to_forget_weights, input_to_cell_weights, + input_to_output_weights, recurrent_to_input_weights, recurrent_to_forget_weights, + recurrent_to_cell_weights, recurrent_to_output_weights, input_gate_bias, forget_gate_bias, + cell_bias, output_gate_bias, cell_state_in, output_state_in, cell_state_out, + output_state_out); const int input_size = input->info()->dimension(0); const int batch_size = input->info()->dimension(1); @@ -87,8 +152,10 @@ void NELSTMLayerQuantized::configure(const ITensor *input, const QuantizationInfo qweights = input_to_input_weights->info()->quantization_info(); // Weights quantization - auto_init_if_empty(*cell_state_out->info(), TensorInfo(TensorShape(batch_size, output_size), 1, DataType::QSYMM16, qsymm_4)); - auto_init_if_empty(*output_state_out->info(), TensorInfo(TensorShape(batch_size, output_size), 1, DataType::QASYMM8, qasymm)); + auto_init_if_empty(*cell_state_out->info(), + TensorInfo(TensorShape(batch_size, output_size), 1, DataType::QSYMM16, qsymm_4)); + auto_init_if_empty(*output_state_out->info(), + TensorInfo(TensorShape(batch_size, output_size), 1, DataType::QASYMM8, qasymm)); _input_to_input_weights = input_to_input_weights; _input_to_forget_weights = input_to_forget_weights; @@ -104,34 +171,41 @@ void NELSTMLayerQuantized::configure(const ITensor *input, _output_gate_bias = output_gate_bias; // Weights concatenation - std::vector<const ITensor *> inputs_weights_vector{ input_to_input_weights, input_to_forget_weights, input_to_cell_weights, input_to_output_weights }; - std::vector<const ITensor *> recurrent_weights_vector{ recurrent_to_input_weights, recurrent_to_forget_weights, recurrent_to_cell_weights, recurrent_to_output_weights }; + std::vector<const ITensor *> inputs_weights_vector{input_to_input_weights, input_to_forget_weights, + input_to_cell_weights, input_to_output_weights}; + std::vector<const ITensor *> recurrent_weights_vector{recurrent_to_input_weights, recurrent_to_forget_weights, + recurrent_to_cell_weights, recurrent_to_output_weights}; - _input_weights.allocator()->init(TensorInfo(TensorShape(input_size, 4 * output_size), 1, DataType::QASYMM8, qweights)); + _input_weights.allocator()->init( + TensorInfo(TensorShape(input_size, 4 * output_size), 1, DataType::QASYMM8, qweights)); _concat_input_weights.configure(inputs_weights_vector, &_input_weights, Window::DimY); - _recurrent_weights.allocator()->init(TensorInfo(TensorShape(output_size, 4 * output_size), 1, DataType::QASYMM8, qweights)); + _recurrent_weights.allocator()->init( + TensorInfo(TensorShape(output_size, 4 * output_size), 1, DataType::QASYMM8, qweights)); _concat_recurrent_weights.configure(recurrent_weights_vector, &_recurrent_weights, Window::DimY); - std::vector<const ITensor *> weights_vector{ &_recurrent_weights, &_input_weights }; - _weights.allocator()->init(TensorInfo(TensorShape(output_size + input_size, 4 * output_size), 1, DataType::QASYMM8, qweights)); + std::vector<const ITensor *> weights_vector{&_recurrent_weights, &_input_weights}; + _weights.allocator()->init( + TensorInfo(TensorShape(output_size + input_size, 4 * output_size), 1, DataType::QASYMM8, qweights)); _concat_weights.configure(weights_vector, &_weights, Window::DimX); _transpose_weights.configure(&_weights, &_weights_transposed); // Input concatenation - std::vector<const ITensor *> input_vector{ input, output_state_in }; + std::vector<const ITensor *> input_vector{input, output_state_in}; _memory_group.manage(&_input); - _input.allocator()->init(TensorInfo(TensorShape(output_size + input_size, batch_size), 1, DataType::QASYMM8, qasymm)); + _input.allocator()->init( + TensorInfo(TensorShape(output_size + input_size, batch_size), 1, DataType::QASYMM8, qasymm)); _concat_inputs.configure(input_vector, &_input, Window::DimX); // Bias concatenation - std::vector<const ITensor *> bias_vector{ input_gate_bias, forget_gate_bias, cell_bias, output_gate_bias }; + std::vector<const ITensor *> bias_vector{input_gate_bias, forget_gate_bias, cell_bias, output_gate_bias}; _bias.allocator()->init(TensorInfo(TensorShape(4 * output_size), 1, DataType::S32)); _concat_bias.configure(bias_vector, &_bias, Window::DimX); // Invert the offset for gemmlowp _input.info()->set_quantization_info(QuantizationInfo(qasymm.uniform().scale, -qasymm.uniform().offset)); - _weights_transposed.info()->set_quantization_info(QuantizationInfo(qweights.uniform().scale, -qweights.uniform().offset)); + _weights_transposed.info()->set_quantization_info( + QuantizationInfo(qweights.uniform().scale, -qweights.uniform().offset)); // Run gemmlowp _memory_group.manage(&_output_highp); @@ -141,7 +215,8 @@ void NELSTMLayerQuantized::configure(const ITensor *input, // Set the offset back _input.info()->set_quantization_info(QuantizationInfo(qasymm.uniform().scale, qasymm.uniform().offset)); - _weights_transposed.info()->set_quantization_info(QuantizationInfo(qweights.uniform().scale, qweights.uniform().offset)); + _weights_transposed.info()->set_quantization_info( + QuantizationInfo(qweights.uniform().scale, qweights.uniform().offset)); // multiplier = (input_scale * weights_scale) / output_scale (2 ^ (-12)) _output_lowp.allocator()->init(TensorInfo(_output_highp.info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_3)); @@ -152,69 +227,91 @@ void NELSTMLayerQuantized::configure(const ITensor *input, quantization::calculate_quantized_multiplier(multiplier, &output_multiplier, &output_shift); _memory_group.manage(&_output_lowp); - _output_stage.configure(&_output_highp, &_bias, &_output_lowp, output_multiplier, output_shift); + + GEMMLowpOutputStageInfo info; + info.type = GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT; + info.gemmlowp_multiplier = output_multiplier; + info.gemmlowp_shift = output_shift; + info.output_data_type = DataType::QSYMM16; + _output_stage.configure(&_output_highp, &_bias, &_output_lowp, info); _output_highp.allocator()->allocate(); _bias.allocator()->allocate(); // Get the gate tensors - if(batch_size > 1) + if (batch_size > 1) { _memory_group.manage(&_input_gate_input); - _slice_input_tensor.configure(&_output_lowp, &_input_gate_input, { 0, 0 }, { output_size, batch_size }); + _slice_input_tensor.configure(&_output_lowp, &_input_gate_input, {0, 0}, {output_size, batch_size}); _memory_group.manage(&_forget_gate_input); - _slice_forget_tensor.configure(&_output_lowp, &_forget_gate_input, { output_size, 0 }, { 2 * output_size, batch_size }); + _slice_forget_tensor.configure(&_output_lowp, &_forget_gate_input, {output_size, 0}, + {2 * output_size, batch_size}); _memory_group.manage(&_input_modulation_gate_input); - _slice_cell_tensor.configure(&_output_lowp, &_input_modulation_gate_input, { 2 * output_size, 0 }, { 3 * output_size, batch_size }); + _slice_cell_tensor.configure(&_output_lowp, &_input_modulation_gate_input, {2 * output_size, 0}, + {3 * output_size, batch_size}); _memory_group.manage(&_output_gate_input); - _slice_output_tensor.configure(&_output_lowp, &_output_gate_input, { 3 * output_size, 0 }, { 4 * output_size, batch_size }); + _slice_output_tensor.configure(&_output_lowp, &_output_gate_input, {3 * output_size, 0}, + {4 * output_size, batch_size}); _output_lowp.allocator()->allocate(); } else { _memory_group.manage(&_input_gate_input); - _slice_input_tensor.configure(&_output_lowp, &_input_gate_input, { 0 }, { output_size }); + _slice_input_tensor.configure(&_output_lowp, &_input_gate_input, {0}, {output_size}); _memory_group.manage(&_forget_gate_input); - _slice_forget_tensor.configure(&_output_lowp, &_forget_gate_input, { output_size }, { 2 * output_size }); + _slice_forget_tensor.configure(&_output_lowp, &_forget_gate_input, {output_size}, {2 * output_size}); _memory_group.manage(&_input_modulation_gate_input); - _slice_cell_tensor.configure(&_output_lowp, &_input_modulation_gate_input, { 2 * output_size }, { 3 * output_size }); + _slice_cell_tensor.configure(&_output_lowp, &_input_modulation_gate_input, {2 * output_size}, + {3 * output_size}); _memory_group.manage(&_output_gate_input); - _slice_output_tensor.configure(&_output_lowp, &_output_gate_input, { 3 * output_size }, { 4 * output_size }); + _slice_output_tensor.configure(&_output_lowp, &_output_gate_input, {3 * output_size}, {4 * output_size}); _output_lowp.allocator()->allocate(); } // Forget gate _memory_group.manage(&_forget_gate_output); - _forget_gate_output.allocator()->init(TensorInfo(_forget_gate_input.info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_0)); - _sigmoid_forget_gate.configure(&_forget_gate_input, &_forget_gate_output, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)); + _forget_gate_output.allocator()->init( + TensorInfo(_forget_gate_input.info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_0)); + _sigmoid_forget_gate.configure(&_forget_gate_input, &_forget_gate_output, + ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)); _forget_gate_input.allocator()->allocate(); // Input gate _memory_group.manage(&_input_gate_output); - _input_gate_output.allocator()->init(TensorInfo(_input_gate_input.info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_0)); - _sigmoid_input_gate.configure(&_input_gate_input, &_input_gate_output, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)); + _input_gate_output.allocator()->init( + TensorInfo(_input_gate_input.info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_0)); + _sigmoid_input_gate.configure(&_input_gate_input, &_input_gate_output, + ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)); _input_gate_input.allocator()->allocate(); // Input modulation gate equation _memory_group.manage(&_input_modulation_gate_output); - _input_modulation_gate_output.allocator()->init(TensorInfo(_input_modulation_gate_input.info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_0)); - _tanh_modulation_gate.configure(&_input_modulation_gate_input, &_input_modulation_gate_output, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.0f, 1.0f)); + _input_modulation_gate_output.allocator()->init( + TensorInfo(_input_modulation_gate_input.info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_0)); + _tanh_modulation_gate.configure(&_input_modulation_gate_input, &_input_modulation_gate_output, + ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.0f, 1.0f)); _input_modulation_gate_input.allocator()->allocate(); // Output gate _memory_group.manage(&_output_gate_output); - _output_gate_output.allocator()->init(TensorInfo(_output_gate_input.info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_0)); - _sigmoid_output_gate.configure(&_output_gate_input, &_output_gate_output, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)); + _output_gate_output.allocator()->init( + TensorInfo(_output_gate_input.info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_0)); + _sigmoid_output_gate.configure(&_output_gate_input, &_output_gate_output, + ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)); _output_gate_input.allocator()->allocate(); // Long term memory _memory_group.manage(&_cell_state1); - _cell_state1.allocator()->init(TensorInfo(_forget_gate_output.info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_4)); - _mul1.configure(&_forget_gate_output, cell_state_in, &_cell_state1, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO); + _cell_state1.allocator()->init( + TensorInfo(_forget_gate_output.info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_4)); + _mul1.configure(&_forget_gate_output, cell_state_in, &_cell_state1, 1, ConvertPolicy::SATURATE, + RoundingPolicy::TO_ZERO); _forget_gate_output.allocator()->allocate(); _memory_group.manage(&_cell_state2); - _cell_state2.allocator()->init(TensorInfo(_input_gate_output.info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_4)); - _mul2.configure(&_input_gate_output, &_input_modulation_gate_output, &_cell_state2, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO); + _cell_state2.allocator()->init( + TensorInfo(_input_gate_output.info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_4)); + _mul2.configure(&_input_gate_output, &_input_modulation_gate_output, &_cell_state2, 1, ConvertPolicy::SATURATE, + RoundingPolicy::TO_ZERO); _input_modulation_gate_output.allocator()->allocate(); _input_gate_output.allocator()->allocate(); @@ -224,18 +321,23 @@ void NELSTMLayerQuantized::configure(const ITensor *input, // Short term memory _memory_group.manage(&_output_state_tmp); - _output_state_tmp.allocator()->init(TensorInfo(cell_state_out->info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_0)); - _tanh_output_state.configure(cell_state_out, &_output_state_tmp, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.0f, 1.0f)); + _output_state_tmp.allocator()->init( + TensorInfo(cell_state_out->info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_0)); + _tanh_output_state.configure(cell_state_out, &_output_state_tmp, + ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.0f, 1.0f)); _memory_group.manage(&_output_state_out_symm); - _output_state_out_symm.allocator()->init(TensorInfo(_output_gate_output.info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_0)); - _mul3.configure(&_output_state_tmp, &_output_gate_output, &_output_state_out_symm, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO); + _output_state_out_symm.allocator()->init( + TensorInfo(_output_gate_output.info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_0)); + _mul3.configure(&_output_state_tmp, &_output_gate_output, &_output_state_out_symm, 1, ConvertPolicy::SATURATE, + RoundingPolicy::TO_ZERO); _output_gate_output.allocator()->allocate(); _output_state_tmp.allocator()->allocate(); // Requantize the output state from QSYMM16 to QASYMM8 _memory_group.manage(&_output_state_out_f32); - _output_state_out_f32.allocator()->init(TensorInfo(_output_state_out_symm.info()->tensor_shape(), 1, DataType::F32)); + _output_state_out_f32.allocator()->init( + TensorInfo(_output_state_out_symm.info()->tensor_shape(), 1, DataType::F32)); _dequantize.configure(&_output_state_out_symm, &_output_state_out_f32); _output_state_out_symm.allocator()->allocate(); @@ -244,15 +346,28 @@ void NELSTMLayerQuantized::configure(const ITensor *input, } Status NELSTMLayerQuantized::validate(const ITensorInfo *input, - const ITensorInfo *input_to_input_weights, const ITensorInfo *input_to_forget_weights, const ITensorInfo *input_to_cell_weights, const ITensorInfo *input_to_output_weights, - const ITensorInfo *recurrent_to_input_weights, const ITensorInfo *recurrent_to_forget_weights, const ITensorInfo *recurrent_to_cell_weights, const ITensorInfo *recurrent_to_output_weights, - const ITensorInfo *input_gate_bias, const ITensorInfo *forget_gate_bias, const ITensorInfo *cell_bias, const ITensorInfo *output_gate_bias, - const ITensorInfo *cell_state_in, const ITensorInfo *output_state_in, - const ITensorInfo *cell_state_out, const ITensorInfo *output_state_out) + const ITensorInfo *input_to_input_weights, + const ITensorInfo *input_to_forget_weights, + const ITensorInfo *input_to_cell_weights, + const ITensorInfo *input_to_output_weights, + const ITensorInfo *recurrent_to_input_weights, + const ITensorInfo *recurrent_to_forget_weights, + const ITensorInfo *recurrent_to_cell_weights, + const ITensorInfo *recurrent_to_output_weights, + const ITensorInfo *input_gate_bias, + const ITensorInfo *forget_gate_bias, + const ITensorInfo *cell_bias, + const ITensorInfo *output_gate_bias, + const ITensorInfo *cell_state_in, + const ITensorInfo *output_state_in, + const ITensorInfo *cell_state_out, + const ITensorInfo *output_state_out) { - ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, input_to_input_weights, input_to_forget_weights, input_to_cell_weights, input_to_output_weights, recurrent_to_input_weights, - recurrent_to_forget_weights, recurrent_to_cell_weights, recurrent_to_output_weights, input_gate_bias, forget_gate_bias, cell_bias, output_gate_bias, cell_state_in, - output_state_in, cell_state_out, output_state_out); + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR( + input, input_to_input_weights, input_to_forget_weights, input_to_cell_weights, input_to_output_weights, + recurrent_to_input_weights, recurrent_to_forget_weights, recurrent_to_cell_weights, recurrent_to_output_weights, + input_gate_bias, forget_gate_bias, cell_bias, output_gate_bias, cell_state_in, output_state_in, cell_state_out, + output_state_out); const int input_size = input->dimension(0); const int batch_size = input->dimension(1); @@ -264,29 +379,51 @@ Status NELSTMLayerQuantized::validate(const ITensorInfo *input, ARM_COMPUTE_RETURN_ERROR_ON(input_gate_bias->num_dimensions() > 1); ARM_COMPUTE_RETURN_ERROR_ON(output_state_in->num_dimensions() > 2); - TensorInfo input_weights_info(input_to_input_weights->clone()->set_tensor_shape(TensorShape(input_size, output_size)).set_data_type(DataType::QASYMM8)); - TensorInfo recurrent_weights_info(input_to_input_weights->clone()->set_tensor_shape(TensorShape(output_size, output_size)).set_data_type(DataType::QASYMM8)); - TensorInfo bias_info(input_gate_bias->clone()->set_tensor_shape(TensorShape(output_size)).set_data_type(DataType::S32)); - TensorInfo output_state_info(cell_state_in->clone()->set_tensor_shape(TensorShape(output_size, batch_size)).set_data_type(DataType::QASYMM8).set_quantization_info(qasymm)); - TensorInfo cell_state_info(cell_state_in->clone()->set_tensor_shape(TensorShape(output_size, batch_size)).set_data_type(DataType::QSYMM16).set_quantization_info(qsymm_4)); + TensorInfo input_weights_info(input_to_input_weights->clone() + ->set_tensor_shape(TensorShape(input_size, output_size)) + .set_data_type(DataType::QASYMM8)); + TensorInfo recurrent_weights_info(input_to_input_weights->clone() + ->set_tensor_shape(TensorShape(output_size, output_size)) + .set_data_type(DataType::QASYMM8)); + TensorInfo bias_info( + input_gate_bias->clone()->set_tensor_shape(TensorShape(output_size)).set_data_type(DataType::S32)); + TensorInfo output_state_info(cell_state_in->clone() + ->set_tensor_shape(TensorShape(output_size, batch_size)) + .set_data_type(DataType::QASYMM8) + .set_quantization_info(qasymm)); + TensorInfo cell_state_info(cell_state_in->clone() + ->set_tensor_shape(TensorShape(output_size, batch_size)) + .set_data_type(DataType::QSYMM16) + .set_quantization_info(qsymm_4)); // Shape checks - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&input_weights_info, input_to_input_weights, input_to_forget_weights, input_to_cell_weights, input_to_output_weights); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&recurrent_weights_info, recurrent_to_input_weights, recurrent_to_forget_weights, recurrent_to_cell_weights, recurrent_to_output_weights); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&bias_info, input_gate_bias, forget_gate_bias, cell_bias, output_gate_bias); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&input_weights_info, input_to_input_weights, input_to_forget_weights, + input_to_cell_weights, input_to_output_weights); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&recurrent_weights_info, recurrent_to_input_weights, + recurrent_to_forget_weights, recurrent_to_cell_weights, + recurrent_to_output_weights); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&bias_info, input_gate_bias, forget_gate_bias, cell_bias, + output_gate_bias); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&cell_state_info, cell_state_in); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&output_state_info, output_state_in); // Data type checks - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&input_weights_info, input, input_to_input_weights, input_to_forget_weights, input_to_cell_weights, input_to_output_weights); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(recurrent_to_input_weights, recurrent_to_forget_weights, recurrent_to_cell_weights, recurrent_to_output_weights); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&bias_info, input_gate_bias, forget_gate_bias, cell_bias, output_gate_bias); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&input_weights_info, input, input_to_input_weights, + input_to_forget_weights, input_to_cell_weights, + input_to_output_weights); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(recurrent_to_input_weights, recurrent_to_forget_weights, + recurrent_to_cell_weights, recurrent_to_output_weights); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&bias_info, input_gate_bias, forget_gate_bias, cell_bias, + output_gate_bias); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&cell_state_info, cell_state_in); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&output_state_info, output_state_in); // Quantization checks - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(&input_weights_info, input_to_input_weights, input_to_forget_weights, input_to_cell_weights, input_to_output_weights); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(recurrent_to_input_weights, recurrent_to_forget_weights, recurrent_to_cell_weights, recurrent_to_output_weights); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(&input_weights_info, input_to_input_weights, + input_to_forget_weights, input_to_cell_weights, + input_to_output_weights); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(recurrent_to_input_weights, recurrent_to_forget_weights, + recurrent_to_cell_weights, recurrent_to_output_weights); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(&cell_state_info, cell_state_in); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(&output_state_info, output_state_in); @@ -308,7 +445,8 @@ Status NELSTMLayerQuantized::validate(const ITensorInfo *input, recurrent_weights_vector.emplace_back(recurrent_to_cell_weights); recurrent_weights_vector.emplace_back(recurrent_to_output_weights); const TensorInfo recurrent_weights(TensorShape(output_size, 4 * output_size), 1, DataType::QASYMM8, qweights); - ARM_COMPUTE_RETURN_ON_ERROR(NEConcatenateLayer::validate(recurrent_weights_vector, &recurrent_weights, Window::DimY)); + ARM_COMPUTE_RETURN_ON_ERROR( + NEConcatenateLayer::validate(recurrent_weights_vector, &recurrent_weights, Window::DimY)); // _concat_weights std::vector<const ITensorInfo *> weights_vector; @@ -318,7 +456,7 @@ Status NELSTMLayerQuantized::validate(const ITensorInfo *input, ARM_COMPUTE_RETURN_ON_ERROR(NEConcatenateLayer::validate(weights_vector, &weights, Window::DimX)); // _transpose_weights const TensorShape weights_transposed_shape(weights.tensor_shape()[1], weights.tensor_shape()[0]); - TensorInfo weights_transposed = weights.clone()->set_is_resizable(true).set_tensor_shape(weights_transposed_shape); + TensorInfo weights_transposed = weights.clone()->set_is_resizable(true).set_tensor_shape(weights_transposed_shape); ARM_COMPUTE_RETURN_ON_ERROR(NETranspose::validate(&weights, &weights_transposed)); // _concat_inputs @@ -344,7 +482,8 @@ Status NELSTMLayerQuantized::validate(const ITensorInfo *input, // _gemmlowp const TensorInfo output_highp(TensorShape(4 * output_size, batch_size), 1, DataType::S32); - ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpMatrixMultiplyCore::validate(&input_concatenated, &weights_transposed, nullptr, &output_highp)); + ARM_COMPUTE_RETURN_ON_ERROR( + NEGEMMLowpMatrixMultiplyCore::validate(&input_concatenated, &weights_transposed, nullptr, &output_highp)); // Set the offset back input_concatenated.set_quantization_info(QuantizationInfo(qasymm.uniform().scale, qasymm.uniform().offset)); @@ -355,78 +494,107 @@ Status NELSTMLayerQuantized::validate(const ITensorInfo *input, const float multiplier = 4096.f * qasymm.uniform().scale * qweights.uniform().scale; int32_t output_multiplier = 0; int32_t output_shift = 0; - ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier(multiplier, &output_multiplier, &output_shift)); + ARM_COMPUTE_RETURN_ON_ERROR( + quantization::calculate_quantized_multiplier(multiplier, &output_multiplier, &output_shift)); // _output_stage - ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPoint::validate(&output_highp, &bias_concatenated, &output_lowp)); + GEMMLowpOutputStageInfo info; + info.type = GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT; + info.gemmlowp_multiplier = output_multiplier; + info.gemmlowp_shift = output_shift; + info.output_data_type = DataType::QSYMM16; + ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpOutputStage::validate(&output_highp, &bias_concatenated, &output_lowp, info)); TensorInfo input_gate_input; TensorInfo forget_gate_input; TensorInfo input_modulation_gate_input; TensorInfo output_gate_input; - if(batch_size > 1) + if (batch_size > 1) { // _slice_input_tensor input_gate_input = TensorInfo(TensorShape(output_size, batch_size), 1, DataType::QSYMM16, qsymm_3); - ARM_COMPUTE_RETURN_ON_ERROR(NESlice::validate(&output_lowp, &input_gate_input, { 0, 0 }, { output_size, batch_size })); + ARM_COMPUTE_RETURN_ON_ERROR( + NESlice::validate(&output_lowp, &input_gate_input, {0, 0}, {output_size, batch_size})); // _slice_forget_tensor forget_gate_input = TensorInfo(TensorShape(output_size, batch_size), 1, DataType::QSYMM16, qsymm_3); - ARM_COMPUTE_RETURN_ON_ERROR(NESlice::validate(&output_lowp, &forget_gate_input, { output_size, 0 }, { 2 * output_size, batch_size })); + ARM_COMPUTE_RETURN_ON_ERROR( + NESlice::validate(&output_lowp, &forget_gate_input, {output_size, 0}, {2 * output_size, batch_size})); // _slice_cell_tensor input_modulation_gate_input = TensorInfo(TensorShape(output_size, batch_size), 1, DataType::QSYMM16, qsymm_3); - ARM_COMPUTE_RETURN_ON_ERROR(NESlice::validate(&output_lowp, &input_modulation_gate_input, { 2 * output_size, 0 }, { 3 * output_size, batch_size })); + ARM_COMPUTE_RETURN_ON_ERROR(NESlice::validate(&output_lowp, &input_modulation_gate_input, {2 * output_size, 0}, + {3 * output_size, batch_size})); // _slice_output_tensor output_gate_input = TensorInfo(TensorShape(output_size, batch_size), 1, DataType::QSYMM16, qsymm_3); - ARM_COMPUTE_RETURN_ON_ERROR(NESlice::validate(&output_lowp, &output_gate_input, { 3 * output_size, 0 }, { 4 * output_size, batch_size })); + ARM_COMPUTE_RETURN_ON_ERROR( + NESlice::validate(&output_lowp, &output_gate_input, {3 * output_size, 0}, {4 * output_size, batch_size})); } else { // _slice_input_tensor input_gate_input = TensorInfo(TensorShape(output_size), 1, DataType::QSYMM16, qsymm_3); - ARM_COMPUTE_RETURN_ON_ERROR(NESlice::validate(&output_lowp, &input_gate_input, { 0 }, { output_size })); + ARM_COMPUTE_RETURN_ON_ERROR(NESlice::validate(&output_lowp, &input_gate_input, {0}, {output_size})); // _slice_forget_tensor forget_gate_input = TensorInfo(TensorShape(output_size), 1, DataType::QSYMM16, qsymm_3); - ARM_COMPUTE_RETURN_ON_ERROR(NESlice::validate(&output_lowp, &forget_gate_input, { output_size }, { 2 * output_size })); + ARM_COMPUTE_RETURN_ON_ERROR( + NESlice::validate(&output_lowp, &forget_gate_input, {output_size}, {2 * output_size})); // _slice_cell_tensor input_modulation_gate_input = TensorInfo(TensorShape(output_size), 1, DataType::QSYMM16, qsymm_3); - ARM_COMPUTE_RETURN_ON_ERROR(NESlice::validate(&output_lowp, &input_modulation_gate_input, { 2 * output_size }, { 3 * output_size })); + ARM_COMPUTE_RETURN_ON_ERROR( + NESlice::validate(&output_lowp, &input_modulation_gate_input, {2 * output_size}, {3 * output_size})); // _slice_output_tensor output_gate_input = TensorInfo(TensorShape(output_size), 1, DataType::QSYMM16, qsymm_3); - ARM_COMPUTE_RETURN_ON_ERROR(NESlice::validate(&output_lowp, &output_gate_input, { 3 * output_size }, { 4 * output_size })); + ARM_COMPUTE_RETURN_ON_ERROR( + NESlice::validate(&output_lowp, &output_gate_input, {3 * output_size}, {4 * output_size})); } // _sigmoid_forget_gate const TensorInfo forget_gate_output(forget_gate_input.tensor_shape(), 1, DataType::QSYMM16, qsymm_0); - ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayer::validate(&forget_gate_input, &forget_gate_output, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC))); + ARM_COMPUTE_RETURN_ON_ERROR( + NEActivationLayer::validate(&forget_gate_input, &forget_gate_output, + ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC))); // _sigmoid_input_gate const TensorInfo input_gate_output(input_gate_input.tensor_shape(), 1, DataType::QSYMM16, qsymm_0); - ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayer::validate(&input_gate_input, &input_gate_output, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC))); + ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayer::validate( + &input_gate_input, &input_gate_output, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC))); // _tanh_modulation_gate - const TensorInfo input_modulation_gate_output(input_modulation_gate_input.tensor_shape(), 1, DataType::QSYMM16, qsymm_0); - ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayer::validate(&input_modulation_gate_input, &input_modulation_gate_output, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.0f, 1.0f))); + const TensorInfo input_modulation_gate_output(input_modulation_gate_input.tensor_shape(), 1, DataType::QSYMM16, + qsymm_0); + ARM_COMPUTE_RETURN_ON_ERROR( + NEActivationLayer::validate(&input_modulation_gate_input, &input_modulation_gate_output, + ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.0f, 1.0f))); // _sigmoid_output_gate const TensorInfo output_gate_output(output_gate_input.tensor_shape(), 1, DataType::QSYMM16, qsymm_0); - ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayer::validate(&output_gate_input, &output_gate_output, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC))); + ARM_COMPUTE_RETURN_ON_ERROR( + NEActivationLayer::validate(&output_gate_input, &output_gate_output, + ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC))); // _mul_forget_gate_cell_state const TensorInfo cell_state_tmp1(forget_gate_output.tensor_shape(), 1, DataType::QSYMM16, qsymm_4); - ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplication::validate(&forget_gate_output, cell_state_in, &cell_state_tmp1, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO)); + ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplication::validate( + &forget_gate_output, cell_state_in, &cell_state_tmp1, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO)); // _mul_input_gate_input_mod_gate const TensorInfo cell_state_tmp2(input_gate_output.tensor_shape(), 1, DataType::QSYMM16, qsymm_4); - ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplication::validate(&input_gate_output, &input_modulation_gate_output, &cell_state_tmp2, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO)); + ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplication::validate(&input_gate_output, &input_modulation_gate_output, + &cell_state_tmp2, 1, ConvertPolicy::SATURATE, + RoundingPolicy::TO_ZERO)); // _add_cell_state_tmps - ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(&cell_state_tmp1, &cell_state_tmp2, cell_state_out, ConvertPolicy::SATURATE)); + ARM_COMPUTE_RETURN_ON_ERROR( + NEArithmeticAddition::validate(&cell_state_tmp1, &cell_state_tmp2, cell_state_out, ConvertPolicy::SATURATE)); // _tanh_modulation_gate const TensorInfo output_state_tmp(cell_state_out->tensor_shape(), 1, DataType::QSYMM16, qsymm_0); - ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayer::validate(cell_state_out, &output_state_tmp, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.0f, 1.0f))); + ARM_COMPUTE_RETURN_ON_ERROR( + NEActivationLayer::validate(cell_state_out, &output_state_tmp, + ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.0f, 1.0f))); // _mul_output_state_tmp_output_gate const TensorInfo output_state_out_symm(output_gate_output.tensor_shape(), 1, DataType::QSYMM16, qsymm_0); - ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplication::validate(&output_state_tmp, &output_gate_output, &output_state_out_symm, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO)); + ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplication::validate(&output_state_tmp, &output_gate_output, + &output_state_out_symm, 1, ConvertPolicy::SATURATE, + RoundingPolicy::TO_ZERO)); // _dequantize const TensorInfo output_state_out_f32(output_state_out_symm.tensor_shape(), 1, DataType::F32); @@ -435,14 +603,14 @@ Status NELSTMLayerQuantized::validate(const ITensorInfo *input, // _quantize ARM_COMPUTE_RETURN_ON_ERROR(NEQuantizationLayer::validate(&output_state_out_f32, output_state_out)); - if(cell_state_out->total_size() != 0) + if (cell_state_out->total_size() != 0) { ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&cell_state_info, cell_state_out); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&cell_state_info, cell_state_out); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(&cell_state_info, cell_state_out); } - if(output_state_out->total_size() != 0) + if (output_state_out->total_size() != 0) { ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&output_state_info, output_state_out); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&output_state_info, output_state_out); @@ -501,7 +669,7 @@ void NELSTMLayerQuantized::run() void NELSTMLayerQuantized::prepare() { - if(!_is_prepared) + if (!_is_prepared) { _input_weights.allocator()->allocate(); _concat_input_weights.run(); diff --git a/src/runtime/NEON/functions/NELogical.cpp b/src/runtime/NEON/functions/NELogical.cpp index 171d84da19..0013a521d1 100644 --- a/src/runtime/NEON/functions/NELogical.cpp +++ b/src/runtime/NEON/functions/NELogical.cpp @@ -25,21 +25,22 @@ #include "arm_compute/runtime/NEON/NEScheduler.h" #include "arm_compute/runtime/Tensor.h" + +#include "src/common/utils/Log.h" #include "src/core/NEON/kernels/NELogicalKernel.h" namespace arm_compute { struct LogicalArgs { - std::unique_ptr<kernels::NELogicalKernel> kernel{ nullptr }; + std::unique_ptr<kernels::NELogicalKernel> kernel{nullptr}; ITensorPack pack{}; }; struct NELogicalAnd::Impl : public LogicalArgs { }; -NELogicalAnd::NELogicalAnd() - : _impl(std::make_unique<Impl>()) +NELogicalAnd::NELogicalAnd() : _impl(std::make_unique<Impl>()) { } NELogicalAnd::~NELogicalAnd() = default; @@ -47,6 +48,7 @@ NELogicalAnd::~NELogicalAnd() = default; void NELogicalAnd::configure(const ITensor *input1, const ITensor *input2, ITensor *output) { ARM_COMPUTE_ERROR_ON_NULLPTR(input1, input2, output); + ARM_COMPUTE_LOG_PARAMS(input1, input2, output); _impl->kernel = std::make_unique<kernels::NELogicalKernel>(); _impl->kernel->configure(input1->info(), input2->info(), output->info(), LogicalOperation::And); @@ -70,8 +72,7 @@ void NELogicalAnd::run() struct NELogicalOr::Impl : public LogicalArgs { }; -NELogicalOr::NELogicalOr() - : _impl(std::make_unique<Impl>()) +NELogicalOr::NELogicalOr() : _impl(std::make_unique<Impl>()) { } NELogicalOr::~NELogicalOr() = default; @@ -79,6 +80,7 @@ NELogicalOr::~NELogicalOr() = default; void NELogicalOr::configure(const ITensor *input1, const ITensor *input2, ITensor *output) { ARM_COMPUTE_ERROR_ON_NULLPTR(input1, input2, output); + ARM_COMPUTE_LOG_PARAMS(input1, input2, output); _impl->kernel = std::make_unique<kernels::NELogicalKernel>(); _impl->kernel->configure(input1->info(), input2->info(), output->info(), LogicalOperation::Or); @@ -102,8 +104,7 @@ void NELogicalOr::run() struct NELogicalNot::Impl : public LogicalArgs { }; -NELogicalNot::NELogicalNot() - : _impl(std::make_unique<Impl>()) +NELogicalNot::NELogicalNot() : _impl(std::make_unique<Impl>()) { } NELogicalNot::~NELogicalNot() = default; @@ -111,6 +112,7 @@ NELogicalNot::~NELogicalNot() = default; void NELogicalNot::configure(const ITensor *input, ITensor *output) { ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); + ARM_COMPUTE_LOG_PARAMS(input, output); _impl->kernel = std::make_unique<kernels::NELogicalKernel>(); _impl->kernel->configure(input->info(), nullptr, output->info(), LogicalOperation::Not); diff --git a/src/runtime/NEON/functions/NEMatMul.cpp b/src/runtime/NEON/functions/NEMatMul.cpp new file mode 100644 index 0000000000..31898bafc4 --- /dev/null +++ b/src/runtime/NEON/functions/NEMatMul.cpp @@ -0,0 +1,85 @@ +/* + * Copyright (c) 2023 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/runtime/NEON/functions/NEMatMul.h" + +#include "arm_compute/core/Validate.h" +#include "arm_compute/runtime/MemoryGroup.h" +#include "arm_compute/runtime/Tensor.h" + +#include "src/core/helpers/MemoryHelpers.h" +#include "src/cpu/operators/CpuMatMul.h" + +namespace arm_compute +{ +struct NEMatMul::Impl +{ + const ITensor *lhs{nullptr}; + const ITensor *rhs{nullptr}; + ITensor *output{nullptr}; + std::unique_ptr<cpu::CpuMatMul> op{nullptr}; + MemoryGroup memory_group{}; + WorkspaceData<Tensor> workspace_tensors{}; + ITensorPack run_pack{}; +}; + +NEMatMul::NEMatMul() : _impl(std::make_unique<Impl>()) +{ +} + +NEMatMul::~NEMatMul() = default; + +void NEMatMul::configure(ITensor *lhs, + ITensor *rhs, + ITensor *output, + const MatMulInfo &info, + const CpuMatMulSettings &settings, + const ActivationLayerInfo &act_info) +{ + _impl->lhs = lhs; + _impl->rhs = rhs; + _impl->output = output; + + ARM_COMPUTE_ERROR_ON_NULLPTR(_impl->lhs, _impl->rhs, _impl->output); + _impl->op = std::make_unique<cpu::CpuMatMul>(); + _impl->op->configure(lhs->info(), rhs->info(), output->info(), info, settings, act_info); + _impl->run_pack = {{ACL_SRC_0, lhs}, {ACL_SRC_1, rhs}, {ACL_DST, output}}; + _impl->workspace_tensors = manage_workspace<Tensor>(_impl->op->workspace(), _impl->memory_group, _impl->run_pack); +} + +Status NEMatMul::validate(const ITensorInfo *lhs, + const ITensorInfo *rhs, + const ITensorInfo *output, + const MatMulInfo &info, + const CpuMatMulSettings &settings, + const ActivationLayerInfo &act_info) +{ + return cpu::CpuMatMul::validate(lhs, rhs, output, info, settings, act_info); +} + +void NEMatMul::run() +{ + MemoryGroupResourceScope scope_mg(_impl->memory_group); + _impl->op->run(_impl->run_pack); +} +} // namespace arm_compute diff --git a/src/runtime/NEON/functions/NEMaxUnpoolingLayer.cpp b/src/runtime/NEON/functions/NEMaxUnpoolingLayer.cpp index 656777d726..c3861afd2c 100644 --- a/src/runtime/NEON/functions/NEMaxUnpoolingLayer.cpp +++ b/src/runtime/NEON/functions/NEMaxUnpoolingLayer.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2021 Arm Limited. + * Copyright (c) 2020-2022 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -25,36 +25,66 @@ #include "arm_compute/core/ITensor.h" #include "arm_compute/core/Validate.h" -#include "arm_compute/runtime/NEON/NEScheduler.h" #include "arm_compute/runtime/NEON/functions/NEFill.h" -#include "src/core/NEON/kernels/NEMaxUnpoolingLayerKernel.h" +#include "arm_compute/runtime/NEON/NEScheduler.h" + +#include "src/common/utils/Log.h" +#include "src/cpu/kernels/CpuMaxUnpoolingLayerKernel.h" +#include "src/cpu/operators/CpuMaxUnpooling.h" namespace arm_compute { +struct NEMaxUnpoolingLayer::Impl +{ + const ITensor *src{nullptr}; + const ITensor *indices{nullptr}; + ITensor *dst{nullptr}; + std::unique_ptr<cpu::CpuMaxUnpooling> op{nullptr}; +}; + NEMaxUnpoolingLayer::~NEMaxUnpoolingLayer() = default; -NEMaxUnpoolingLayer::NEMaxUnpoolingLayer() - : _fill_func(), _unpooling_layer_kernel() +NEMaxUnpoolingLayer::NEMaxUnpoolingLayer() : _fill_func(), _impl() { } -void NEMaxUnpoolingLayer::configure(ITensor *input, ITensor *indices, ITensor *output, const PoolingLayerInfo &pool_info) +void NEMaxUnpoolingLayer::configure(ITensor *input, + ITensor *indices, + ITensor *output, + const PoolingLayerInfo &pool_info) { + ARM_COMPUTE_LOG_PARAMS(input, indices, output, pool_info); + const PixelValue zero_value(0.f); - _fill_func = std::make_unique<NEFill>(); - _unpooling_layer_kernel = std::make_unique<NEMaxUnpoolingLayerKernel>(); + _fill_func = std::make_unique<NEFill>(); + _impl = std::make_unique<Impl>(); + _impl->src = input; + _impl->indices = indices; + _impl->dst = output; + + _impl->op = std::make_unique<cpu::CpuMaxUnpooling>(); _fill_func->configure(output, zero_value); - _unpooling_layer_kernel->configure(input, indices, output, pool_info); + _impl->op->configure(input->info(), indices->info(), output->info(), pool_info); } -Status NEMaxUnpoolingLayer::validate(const ITensorInfo *input, const ITensorInfo *indices, const ITensorInfo *output, const PoolingLayerInfo &pool_info) +Status NEMaxUnpoolingLayer::validate(const ITensorInfo *input, + const ITensorInfo *indices, + const ITensorInfo *output, + const PoolingLayerInfo &pool_info) { - return NEMaxUnpoolingLayerKernel::validate(input, indices, output, pool_info); + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output, indices); + ARM_COMPUTE_RETURN_ON_ERROR(cpu::CpuMaxUnpooling::validate(input, indices, output, pool_info)); + return Status{}; } void NEMaxUnpoolingLayer::run() { + ITensorPack pack; + pack.add_tensor(TensorType::ACL_SRC_0, _impl->src); + pack.add_tensor(TensorType::ACL_SRC_1, _impl->indices); + pack.add_tensor(TensorType::ACL_DST, _impl->dst); + _fill_func->run(); - NEScheduler::get().schedule(_unpooling_layer_kernel.get(), Window::DimY); + _impl->op->run(pack); } } /* namespace arm_compute */ diff --git a/src/runtime/NEON/functions/NEMeanStdDevNormalizationLayer.cpp b/src/runtime/NEON/functions/NEMeanStdDevNormalizationLayer.cpp index 02de983b77..dec0dde56d 100644 --- a/src/runtime/NEON/functions/NEMeanStdDevNormalizationLayer.cpp +++ b/src/runtime/NEON/functions/NEMeanStdDevNormalizationLayer.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2020 Arm Limited. + * Copyright (c) 2019-2021 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -23,6 +23,7 @@ */ #include "arm_compute/runtime/NEON/functions/NEMeanStdDevNormalizationLayer.h" +#include "src/common/utils/Log.h" #include "src/core/NEON/kernels/NEMeanStdDevNormalizationKernel.h" namespace arm_compute @@ -31,6 +32,8 @@ NEMeanStdDevNormalizationLayer::~NEMeanStdDevNormalizationLayer() = default; void NEMeanStdDevNormalizationLayer::configure(ITensor *input, ITensor *output, float epsilon) { + ARM_COMPUTE_LOG_PARAMS(input, output, epsilon); + auto k = std::make_unique<NEMeanStdDevNormalizationKernel>(); k->configure(input, output, epsilon); _kernel = std::move(k); diff --git a/src/runtime/NEON/functions/NENormalizationLayer.cpp b/src/runtime/NEON/functions/NENormalizationLayer.cpp index 9dcb157c03..d6d2e9dc46 100644 --- a/src/runtime/NEON/functions/NENormalizationLayer.cpp +++ b/src/runtime/NEON/functions/NENormalizationLayer.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2020 Arm Limited. + * Copyright (c) 2017-2021 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -29,6 +29,8 @@ #include "arm_compute/core/Types.h" #include "arm_compute/core/Validate.h" #include "arm_compute/runtime/NEON/NEScheduler.h" + +#include "src/common/utils/Log.h" #include "src/core/NEON/kernels/NENormalizationLayerKernel.h" namespace arm_compute @@ -43,6 +45,7 @@ NENormalizationLayer::NENormalizationLayer(std::shared_ptr<IMemoryManager> memor void NENormalizationLayer::configure(const ITensor *input, ITensor *output, const NormalizationLayerInfo &norm_info) { ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); + ARM_COMPUTE_LOG_PARAMS(input, output, norm_info); TensorInfo tensor_info(input->info()->tensor_shape(), 1, input->info()->data_type()); _input_squared.allocator()->init(tensor_info); @@ -59,13 +62,16 @@ void NENormalizationLayer::configure(const ITensor *input, ITensor *output, cons _input_squared.allocator()->allocate(); } -Status NENormalizationLayer::validate(const ITensorInfo *input, const ITensorInfo *output, const NormalizationLayerInfo &norm_info) +Status NENormalizationLayer::validate(const ITensorInfo *input, + const ITensorInfo *output, + const NormalizationLayerInfo &norm_info) { // Perform validation step ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output); ARM_COMPUTE_RETURN_ON_ERROR(NENormalizationLayerKernel::validate(input, input, output, norm_info)); - ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplication::validate(input, input, output, 1.0f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO)); + ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplication::validate(input, input, output, 1.0f, ConvertPolicy::SATURATE, + RoundingPolicy::TO_ZERO)); return Status{}; } @@ -76,4 +82,4 @@ void NENormalizationLayer::run() _multiply_f.run(); NEScheduler::get().schedule(_norm_kernel.get(), Window::DimY); } -}
\ No newline at end of file +} // namespace arm_compute diff --git a/src/runtime/NEON/functions/NEPReluLayer.cpp b/src/runtime/NEON/functions/NEPReluLayer.cpp index a05b545e9a..963e68bac7 100644 --- a/src/runtime/NEON/functions/NEPReluLayer.cpp +++ b/src/runtime/NEON/functions/NEPReluLayer.cpp @@ -24,7 +24,8 @@ #include "arm_compute/runtime/NEON/functions/NEPReluLayer.h" #include "arm_compute/core/ITensor.h" -#include "src/runtime/cpu/operators/CpuPRelu.h" + +#include "src/cpu/operators/CpuPRelu.h" namespace arm_compute { @@ -32,17 +33,16 @@ using OperatorType = cpu::CpuPRelu; struct NEPReluLayer::Impl { - const ITensor *src_0{ nullptr }; - const ITensor *src_1{ nullptr }; - ITensor *dst{ nullptr }; - std::unique_ptr<OperatorType> op{ nullptr }; + const ITensor *src_0{nullptr}; + const ITensor *src_1{nullptr}; + ITensor *dst{nullptr}; + std::unique_ptr<OperatorType> op{nullptr}; }; -NEPReluLayer::NEPReluLayer() - : _impl(std::make_unique<Impl>()) +NEPReluLayer::NEPReluLayer() : _impl(std::make_unique<Impl>()) { } -NEPReluLayer::NEPReluLayer(NEPReluLayer &&) = default; +NEPReluLayer::NEPReluLayer(NEPReluLayer &&) = default; NEPReluLayer &NEPReluLayer::operator=(NEPReluLayer &&) = default; NEPReluLayer::~NEPReluLayer() = default; diff --git a/src/runtime/NEON/functions/NEPadLayer.cpp b/src/runtime/NEON/functions/NEPadLayer.cpp index 531b06de64..253566df0f 100644 --- a/src/runtime/NEON/functions/NEPadLayer.cpp +++ b/src/runtime/NEON/functions/NEPadLayer.cpp @@ -23,12 +23,13 @@ */ #include "arm_compute/runtime/NEON/functions/NEPadLayer.h" -#include "arm_compute/runtime/NEON/NEScheduler.h" - #include "arm_compute/core/Types.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" -#include "src/core/NEON/kernels/NEPadLayerKernel.h" +#include "arm_compute/runtime/NEON/NEScheduler.h" + +#include "src/common/utils/Log.h" #include "src/core/helpers/AutoConfiguration.h" +#include "src/core/NEON/kernels/NEPadLayerKernel.h" namespace arm_compute { @@ -37,9 +38,9 @@ namespace uint32_t last_padding_dimension(const PaddingList &padding) { int last_padding_dim = padding.size() - 1; - for(; last_padding_dim >= 0; --last_padding_dim) + for (; last_padding_dim >= 0; --last_padding_dim) { - if(padding[last_padding_dim].first > 0 || padding[last_padding_dim].second > 0) + if (padding[last_padding_dim].first > 0 || padding[last_padding_dim].second > 0) { break; } @@ -51,11 +52,22 @@ uint32_t last_padding_dimension(const PaddingList &padding) NEPadLayer::~NEPadLayer() = default; NEPadLayer::NEPadLayer() - : _copy_function(), _pad_kernel(), _mode(), _padding(), _num_dimensions(0), _slice_functions(), _concat_functions(), _slice_results(), _concat_results() + : _copy_function(), + _pad_kernel(), + _mode(), + _padding(), + _num_dimensions(0), + _slice_functions(), + _concat_functions(), + _slice_results(), + _concat_results() { } -void NEPadLayer::configure_constant_mode(ITensor *input, ITensor *output, const PaddingList &padding, const PixelValue constant_value) +void NEPadLayer::configure_constant_mode(ITensor *input, + ITensor *output, + const PaddingList &padding, + const PixelValue constant_value) { _pad_kernel = std::make_unique<NEPadLayerKernel>(); _pad_kernel->configure(input, output, padding, constant_value, PaddingMode::CONSTANT); @@ -84,20 +96,20 @@ void NEPadLayer::configure_reflect_symmetric_mode(ITensor *input, ITensor *outpu Coordinates ends_after{}; Coordinates strides{}; ITensor *prev = input; - for(uint32_t i = 0; i < _num_dimensions; ++i) + for (uint32_t i = 0; i < _num_dimensions; ++i) { // Values in strides from the previous dimensions need to be set to 1 to avoid reversing again. - if(i > 0) + if (i > 0) { strides.set(i - 1, 1); } - if(_padding[i].first > 0 || _padding[i].second > 0) + if (_padding[i].first > 0 || _padding[i].second > 0) { // Set the starts, ends, and strides values for the current dimension. // Due to the bit masks passed to strided slice, the values below the current dimension in // starts and ends will be ignored so do not need to be modified. - if(_mode == PaddingMode::REFLECT) + if (_mode == PaddingMode::REFLECT) { starts_before.set(i, _padding[i].first); ends_before.set(i, 0); @@ -123,11 +135,12 @@ void NEPadLayer::configure_reflect_symmetric_mode(ITensor *input, ITensor *outpu // Reflect the input values for the padding before and after the input. std::vector<const ITensor *> concat_vector; - if(_padding[i].first > 0) + if (_padding[i].first > 0) { - if(i < prev->info()->num_dimensions()) + if (i < prev->info()->num_dimensions()) { - _slice_functions[2 * i].configure(prev, &_slice_results[2 * i], starts_before, ends_before, strides, begin_mask_before, end_mask_before); + _slice_functions[2 * i].configure(prev, &_slice_results[2 * i], starts_before, ends_before, strides, + begin_mask_before, end_mask_before); concat_vector.emplace_back(&_slice_results[2 * i]); } else @@ -137,11 +150,12 @@ void NEPadLayer::configure_reflect_symmetric_mode(ITensor *input, ITensor *outpu } } concat_vector.push_back(prev); - if(_padding[i].second > 0) + if (_padding[i].second > 0) { - if(i < prev->info()->num_dimensions()) + if (i < prev->info()->num_dimensions()) { - _slice_functions[2 * i + 1].configure(prev, &_slice_results[2 * i + 1], starts_after, ends_after, strides, begin_mask_after, end_mask_after); + _slice_functions[2 * i + 1].configure(prev, &_slice_results[2 * i + 1], starts_after, ends_after, + strides, begin_mask_after, end_mask_after); concat_vector.emplace_back(&_slice_results[2 * i + 1]); } else @@ -152,8 +166,13 @@ void NEPadLayer::configure_reflect_symmetric_mode(ITensor *input, ITensor *outpu } // Concatenate the padding before and after with the input. ITensor *out = (i == _num_dimensions - 1) ? output : &_concat_results[i]; + out->info()->set_quantization_info(output->info()->quantization_info()); + for (auto &v : concat_vector) + { + v->info()->set_quantization_info(input->info()->quantization_info()); + } _concat_functions[i].configure(concat_vector, out, i); - if(i != _num_dimensions - 1) + if (i != _num_dimensions - 1) { _concat_results[i].allocator()->allocate(); } @@ -164,22 +183,28 @@ void NEPadLayer::configure_reflect_symmetric_mode(ITensor *input, ITensor *outpu } } -void NEPadLayer::configure(ITensor *input, ITensor *output, const PaddingList &padding, const PixelValue constant_value, const PaddingMode mode) +void NEPadLayer::configure(ITensor *input, + ITensor *output, + const PaddingList &padding, + const PixelValue constant_value, + const PaddingMode mode) { ARM_COMPUTE_ERROR_THROW_ON(validate(input->info(), output->info(), padding, constant_value, mode)); + ARM_COMPUTE_LOG_PARAMS(input, output, padding, constant_value, mode); _padding = padding; _mode = mode; - const TensorShape padded_shape = misc::shape_calculator::compute_padded_shape(input->info()->tensor_shape(), _padding); + const TensorShape padded_shape = + misc::shape_calculator::compute_padded_shape(input->info()->tensor_shape(), _padding); auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(padded_shape)); // Find the last dimension requiring padding so that it is known when to write to output and whether any padding is applied. _num_dimensions = last_padding_dimension(padding) + 1; - if(_num_dimensions > 0) + if (_num_dimensions > 0) { - switch(_mode) + switch (_mode) { case PaddingMode::CONSTANT: { @@ -203,19 +228,23 @@ void NEPadLayer::configure(ITensor *input, ITensor *output, const PaddingList &p } } -Status NEPadLayer::validate(const ITensorInfo *input, const ITensorInfo *output, const PaddingList &padding, const PixelValue constant_value, const PaddingMode mode) +Status NEPadLayer::validate(const ITensorInfo *input, + const ITensorInfo *output, + const PaddingList &padding, + const PixelValue constant_value, + const PaddingMode mode) { ARM_COMPUTE_UNUSED(constant_value); const TensorShape padded_shape = misc::shape_calculator::compute_padded_shape(input->tensor_shape(), padding); - if(output->total_size() > 0) + if (output->total_size() > 0) { ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), padded_shape); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); } - switch(mode) + switch (mode) { case PaddingMode::CONSTANT: { @@ -224,9 +253,9 @@ Status NEPadLayer::validate(const ITensorInfo *input, const ITensorInfo *output, case PaddingMode::REFLECT: case PaddingMode::SYMMETRIC: { - for(uint32_t i = 0; i < padding.size(); ++i) + for (uint32_t i = 0; i < padding.size(); ++i) { - if(mode == PaddingMode::REFLECT) + if (mode == PaddingMode::REFLECT) { ARM_COMPUTE_RETURN_ERROR_ON(padding[i].first >= input->dimension(i)); ARM_COMPUTE_RETURN_ERROR_ON(padding[i].second >= input->dimension(i)); @@ -249,9 +278,9 @@ Status NEPadLayer::validate(const ITensorInfo *input, const ITensorInfo *output, void NEPadLayer::run() { - if(_num_dimensions > 0) + if (_num_dimensions > 0) { - switch(_mode) + switch (_mode) { case PaddingMode::CONSTANT: { @@ -261,15 +290,15 @@ void NEPadLayer::run() case PaddingMode::REFLECT: case PaddingMode::SYMMETRIC: { - for(uint32_t i = 0; i < _num_dimensions; ++i) + for (uint32_t i = 0; i < _num_dimensions; ++i) { - if(_padding[i].first > 0 || _padding[i].second > 0) + if (_padding[i].first > 0 || _padding[i].second > 0) { - if(_padding[i].first > 0 && _slice_results[2 * i].info()->total_size() > 0) + if (_padding[i].first > 0 && _slice_results[2 * i].info()->total_size() > 0) { _slice_functions[2 * i].run(); } - if(_padding[i].second > 0 && _slice_results[2 * i + 1].info()->total_size() > 0) + if (_padding[i].second > 0 && _slice_results[2 * i + 1].info()->total_size() > 0) { _slice_functions[2 * i + 1].run(); } diff --git a/src/runtime/NEON/functions/NEPermute.cpp b/src/runtime/NEON/functions/NEPermute.cpp index f707fad757..80cd04ce6c 100644 --- a/src/runtime/NEON/functions/NEPermute.cpp +++ b/src/runtime/NEON/functions/NEPermute.cpp @@ -24,19 +24,19 @@ #include "arm_compute/runtime/NEON/functions/NEPermute.h" #include "arm_compute/core/Validate.h" -#include "src/runtime/cpu/operators/CpuPermute.h" + +#include "src/cpu/operators/CpuPermute.h" namespace arm_compute { struct NEPermute::Impl { - const ITensor *src{ nullptr }; - ITensor *dst{ nullptr }; - std::unique_ptr<cpu::CpuPermute> op{ nullptr }; + const ITensor *src{nullptr}; + ITensor *dst{nullptr}; + std::unique_ptr<cpu::CpuPermute> op{nullptr}; }; -NEPermute::NEPermute() - : _impl(std::make_unique<Impl>()) +NEPermute::NEPermute() : _impl(std::make_unique<Impl>()) { } diff --git a/src/runtime/NEON/functions/NEPixelWiseMultiplication.cpp b/src/runtime/NEON/functions/NEPixelWiseMultiplication.cpp index 3a2f1984b4..97155a9e74 100644 --- a/src/runtime/NEON/functions/NEPixelWiseMultiplication.cpp +++ b/src/runtime/NEON/functions/NEPixelWiseMultiplication.cpp @@ -24,7 +24,8 @@ #include "arm_compute/runtime/NEON/functions/NEPixelWiseMultiplication.h" #include "arm_compute/core/ITensor.h" -#include "src/runtime/cpu/operators/CpuMul.h" + +#include "src/cpu/operators/CpuMul.h" #include <utility> @@ -32,32 +33,42 @@ namespace arm_compute { struct NEPixelWiseMultiplication::Impl { - const ITensor *src_0{ nullptr }; - const ITensor *src_1{ nullptr }; - ITensor *dst{ nullptr }; - std::unique_ptr<cpu::CpuMul> op{ nullptr }; + const ITensor *src_0{nullptr}; + const ITensor *src_1{nullptr}; + ITensor *dst{nullptr}; + std::unique_ptr<cpu::CpuMul> op{nullptr}; }; -NEPixelWiseMultiplication::NEPixelWiseMultiplication() - : _impl(std::make_unique<Impl>()) +NEPixelWiseMultiplication::NEPixelWiseMultiplication() : _impl(std::make_unique<Impl>()) { } NEPixelWiseMultiplication::~NEPixelWiseMultiplication() = default; -Status NEPixelWiseMultiplication::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, float scale, ConvertPolicy overflow_policy, RoundingPolicy rounding_policy, +Status NEPixelWiseMultiplication::validate(const ITensorInfo *input1, + const ITensorInfo *input2, + const ITensorInfo *output, + float scale, + ConvertPolicy overflow_policy, + RoundingPolicy rounding_policy, const ActivationLayerInfo &act_info) { return cpu::CpuMul::validate(input1, input2, output, scale, overflow_policy, rounding_policy, act_info); } -void NEPixelWiseMultiplication::configure(const ITensor *input1, const ITensor *input2, ITensor *output, float scale, ConvertPolicy overflow_policy, RoundingPolicy rounding_policy, +void NEPixelWiseMultiplication::configure(const ITensor *input1, + const ITensor *input2, + ITensor *output, + float scale, + ConvertPolicy overflow_policy, + RoundingPolicy rounding_policy, const ActivationLayerInfo &act_info) { _impl->src_0 = input1; _impl->src_1 = input2; _impl->dst = output; _impl->op = std::make_unique<cpu::CpuMul>(); - _impl->op->configure(input1->info(), input2->info(), output->info(), scale, overflow_policy, rounding_policy, act_info); + _impl->op->configure(input1->info(), input2->info(), output->info(), scale, overflow_policy, rounding_policy, + act_info); } void NEPixelWiseMultiplication::run() @@ -71,24 +82,29 @@ void NEPixelWiseMultiplication::run() struct NEComplexPixelWiseMultiplication::Impl { - ITensor *src_0{ nullptr }; - ITensor *src_1{ nullptr }; - ITensor *dst{ nullptr }; - std::unique_ptr<cpu::CpuComplexMul> op{ nullptr }; + ITensor *src_0{nullptr}; + ITensor *src_1{nullptr}; + ITensor *dst{nullptr}; + std::unique_ptr<cpu::CpuComplexMul> op{nullptr}; }; -NEComplexPixelWiseMultiplication::NEComplexPixelWiseMultiplication() - : _impl(std::make_unique<Impl>()) +NEComplexPixelWiseMultiplication::NEComplexPixelWiseMultiplication() : _impl(std::make_unique<Impl>()) { } NEComplexPixelWiseMultiplication::~NEComplexPixelWiseMultiplication() = default; -Status NEComplexPixelWiseMultiplication::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const ActivationLayerInfo &act_info) +Status NEComplexPixelWiseMultiplication::validate(const ITensorInfo *input1, + const ITensorInfo *input2, + const ITensorInfo *output, + const ActivationLayerInfo &act_info) { return cpu::CpuComplexMul::validate(input1, input2, output, act_info); } -void NEComplexPixelWiseMultiplication::configure(ITensor *input1, ITensor *input2, ITensor *output, const ActivationLayerInfo &act_info) +void NEComplexPixelWiseMultiplication::configure(ITensor *input1, + ITensor *input2, + ITensor *output, + const ActivationLayerInfo &act_info) { _impl->src_0 = input1; _impl->src_1 = input2; diff --git a/src/runtime/NEON/functions/NEPooling3dLayer.cpp b/src/runtime/NEON/functions/NEPooling3dLayer.cpp new file mode 100644 index 0000000000..e017e8c21d --- /dev/null +++ b/src/runtime/NEON/functions/NEPooling3dLayer.cpp @@ -0,0 +1,76 @@ +/* + * Copyright (c) 2022 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/runtime/NEON/functions/NEPooling3dLayer.h" + +#include "arm_compute/core/TensorInfo.h" +#include "arm_compute/core/Validate.h" +#include "arm_compute/runtime/Tensor.h" + +#include "src/core/helpers/MemoryHelpers.h" +#include "src/cpu/operators/CpuPool3d.h" + +namespace arm_compute +{ +struct NEPooling3dLayer::Impl +{ + const ITensor *src{nullptr}; + ITensor *dst{nullptr}; + std::unique_ptr<cpu::CpuPool3d> op{nullptr}; + MemoryGroup memory_group{}; + ITensorPack run_pack{}; + WorkspaceData<Tensor> workspace_tensors{}; +}; + +NEPooling3dLayer::~NEPooling3dLayer() = default; + +NEPooling3dLayer::NEPooling3dLayer(std::shared_ptr<IMemoryManager> memory_manager) : _impl(std::make_unique<Impl>()) +{ + _impl->memory_group = MemoryGroup(std::move(memory_manager)); +} + +void NEPooling3dLayer::configure(const ITensor *input, ITensor *output, const Pooling3dLayerInfo &pool_info) +{ + _impl->src = input; + _impl->dst = output; + _impl->op = std::make_unique<cpu::CpuPool3d>(); + _impl->op->configure(input->info(), output->info(), pool_info); + + _impl->run_pack = {{TensorType::ACL_SRC, _impl->src}, {TensorType::ACL_DST_0, _impl->dst}}; + _impl->workspace_tensors = manage_workspace<Tensor>(_impl->op->workspace(), _impl->memory_group, _impl->run_pack); +} + +Status +NEPooling3dLayer::validate(const ITensorInfo *input, const ITensorInfo *output, const Pooling3dLayerInfo &pool_info) +{ + return cpu::CpuPool3d::validate(input, output, pool_info); +} + +void NEPooling3dLayer::run() +{ + MemoryGroupResourceScope scope_mg(_impl->memory_group); + ARM_COMPUTE_ERROR_ON_NULLPTR(_impl->src, _impl->dst); + _impl->op->run(_impl->run_pack); +} + +} // namespace arm_compute diff --git a/src/runtime/NEON/functions/NEPoolingLayer.cpp b/src/runtime/NEON/functions/NEPoolingLayer.cpp index 8d267a32c0..eb9125be3c 100644 --- a/src/runtime/NEON/functions/NEPoolingLayer.cpp +++ b/src/runtime/NEON/functions/NEPoolingLayer.cpp @@ -26,17 +26,18 @@ #include "arm_compute/core/TensorInfo.h" #include "arm_compute/core/Validate.h" #include "arm_compute/runtime/Tensor.h" + #include "src/core/helpers/MemoryHelpers.h" -#include "src/runtime/cpu/operators/CpuPool2d.h" +#include "src/cpu/operators/CpuPool2d.h" namespace arm_compute { struct NEPoolingLayer::Impl { - ITensor *src{ nullptr }; - ITensor *dst{ nullptr }; - ITensor *indices{ nullptr }; - std::unique_ptr<cpu::CpuPool2d> op{ nullptr }; + ITensor *src{nullptr}; + ITensor *dst{nullptr}; + ITensor *indices{nullptr}; + std::unique_ptr<cpu::CpuPool2d> op{nullptr}; MemoryGroup memory_group{}; ITensorPack run_pack{}; WorkspaceData<Tensor> workspace_tensors{}; @@ -44,8 +45,7 @@ struct NEPoolingLayer::Impl NEPoolingLayer::~NEPoolingLayer() = default; -NEPoolingLayer::NEPoolingLayer(std::shared_ptr<IMemoryManager> memory_manager) - : _impl(std::make_unique<Impl>()) +NEPoolingLayer::NEPoolingLayer(std::shared_ptr<IMemoryManager> memory_manager) : _impl(std::make_unique<Impl>()) { _impl->memory_group = MemoryGroup(std::move(memory_manager)); } @@ -58,11 +58,16 @@ void NEPoolingLayer::configure(ITensor *input, ITensor *output, const PoolingLay _impl->op = std::make_unique<cpu::CpuPool2d>(); _impl->op->configure(input->info(), output->info(), pool_info, (indices) ? indices->info() : nullptr); - _impl->run_pack = { { TensorType::ACL_SRC, _impl->src }, { TensorType::ACL_DST_0, _impl->dst }, { TensorType::ACL_DST_1, _impl->indices } }; + _impl->run_pack = {{TensorType::ACL_SRC, _impl->src}, + {TensorType::ACL_DST_0, _impl->dst}, + {TensorType::ACL_DST_1, _impl->indices}}; _impl->workspace_tensors = manage_workspace<Tensor>(_impl->op->workspace(), _impl->memory_group, _impl->run_pack); } -Status NEPoolingLayer::validate(const ITensorInfo *input, const ITensorInfo *output, const PoolingLayerInfo &pool_info, const ITensorInfo *indices) +Status NEPoolingLayer::validate(const ITensorInfo *input, + const ITensorInfo *output, + const PoolingLayerInfo &pool_info, + const ITensorInfo *indices) { return cpu::CpuPool2d::validate(input, output, pool_info, indices); } diff --git a/src/runtime/NEON/functions/NEPriorBoxLayer.cpp b/src/runtime/NEON/functions/NEPriorBoxLayer.cpp index 0c71706586..dbb6bf9df1 100644 --- a/src/runtime/NEON/functions/NEPriorBoxLayer.cpp +++ b/src/runtime/NEON/functions/NEPriorBoxLayer.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018-2020 Arm Limited. + * Copyright (c) 2018-2021 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -27,21 +27,31 @@ #include "arm_compute/core/Error.h" #include "arm_compute/core/TensorInfo.h" #include "arm_compute/core/Types.h" -#include "arm_compute/core/Validate.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" +#include "arm_compute/core/Validate.h" #include "arm_compute/runtime/NEON/NEScheduler.h" + +#include "src/common/utils/Log.h" #include "src/core/NEON/kernels/NEPriorBoxLayerKernel.h" namespace arm_compute { -void NEPriorBoxLayer::configure(const ITensor *input1, const ITensor *input2, ITensor *output, const PriorBoxLayerInfo &info) +void NEPriorBoxLayer::configure(const ITensor *input1, + const ITensor *input2, + ITensor *output, + const PriorBoxLayerInfo &info) { + ARM_COMPUTE_LOG_PARAMS(input1, input2, output, info); + auto k = std::make_unique<NEPriorBoxLayerKernel>(); k->configure(input1, input2, output, info); _kernel = std::move(k); } -Status NEPriorBoxLayer::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const PriorBoxLayerInfo &info) +Status NEPriorBoxLayer::validate(const ITensorInfo *input1, + const ITensorInfo *input2, + const ITensorInfo *output, + const PriorBoxLayerInfo &info) { return NEPriorBoxLayerKernel::validate(input1, input2, output, info); } diff --git a/src/runtime/NEON/functions/NEQLSTMLayer.cpp b/src/runtime/NEON/functions/NEQLSTMLayer.cpp index 85d62ac058..dd78d10d16 100644 --- a/src/runtime/NEON/functions/NEQLSTMLayer.cpp +++ b/src/runtime/NEON/functions/NEQLSTMLayer.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020 Arm Limited. + * Copyright (c) 2020-2022 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -23,34 +23,38 @@ */ #include "arm_compute/runtime/NEON/functions/NEQLSTMLayer.h" +#include "arm_compute/core/ITensorPack.h" #include "arm_compute/core/KernelDescriptors.h" #include "arm_compute/core/QuantizationInfo.h" #include "arm_compute/core/Utils.h" -#include "arm_compute/core/Validate.h" #include "arm_compute/core/utils/misc/InfoHelpers.h" #include "arm_compute/core/utils/quantization/AsymmHelpers.h" +#include "arm_compute/core/Validate.h" #include "arm_compute/runtime/NEON/NEScheduler.h" -#include "src/core/NEON/kernels/NEConvertQuantizedSignednessKernel.h" -#include "src/core/NEON/kernels/NEGEMMInterleave4x4Kernel.h" -#include "src/core/NEON/kernels/NEGEMMLowpMatrixMultiplyKernel.h" -#include "src/core/NEON/kernels/NEGEMMLowpOffsetContributionKernel.h" -#include "src/core/NEON/kernels/NEGEMMLowpOffsetContributionOutputStageKernel.h" -#include "src/core/NEON/kernels/NEGEMMLowpReductionKernel.h" -#include "src/core/NEON/kernels/NEGEMMTranspose1xWKernel.h" -#include "src/core/NEON/kernels/NEQLSTMLayerNormalizationKernel.h" + +#include "src/common/utils/Log.h" #include "src/core/helpers/WindowHelpers.h" +#include "src/core/NEON/kernels/NEQLSTMLayerNormalizationKernel.h" +#include "src/cpu/kernels/CpuGemmLowpMatrixReductionKernel.h" namespace arm_compute { using namespace arm_compute::utils::info_helpers; namespace { -Status validate_mm(GEMMLowpOutputStageInfo &gemmlowp_info, const ITensorInfo *mm_input, const ITensorInfo *mm_weights, const ITensorInfo *bias, - float gemmlowp_scale, const TensorInfo *mm_res_info, const TensorInfo *outstage_tensor_info) +Status validate_mm(GEMMLowpOutputStageInfo &gemmlowp_info, + const ITensorInfo *mm_input, + const ITensorInfo *mm_weights, + const ITensorInfo *bias, + float gemmlowp_scale, + const TensorInfo *mm_res_info, + const TensorInfo *outstage_tensor_info) { ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpMatrixMultiplyCore::validate(mm_input, mm_weights, nullptr, mm_res_info)); - ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier(gemmlowp_scale, &gemmlowp_info.gemmlowp_multiplier, &gemmlowp_info.gemmlowp_shift)); - ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpOutputStage::validate(mm_res_info, bias, outstage_tensor_info, gemmlowp_info)); + ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier( + gemmlowp_scale, &gemmlowp_info.gemmlowp_multiplier, &gemmlowp_info.gemmlowp_shift)); + ARM_COMPUTE_RETURN_ON_ERROR( + NEGEMMLowpOutputStage::validate(mm_res_info, bias, outstage_tensor_info, gemmlowp_info)); return Status{}; } } // namespace @@ -59,10 +63,7 @@ Status NEQLSTMLayer::validate_layer_norm(const ITensorInfo &in, const ITensorInf { // Output quantization scale will be different, but ignored here // since it will be configured at configure() stage. - const TensorInfo out - { - in - }; + const TensorInfo out{in}; return NEQLSTMLayerNormalizationKernel::validate(&in, &out, &weight, &bias); } @@ -92,6 +93,8 @@ Status NEQLSTMLayer::TensorCopyKernel::validate(const ITensorInfo &src, const IT void NEQLSTMLayer::TensorCopyKernel::configure(ITensor &src, ITensor &dst) { ARM_COMPUTE_ERROR_THROW_ON(NEQLSTMLayer::TensorCopyKernel::validate(*src.info(), *dst.info())); + ARM_COMPUTE_LOG_PARAMS(src, dst); + _src = &src; _dst = &dst; _row_size = std::min(_src->info()->tensor_shape().x(), _dst->info()->tensor_shape().x()); @@ -100,39 +103,108 @@ void NEQLSTMLayer::TensorCopyKernel::configure(ITensor &src, ITensor &dst) void NEQLSTMLayer::TensorCopyKernel::run() { - Iterator input_iter{ _src, _window }; - Iterator output_iter{ _dst, _window }; + Iterator input_iter{_src, _window}; + Iterator output_iter{_dst, _window}; - execute_window_loop(_window, [&](const Coordinates &) - { - memcpy(output_iter.ptr(), input_iter.ptr(), _row_size); - }, - input_iter, output_iter); + execute_window_loop( + _window, [&](const Coordinates &) { memcpy(output_iter.ptr(), input_iter.ptr(), _row_size); }, input_iter, + output_iter); } NEQLSTMLayer::~NEQLSTMLayer() = default; NEQLSTMLayer::NEQLSTMLayer(std::shared_ptr<IMemoryManager> memory_manager) - : _memory_group(), _transpose_input_to_forget_weights(), _transpose_input_to_cell_weights(), _transpose_input_to_output_weights(), _transpose_input_to_input_weights(), - _transpose_recurrent_to_forget_weights(), _transpose_recurrent_to_cell_weights(), _transpose_recurrent_to_output_weights(), _transpose_recurrent_to_input_weights(), _transpose_projection_weights(), - _input_to_input_reduction(), _recurrent_to_input_reduction(), _input_to_forget_reduction(), _recurrent_to_forget_reduction(), _input_to_cell_reduction(), _recurrent_to_cell_reduction(), - _input_to_output_reduction(), _recurrent_to_output_reduction(), _projection_reduction(), _projection_bias_add(), _mm_input_to_forget(), _mm_recurrent_to_forget(), _pixelwise_mul_cell_to_forget(), - _input_to_forget_outstage(), _recurrent_to_forget_outstage(), _cell_to_forget_outstage(), _accumulate_input_recurrent_forget(), _accumulate_cell_forget(), _forget_gate_sigmoid(), _mm_input_to_cell(), - _input_to_cell_outstage(), _mm_recurrent_to_cell(), _recurrent_to_cell_outstage(), _accumulate_input_recurrent_modulation(), _cell_gate_tanh(), _input_gate_sub(), _mm_input_to_input(), - _input_to_input_outstage(), _mm_recurrent_to_input(), _recurrent_to_input_outstage(), _accumulate_input_recurrent_input(), _pixelwise_mul_cell_to_input(), _cell_to_input_outstage(), - _accumulate_cell_input(), _input_gate_sigmoid(), _pixelwise_mul_forget_cell(), _pixelwise_mul_input_cell(), _add_forget_cell(), _cell_clip(), _mm_input_to_output(), _input_to_output_outstage(), - _mm_recurrent_to_output(), _recurrent_to_output_outstage(), _accumulate_input_recurrent_output(), _pixelwise_mul_cell_to_output(), _cell_to_output_outstage(), _accumulate_cell_to_output(), - _output_gate_sigmoid(), _hidden_tanh(), _pixelwise_mul_hidden(), _hidden_outstage(), _mm_projection(), _projection_outstage(), _accumulate_projection(), _projection_clip(), _projection_bias_copy(), - _projection_output_to_accumulate_copy(), _projection_accumulate_to_output_copy(), _hidden_to_output_copy(), _layer_norms(), _copy_output(), _layer_norm_weights(), _layer_norm_bias(), + : _memory_group(), + _dequantize_input_to_forget_weights(), + _quantize_input_to_forget_weights(), + _transpose_input_to_forget_weights(), + _transpose_input_to_cell_weights(), + _transpose_input_to_output_weights(), + _transpose_input_to_input_weights(), + _transpose_recurrent_to_forget_weights(), + _transpose_recurrent_to_cell_weights(), + _transpose_recurrent_to_output_weights(), + _transpose_recurrent_to_input_weights(), + _transpose_projection_weights(), + _input_to_input_reduction(), + _recurrent_to_input_reduction(), + _input_to_forget_reduction(), + _recurrent_to_forget_reduction(), + _input_to_cell_reduction(), + _recurrent_to_cell_reduction(), + _input_to_output_reduction(), + _recurrent_to_output_reduction(), + _projection_reduction(), + _projection_bias_add(), + _mm_input_to_forget(), + _mm_recurrent_to_forget(), + _pixelwise_mul_cell_to_forget(), + _input_to_forget_outstage(), + _recurrent_to_forget_outstage(), + _cell_to_forget_outstage(), + _accumulate_input_recurrent_forget(), + _accumulate_cell_forget(), + _forget_gate_sigmoid(), + _mm_input_to_cell(), + _input_to_cell_outstage(), + _mm_recurrent_to_cell(), + _recurrent_to_cell_outstage(), + _accumulate_input_recurrent_modulation(), + _cell_gate_tanh(), + _input_gate_sub(), + _mm_input_to_input(), + _input_to_input_outstage(), + _mm_recurrent_to_input(), + _recurrent_to_input_outstage(), + _accumulate_input_recurrent_input(), + _pixelwise_mul_cell_to_input(), + _cell_to_input_outstage(), + _accumulate_cell_input(), + _input_gate_sigmoid(), + _pixelwise_mul_forget_cell(), + _pixelwise_mul_input_cell(), + _add_forget_cell(), + _cell_clip(), + _mm_input_to_output(), + _input_to_output_outstage(), + _mm_recurrent_to_output(), + _recurrent_to_output_outstage(), + _accumulate_input_recurrent_output(), + _pixelwise_mul_cell_to_output(), + _cell_to_output_outstage(), + _accumulate_cell_to_output(), + _output_gate_sigmoid(), + _hidden_tanh(), + _pixelwise_mul_hidden(), + _hidden_outstage(), + _mm_projection(), + _projection_outstage(), + _accumulate_projection(), + _projection_clip(), + _projection_bias_copy(), + _projection_output_to_accumulate_copy(), + _projection_accumulate_to_output_copy(), + _hidden_to_output_copy(), + _layer_norms(), + _copy_output(), + _layer_norm_weights(), + _layer_norm_bias(), _layer_norm_output() { _memory_group = MemoryGroup(std::move(memory_manager)); } -void NEQLSTMLayer::configure_mm(NEGEMMLowpMatrixMultiplyCore &mm, NEGEMMLowpOutputStage &outstage, GEMMLowpOutputStageInfo &gemmlowp_info, - const ITensor *mm_input, const ITensor *mm_weights, const ITensor *bias, - Tensor *mm_res, Tensor *outstage_res, float gemmlowp_scale, - const TensorInfo &mm_res_info, const TensorInfo &outstage_tensor_info) +void NEQLSTMLayer::configure_mm(NEGEMMLowpMatrixMultiplyCore &mm, + NEGEMMLowpOutputStage &outstage, + GEMMLowpOutputStageInfo &gemmlowp_info, + const ITensor *mm_input, + const ITensor *mm_weights, + const ITensor *bias, + Tensor *mm_res, + Tensor *outstage_res, + float gemmlowp_scale, + const TensorInfo &mm_res_info, + const TensorInfo &outstage_tensor_info) { _memory_group.manage(mm_res); _memory_group.manage(outstage_res); @@ -144,33 +216,88 @@ void NEQLSTMLayer::configure_mm(NEGEMMLowpMatrixMultiplyCore &mm, NEGEMMLowpOutp mm.configure(mm_input, mm_weights, nullptr, mm_res); // Configure output stage - quantization::calculate_quantized_multiplier(gemmlowp_scale, &gemmlowp_info.gemmlowp_multiplier, &gemmlowp_info.gemmlowp_shift); + quantization::calculate_quantized_multiplier(gemmlowp_scale, &gemmlowp_info.gemmlowp_multiplier, + &gemmlowp_info.gemmlowp_shift); outstage.configure(mm_res, bias, outstage_res, gemmlowp_info); mm_res->allocator()->allocate(); } -void NEQLSTMLayer::configure(const ITensor *input, - const ITensor *input_to_forget_weights, const ITensor *input_to_cell_weights, const ITensor *input_to_output_weights, - const ITensor *recurrent_to_forget_weights, const ITensor *recurrent_to_cell_weights, const ITensor *recurrent_to_output_weights, - const ITensor *forget_gate_bias, const ITensor *cell_bias, const ITensor *output_gate_bias, - const ITensor *cell_state_in, ITensor *output_state_in, - ITensor *cell_state_out, ITensor *output_state_out, ITensor *output, +void NEQLSTMLayer::configure(const ITensor *input, + const ITensor *input_to_forget_weights, + const ITensor *input_to_cell_weights, + const ITensor *input_to_output_weights, + const ITensor *recurrent_to_forget_weights, + const ITensor *recurrent_to_cell_weights, + const ITensor *recurrent_to_output_weights, + const ITensor *forget_gate_bias, + const ITensor *cell_bias, + const ITensor *output_gate_bias, + const ITensor *cell_state_in, + ITensor *output_state_in, + ITensor *cell_state_out, + ITensor *output_state_out, + ITensor *output, const LSTMParams<ITensor> &lstm_params) { ARM_COMPUTE_ERROR_ON_NULLPTR(input, input_to_forget_weights, input_to_cell_weights, input_to_output_weights, recurrent_to_forget_weights, recurrent_to_cell_weights, recurrent_to_output_weights, - forget_gate_bias, cell_bias, output_gate_bias, cell_state_in, output_state_in, cell_state_out, output_state_out); + forget_gate_bias, cell_bias, output_gate_bias, cell_state_in, output_state_in, + cell_state_out, output_state_out); + + ARM_COMPUTE_LOG_PARAMS(input, input_to_forget_weights, input_to_cell_weights, input_to_output_weights, + recurrent_to_forget_weights, recurrent_to_cell_weights, recurrent_to_output_weights, + forget_gate_bias, cell_bias, output_gate_bias, cell_state_in, output_state_in, + cell_state_out, output_state_out); // Set lstm parameters LSTMParams<ITensorInfo> lstm_params_info{}; build_lstm_params_tensor_info(lstm_params, &lstm_params_info); - // Validate - ARM_COMPUTE_ERROR_THROW_ON(NEQLSTMLayer::validate(input->info(), input_to_forget_weights->info(), input_to_cell_weights->info(), input_to_output_weights->info(), - recurrent_to_forget_weights->info(), recurrent_to_cell_weights->info(), recurrent_to_output_weights->info(), - forget_gate_bias->info(), cell_bias->info(), output_gate_bias->info(), - cell_state_in->info(), output_state_in->info(), cell_state_out->info(), output_state_out->info(), output->info(), - lstm_params_info)); + _input_to_forget_weights_transposed.info()->set_quantization_info( + input_to_forget_weights->info()->quantization_info()); + _input_to_cell_weights_transposed.info()->set_quantization_info(input_to_cell_weights->info()->quantization_info()); + _input_to_output_weights_transposed.info()->set_quantization_info( + input_to_output_weights->info()->quantization_info()); + _recurrent_to_forget_weights_transposed.info()->set_quantization_info( + recurrent_to_forget_weights->info()->quantization_info()); + _recurrent_to_cell_weights_transposed.info()->set_quantization_info( + recurrent_to_cell_weights->info()->quantization_info()); + _recurrent_to_output_weights_transposed.info()->set_quantization_info( + recurrent_to_output_weights->info()->quantization_info()); + + if (input_to_forget_weights->info()->data_type() == DataType::QASYMM8_SIGNED) + { + _convert_input_to_forget_weights_to_qsymm8 = true; + // Setup dequantize output tensor to go from QASYMM8_SIGNED -> F32 + + _input_to_forget_weights_f32.allocator()->init( + TensorInfo(input_to_forget_weights->info()->tensor_shape(), 1, DataType::F32) + .set_data_layout(input_to_forget_weights->info()->data_layout())); + // Setup the quantize output tensor to go from F32 -> QSYMM8 + _input_to_forget_weights_symm8.allocator()->init( + (TensorInfo(input_to_forget_weights->info()->tensor_shape(), 1, DataType::QSYMM8) + .set_data_layout(input_to_forget_weights->info()->data_layout()) + .set_quantization_info(input_to_forget_weights->info()->quantization_info()))); + + _dequantize_input_to_forget_weights.configure(input_to_forget_weights, &_input_to_forget_weights_f32); + _quantize_input_to_forget_weights.configure(&_input_to_forget_weights_f32, &_input_to_forget_weights_symm8); + + ARM_COMPUTE_ERROR_THROW_ON(NEQLSTMLayer::validate( + input->info(), _input_to_forget_weights_symm8.info(), input_to_cell_weights->info(), + input_to_output_weights->info(), recurrent_to_forget_weights->info(), recurrent_to_cell_weights->info(), + recurrent_to_output_weights->info(), forget_gate_bias->info(), cell_bias->info(), output_gate_bias->info(), + cell_state_in->info(), output_state_in->info(), cell_state_out->info(), output_state_out->info(), + output->info(), lstm_params_info)); + } + else + { + ARM_COMPUTE_ERROR_THROW_ON(NEQLSTMLayer::validate( + input->info(), input_to_forget_weights->info(), input_to_cell_weights->info(), + input_to_output_weights->info(), recurrent_to_forget_weights->info(), recurrent_to_cell_weights->info(), + recurrent_to_output_weights->info(), forget_gate_bias->info(), cell_bias->info(), output_gate_bias->info(), + cell_state_in->info(), output_state_in->info(), cell_state_out->info(), output_state_out->info(), + output->info(), lstm_params_info)); + } const int batch_size = input->info()->dimension(1); const int num_units = input_to_output_weights->info()->dimension(1); @@ -181,7 +308,9 @@ void NEQLSTMLayer::configure(const ITensor *input, const UniformQuantizationInfo qoutput_state_in = output_state_in->info()->quantization_info().uniform(); _projection_bias = lstm_params.projection_bias(); - _input_to_forget_weights = input_to_forget_weights; + _input_to_forget_weights = (input_to_forget_weights->info()->data_type() == DataType::QASYMM8_SIGNED) + ? &_input_to_forget_weights_symm8 + : input_to_forget_weights; _input_to_cell_weights = input_to_cell_weights; _input_to_output_weights = input_to_output_weights; _recurrent_to_forget_weights = recurrent_to_forget_weights; @@ -191,7 +320,7 @@ void NEQLSTMLayer::configure(const ITensor *input, // Layer normalization _has_layer_norm = lstm_params.use_layer_norm(); - if(_has_layer_norm) + if (_has_layer_norm) { set_layer_norm_weight(lstm_params.forget_layer_norm_weights(), LayerNormGate::Forget); set_layer_norm_weight(lstm_params.cell_layer_norm_weights(), LayerNormGate::Cell); @@ -213,44 +342,59 @@ void NEQLSTMLayer::configure(const ITensor *input, // Calculate quantized parameters for clipping. int16_t quantized_cell_clip = 0; - if(lstm_params.cell_clip() > 0.0f) + if (lstm_params.cell_clip() > 0.0f) { quantized_cell_clip = quantize_qsymm16(lstm_params.cell_clip(), qcell_state_in); } _has_cell_clipping = quantized_cell_clip > 0; // Precompute effective bias for optimizing the matmul computations. - if(!_has_cifg) + if (!_has_cifg) { _input_to_input_weights = lstm_params.input_to_input_weights(); _recurrent_to_input_weights = lstm_params.recurrent_to_input_weights(); - _input_to_input_reduction = std::make_unique<NEGEMMLowpMatrixAReductionKernel>(); - _recurrent_to_input_reduction = std::make_unique<NEGEMMLowpMatrixAReductionKernel>(); - _input_to_input_reduction->configure(_input_to_input_weights, &_input_to_input_eff_bias, GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true)); - _recurrent_to_input_reduction->configure(_recurrent_to_input_weights, &_recurrent_to_input_eff_bias, GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true)); + _input_to_input_reduction = std::make_unique<cpu::kernels::CpuGemmLowpMatrixAReductionKernel>(); + _recurrent_to_input_reduction = std::make_unique<cpu::kernels::CpuGemmLowpMatrixAReductionKernel>(); + _input_to_input_reduction->configure(_input_to_input_weights->info(), _input_to_input_eff_bias.info(), + GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true)); + _recurrent_to_input_reduction->configure( + _recurrent_to_input_weights->info(), _recurrent_to_input_eff_bias.info(), + GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true)); } - _input_to_forget_reduction = std::make_unique<NEGEMMLowpMatrixAReductionKernel>(); - _recurrent_to_forget_reduction = std::make_unique<NEGEMMLowpMatrixAReductionKernel>(); - _input_to_cell_reduction = std::make_unique<NEGEMMLowpMatrixAReductionKernel>(); - _recurrent_to_cell_reduction = std::make_unique<NEGEMMLowpMatrixAReductionKernel>(); - _input_to_output_reduction = std::make_unique<NEGEMMLowpMatrixAReductionKernel>(); - _recurrent_to_output_reduction = std::make_unique<NEGEMMLowpMatrixAReductionKernel>(); - - _recurrent_to_cell_reduction->configure(input_to_forget_weights, &_input_to_forget_eff_bias, GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true)); - _recurrent_to_forget_reduction->configure(recurrent_to_forget_weights, &_recurrent_to_forget_eff_bias, GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true)); - _input_to_cell_reduction->configure(input_to_cell_weights, &_input_to_cell_eff_bias, GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true)); - _recurrent_to_cell_reduction->configure(recurrent_to_cell_weights, &_recurrent_to_cell_eff_bias, GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true)); - _input_to_output_reduction->configure(input_to_output_weights, &_input_to_output_eff_bias, GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true)); - _recurrent_to_output_reduction->configure(recurrent_to_output_weights, &_recurrent_to_output_eff_bias, GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true)); - if(_has_projection) + _input_to_forget_reduction = std::make_unique<cpu::kernels::CpuGemmLowpMatrixAReductionKernel>(); + _recurrent_to_forget_reduction = std::make_unique<cpu::kernels::CpuGemmLowpMatrixAReductionKernel>(); + _input_to_cell_reduction = std::make_unique<cpu::kernels::CpuGemmLowpMatrixAReductionKernel>(); + _recurrent_to_cell_reduction = std::make_unique<cpu::kernels::CpuGemmLowpMatrixAReductionKernel>(); + _input_to_output_reduction = std::make_unique<cpu::kernels::CpuGemmLowpMatrixAReductionKernel>(); + _recurrent_to_output_reduction = std::make_unique<cpu::kernels::CpuGemmLowpMatrixAReductionKernel>(); + + _input_to_forget_reduction->configure(input_to_forget_weights->info(), _input_to_forget_eff_bias.info(), + GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true)); + _recurrent_to_forget_reduction->configure( + recurrent_to_forget_weights->info(), _recurrent_to_forget_eff_bias.info(), + GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true)); + _input_to_cell_reduction->configure(input_to_cell_weights->info(), _input_to_cell_eff_bias.info(), + GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true)); + _recurrent_to_cell_reduction->configure( + recurrent_to_cell_weights->info(), _recurrent_to_cell_eff_bias.info(), + GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true)); + _input_to_output_reduction->configure(input_to_output_weights->info(), _input_to_output_eff_bias.info(), + GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true)); + _recurrent_to_output_reduction->configure( + recurrent_to_output_weights->info(), _recurrent_to_output_eff_bias.info(), + GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true)); + if (_has_projection) { - _projection_reduction = std::make_unique<NEGEMMLowpMatrixAReductionKernel>(); - _projection_reduction->configure(_projection_weights, &_projection_eff_bias, GEMMLowpReductionKernelInfo(output_size, false, lstm_params.hidden_state_zero(), true)); - if(_projection_bias != nullptr) + _projection_reduction = std::make_unique<cpu::kernels::CpuGemmLowpMatrixAReductionKernel>(); + _projection_reduction->configure( + _projection_weights->info(), _projection_eff_bias.info(), + GEMMLowpReductionKernelInfo(output_size, false, lstm_params.hidden_state_zero(), true)); + if (_projection_bias != nullptr) { - _projection_bias_add.configure(_projection_bias, &_projection_eff_bias, &_projection_eff_bias, ConvertPolicy::SATURATE); + _projection_bias_add.configure(_projection_bias, &_projection_eff_bias, &_projection_eff_bias, + ConvertPolicy::SATURATE); } } @@ -258,15 +402,19 @@ void NEQLSTMLayer::configure(const ITensor *input, _transpose_input_to_forget_weights.configure(input_to_forget_weights, &_input_to_forget_weights_transposed); _transpose_input_to_cell_weights.configure(input_to_cell_weights, &_input_to_cell_weights_transposed); _transpose_input_to_output_weights.configure(input_to_output_weights, &_input_to_output_weights_transposed); - _transpose_recurrent_to_forget_weights.configure(recurrent_to_forget_weights, &_recurrent_to_forget_weights_transposed); + _transpose_recurrent_to_forget_weights.configure(recurrent_to_forget_weights, + &_recurrent_to_forget_weights_transposed); _transpose_recurrent_to_cell_weights.configure(recurrent_to_cell_weights, &_recurrent_to_cell_weights_transposed); - _transpose_recurrent_to_output_weights.configure(recurrent_to_output_weights, &_recurrent_to_output_weights_transposed); - if(!_has_cifg) + _transpose_recurrent_to_output_weights.configure(recurrent_to_output_weights, + &_recurrent_to_output_weights_transposed); + if (!_has_cifg) { - _transpose_input_to_input_weights.configure(lstm_params.input_to_input_weights(), &_input_to_input_weights_transposed); - _transpose_recurrent_to_input_weights.configure(lstm_params.recurrent_to_input_weights(), &_recurrent_to_input_weights_transposed); + _transpose_input_to_input_weights.configure(lstm_params.input_to_input_weights(), + &_input_to_input_weights_transposed); + _transpose_recurrent_to_input_weights.configure(lstm_params.recurrent_to_input_weights(), + &_recurrent_to_input_weights_transposed); } - if(_has_projection) + if (_has_projection) { _transpose_projection_weights.configure(_projection_weights, &_projection_weights_transposed); } @@ -279,40 +427,52 @@ void NEQLSTMLayer::configure(const ITensor *input, const TensorInfo mm_out_info(TensorShape(num_units, batch_size), 1, DataType::S32); // Forget gate. - const TensorInfo forget_gate_outstage_info(mm_out_info.tensor_shape(), 1, DataType::QSYMM16, QuantizationInfo(lstm_params.forget_intermediate_scale(), 0)); - const float input_to_forget_scale = input_to_forget_weights->info()->quantization_info().uniform().scale * qinput.scale / lstm_params.forget_intermediate_scale(); - configure_mm(_mm_input_to_forget, _input_to_forget_outstage, gemmlowp_info, - input, &_input_to_forget_weights_transposed, &_input_to_forget_eff_bias, - &_mm_input_to_forget_res, &_input_to_forget_outstage_res, input_to_forget_scale, - mm_out_info, forget_gate_outstage_info); - - const float recurrent_to_forget_scale = recurrent_to_forget_weights->info()->quantization_info().uniform().scale * qoutput_state_in.scale / lstm_params.forget_intermediate_scale(); - configure_mm(_mm_recurrent_to_forget, _recurrent_to_forget_outstage, gemmlowp_info, - output_state_in, &_recurrent_to_forget_weights_transposed, &_recurrent_to_forget_eff_bias, - &_mm_recurrent_to_forget_res, &_recurrent_to_forget_outstage_res, recurrent_to_forget_scale, - mm_out_info, forget_gate_outstage_info); - - _accumulate_input_recurrent_forget.configure(&_input_to_forget_outstage_res, &_recurrent_to_forget_outstage_res, &_recurrent_to_forget_outstage_res, ConvertPolicy::SATURATE); + const TensorInfo forget_gate_outstage_info(mm_out_info.tensor_shape(), 1, DataType::QSYMM16, + QuantizationInfo(lstm_params.forget_intermediate_scale(), 0)); + const float input_to_forget_scale = input_to_forget_weights->info()->quantization_info().uniform().scale * + qinput.scale / lstm_params.forget_intermediate_scale(); + configure_mm(_mm_input_to_forget, _input_to_forget_outstage, gemmlowp_info, input, + &_input_to_forget_weights_transposed, &_input_to_forget_eff_bias, &_mm_input_to_forget_res, + &_input_to_forget_outstage_res, input_to_forget_scale, mm_out_info, forget_gate_outstage_info); + + const float recurrent_to_forget_scale = recurrent_to_forget_weights->info()->quantization_info().uniform().scale * + qoutput_state_in.scale / lstm_params.forget_intermediate_scale(); + configure_mm(_mm_recurrent_to_forget, _recurrent_to_forget_outstage, gemmlowp_info, output_state_in, + &_recurrent_to_forget_weights_transposed, &_recurrent_to_forget_eff_bias, &_mm_recurrent_to_forget_res, + &_recurrent_to_forget_outstage_res, recurrent_to_forget_scale, mm_out_info, forget_gate_outstage_info); + + _accumulate_input_recurrent_forget.configure(&_input_to_forget_outstage_res, &_recurrent_to_forget_outstage_res, + &_recurrent_to_forget_outstage_res, ConvertPolicy::SATURATE); _input_to_forget_outstage_res.allocator()->allocate(); - if(_has_peephole) + if (_has_peephole) { _mul_cell_to_forget_res.allocator()->init(TensorInfo(cell_state_in->info()->tensor_shape(), 1, DataType::S32)); _memory_group.manage(&_mul_cell_to_forget_res); - _pixelwise_mul_cell_to_forget.configure(cell_state_in, lstm_params.cell_to_forget_weights(), &_mul_cell_to_forget_res, 1.f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO); - _cell_to_forget_outstage_res.allocator()->init(TensorInfo(_mul_cell_to_forget_res.info()->tensor_shape(), 1, DataType::QSYMM16, QuantizationInfo(lstm_params.forget_intermediate_scale(), 0))); + _pixelwise_mul_cell_to_forget.configure(cell_state_in, lstm_params.cell_to_forget_weights(), + &_mul_cell_to_forget_res, 1.f, ConvertPolicy::SATURATE, + RoundingPolicy::TO_ZERO); + _cell_to_forget_outstage_res.allocator()->init( + TensorInfo(_mul_cell_to_forget_res.info()->tensor_shape(), 1, DataType::QSYMM16, + QuantizationInfo(lstm_params.forget_intermediate_scale(), 0))); _memory_group.manage(&_cell_to_forget_outstage_res); - const float cell_to_forget_scale = std::pow(2, cell_shift) * lstm_params.cell_to_forget_weights()->info()->quantization_info().uniform().scale / lstm_params.forget_intermediate_scale(); - quantization::calculate_quantized_multiplier(cell_to_forget_scale, &gemmlowp_info.gemmlowp_multiplier, &gemmlowp_info.gemmlowp_shift); - _cell_to_forget_outstage.configure(&_mul_cell_to_forget_res, nullptr, &_cell_to_forget_outstage_res, gemmlowp_info); + const float cell_to_forget_scale = + std::pow(2, cell_shift) * + lstm_params.cell_to_forget_weights()->info()->quantization_info().uniform().scale / + lstm_params.forget_intermediate_scale(); + quantization::calculate_quantized_multiplier(cell_to_forget_scale, &gemmlowp_info.gemmlowp_multiplier, + &gemmlowp_info.gemmlowp_shift); + _cell_to_forget_outstage.configure(&_mul_cell_to_forget_res, nullptr, &_cell_to_forget_outstage_res, + gemmlowp_info); _mul_cell_to_forget_res.allocator()->allocate(); - _accumulate_cell_forget.configure(&_recurrent_to_forget_outstage_res, &_cell_to_forget_outstage_res, &_recurrent_to_forget_outstage_res, ConvertPolicy::SATURATE); + _accumulate_cell_forget.configure(&_recurrent_to_forget_outstage_res, &_cell_to_forget_outstage_res, + &_recurrent_to_forget_outstage_res, ConvertPolicy::SATURATE); _cell_to_forget_outstage_res.allocator()->allocate(); } Tensor *forget_activation_input = &_recurrent_to_forget_outstage_res; - if(_has_layer_norm) + if (_has_layer_norm) { configure_layer_norm(LayerNormGate::Forget, forget_activation_input); forget_activation_input->allocator()->allocate(); @@ -321,33 +481,36 @@ void NEQLSTMLayer::configure(const ITensor *input, // Output quantization info of Sigmoid and Tanh activations const QuantizationInfo sigmoid_tanh_outqinfo(1.f / 32768.f, 0); - const TensorInfo forget_gate_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, sigmoid_tanh_outqinfo); + const TensorInfo forget_gate_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, sigmoid_tanh_outqinfo); _memory_group.manage(&_forget_gate); _forget_gate.allocator()->init(forget_gate_info); - _forget_gate_sigmoid.configure(forget_activation_input, &_forget_gate, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)); + _forget_gate_sigmoid.configure(forget_activation_input, &_forget_gate, + ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)); forget_activation_input->allocator()->allocate(); // Modulation gate. - const TensorInfo cell_outstage_info(mm_out_info.tensor_shape(), 1, DataType::QSYMM16, QuantizationInfo(lstm_params.cell_intermediate_scale(), 0)); - const float input_to_cell_scale = input_to_cell_weights->info()->quantization_info().uniform().scale * qinput.scale / lstm_params.cell_intermediate_scale(); - configure_mm(_mm_input_to_cell, _input_to_cell_outstage, gemmlowp_info, - input, &_input_to_cell_weights_transposed, &_input_to_cell_eff_bias, - &_mm_input_to_cell_res, &_input_to_cell_outstage_res, input_to_cell_scale, + const TensorInfo cell_outstage_info(mm_out_info.tensor_shape(), 1, DataType::QSYMM16, + QuantizationInfo(lstm_params.cell_intermediate_scale(), 0)); + const float input_to_cell_scale = input_to_cell_weights->info()->quantization_info().uniform().scale * + qinput.scale / lstm_params.cell_intermediate_scale(); + configure_mm(_mm_input_to_cell, _input_to_cell_outstage, gemmlowp_info, input, &_input_to_cell_weights_transposed, + &_input_to_cell_eff_bias, &_mm_input_to_cell_res, &_input_to_cell_outstage_res, input_to_cell_scale, mm_out_info, cell_outstage_info); - const float recurrent_to_cell_scale = recurrent_to_cell_weights->info()->quantization_info().uniform().scale * qoutput_state_in.scale / lstm_params.cell_intermediate_scale(); - configure_mm(_mm_recurrent_to_cell, _recurrent_to_cell_outstage, gemmlowp_info, - output_state_in, &_recurrent_to_cell_weights_transposed, &_recurrent_to_cell_eff_bias, - &_mm_recurrent_to_cell_res, &_recurrent_to_cell_outstage_res, recurrent_to_cell_scale, - mm_out_info, cell_outstage_info); + const float recurrent_to_cell_scale = recurrent_to_cell_weights->info()->quantization_info().uniform().scale * + qoutput_state_in.scale / lstm_params.cell_intermediate_scale(); + configure_mm(_mm_recurrent_to_cell, _recurrent_to_cell_outstage, gemmlowp_info, output_state_in, + &_recurrent_to_cell_weights_transposed, &_recurrent_to_cell_eff_bias, &_mm_recurrent_to_cell_res, + &_recurrent_to_cell_outstage_res, recurrent_to_cell_scale, mm_out_info, cell_outstage_info); - _accumulate_input_recurrent_modulation.configure(&_input_to_cell_outstage_res, &_recurrent_to_cell_outstage_res, &_recurrent_to_cell_outstage_res, ConvertPolicy::SATURATE); + _accumulate_input_recurrent_modulation.configure(&_input_to_cell_outstage_res, &_recurrent_to_cell_outstage_res, + &_recurrent_to_cell_outstage_res, ConvertPolicy::SATURATE); _input_to_cell_outstage_res.allocator()->allocate(); Tensor *cell_activation_input = &_recurrent_to_cell_outstage_res; - if(_has_layer_norm) + if (_has_layer_norm) { configure_layer_norm(LayerNormGate::Cell, cell_activation_input); cell_activation_input->allocator()->allocate(); @@ -358,14 +521,15 @@ void NEQLSTMLayer::configure(const ITensor *input, _memory_group.manage(&_cell_gate); _cell_gate.allocator()->init(cell_gate_info); - _cell_gate_tanh.configure(cell_activation_input, &_cell_gate, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.f, 1.f)); + _cell_gate_tanh.configure(cell_activation_input, &_cell_gate, + ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.f, 1.f)); cell_activation_input->allocator()->allocate(); // Input gate. const TensorInfo input_gate_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, sigmoid_tanh_outqinfo); _input_gate.allocator()->init(input_gate_info); _memory_group.manage(&_input_gate); - if(_has_cifg) + if (_has_cifg) { _ones.allocator()->init(*_forget_gate.info()); _input_gate_sub.configure(&_ones, &_forget_gate, &_input_gate, ConvertPolicy::SATURATE); @@ -373,104 +537,137 @@ void NEQLSTMLayer::configure(const ITensor *input, } else { - const TensorInfo input_outstage_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, QuantizationInfo(lstm_params.input_intermediate_scale(), 0)); - const float input_to_input_scale = _input_to_input_weights->info()->quantization_info().uniform().scale * qinput.scale / lstm_params.input_intermediate_scale(); - configure_mm(_mm_input_to_input, _input_to_input_outstage, gemmlowp_info, - input, &_input_to_input_weights_transposed, &_input_to_input_eff_bias, - &_mm_input_to_input_res, &_input_to_input_outstage_res, input_to_input_scale, - mm_out_info, input_outstage_info); - - const float recurrent_to_input_scale = _recurrent_to_input_weights->info()->quantization_info().uniform().scale * qoutput_state_in.scale / lstm_params.input_intermediate_scale(); - configure_mm(_mm_recurrent_to_input, _recurrent_to_input_outstage, gemmlowp_info, - output_state_in, &_recurrent_to_input_weights_transposed, &_recurrent_to_input_eff_bias, + const TensorInfo input_outstage_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, + QuantizationInfo(lstm_params.input_intermediate_scale(), 0)); + const float input_to_input_scale = _input_to_input_weights->info()->quantization_info().uniform().scale * + qinput.scale / lstm_params.input_intermediate_scale(); + configure_mm(_mm_input_to_input, _input_to_input_outstage, gemmlowp_info, input, + &_input_to_input_weights_transposed, &_input_to_input_eff_bias, &_mm_input_to_input_res, + &_input_to_input_outstage_res, input_to_input_scale, mm_out_info, input_outstage_info); + + const float recurrent_to_input_scale = + _recurrent_to_input_weights->info()->quantization_info().uniform().scale * qoutput_state_in.scale / + lstm_params.input_intermediate_scale(); + configure_mm(_mm_recurrent_to_input, _recurrent_to_input_outstage, gemmlowp_info, output_state_in, + &_recurrent_to_input_weights_transposed, &_recurrent_to_input_eff_bias, &_mm_recurrent_to_input_res, &_recurrent_to_input_outstage_res, recurrent_to_input_scale, mm_out_info, input_outstage_info); - _accumulate_input_recurrent_input.configure(&_input_to_input_outstage_res, &_recurrent_to_input_outstage_res, &_recurrent_to_input_outstage_res, ConvertPolicy::SATURATE); + _accumulate_input_recurrent_input.configure(&_input_to_input_outstage_res, &_recurrent_to_input_outstage_res, + &_recurrent_to_input_outstage_res, ConvertPolicy::SATURATE); _input_to_input_outstage_res.allocator()->allocate(); - if(_has_peephole) + if (_has_peephole) { - _mul_cell_to_input_res.allocator()->init(TensorInfo(cell_state_in->info()->tensor_shape(), 1, DataType::S32)); + _mul_cell_to_input_res.allocator()->init( + TensorInfo(cell_state_in->info()->tensor_shape(), 1, DataType::S32)); _memory_group.manage(&_mul_cell_to_input_res); - _pixelwise_mul_cell_to_input.configure(cell_state_in, lstm_params.cell_to_input_weights(), &_mul_cell_to_input_res, 1.f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO); - const float cell_to_input_scale = std::pow(2, cell_shift) * lstm_params.cell_to_input_weights()->info()->quantization_info().uniform().scale / lstm_params.input_intermediate_scale(); - quantization::calculate_quantized_multiplier(cell_to_input_scale, &gemmlowp_info.gemmlowp_multiplier, &gemmlowp_info.gemmlowp_shift); - _cell_to_input_outstage_res.allocator()->init(TensorInfo(_mul_cell_to_input_res.info()->tensor_shape(), 1, DataType::QSYMM16, QuantizationInfo(lstm_params.input_intermediate_scale(), 0))); + _pixelwise_mul_cell_to_input.configure(cell_state_in, lstm_params.cell_to_input_weights(), + &_mul_cell_to_input_res, 1.f, ConvertPolicy::SATURATE, + RoundingPolicy::TO_ZERO); + const float cell_to_input_scale = + std::pow(2, cell_shift) * + lstm_params.cell_to_input_weights()->info()->quantization_info().uniform().scale / + lstm_params.input_intermediate_scale(); + quantization::calculate_quantized_multiplier(cell_to_input_scale, &gemmlowp_info.gemmlowp_multiplier, + &gemmlowp_info.gemmlowp_shift); + _cell_to_input_outstage_res.allocator()->init( + TensorInfo(_mul_cell_to_input_res.info()->tensor_shape(), 1, DataType::QSYMM16, + QuantizationInfo(lstm_params.input_intermediate_scale(), 0))); _memory_group.manage(&_cell_to_input_outstage_res); - _cell_to_input_outstage.configure(&_mul_cell_to_input_res, nullptr, &_cell_to_input_outstage_res, gemmlowp_info); + _cell_to_input_outstage.configure(&_mul_cell_to_input_res, nullptr, &_cell_to_input_outstage_res, + gemmlowp_info); _mul_cell_to_input_res.allocator()->allocate(); - _accumulate_cell_input.configure(&_recurrent_to_input_outstage_res, &_cell_to_input_outstage_res, &_recurrent_to_input_outstage_res, ConvertPolicy::SATURATE); + _accumulate_cell_input.configure(&_recurrent_to_input_outstage_res, &_cell_to_input_outstage_res, + &_recurrent_to_input_outstage_res, ConvertPolicy::SATURATE); _cell_to_input_outstage_res.allocator()->allocate(); } Tensor *input_activation_input = &_recurrent_to_input_outstage_res; - if(_has_layer_norm) + if (_has_layer_norm) { configure_layer_norm(LayerNormGate::Input, input_activation_input); input_activation_input->allocator()->allocate(); input_activation_input = &get_layer_norm_output(LayerNormGate::Input); } - _input_gate_sigmoid.configure(input_activation_input, &_input_gate, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)); + _input_gate_sigmoid.configure(input_activation_input, &_input_gate, + ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)); input_activation_input->allocator()->allocate(); } // Cell. // TODO(COMPMID-3395): Perform multiplication in the quantized domain in NEPixelWiseMultiplication - _pixelwise_mul_forget_cell.configure(&_forget_gate, cell_state_in, &_forget_gate, 1.f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO); + _pixelwise_mul_forget_cell.configure(&_forget_gate, cell_state_in, &_forget_gate, 1.f, ConvertPolicy::SATURATE, + RoundingPolicy::TO_ZERO); const float cell_gate_scale = _cell_gate.info()->quantization_info().uniform().scale; const float mul_input_cell_scale = cell_gate_scale * std::pow(2, 15 + cell_shift); - const TensorInfo mul_input_cell_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, QuantizationInfo(mul_input_cell_scale, 0)); + const TensorInfo mul_input_cell_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, + QuantizationInfo(mul_input_cell_scale, 0)); _memory_group.manage(&_mul_input_cell_res); _mul_input_cell_res.allocator()->init(mul_input_cell_info); - _pixelwise_mul_input_cell.configure(&_input_gate, &_cell_gate, &_mul_input_cell_res, 1.f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO); + _pixelwise_mul_input_cell.configure(&_input_gate, &_cell_gate, &_mul_input_cell_res, 1.f, ConvertPolicy::SATURATE, + RoundingPolicy::TO_ZERO); _cell_gate.allocator()->allocate(); _add_forget_cell.configure(&_forget_gate, &_mul_input_cell_res, cell_state_out, ConvertPolicy::SATURATE); _mul_input_cell_res.allocator()->allocate(); _forget_gate.allocator()->allocate(); - if(_has_cell_clipping) + if (_has_cell_clipping) { - _cell_clip.configure(cell_state_out, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, -quantized_cell_clip, quantized_cell_clip)); + _cell_clip.configure(cell_state_out, nullptr, + ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, + -quantized_cell_clip, quantized_cell_clip)); } // Output gate. - const TensorInfo output_outstage_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, QuantizationInfo(lstm_params.output_intermediate_scale(), 0)); - const float input_to_output_scale = input_to_output_weights->info()->quantization_info().uniform().scale * qinput.scale / lstm_params.output_intermediate_scale(); - configure_mm(_mm_input_to_output, _input_to_output_outstage, gemmlowp_info, - input, &_input_to_output_weights_transposed, &_input_to_output_eff_bias, - &_mm_input_to_output_res, &_input_to_output_outstage_res, input_to_output_scale, - mm_out_info, output_outstage_info); - - const float recurrent_to_output_scale = recurrent_to_output_weights->info()->quantization_info().uniform().scale * qoutput_state_in.scale / lstm_params.output_intermediate_scale(); - configure_mm(_mm_recurrent_to_output, _recurrent_to_output_outstage, gemmlowp_info, - output_state_in, &_recurrent_to_output_weights_transposed, &_recurrent_to_output_eff_bias, - &_mm_recurrent_to_output_res, &_recurrent_to_output_outstage_res, recurrent_to_output_scale, - mm_out_info, output_outstage_info); - - _accumulate_input_recurrent_output.configure(&_recurrent_to_output_outstage_res, &_input_to_output_outstage_res, &_recurrent_to_output_outstage_res, ConvertPolicy::SATURATE); + const TensorInfo output_outstage_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, + QuantizationInfo(lstm_params.output_intermediate_scale(), 0)); + const float input_to_output_scale = input_to_output_weights->info()->quantization_info().uniform().scale * + qinput.scale / lstm_params.output_intermediate_scale(); + configure_mm(_mm_input_to_output, _input_to_output_outstage, gemmlowp_info, input, + &_input_to_output_weights_transposed, &_input_to_output_eff_bias, &_mm_input_to_output_res, + &_input_to_output_outstage_res, input_to_output_scale, mm_out_info, output_outstage_info); + + const float recurrent_to_output_scale = recurrent_to_output_weights->info()->quantization_info().uniform().scale * + qoutput_state_in.scale / lstm_params.output_intermediate_scale(); + configure_mm(_mm_recurrent_to_output, _recurrent_to_output_outstage, gemmlowp_info, output_state_in, + &_recurrent_to_output_weights_transposed, &_recurrent_to_output_eff_bias, &_mm_recurrent_to_output_res, + &_recurrent_to_output_outstage_res, recurrent_to_output_scale, mm_out_info, output_outstage_info); + + _accumulate_input_recurrent_output.configure(&_recurrent_to_output_outstage_res, &_input_to_output_outstage_res, + &_recurrent_to_output_outstage_res, ConvertPolicy::SATURATE); _input_to_output_outstage_res.allocator()->allocate(); - if(_has_peephole) + if (_has_peephole) { // TODO(COMPMID-3395): Perform multiplication in the quantized domain in NEPixelWiseMultiplication // Here we are not using the output stage because all operations are done in float _mul_cell_to_output_res.allocator()->init(TensorInfo(cell_state_out->info()->tensor_shape(), 1, DataType::S32)); _memory_group.manage(&_mul_cell_to_output_res); - _pixelwise_mul_cell_to_output.configure(cell_state_out, lstm_params.cell_to_output_weights(), &_mul_cell_to_output_res, 1.f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO); - - const float cell_to_output_scale = std::pow(2, cell_shift) * lstm_params.cell_to_output_weights()->info()->quantization_info().uniform().scale / lstm_params.output_intermediate_scale(); - quantization::calculate_quantized_multiplier(cell_to_output_scale, &gemmlowp_info.gemmlowp_multiplier, &gemmlowp_info.gemmlowp_shift); - _cell_to_output_outstage_res.allocator()->init(TensorInfo(_mul_cell_to_output_res.info()->tensor_shape(), 1, DataType::QSYMM16, QuantizationInfo(lstm_params.output_intermediate_scale(), 0))); + _pixelwise_mul_cell_to_output.configure(cell_state_out, lstm_params.cell_to_output_weights(), + &_mul_cell_to_output_res, 1.f, ConvertPolicy::SATURATE, + RoundingPolicy::TO_ZERO); + + const float cell_to_output_scale = + std::pow(2, cell_shift) * + lstm_params.cell_to_output_weights()->info()->quantization_info().uniform().scale / + lstm_params.output_intermediate_scale(); + quantization::calculate_quantized_multiplier(cell_to_output_scale, &gemmlowp_info.gemmlowp_multiplier, + &gemmlowp_info.gemmlowp_shift); + _cell_to_output_outstage_res.allocator()->init( + TensorInfo(_mul_cell_to_output_res.info()->tensor_shape(), 1, DataType::QSYMM16, + QuantizationInfo(lstm_params.output_intermediate_scale(), 0))); _memory_group.manage(&_cell_to_output_outstage_res); - _cell_to_output_outstage.configure(&_mul_cell_to_output_res, nullptr, &_cell_to_output_outstage_res, gemmlowp_info); + _cell_to_output_outstage.configure(&_mul_cell_to_output_res, nullptr, &_cell_to_output_outstage_res, + gemmlowp_info); _mul_cell_to_output_res.allocator()->allocate(); - _accumulate_cell_to_output.configure(&_recurrent_to_output_outstage_res, &_cell_to_output_outstage_res, &_recurrent_to_output_outstage_res, ConvertPolicy::SATURATE); + _accumulate_cell_to_output.configure(&_recurrent_to_output_outstage_res, &_cell_to_output_outstage_res, + &_recurrent_to_output_outstage_res, ConvertPolicy::SATURATE); _cell_to_output_outstage_res.allocator()->allocate(); } Tensor *output_activation_input = &_recurrent_to_output_outstage_res; - if(_has_layer_norm) + if (_has_layer_norm) { configure_layer_norm(LayerNormGate::Output, output_activation_input); output_activation_input->allocator()->allocate(); @@ -480,20 +677,24 @@ void NEQLSTMLayer::configure(const ITensor *input, _memory_group.manage(&_output_gate); _output_gate.allocator()->init(output_gate_info); - _output_gate_sigmoid.configure(output_activation_input, &_output_gate, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)); + _output_gate_sigmoid.configure(output_activation_input, &_output_gate, + ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)); output_activation_input->allocator()->allocate(); // Hidden. - _hidden_tanh.configure(cell_state_out, &_input_gate, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.f, 1.f)); + _hidden_tanh.configure(cell_state_out, &_input_gate, + ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.f, 1.f)); // TODO(COMPMID-3395): Perform multiplication in the quantized domain in NEPixelWiseMultiplication _memory_group.manage(&_hidden_mul_res); const TensorInfo hidden_mul_res(_input_gate.info()->tensor_shape(), 1, DataType::S32); _hidden_mul_res.allocator()->init(hidden_mul_res); - _pixelwise_mul_hidden.configure(&_output_gate, &_input_gate, &_hidden_mul_res, 1.f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO); + _pixelwise_mul_hidden.configure(&_output_gate, &_input_gate, &_hidden_mul_res, 1.f, ConvertPolicy::SATURATE, + RoundingPolicy::TO_ZERO); _output_gate.allocator()->allocate(); _input_gate.allocator()->allocate(); const float hidden_state_scale = std::pow(2, -15) / lstm_params.hidden_state_scale() * std::pow(2, -15); - quantization::calculate_quantized_multiplier(hidden_state_scale, &gemmlowp_info.gemmlowp_multiplier, &gemmlowp_info.gemmlowp_shift, /* ignore_epsilon */ true); + quantization::calculate_quantized_multiplier(hidden_state_scale, &gemmlowp_info.gemmlowp_multiplier, + &gemmlowp_info.gemmlowp_shift, /* ignore_epsilon */ true); gemmlowp_info.gemmlowp_offset = lstm_params.hidden_state_zero(); gemmlowp_info.output_data_type = output_state_in->info()->data_type(); @@ -502,7 +703,7 @@ void NEQLSTMLayer::configure(const ITensor *input, _memory_group.manage(&_hidden_gate); - if(_projection_tensor_copy_required) + if (_projection_tensor_copy_required) { _hidden_gate.allocator()->init(*output_state_out->info()); _hidden_gate.info()->set_tensor_shape(_hidden_mul_res.info()->tensor_shape()); @@ -513,27 +714,26 @@ void NEQLSTMLayer::configure(const ITensor *input, _hidden_mul_res.allocator()->allocate(); // Projection. - if(_has_projection) + if (_has_projection) { const TensorInfo projection_outstage_info(*output_state_out->info()); - const UniformQuantizationInfo qprojection = _projection_weights->info()->quantization_info().uniform(); - const float projection_scale = qprojection.scale * lstm_params.hidden_state_scale() / qoutput_state_in.scale; - gemmlowp_info.gemmlowp_offset = qoutput_state_in.offset; - gemmlowp_info.gemmlowp_min_bound = std::numeric_limits<int8_t>::lowest(); - gemmlowp_info.gemmlowp_max_bound = std::numeric_limits<int8_t>::max(); - gemmlowp_info.output_data_type = DataType::QASYMM8_SIGNED; - - TensorInfo projection_mm_out_info{ mm_out_info }; + const UniformQuantizationInfo qprojection = _projection_weights->info()->quantization_info().uniform(); + const float projection_scale = qprojection.scale * lstm_params.hidden_state_scale() / qoutput_state_in.scale; + gemmlowp_info.gemmlowp_offset = qoutput_state_in.offset; + gemmlowp_info.gemmlowp_min_bound = std::numeric_limits<int8_t>::lowest(); + gemmlowp_info.gemmlowp_max_bound = std::numeric_limits<int8_t>::max(); + gemmlowp_info.output_data_type = DataType::QASYMM8_SIGNED; + + TensorInfo projection_mm_out_info{mm_out_info}; projection_mm_out_info.set_tensor_shape(TensorShape(output_size, batch_size)); - configure_mm(_mm_projection, _projection_outstage, gemmlowp_info, - hidden_gate_result, &_projection_weights_transposed, &_projection_eff_bias, - &_mm_projection_res, &_projection_outstage_res, projection_scale, - projection_mm_out_info, projection_outstage_info); + configure_mm(_mm_projection, _projection_outstage, gemmlowp_info, hidden_gate_result, + &_projection_weights_transposed, &_projection_eff_bias, &_mm_projection_res, + &_projection_outstage_res, projection_scale, projection_mm_out_info, projection_outstage_info); ITensor *accumulate_destination = output_state_out; - if(_projection_tensor_copy_required) + if (_projection_tensor_copy_required) { _hidden_gate.allocator()->allocate(); _projection_accumulate_res.allocator()->init(*output_state_in->info()); @@ -542,30 +742,34 @@ void NEQLSTMLayer::configure(const ITensor *input, accumulate_destination = &_projection_accumulate_res; } - _accumulate_projection.configure(&_projection_outstage_res, accumulate_destination, accumulate_destination, ConvertPolicy::SATURATE); + _accumulate_projection.configure(&_projection_outstage_res, accumulate_destination, accumulate_destination, + ConvertPolicy::SATURATE); _projection_outstage_res.allocator()->allocate(); - if(_projection_tensor_copy_required) + if (_projection_tensor_copy_required) { _projection_accumulate_to_output_copy.configure(_projection_accumulate_res, *output_state_out); _projection_accumulate_res.allocator()->allocate(); } - int8_t quantized_projection_clip{ 0 }; - if(lstm_params.projection_clip() > 0.0f) + int8_t quantized_projection_clip{0}; + if (lstm_params.projection_clip() > 0.0f) { - quantized_projection_clip = utility::clamp<int8_t>(lstm_params.projection_clip() / qprojection.scale, -128, 127); + quantized_projection_clip = + utility::clamp<int8_t>(lstm_params.projection_clip() / qprojection.scale, -128, 127); } - if(quantized_projection_clip > 0) + if (quantized_projection_clip > 0) { - _projection_clip.configure(output_state_out, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, -quantized_projection_clip, quantized_projection_clip)); + _projection_clip.configure(output_state_out, nullptr, + ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, + -quantized_projection_clip, quantized_projection_clip)); _has_projection_clipping = true; } } else { - if(_projection_tensor_copy_required) + if (_projection_tensor_copy_required) { _hidden_to_output_copy.configure(_hidden_gate, *output_state_out); _hidden_gate.allocator()->allocate(); @@ -576,17 +780,27 @@ void NEQLSTMLayer::configure(const ITensor *input, _copy_output.configure(output_state_out, output); } -Status NEQLSTMLayer::validate(const ITensorInfo *input, - const ITensorInfo *input_to_forget_weights, const ITensorInfo *input_to_cell_weights, const ITensorInfo *input_to_output_weights, - const ITensorInfo *recurrent_to_forget_weights, const ITensorInfo *recurrent_to_cell_weights, const ITensorInfo *recurrent_to_output_weights, - const ITensorInfo *forget_gate_bias, const ITensorInfo *cell_bias, const ITensorInfo *output_gate_bias, - const ITensorInfo *cell_state_in, const ITensorInfo *output_state_in, - const ITensorInfo *cell_state_out, const ITensorInfo *output_state_out, const ITensorInfo *output, +Status NEQLSTMLayer::validate(const ITensorInfo *input, + const ITensorInfo *input_to_forget_weights, + const ITensorInfo *input_to_cell_weights, + const ITensorInfo *input_to_output_weights, + const ITensorInfo *recurrent_to_forget_weights, + const ITensorInfo *recurrent_to_cell_weights, + const ITensorInfo *recurrent_to_output_weights, + const ITensorInfo *forget_gate_bias, + const ITensorInfo *cell_bias, + const ITensorInfo *output_gate_bias, + const ITensorInfo *cell_state_in, + const ITensorInfo *output_state_in, + const ITensorInfo *cell_state_out, + const ITensorInfo *output_state_out, + const ITensorInfo *output, const LSTMParams<ITensorInfo> &lstm_params) { - ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, input_to_forget_weights, input_to_cell_weights, input_to_output_weights, recurrent_to_forget_weights, recurrent_to_cell_weights, - recurrent_to_output_weights, forget_gate_bias, cell_bias, output_gate_bias, cell_state_in, output_state_in, - cell_state_out, output_state_out, output); + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, input_to_forget_weights, input_to_cell_weights, input_to_output_weights, + recurrent_to_forget_weights, recurrent_to_cell_weights, + recurrent_to_output_weights, forget_gate_bias, cell_bias, output_gate_bias, + cell_state_in, output_state_in, cell_state_out, output_state_out, output); ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8_SIGNED); ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->num_dimensions() != 2, "Input must have exactly 2 dimensions"); @@ -598,14 +812,28 @@ Status NEQLSTMLayer::validate(const ITensorInfo *input, ARM_COMPUTE_RETURN_ERROR_ON(input_to_output_weights->num_dimensions() != 2); ARM_COMPUTE_RETURN_ERROR_ON(input_to_output_weights->dimension(0) != input_size); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input_to_output_weights, input_to_forget_weights, input_to_cell_weights); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input_to_output_weights, input_to_forget_weights, + input_to_cell_weights); ARM_COMPUTE_RETURN_ERROR_ON(recurrent_to_output_weights->num_dimensions() != 2); ARM_COMPUTE_RETURN_ERROR_ON(recurrent_to_output_weights->dimension(1) != num_units); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(recurrent_to_output_weights, recurrent_to_forget_weights, recurrent_to_cell_weights); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input_to_forget_weights, 1, DataType::QSYMM8); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input_to_forget_weights, input_to_cell_weights, input_to_output_weights, - recurrent_to_forget_weights, recurrent_to_cell_weights, recurrent_to_output_weights); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(recurrent_to_output_weights, recurrent_to_forget_weights, + recurrent_to_cell_weights); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input_to_forget_weights, 1, DataType::QASYMM8_SIGNED, + DataType::QSYMM8); + // If the input_to_forget_weights data type is DataType::QSYMM8 then it can never match the other weights as they are all DataType::QASYMM8_SIGNED + if (input_to_forget_weights->data_type() == DataType::QSYMM8) + { + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input_to_cell_weights, input_to_output_weights, + recurrent_to_forget_weights, recurrent_to_cell_weights, + recurrent_to_output_weights); + } + else + { + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input_to_forget_weights, input_to_cell_weights, + input_to_output_weights, recurrent_to_forget_weights, + recurrent_to_cell_weights, recurrent_to_output_weights); + } ARM_COMPUTE_RETURN_ERROR_ON(forget_gate_bias->num_dimensions() != 1); ARM_COMPUTE_RETURN_ERROR_ON(forget_gate_bias->dimension(0) != num_units); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(forget_gate_bias, cell_bias, output_gate_bias); @@ -623,20 +851,25 @@ Status NEQLSTMLayer::validate(const ITensorInfo *input, ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output_state_in); // Check whether peephole weights are all there or none - if(lstm_params.has_peephole_opt()) + if (lstm_params.has_peephole_opt()) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(lstm_params.cell_to_forget_weights(), lstm_params.cell_to_output_weights()); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lstm_params.cell_to_forget_weights(), 1, DataType::QSYMM16); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lstm_params.cell_to_forget_weights(), 1, + DataType::QSYMM16); ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.cell_to_forget_weights()->num_dimensions() != 1); ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.cell_to_forget_weights()->dimension(0) != num_units); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(lstm_params.cell_to_forget_weights(), lstm_params.cell_to_output_weights()); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(lstm_params.cell_to_forget_weights(), lstm_params.cell_to_output_weights()); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(lstm_params.cell_to_forget_weights(), + lstm_params.cell_to_output_weights()); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(lstm_params.cell_to_forget_weights(), + lstm_params.cell_to_output_weights()); - if(!lstm_params.has_cifg_opt()) + if (!lstm_params.has_cifg_opt()) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(lstm_params.cell_to_input_weights()); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(lstm_params.cell_to_forget_weights(), lstm_params.cell_to_input_weights()); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(lstm_params.cell_to_forget_weights(), lstm_params.cell_to_input_weights()); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(lstm_params.cell_to_forget_weights(), + lstm_params.cell_to_input_weights()); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(lstm_params.cell_to_forget_weights(), + lstm_params.cell_to_input_weights()); } } @@ -650,7 +883,7 @@ Status NEQLSTMLayer::validate(const ITensorInfo *input, // Calculate quantized parameters for clipping. int16_t quantized_cell_clip = 0; - if(lstm_params.cell_clip() > 0.0f) + if (lstm_params.cell_clip() > 0.0f) { quantized_cell_clip = quantize_qsymm16(lstm_params.cell_clip(), qcell_state_in); } @@ -658,49 +891,90 @@ Status NEQLSTMLayer::validate(const ITensorInfo *input, // Precompute effective bias for optimizing the matmul computations. const TensorInfo eff_bias_info(TensorShape(num_units), 1, DataType::S32); const TensorInfo projection_eff_bias_info(TensorShape(output_size), 1, DataType::S32); - if(!lstm_params.has_cifg_opt()) + if (!lstm_params.has_cifg_opt()) { - ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpMatrixAReductionKernel::validate(lstm_params.input_to_input_weights(), &eff_bias_info, GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true))); - ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpMatrixAReductionKernel::validate(lstm_params.recurrent_to_input_weights(), &eff_bias_info, GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, - true))); + ARM_COMPUTE_RETURN_ON_ERROR(cpu::kernels::CpuGemmLowpMatrixAReductionKernel::validate( + lstm_params.input_to_input_weights(), &eff_bias_info, + GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true))); + ARM_COMPUTE_RETURN_ON_ERROR(cpu::kernels::CpuGemmLowpMatrixAReductionKernel::validate( + lstm_params.recurrent_to_input_weights(), &eff_bias_info, + GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true))); } - ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpMatrixAReductionKernel::validate(input_to_forget_weights, &eff_bias_info, GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true))); - ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpMatrixAReductionKernel::validate(recurrent_to_forget_weights, &eff_bias_info, GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true))); - ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpMatrixAReductionKernel::validate(input_to_cell_weights, &eff_bias_info, GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true))); - ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpMatrixAReductionKernel::validate(recurrent_to_cell_weights, &eff_bias_info, GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true))); - ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpMatrixAReductionKernel::validate(input_to_output_weights, &eff_bias_info, GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true))); - ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpMatrixAReductionKernel::validate(recurrent_to_output_weights, &eff_bias_info, GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true))); - if(lstm_params.has_projection()) + ARM_COMPUTE_RETURN_ON_ERROR(cpu::kernels::CpuGemmLowpMatrixAReductionKernel::validate( + input_to_forget_weights, &eff_bias_info, GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true))); + ARM_COMPUTE_RETURN_ON_ERROR(cpu::kernels::CpuGemmLowpMatrixAReductionKernel::validate( + recurrent_to_forget_weights, &eff_bias_info, + GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true))); + ARM_COMPUTE_RETURN_ON_ERROR(cpu::kernels::CpuGemmLowpMatrixAReductionKernel::validate( + input_to_cell_weights, &eff_bias_info, GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true))); + ARM_COMPUTE_RETURN_ON_ERROR(cpu::kernels::CpuGemmLowpMatrixAReductionKernel::validate( + recurrent_to_cell_weights, &eff_bias_info, + GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true))); + ARM_COMPUTE_RETURN_ON_ERROR(cpu::kernels::CpuGemmLowpMatrixAReductionKernel::validate( + input_to_output_weights, &eff_bias_info, GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true))); + ARM_COMPUTE_RETURN_ON_ERROR(cpu::kernels::CpuGemmLowpMatrixAReductionKernel::validate( + recurrent_to_output_weights, &eff_bias_info, + GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true))); + if (lstm_params.has_projection()) { - ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpMatrixAReductionKernel::validate(lstm_params.projection_weights(), &projection_eff_bias_info, GEMMLowpReductionKernelInfo(output_size, false, - lstm_params.hidden_state_zero(), - true))); - if(lstm_params.projection_bias() != nullptr) + ARM_COMPUTE_RETURN_ON_ERROR(cpu::kernels::CpuGemmLowpMatrixAReductionKernel::validate( + lstm_params.projection_weights(), &projection_eff_bias_info, + GEMMLowpReductionKernelInfo(output_size, false, lstm_params.hidden_state_zero(), true))); + if (lstm_params.projection_bias() != nullptr) { ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lstm_params.projection_bias(), 1, DataType::S32); - ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(lstm_params.projection_bias(), &projection_eff_bias_info, &projection_eff_bias_info, ConvertPolicy::SATURATE)); + ARM_COMPUTE_RETURN_ON_ERROR( + NEArithmeticAddition::validate(lstm_params.projection_bias(), &projection_eff_bias_info, + &projection_eff_bias_info, ConvertPolicy::SATURATE)); } } - const TensorInfo input_weights_transposed(TensorShape(num_units, input_size), 1, input_to_forget_weights->data_type(), input_to_forget_weights->quantization_info()); - const TensorInfo recurrent_weights_transposed(TensorShape(num_units, output_size), 1, recurrent_to_forget_weights->data_type(), recurrent_to_forget_weights->quantization_info()); + const TensorInfo input_weights_transposed(TensorShape(num_units, input_size), 1, input_to_cell_weights->data_type(), + input_to_cell_weights->quantization_info()); + const TensorInfo input_to_output_weights_transposed(TensorShape(num_units, input_size), 1, + input_to_output_weights->data_type(), + input_to_output_weights->quantization_info()); + const TensorInfo recurrent_to_forget_weights_transposed(TensorShape(num_units, output_size), 1, + recurrent_to_forget_weights->data_type(), + recurrent_to_forget_weights->quantization_info()); + const TensorInfo recurrent_to_cell_weights_transposed(TensorShape(num_units, output_size), 1, + recurrent_to_cell_weights->data_type(), + recurrent_to_cell_weights->quantization_info()); + const TensorInfo recurrent_to_output_weights_transposed(TensorShape(num_units, output_size), 1, + recurrent_to_output_weights->data_type(), + recurrent_to_output_weights->quantization_info()); + const TensorInfo recurrent_weights_transposed(TensorShape(num_units, output_size), 1, + recurrent_to_forget_weights->data_type(), + recurrent_to_forget_weights->quantization_info()); - // Validate weights transpose - ARM_COMPUTE_RETURN_ON_ERROR(NETranspose::validate(input_to_forget_weights, &input_weights_transposed)); ARM_COMPUTE_RETURN_ON_ERROR(NETranspose::validate(input_to_cell_weights, &input_weights_transposed)); - ARM_COMPUTE_RETURN_ON_ERROR(NETranspose::validate(input_to_output_weights, &input_weights_transposed)); - ARM_COMPUTE_RETURN_ON_ERROR(NETranspose::validate(recurrent_to_forget_weights, &recurrent_weights_transposed)); - ARM_COMPUTE_RETURN_ON_ERROR(NETranspose::validate(recurrent_to_cell_weights, &recurrent_weights_transposed)); - ARM_COMPUTE_RETURN_ON_ERROR(NETranspose::validate(recurrent_to_output_weights, &recurrent_weights_transposed)); - if(!lstm_params.has_cifg_opt()) + ARM_COMPUTE_RETURN_ON_ERROR(NETranspose::validate(input_to_output_weights, &input_to_output_weights_transposed)); + ARM_COMPUTE_RETURN_ON_ERROR( + NETranspose::validate(recurrent_to_forget_weights, &recurrent_to_forget_weights_transposed)); + ARM_COMPUTE_RETURN_ON_ERROR( + NETranspose::validate(recurrent_to_cell_weights, &recurrent_to_cell_weights_transposed)); + ARM_COMPUTE_RETURN_ON_ERROR( + NETranspose::validate(recurrent_to_output_weights, &recurrent_to_output_weights_transposed)); + if (!lstm_params.has_cifg_opt()) { - ARM_COMPUTE_RETURN_ON_ERROR(NETranspose::validate(lstm_params.input_to_input_weights(), &input_weights_transposed)); - ARM_COMPUTE_RETURN_ON_ERROR(NETranspose::validate(lstm_params.recurrent_to_input_weights(), &recurrent_weights_transposed)); + const TensorInfo recurrent_to_input_weights_transposed( + TensorShape(num_units, output_size), 1, recurrent_to_forget_weights->data_type(), + lstm_params.recurrent_to_input_weights()->quantization_info()); + const TensorInfo input_to_input_weights_transposed(TensorShape(num_units, input_size), 1, + lstm_params.input_to_input_weights()->data_type(), + lstm_params.input_to_input_weights()->quantization_info()); + ARM_COMPUTE_RETURN_ON_ERROR( + NETranspose::validate(lstm_params.input_to_input_weights(), &input_to_input_weights_transposed)); + ARM_COMPUTE_RETURN_ON_ERROR( + NETranspose::validate(lstm_params.recurrent_to_input_weights(), &recurrent_to_input_weights_transposed)); } - if(lstm_params.has_projection()) + if (lstm_params.has_projection()) { - const TensorInfo projection_weights_transposed(TensorShape(output_size, num_units), 1, lstm_params.projection_weights()->data_type(), lstm_params.projection_weights()->quantization_info()); - ARM_COMPUTE_RETURN_ON_ERROR(NETranspose::validate(lstm_params.projection_weights(), &projection_weights_transposed)); + const TensorInfo projection_weights_transposed(TensorShape(output_size, num_units), 1, + lstm_params.projection_weights()->data_type(), + lstm_params.projection_weights()->quantization_info()); + ARM_COMPUTE_RETURN_ON_ERROR( + NETranspose::validate(lstm_params.projection_weights(), &projection_weights_transposed)); } GEMMLowpOutputStageInfo gemmlowp_info; @@ -713,28 +987,42 @@ Status NEQLSTMLayer::validate(const ITensorInfo *input, // Forget gate. ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.forget_intermediate_scale() == 0); - const TensorInfo forget_outstage_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, QuantizationInfo(lstm_params.forget_intermediate_scale(), 0)); + const TensorInfo forget_outstage_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, + QuantizationInfo(lstm_params.forget_intermediate_scale(), 0)); const TensorInfo mm_out_info(TensorShape(num_units, batch_size), 1, DataType::S32); - const float input_to_forget_scale = input_to_forget_weights->quantization_info().uniform().scale * qinput.scale / lstm_params.forget_intermediate_scale(); - ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, input, &input_weights_transposed, &eff_bias_info, input_to_forget_scale, &mm_out_info, &forget_outstage_info)); + const float input_to_forget_scale = input_to_forget_weights->quantization_info().uniform().scale * qinput.scale / + lstm_params.forget_intermediate_scale(); + ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, input, &input_weights_transposed, &eff_bias_info, + input_to_forget_scale, &mm_out_info, &forget_outstage_info)); - const float recurrent_to_forget_scale = recurrent_to_forget_weights->quantization_info().uniform().scale * qoutput_state_in.scale / lstm_params.forget_intermediate_scale(); - ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, output_state_in, &recurrent_weights_transposed, &eff_bias_info, recurrent_to_forget_scale, &mm_out_info, &forget_outstage_info)); + const float recurrent_to_forget_scale = recurrent_to_forget_weights->quantization_info().uniform().scale * + qoutput_state_in.scale / lstm_params.forget_intermediate_scale(); + ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, output_state_in, &recurrent_weights_transposed, + &eff_bias_info, recurrent_to_forget_scale, &mm_out_info, + &forget_outstage_info)); - ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(&forget_outstage_info, &forget_outstage_info, &forget_outstage_info, ConvertPolicy::SATURATE)); + ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(&forget_outstage_info, &forget_outstage_info, + &forget_outstage_info, ConvertPolicy::SATURATE)); - if(lstm_params.has_peephole_opt()) + if (lstm_params.has_peephole_opt()) { - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lstm_params.cell_to_forget_weights(), 1, DataType::QSYMM16); - ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplication::validate(cell_state_in, lstm_params.cell_to_forget_weights(), &mm_out_info, 1.f, ConvertPolicy::SATURATE, - RoundingPolicy::TO_ZERO)); - const float cell_to_forget_scale = std::pow(2, cell_shift) * lstm_params.cell_to_forget_weights()->quantization_info().uniform().scale / lstm_params.forget_intermediate_scale(); - ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier(cell_to_forget_scale, &gemmlowp_info.gemmlowp_multiplier, &gemmlowp_info.gemmlowp_shift)); - ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpOutputStage::validate(&mm_out_info, nullptr, &forget_outstage_info, gemmlowp_info)); - ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(&forget_outstage_info, &forget_outstage_info, &forget_outstage_info, ConvertPolicy::SATURATE)); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lstm_params.cell_to_forget_weights(), 1, + DataType::QSYMM16); + ARM_COMPUTE_RETURN_ON_ERROR( + NEPixelWiseMultiplication::validate(cell_state_in, lstm_params.cell_to_forget_weights(), &mm_out_info, 1.f, + ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO)); + const float cell_to_forget_scale = std::pow(2, cell_shift) * + lstm_params.cell_to_forget_weights()->quantization_info().uniform().scale / + lstm_params.forget_intermediate_scale(); + ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier( + cell_to_forget_scale, &gemmlowp_info.gemmlowp_multiplier, &gemmlowp_info.gemmlowp_shift)); + ARM_COMPUTE_RETURN_ON_ERROR( + NEGEMMLowpOutputStage::validate(&mm_out_info, nullptr, &forget_outstage_info, gemmlowp_info)); + ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(&forget_outstage_info, &forget_outstage_info, + &forget_outstage_info, ConvertPolicy::SATURATE)); } - if(has_layer_norm) + if (has_layer_norm) { const ITensorInfo *w_info = lstm_params.forget_layer_norm_weights(); const ITensorInfo *b_info = forget_gate_bias; @@ -743,22 +1031,31 @@ Status NEQLSTMLayer::validate(const ITensorInfo *input, // Output quantization info of Sigmoid and Tanh activations const QuantizationInfo sigmoid_tanh_outqinfo(1.f / 32768.f, 0); - const TensorInfo forget_gate_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, sigmoid_tanh_outqinfo); + const TensorInfo forget_gate_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, sigmoid_tanh_outqinfo); - ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayer::validate(&forget_outstage_info, &forget_gate_info, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC))); + ARM_COMPUTE_RETURN_ON_ERROR( + NEActivationLayer::validate(&forget_outstage_info, &forget_gate_info, + ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC))); // Modulation gate. ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.cell_intermediate_scale() == 0); - const TensorInfo cell_outstage_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, QuantizationInfo(lstm_params.cell_intermediate_scale(), 0)); - const float input_to_cell_scale = input_to_cell_weights->quantization_info().uniform().scale * qinput.scale / lstm_params.cell_intermediate_scale(); - ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, input, &input_weights_transposed, &eff_bias_info, input_to_cell_scale, &mm_out_info, &cell_outstage_info)); - - const float recurrent_to_cell_scale = recurrent_to_cell_weights->quantization_info().uniform().scale * qoutput_state_in.scale / lstm_params.cell_intermediate_scale(); - ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, output_state_in, &recurrent_weights_transposed, &eff_bias_info, recurrent_to_cell_scale, &mm_out_info, &cell_outstage_info)); - - ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(&cell_outstage_info, &cell_outstage_info, &cell_outstage_info, ConvertPolicy::SATURATE)); - - if(has_layer_norm) + const TensorInfo cell_outstage_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, + QuantizationInfo(lstm_params.cell_intermediate_scale(), 0)); + const float input_to_cell_scale = input_to_cell_weights->quantization_info().uniform().scale * qinput.scale / + lstm_params.cell_intermediate_scale(); + ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, input, &input_weights_transposed, &eff_bias_info, + input_to_cell_scale, &mm_out_info, &cell_outstage_info)); + + const float recurrent_to_cell_scale = recurrent_to_cell_weights->quantization_info().uniform().scale * + qoutput_state_in.scale / lstm_params.cell_intermediate_scale(); + ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, output_state_in, &recurrent_weights_transposed, + &eff_bias_info, recurrent_to_cell_scale, &mm_out_info, + &cell_outstage_info)); + + ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(&cell_outstage_info, &cell_outstage_info, + &cell_outstage_info, ConvertPolicy::SATURATE)); + + if (has_layer_norm) { const ITensorInfo *w_info = lstm_params.cell_layer_norm_weights(); const ITensorInfo *b_info = cell_bias; @@ -766,85 +1063,134 @@ Status NEQLSTMLayer::validate(const ITensorInfo *input, } const TensorInfo cell_gate_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, sigmoid_tanh_outqinfo); - ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayer::validate(&cell_outstage_info, &cell_gate_info, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.f, 1.f))); + ARM_COMPUTE_RETURN_ON_ERROR( + NEActivationLayer::validate(&cell_outstage_info, &cell_gate_info, + ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.f, 1.f))); // Input gate. const TensorInfo input_gate_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, sigmoid_tanh_outqinfo); - if(lstm_params.has_cifg_opt()) + if (lstm_params.has_cifg_opt()) { - ARM_COMPUTE_RETURN_ERROR_ON_MSG(lstm_params.input_gate_bias() != nullptr, "Input gate bias must not be present when CIFG is used"); - ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticSubtraction::validate(&input_gate_info, &forget_gate_info, &forget_gate_info, ConvertPolicy::SATURATE)); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(lstm_params.input_gate_bias() != nullptr, + "Input gate bias must not be present when CIFG is used"); + ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticSubtraction::validate(&input_gate_info, &forget_gate_info, + &forget_gate_info, ConvertPolicy::SATURATE)); } else { - ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(lstm_params.input_to_input_weights(), lstm_params.recurrent_to_input_weights(), lstm_params.input_gate_bias()); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input_to_forget_weights, lstm_params.input_to_input_weights(), lstm_params.recurrent_to_input_weights()); + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(lstm_params.input_to_input_weights(), + lstm_params.recurrent_to_input_weights(), lstm_params.input_gate_bias()); + + // If the input_to_forget_weights data type is DataType::QSYMM8 then it can never match the other weights as they are all DataType::QASYMM8_SIGNED + if (input_to_forget_weights->data_type() == DataType::QSYMM8) + { + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(lstm_params.input_to_input_weights(), + lstm_params.recurrent_to_input_weights()); + } + else + { + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input_to_forget_weights, + lstm_params.input_to_input_weights(), + lstm_params.recurrent_to_input_weights()); + } ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input_to_forget_weights, lstm_params.input_to_input_weights()); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(recurrent_to_forget_weights, lstm_params.recurrent_to_input_weights()); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(recurrent_to_forget_weights, + lstm_params.recurrent_to_input_weights()); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(forget_gate_bias, lstm_params.input_gate_bias()); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(forget_gate_bias, lstm_params.input_gate_bias()); ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.input_intermediate_scale() == 0); - const TensorInfo input_outstage_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, QuantizationInfo(lstm_params.input_intermediate_scale(), 0)); - const float input_to_input_scale = lstm_params.input_to_input_weights()->quantization_info().uniform().scale * qinput.scale / lstm_params.input_intermediate_scale(); - ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, input, &input_weights_transposed, &eff_bias_info, input_to_input_scale, &mm_out_info, &input_outstage_info)); - - const float recurrent_to_input_scale = lstm_params.recurrent_to_input_weights()->quantization_info().uniform().scale * qoutput_state_in.scale / lstm_params.input_intermediate_scale(); - ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, output_state_in, &recurrent_weights_transposed, &eff_bias_info, recurrent_to_input_scale, &mm_out_info, &input_outstage_info)); - - ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(&input_outstage_info, &input_outstage_info, &input_outstage_info, ConvertPolicy::SATURATE)); - - if(lstm_params.has_peephole_opt()) + const TensorInfo input_outstage_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, + QuantizationInfo(lstm_params.input_intermediate_scale(), 0)); + const float input_to_input_scale = lstm_params.input_to_input_weights()->quantization_info().uniform().scale * + qinput.scale / lstm_params.input_intermediate_scale(); + ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, input, &input_weights_transposed, &eff_bias_info, + input_to_input_scale, &mm_out_info, &input_outstage_info)); + + const float recurrent_to_input_scale = + lstm_params.recurrent_to_input_weights()->quantization_info().uniform().scale * qoutput_state_in.scale / + lstm_params.input_intermediate_scale(); + ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, output_state_in, &recurrent_weights_transposed, + &eff_bias_info, recurrent_to_input_scale, &mm_out_info, + &input_outstage_info)); + + ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(&input_outstage_info, &input_outstage_info, + &input_outstage_info, ConvertPolicy::SATURATE)); + + if (lstm_params.has_peephole_opt()) { - ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplication::validate(cell_state_in, lstm_params.cell_to_input_weights(), &mm_out_info, 1.f, ConvertPolicy::SATURATE, - RoundingPolicy::TO_ZERO)); - const float cell_to_input_scale = std::pow(2, cell_shift) * lstm_params.cell_to_input_weights()->quantization_info().uniform().scale / lstm_params.input_intermediate_scale(); - ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier(cell_to_input_scale, &gemmlowp_info.gemmlowp_multiplier, &gemmlowp_info.gemmlowp_shift)); - ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpOutputStage::validate(&mm_out_info, &eff_bias_info, &input_outstage_info, gemmlowp_info)); - ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(&input_outstage_info, &input_outstage_info, &input_outstage_info, ConvertPolicy::SATURATE)); + ARM_COMPUTE_RETURN_ON_ERROR( + NEPixelWiseMultiplication::validate(cell_state_in, lstm_params.cell_to_input_weights(), &mm_out_info, + 1.f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO)); + const float cell_to_input_scale = std::pow(2, cell_shift) * + lstm_params.cell_to_input_weights()->quantization_info().uniform().scale / + lstm_params.input_intermediate_scale(); + ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier( + cell_to_input_scale, &gemmlowp_info.gemmlowp_multiplier, &gemmlowp_info.gemmlowp_shift)); + ARM_COMPUTE_RETURN_ON_ERROR( + NEGEMMLowpOutputStage::validate(&mm_out_info, &eff_bias_info, &input_outstage_info, gemmlowp_info)); + ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(&input_outstage_info, &input_outstage_info, + &input_outstage_info, ConvertPolicy::SATURATE)); } - if(has_layer_norm) + if (has_layer_norm) { const ITensorInfo *w_info = lstm_params.input_layer_norm_weights(); const ITensorInfo *b_info = lstm_params.input_gate_bias(); ARM_COMPUTE_RETURN_ON_ERROR(validate_layer_norm(input_outstage_info, *w_info, *b_info)); } - ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayer::validate(&input_outstage_info, &input_gate_info, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.f, 1.f))); + ARM_COMPUTE_RETURN_ON_ERROR( + NEActivationLayer::validate(&input_outstage_info, &input_gate_info, + ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.f, 1.f))); } // Cell. - ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplication::validate(&forget_gate_info, cell_state_in, &forget_gate_info, 1.f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO)); - ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplication::validate(&input_gate_info, cell_state_in, &cell_gate_info, 1.f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO)); - ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(&forget_gate_info, &cell_gate_info, cell_state_out, ConvertPolicy::SATURATE)); - if(quantized_cell_clip > 0) + ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplication::validate( + &forget_gate_info, cell_state_in, &forget_gate_info, 1.f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO)); + ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplication::validate( + &input_gate_info, cell_state_in, &cell_gate_info, 1.f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO)); + ARM_COMPUTE_RETURN_ON_ERROR( + NEArithmeticAddition::validate(&forget_gate_info, &cell_gate_info, cell_state_out, ConvertPolicy::SATURATE)); + if (quantized_cell_clip > 0) { - ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayer::validate(cell_state_out, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, -quantized_cell_clip, - quantized_cell_clip))); + ARM_COMPUTE_RETURN_ON_ERROR( + NEActivationLayer::validate(cell_state_out, nullptr, + ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, + -quantized_cell_clip, quantized_cell_clip))); } // Output gate. ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.output_intermediate_scale() == 0); - const TensorInfo output_outstage_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, QuantizationInfo(lstm_params.output_intermediate_scale(), 0)); - const float input_to_output_scale = input_to_output_weights->quantization_info().uniform().scale * qinput.scale / lstm_params.output_intermediate_scale(); - ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, input, &input_weights_transposed, &eff_bias_info, input_to_output_scale, &mm_out_info, &output_outstage_info)); - - const float recurrent_to_output_scale = recurrent_to_output_weights->quantization_info().uniform().scale * qoutput_state_in.scale / lstm_params.output_intermediate_scale(); - ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, output_state_in, &recurrent_weights_transposed, &eff_bias_info, recurrent_to_output_scale, &mm_out_info, &output_outstage_info)); - - ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(&output_outstage_info, &output_outstage_info, &output_outstage_info, ConvertPolicy::SATURATE)); - if(lstm_params.has_peephole_opt()) + const TensorInfo output_outstage_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, + QuantizationInfo(lstm_params.output_intermediate_scale(), 0)); + const float input_to_output_scale = input_to_output_weights->quantization_info().uniform().scale * qinput.scale / + lstm_params.output_intermediate_scale(); + ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, input, &input_weights_transposed, &eff_bias_info, + input_to_output_scale, &mm_out_info, &output_outstage_info)); + + const float recurrent_to_output_scale = recurrent_to_output_weights->quantization_info().uniform().scale * + qoutput_state_in.scale / lstm_params.output_intermediate_scale(); + ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, output_state_in, &recurrent_weights_transposed, + &eff_bias_info, recurrent_to_output_scale, &mm_out_info, + &output_outstage_info)); + + ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(&output_outstage_info, &output_outstage_info, + &output_outstage_info, ConvertPolicy::SATURATE)); + if (lstm_params.has_peephole_opt()) { - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lstm_params.cell_to_output_weights(), 1, DataType::QSYMM16); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lstm_params.cell_to_output_weights(), 1, + DataType::QSYMM16); // TODO(COMPMID-3395): Perform multiplication in the quantized domain in NEPixelWiseMultiplication // Here we are not using the output stage because all operations are done in float // const float cell_to_output_scale = std::pow(2, cell_shift) * lstm_params.cell_to_output_weights()->quantization_info().uniform().scale / lstm_params.output_intermediate_scale(); // ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier(cell_to_output_scale, &gemmlowp_info.gemmlowp_multiplier, &gemmlowp_info.gemmlowp_shift)); - ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplication::validate(cell_state_out, lstm_params.cell_to_output_weights(), &output_outstage_info, 1.f, ConvertPolicy::SATURATE, - RoundingPolicy::TO_ZERO)); - ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(&output_outstage_info, &output_outstage_info, &output_outstage_info, ConvertPolicy::SATURATE)); + ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplication::validate( + cell_state_out, lstm_params.cell_to_output_weights(), &output_outstage_info, 1.f, ConvertPolicy::SATURATE, + RoundingPolicy::TO_ZERO)); + ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(&output_outstage_info, &output_outstage_info, + &output_outstage_info, ConvertPolicy::SATURATE)); } - if(has_layer_norm) + if (has_layer_norm) { const ITensorInfo *w_info = lstm_params.output_layer_norm_weights(); const ITensorInfo *b_info = output_gate_bias; @@ -852,85 +1198,103 @@ Status NEQLSTMLayer::validate(const ITensorInfo *input, } const TensorInfo output_gate_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, sigmoid_tanh_outqinfo); - ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayer::validate(&output_outstage_info, &output_gate_info, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC))); + ARM_COMPUTE_RETURN_ON_ERROR( + NEActivationLayer::validate(&output_outstage_info, &output_gate_info, + ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC))); // Hidden. - ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayer::validate(cell_state_out, &input_gate_info, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.f, 1.f))); + ARM_COMPUTE_RETURN_ON_ERROR( + NEActivationLayer::validate(cell_state_out, &input_gate_info, + ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.f, 1.f))); const TensorInfo hidden_mul_res(TensorShape(num_units, batch_size), 1, DataType::S32); const TensorInfo hidden_out_info(TensorShape(num_units, batch_size), 1, DataType::QASYMM8_SIGNED); - ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplication::validate(&output_gate_info, &input_gate_info, &hidden_mul_res, 1.f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO)); + ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplication::validate( + &output_gate_info, &input_gate_info, &hidden_mul_res, 1.f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO)); ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.hidden_state_scale() == 0); const float hidden_state_scale = std::pow(2, -15) / lstm_params.hidden_state_scale() * std::pow(2, -15); - ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier(hidden_state_scale, &gemmlowp_info.gemmlowp_multiplier, &gemmlowp_info.gemmlowp_shift, /* ignore_epsilon */ true)); + ARM_COMPUTE_RETURN_ON_ERROR( + quantization::calculate_quantized_multiplier(hidden_state_scale, &gemmlowp_info.gemmlowp_multiplier, + &gemmlowp_info.gemmlowp_shift, /* ignore_epsilon */ true)); gemmlowp_info.gemmlowp_offset = lstm_params.hidden_state_zero(); gemmlowp_info.output_data_type = hidden_out_info.data_type(); - ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpOutputStage::validate(&hidden_mul_res, nullptr, &hidden_out_info, gemmlowp_info)); + ARM_COMPUTE_RETURN_ON_ERROR( + NEGEMMLowpOutputStage::validate(&hidden_mul_res, nullptr, &hidden_out_info, gemmlowp_info)); const bool projection_tensor_copy_required = num_units != output_size; // Projection. - if(lstm_params.has_projection()) + if (lstm_params.has_projection()) { - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(recurrent_to_forget_weights, lstm_params.projection_weights()); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(recurrent_to_forget_weights, + lstm_params.projection_weights()); ARM_COMPUTE_RETURN_ERROR_ON(qoutput_state_in.scale == 0); - const UniformQuantizationInfo qprojection = lstm_params.projection_weights()->quantization_info().uniform(); - const float projection_scale = qprojection.scale * lstm_params.hidden_state_scale() / qoutput_state_in.scale; - ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier(projection_scale, &gemmlowp_info.gemmlowp_multiplier, &gemmlowp_info.gemmlowp_shift)); + const UniformQuantizationInfo qprojection = lstm_params.projection_weights()->quantization_info().uniform(); + const float projection_scale = qprojection.scale * lstm_params.hidden_state_scale() / qoutput_state_in.scale; + ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier( + projection_scale, &gemmlowp_info.gemmlowp_multiplier, &gemmlowp_info.gemmlowp_shift)); gemmlowp_info.gemmlowp_offset = qoutput_state_in.offset; gemmlowp_info.gemmlowp_min_bound = std::numeric_limits<int8_t>::lowest(); gemmlowp_info.gemmlowp_max_bound = std::numeric_limits<int8_t>::max(); gemmlowp_info.output_data_type = DataType::QASYMM8_SIGNED; const TensorInfo projection_outstage_info(*output_state_out); - const TensorInfo projection_weights_transposed(TensorShape(output_size, num_units), 1, lstm_params.projection_weights()->data_type(), lstm_params.projection_weights()->quantization_info()); + const TensorInfo projection_weights_transposed(TensorShape(output_size, num_units), 1, + lstm_params.projection_weights()->data_type(), + lstm_params.projection_weights()->quantization_info()); - TensorInfo projection_mm_out_info{ mm_out_info }; + TensorInfo projection_mm_out_info{mm_out_info}; projection_mm_out_info.set_tensor_shape(TensorShape(output_size, batch_size)); - ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, &hidden_out_info, &projection_weights_transposed, &projection_eff_bias_info, projection_scale, &projection_mm_out_info, + ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, &hidden_out_info, &projection_weights_transposed, + &projection_eff_bias_info, projection_scale, &projection_mm_out_info, &projection_outstage_info)); - if(projection_tensor_copy_required) + if (projection_tensor_copy_required) { - ARM_COMPUTE_RETURN_ON_ERROR(NEQLSTMLayer::TensorCopyKernel::validate(*output_state_in, projection_outstage_info)); + ARM_COMPUTE_RETURN_ON_ERROR( + NEQLSTMLayer::TensorCopyKernel::validate(*output_state_in, projection_outstage_info)); } - ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(output_state_out, output_state_out, output_state_out, ConvertPolicy::SATURATE)); + ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(output_state_out, output_state_out, output_state_out, + ConvertPolicy::SATURATE)); - if(projection_tensor_copy_required) + if (projection_tensor_copy_required) { - ARM_COMPUTE_RETURN_ON_ERROR(NEQLSTMLayer::TensorCopyKernel::validate(projection_outstage_info, *output_state_out)); + ARM_COMPUTE_RETURN_ON_ERROR( + NEQLSTMLayer::TensorCopyKernel::validate(projection_outstage_info, *output_state_out)); } - int8_t quantized_projection_clip{ 0 }; - if(lstm_params.projection_clip() > 0.0f) + int8_t quantized_projection_clip{0}; + if (lstm_params.projection_clip() > 0.0f) { quantized_projection_clip = quantize_qasymm8_signed(lstm_params.projection_clip(), qprojection); } - if(quantized_projection_clip > 0) + if (quantized_projection_clip > 0) { - ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayer::validate(output_state_out, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, -quantized_projection_clip, - quantized_projection_clip))); + ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayer::validate( + output_state_out, nullptr, + ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, + -quantized_projection_clip, quantized_projection_clip))); } } else { - if(projection_tensor_copy_required) + if (projection_tensor_copy_required) { ARM_COMPUTE_RETURN_ON_ERROR(NEQLSTMLayer::TensorCopyKernel::validate(hidden_out_info, *output_state_out)); } } - if(cell_state_out->total_size() > 0) + if (cell_state_out->total_size() > 0) { ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(cell_state_in, cell_state_out); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(cell_state_in, cell_state_out); } - if(output_state_out->total_size() > 0) + if (output_state_out->total_size() > 0) { ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output_state_out); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output_state_in, output_state_out); @@ -955,14 +1319,14 @@ void NEQLSTMLayer::run() _recurrent_to_forget_outstage.run(); _accumulate_input_recurrent_forget.run(); - if(_has_peephole) + if (_has_peephole) { _pixelwise_mul_cell_to_forget.run(); _cell_to_forget_outstage.run(); _accumulate_cell_forget.run(); } - if(_has_layer_norm) + if (_has_layer_norm) { NEScheduler::get().schedule(get_layer_norm(LayerNormGate::Forget).get(), Window::DimY); } @@ -977,7 +1341,7 @@ void NEQLSTMLayer::run() _recurrent_to_cell_outstage.run(); _accumulate_input_recurrent_modulation.run(); - if(_has_layer_norm) + if (_has_layer_norm) { NEScheduler::get().schedule(get_layer_norm(LayerNormGate::Cell).get(), Window::DimY); } @@ -985,7 +1349,7 @@ void NEQLSTMLayer::run() _cell_gate_tanh.run(); // Input gate - if(_has_cifg) + if (_has_cifg) { _input_gate_sub.run(); } @@ -997,14 +1361,14 @@ void NEQLSTMLayer::run() _recurrent_to_input_outstage.run(); _accumulate_input_recurrent_input.run(); - if(_has_peephole) + if (_has_peephole) { _pixelwise_mul_cell_to_input.run(); _cell_to_input_outstage.run(); _accumulate_cell_input.run(); } - if(_has_layer_norm) + if (_has_layer_norm) { NEScheduler::get().schedule(get_layer_norm(LayerNormGate::Input).get(), Window::DimY); } @@ -1017,7 +1381,7 @@ void NEQLSTMLayer::run() _pixelwise_mul_input_cell.run(); _add_forget_cell.run(); - if(_has_cell_clipping) + if (_has_cell_clipping) { _cell_clip.run(); } @@ -1028,14 +1392,14 @@ void NEQLSTMLayer::run() _mm_recurrent_to_output.run(); _recurrent_to_output_outstage.run(); _accumulate_input_recurrent_output.run(); - if(_has_peephole) + if (_has_peephole) { _pixelwise_mul_cell_to_output.run(); _cell_to_output_outstage.run(); _accumulate_cell_to_output.run(); } - if(_has_layer_norm) + if (_has_layer_norm) { NEScheduler::get().schedule(get_layer_norm(LayerNormGate::Output).get(), Window::DimY); } @@ -1048,31 +1412,31 @@ void NEQLSTMLayer::run() _hidden_outstage.run(); // Projection. - if(_has_projection) + if (_has_projection) { _mm_projection.run(); _projection_outstage.run(); - if(_projection_tensor_copy_required) + if (_projection_tensor_copy_required) { _projection_output_to_accumulate_copy.run(); } _accumulate_projection.run(); - if(_projection_tensor_copy_required) + if (_projection_tensor_copy_required) { _projection_accumulate_to_output_copy.run(); } - if(_has_projection_clipping) + if (_has_projection_clipping) { _projection_clip.run(); } } else { - if(_projection_tensor_copy_required) + if (_projection_tensor_copy_required) { _hidden_to_output_copy.run(); } @@ -1084,8 +1448,16 @@ void NEQLSTMLayer::run() void NEQLSTMLayer::prepare() { - if(!_is_prepared) + if (!_is_prepared) { + if (_convert_input_to_forget_weights_to_qsymm8) + { + _input_to_forget_weights_f32.allocator()->allocate(); + _input_to_forget_weights_symm8.allocator()->allocate(); + _dequantize_input_to_forget_weights.run(); + _quantize_input_to_forget_weights.run(); + } + // Pre-transpose weights to be used in GEMM. _input_to_forget_weights_transposed.allocator()->allocate(); _input_to_cell_weights_transposed.allocator()->allocate(); @@ -1101,16 +1473,25 @@ void NEQLSTMLayer::prepare() _transpose_recurrent_to_output_weights.run(); // Precompute effective biases - if(_has_cifg) + if (_has_cifg) { - std::fill_n(reinterpret_cast<int16_t *>(_ones.buffer()), _ones.info()->total_size() / _ones.info()->element_size(), 32767); + std::fill_n(reinterpret_cast<int16_t *>(_ones.buffer()), + _ones.info()->total_size() / _ones.info()->element_size(), 32767); } else { _input_to_input_eff_bias.allocator()->allocate(); _recurrent_to_input_eff_bias.allocator()->allocate(); - NEScheduler::get().schedule(_input_to_input_reduction.get(), Window::DimY); - NEScheduler::get().schedule(_recurrent_to_input_reduction.get(), Window::DimY); + + ITensorPack packII = {{TensorType::ACL_SRC, _input_to_input_weights}, + {TensorType::ACL_DST, &_input_to_input_eff_bias}}; + NEScheduler::get().schedule_op(_input_to_input_reduction.get(), Window::DimY, + _input_to_input_reduction->window(), packII); + + ITensorPack packRI = {{TensorType::ACL_SRC, _recurrent_to_input_weights}, + {TensorType::ACL_DST, &_recurrent_to_input_eff_bias}}; + NEScheduler::get().schedule_op(_recurrent_to_input_reduction.get(), Window::DimY, + _recurrent_to_input_reduction->window(), packRI); _input_to_input_weights_transposed.allocator()->allocate(); _recurrent_to_input_weights_transposed.allocator()->allocate(); @@ -1125,18 +1506,45 @@ void NEQLSTMLayer::prepare() _recurrent_to_cell_eff_bias.allocator()->allocate(); _input_to_output_eff_bias.allocator()->allocate(); _recurrent_to_output_eff_bias.allocator()->allocate(); - NEScheduler::get().schedule(_input_to_forget_reduction.get(), Window::DimY); - NEScheduler::get().schedule(_recurrent_to_forget_reduction.get(), Window::DimY); - NEScheduler::get().schedule(_input_to_cell_reduction.get(), Window::DimY); - NEScheduler::get().schedule(_recurrent_to_cell_reduction.get(), Window::DimY); - NEScheduler::get().schedule(_input_to_output_reduction.get(), Window::DimY); - NEScheduler::get().schedule(_recurrent_to_output_reduction.get(), Window::DimY); - - if(_has_projection) + + ITensorPack packIF = {{TensorType::ACL_SRC, _input_to_forget_weights}, + {TensorType::ACL_DST, &_input_to_forget_eff_bias}}; + NEScheduler::get().schedule_op(_input_to_forget_reduction.get(), Window::DimY, + _input_to_forget_reduction->window(), packIF); + + ITensorPack packRF = {{TensorType::ACL_SRC, _recurrent_to_forget_weights}, + {TensorType::ACL_DST, &_recurrent_to_forget_eff_bias}}; + NEScheduler::get().schedule_op(_recurrent_to_forget_reduction.get(), Window::DimY, + _recurrent_to_forget_reduction->window(), packRF); + + ITensorPack packIC = {{TensorType::ACL_SRC, _input_to_cell_weights}, + {TensorType::ACL_DST, &_input_to_cell_eff_bias}}; + NEScheduler::get().schedule_op(_input_to_cell_reduction.get(), Window::DimY, _input_to_cell_reduction->window(), + packIC); + + ITensorPack packRC = {{TensorType::ACL_SRC, _recurrent_to_cell_weights}, + {TensorType::ACL_DST, &_recurrent_to_cell_eff_bias}}; + NEScheduler::get().schedule_op(_recurrent_to_cell_reduction.get(), Window::DimY, + _recurrent_to_cell_reduction->window(), packRC); + + ITensorPack packIO = {{TensorType::ACL_SRC, _input_to_output_weights}, + {TensorType::ACL_DST, &_input_to_output_eff_bias}}; + NEScheduler::get().schedule_op(_input_to_output_reduction.get(), Window::DimY, + _input_to_output_reduction->window(), packIO); + + ITensorPack packRO = {{TensorType::ACL_SRC, _recurrent_to_output_weights}, + {TensorType::ACL_DST, &_recurrent_to_output_eff_bias}}; + NEScheduler::get().schedule_op(_recurrent_to_output_reduction.get(), Window::DimY, + _recurrent_to_output_reduction->window(), packRO); + + if (_has_projection) { _projection_eff_bias.allocator()->allocate(); - NEScheduler::get().schedule(_projection_reduction.get(), Window::DimY); - if(_projection_bias != nullptr) + ITensorPack pack = {{TensorType::ACL_SRC, _projection_weights}, + {TensorType::ACL_DST, &_projection_eff_bias}}; + NEScheduler::get().schedule_op(_projection_reduction.get(), Window::DimY, _projection_reduction->window(), + pack); + if (_projection_bias != nullptr) { _projection_bias_add.run(); _projection_bias->mark_as_unused(); @@ -1146,7 +1554,7 @@ void NEQLSTMLayer::prepare() _transpose_projection_weights.run(); _projection_weights->mark_as_unused(); - if(!_projection_tensor_copy_required) + if (!_projection_tensor_copy_required) { _hidden_gate.mark_as_unused(); _projection_accumulate_res.mark_as_unused(); diff --git a/src/runtime/NEON/functions/NEQuantizationLayer.cpp b/src/runtime/NEON/functions/NEQuantizationLayer.cpp index e607917615..9b72783c97 100644 --- a/src/runtime/NEON/functions/NEQuantizationLayer.cpp +++ b/src/runtime/NEON/functions/NEQuantizationLayer.cpp @@ -26,19 +26,19 @@ #include "arm_compute/core/Validate.h" #include "arm_compute/runtime/Tensor.h" -#include "src/runtime/cpu/operators/CpuQuantize.h" + +#include "src/cpu/operators/CpuQuantize.h" namespace arm_compute { struct NEQuantizationLayer::Impl { - const ITensor *src{ nullptr }; - ITensor *dst{ nullptr }; - std::unique_ptr<cpu::CpuQuantize> op{ nullptr }; + const ITensor *src{nullptr}; + ITensor *dst{nullptr}; + std::unique_ptr<cpu::CpuQuantize> op{nullptr}; }; -NEQuantizationLayer::NEQuantizationLayer() - : _impl(std::make_unique<Impl>()) +NEQuantizationLayer::NEQuantizationLayer() : _impl(std::make_unique<Impl>()) { } NEQuantizationLayer::~NEQuantizationLayer() = default; diff --git a/src/runtime/NEON/functions/NERNNLayer.cpp b/src/runtime/NEON/functions/NERNNLayer.cpp index d59f7da0dd..2824693800 100644 --- a/src/runtime/NEON/functions/NERNNLayer.cpp +++ b/src/runtime/NEON/functions/NERNNLayer.cpp @@ -27,31 +27,37 @@ #include "arm_compute/core/Error.h" #include "arm_compute/core/TensorInfo.h" #include "arm_compute/core/Types.h" -#include "arm_compute/core/Validate.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" +#include "arm_compute/core/Validate.h" #include "arm_compute/runtime/NEON/NEScheduler.h" -#include "src/core/NEON/kernels/NEConvertQuantizedSignednessKernel.h" -#include "src/core/NEON/kernels/NEGEMMInterleave4x4Kernel.h" -#include "src/core/NEON/kernels/NEGEMMLowpMatrixMultiplyKernel.h" -#include "src/core/NEON/kernels/NEGEMMLowpOffsetContributionKernel.h" -#include "src/core/NEON/kernels/NEGEMMLowpOffsetContributionOutputStageKernel.h" -#include "src/core/NEON/kernels/NEGEMMLowpReductionKernel.h" -#include "src/core/NEON/kernels/NEGEMMMatrixAdditionKernel.h" -#include "src/core/NEON/kernels/NEGEMMMatrixMultiplyKernel.h" -#include "src/core/NEON/kernels/NEGEMMTranspose1xWKernel.h" + +#include "src/common/utils/Log.h" namespace arm_compute { NERNNLayer::~NERNNLayer() = default; NERNNLayer::NERNNLayer(std::shared_ptr<IMemoryManager> memory_manager) - : _memory_group(std::move(memory_manager)), _gemm_state_f(), _add_f(), _activation(), _fully_connected(memory_manager), _copy_f(), _fully_connected_out(), _gemm_output(), _add_output(), + : _memory_group(std::move(memory_manager)), + _gemm_state_f(), + _add_f(), + _activation(), + _fully_connected(memory_manager), + _copy_f(), + _fully_connected_out(), + _gemm_output(), + _add_output(), _is_prepared(false) { } -Status NERNNLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *recurrent_weights, const ITensorInfo *bias, const ITensorInfo *hidden_state, - const ITensorInfo *output, const ActivationLayerInfo &info) +Status NERNNLayer::validate(const ITensorInfo *input, + const ITensorInfo *weights, + const ITensorInfo *recurrent_weights, + const ITensorInfo *bias, + const ITensorInfo *hidden_state, + const ITensorInfo *output, + const ActivationLayerInfo &info) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, recurrent_weights, bias, hidden_state, output); ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_NOT_IN(input, DataType::F16, DataType::F32); @@ -68,23 +74,34 @@ Status NERNNLayer::validate(const ITensorInfo *input, const ITensorInfo *weights ARM_COMPUTE_RETURN_ERROR_ON(hidden_state->dimension(idx_height) != input->dimension(idx_height)); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), hidden_state->tensor_shape()); - auto shape_info = TensorInfo(misc::shape_calculator::compute_rnn_shape(recurrent_weights, hidden_state->dimension(idx_height)), 1, input->data_type()); + auto shape_info = + TensorInfo(misc::shape_calculator::compute_rnn_shape(recurrent_weights, hidden_state->dimension(idx_height)), 1, + input->data_type()); ARM_COMPUTE_RETURN_ON_ERROR(NEFullyConnectedLayer::validate(input, weights, bias, &shape_info)); - ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(&shape_info, &shape_info, &shape_info, ConvertPolicy::SATURATE)); + ARM_COMPUTE_RETURN_ON_ERROR( + NEArithmeticAddition::validate(&shape_info, &shape_info, &shape_info, ConvertPolicy::SATURATE)); ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayer::validate(&shape_info, &shape_info, info)); return Status{}; } -void NERNNLayer::configure(const ITensor *input, const ITensor *weights, const ITensor *recurrent_weights, const ITensor *bias, ITensor *hidden_state, ITensor *output, +void NERNNLayer::configure(const ITensor *input, + const ITensor *weights, + const ITensor *recurrent_weights, + const ITensor *bias, + ITensor *hidden_state, + ITensor *output, ActivationLayerInfo &info) { ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, recurrent_weights, bias, hidden_state, output); - ARM_COMPUTE_ERROR_THROW_ON(NERNNLayer::validate(input->info(), weights->info(), recurrent_weights->info(), bias->info(), hidden_state->info(), output->info(), info)); + ARM_COMPUTE_ERROR_THROW_ON(NERNNLayer::validate(input->info(), weights->info(), recurrent_weights->info(), + bias->info(), hidden_state->info(), output->info(), info)); + ARM_COMPUTE_LOG_PARAMS(input, weights, recurrent_weights, bias, hidden_state, output, info); const int idx_height = get_data_layout_dimension_index(input->info()->data_layout(), DataLayoutDimension::HEIGHT); - TensorShape shape = misc::shape_calculator::compute_rnn_shape(recurrent_weights->info(), hidden_state->info()->dimension(idx_height)); + TensorShape shape = misc::shape_calculator::compute_rnn_shape(recurrent_weights->info(), + hidden_state->info()->dimension(idx_height)); _is_prepared = false; @@ -132,7 +149,7 @@ void NERNNLayer::run() void NERNNLayer::prepare() { - if(!_is_prepared) + if (!_is_prepared) { _fully_connected.prepare(); _gemm_state_f.prepare(); diff --git a/src/runtime/NEON/functions/NEROIAlignLayer.cpp b/src/runtime/NEON/functions/NEROIAlignLayer.cpp index a946358e18..68bb5d5ef3 100644 --- a/src/runtime/NEON/functions/NEROIAlignLayer.cpp +++ b/src/runtime/NEON/functions/NEROIAlignLayer.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2020 Arm Limited. + * Copyright (c) 2019-2021 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -23,20 +23,29 @@ */ #include "arm_compute/runtime/NEON/functions/NEROIAlignLayer.h" +#include "src/common/utils/Log.h" #include "src/core/NEON/kernels/NEFillBorderKernel.h" #include "src/core/NEON/kernels/NEROIAlignLayerKernel.h" namespace arm_compute { -Status NEROIAlignLayer::validate(const ITensorInfo *input, const ITensorInfo *rois, ITensorInfo *output, const ROIPoolingLayerInfo &pool_info) +Status NEROIAlignLayer::validate(const ITensorInfo *input, + const ITensorInfo *rois, + ITensorInfo *output, + const ROIPoolingLayerInfo &pool_info) { ARM_COMPUTE_RETURN_ON_ERROR(NEROIAlignLayerKernel::validate(input, rois, output, pool_info)); return Status{}; } -void NEROIAlignLayer::configure(const ITensor *input, const ITensor *rois, ITensor *output, const ROIPoolingLayerInfo &pool_info) +void NEROIAlignLayer::configure(const ITensor *input, + const ITensor *rois, + ITensor *output, + const ROIPoolingLayerInfo &pool_info) { + ARM_COMPUTE_LOG_PARAMS(input, rois, output, pool_info); + // Configure ROI pooling kernel auto k = std::make_unique<NEROIAlignLayerKernel>(); k->configure(input, rois, output, pool_info); diff --git a/src/runtime/NEON/functions/NEROIPoolingLayer.cpp b/src/runtime/NEON/functions/NEROIPoolingLayer.cpp index f9434059ea..babec4aa92 100644 --- a/src/runtime/NEON/functions/NEROIPoolingLayer.cpp +++ b/src/runtime/NEON/functions/NEROIPoolingLayer.cpp @@ -22,26 +22,36 @@ * SOFTWARE. */ #include "arm_compute/runtime/NEON/functions/NEROIPoolingLayer.h" + #include "arm_compute/core/Helpers.h" #include "arm_compute/runtime/NEON/NEScheduler.h" + +#include "src/common/utils/Log.h" #include "src/core/NEON/kernels/NEROIPoolingLayerKernel.h" namespace arm_compute { NEROIPoolingLayer::~NEROIPoolingLayer() = default; -NEROIPoolingLayer::NEROIPoolingLayer() - : _roi_kernel() +NEROIPoolingLayer::NEROIPoolingLayer() : _roi_kernel() { } -Status NEROIPoolingLayer::validate(const ITensorInfo *input, const ITensorInfo *rois, const ITensorInfo *output, const ROIPoolingLayerInfo &pool_info) +Status NEROIPoolingLayer::validate(const ITensorInfo *input, + const ITensorInfo *rois, + const ITensorInfo *output, + const ROIPoolingLayerInfo &pool_info) { return NEROIPoolingLayerKernel::validate(input, rois, output, pool_info); } -void NEROIPoolingLayer::configure(const ITensor *input, const ITensor *rois, const ITensor *output, const ROIPoolingLayerInfo &pool_info) +void NEROIPoolingLayer::configure(const ITensor *input, + const ITensor *rois, + const ITensor *output, + const ROIPoolingLayerInfo &pool_info) { + ARM_COMPUTE_LOG_PARAMS(input, rois, output, pool_info); + _roi_kernel = std::make_unique<NEROIPoolingLayerKernel>(); _roi_kernel->configure(input, rois, output, pool_info); } @@ -50,4 +60,4 @@ void NEROIPoolingLayer::run() { NEScheduler::get().schedule(_roi_kernel.get(), Window::DimX); } -} // namespace arm_compute
\ No newline at end of file +} // namespace arm_compute diff --git a/src/runtime/NEON/functions/NERange.cpp b/src/runtime/NEON/functions/NERange.cpp index 56ef2bf657..95492df126 100644 --- a/src/runtime/NEON/functions/NERange.cpp +++ b/src/runtime/NEON/functions/NERange.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018-2020 Arm Limited. + * Copyright (c) 2018-2021 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -24,19 +24,21 @@ #include "arm_compute/runtime/NEON/functions/NERange.h" #include "arm_compute/runtime/NEON/NEScheduler.h" + +#include "src/common/utils/Log.h" #include "src/core/NEON/kernels/NERangeKernel.h" namespace arm_compute { NERange::~NERange() = default; -NERange::NERange() - : _kernel() +NERange::NERange() : _kernel() { } void NERange::configure(ITensor *output, const float start, const float end, const float step) { + ARM_COMPUTE_LOG_PARAMS(output, start, end, step); _kernel = std::make_unique<NERangeKernel>(); _kernel->configure(output, start, end, step); } @@ -50,4 +52,4 @@ void NERange::run() { NEScheduler::get().schedule(_kernel.get(), Window::DimX); } -} // namespace arm_compute
\ No newline at end of file +} // namespace arm_compute diff --git a/src/runtime/NEON/functions/NEReduceMean.cpp b/src/runtime/NEON/functions/NEReduceMean.cpp index b50a925f44..a23db87059 100644 --- a/src/runtime/NEON/functions/NEReduceMean.cpp +++ b/src/runtime/NEON/functions/NEReduceMean.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018-2020 Arm Limited. + * Copyright (c) 2018-2024 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -24,23 +24,25 @@ #include "arm_compute/runtime/NEON/functions/NEReduceMean.h" #include "arm_compute/core/Error.h" -#include "arm_compute/core/Helpers.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" -#include "arm_compute/runtime/NEON/NEScheduler.h" + +#include "src/common/utils/Log.h" #include "src/core/CPP/Validate.h" -#include "src/core/NEON/kernels/NEReductionOperationKernel.h" #include "src/core/helpers/AutoConfiguration.h" +#include "src/core/NEON/kernels/NEReductionOperationKernel.h" namespace arm_compute { namespace { -Status validate_config(const ITensorInfo *input, const Coordinates &reduction_axis, bool keep_dims, const ITensorInfo *output) +Status +validate_config(const ITensorInfo *input, const Coordinates &reduction_axis, bool keep_dims, const ITensorInfo *output) { ARM_COMPUTE_UNUSED(keep_dims); ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output); ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8_SIGNED, DataType::QASYMM8, DataType::F16, DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8_SIGNED, DataType::QASYMM8, + DataType::F16, DataType::F32); ARM_COMPUTE_RETURN_ERROR_ON(reduction_axis.num_dimensions() < 1); ARM_COMPUTE_RETURN_ERROR_ON(reduction_axis.num_dimensions() > input->num_dimensions()); @@ -48,29 +50,36 @@ Status validate_config(const ITensorInfo *input, const Coordinates &reduction_ax const int input_dims = input->num_dimensions(); Coordinates axis_local = reduction_axis; - for(unsigned int i = 0; i < axis_local.num_dimensions(); ++i) + for (unsigned int i = 0; i < axis_local.num_dimensions(); ++i) { //axis: The dimensions to reduce. Must be in the range [-rank(input_tensor), rank(input_tensor)). ARM_COMPUTE_RETURN_ERROR_ON(axis_local[i] < (-static_cast<int>(input->num_dimensions()))); ARM_COMPUTE_RETURN_ERROR_ON(axis_local[i] >= static_cast<int>(input->num_dimensions())); } - if(output->tensor_shape().total_size() != 0) + if (output->tensor_shape().total_size() != 0) { // Only validate if not using auto_init for the output tensor TensorShape out_shape = input->tensor_shape(); // Validate output_shape only if not using auto_init convert_negative_axis(axis_local, input_dims); + +// Suppress warning produced by a compiler bug in GCC +// https://gcc.gnu.org/bugzilla/show_bug.cgi?id=104165 +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Warray-bounds" std::sort(axis_local.begin(), axis_local.begin() + reduction_ops); - for(unsigned int i = 0; i < reduction_ops; ++i) +#pragma GCC diagnostic pop + + for (unsigned int i = 0; i < reduction_ops; ++i) { ARM_COMPUTE_RETURN_ERROR_ON(axis_local[i] > 3); ARM_COMPUTE_RETURN_ERROR_ON(static_cast<unsigned int>(axis_local[i]) > input->num_dimensions() - 1); - if(output->total_size() > 0 && keep_dims) + if (output->total_size() > 0 && keep_dims) { ARM_COMPUTE_RETURN_ERROR_ON(output->dimension(axis_local[i]) != 1); } - if(keep_dims) + if (keep_dims) { out_shape.set(axis_local[i], 1); } @@ -79,19 +88,11 @@ Status validate_config(const ITensorInfo *input, const Coordinates &reduction_ax ARM_COMPUTE_RETURN_ERROR_ON(i > static_cast<unsigned int>(axis_local[i])); const unsigned int remove_index = axis_local[i] - i; ARM_COMPUTE_RETURN_ERROR_ON(remove_index >= out_shape.num_dimensions()); - out_shape.remove_dimension(remove_index); + out_shape.remove_dimension(remove_index, false); } } const TensorInfo out_info = input->clone()->set_tensor_shape(out_shape); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output, &out_info); - const bool requant = is_data_type_quantized(input->data_type()) && input->quantization_info() != output->quantization_info(); - if(requant) - { - TensorInfo input_no_quant(input->clone()->set_data_type(DataType::F32)); - NEDequantizationLayer::validate(input, &input_no_quant); - TensorInfo output_no_quant(output->clone()->set_data_type(DataType::F32)); - NEQuantizationLayer::validate(&output_no_quant, output); - } } return Status{}; } @@ -100,25 +101,34 @@ Status validate_config(const ITensorInfo *input, const Coordinates &reduction_ax NEReduceMean::~NEReduceMean() = default; NEReduceMean::NEReduceMean(std::shared_ptr<IMemoryManager> memory_manager) - : _memory_group(std::move(memory_manager)), _reduction_kernels(), _reduced_outs(), _reshape(), _dequant(), _requant(), _reduction_ops(), _keep_dims(), _do_requant(), _input_no_quant(), - _output_no_quant() + : _memory_group(std::move(memory_manager)), + _reduction_kernels(), + _reduced_outs(), + _reshape(), + _reduction_ops(), + _keep_dims() { } -Status NEReduceMean::validate(const ITensorInfo *input, const Coordinates &reduction_axis, bool keep_dims, const ITensorInfo *output) +Status NEReduceMean::validate(const ITensorInfo *input, + const Coordinates &reduction_axis, + bool keep_dims, + const ITensorInfo *output) { return validate_config(input, reduction_axis, keep_dims, output); } void NEReduceMean::configure(ITensor *input, const Coordinates &reduction_axis, bool keep_dims, ITensor *output) { + ARM_COMPUTE_LOG_PARAMS(input, reduction_axis, keep_dims, output); + // Perform validate step ARM_COMPUTE_ERROR_THROW_ON(NEReduceMean::validate(input->info(), reduction_axis, keep_dims, output->info())); // Output auto inizialitation if not yet initialized - const TensorShape output_shape = arm_compute::misc::shape_calculator::calculate_reduce_mean_shape(input->info(), reduction_axis, keep_dims); + const TensorShape output_shape = + arm_compute::misc::shape_calculator::calculate_reduce_mean_shape(input->info(), reduction_axis, keep_dims); auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(output_shape)); - _do_requant = is_data_type_quantized(input->info()->data_type()) && input->info()->quantization_info() != output->info()->quantization_info(); _reduction_ops = reduction_axis.num_dimensions(); _reduction_kernels.resize(_reduction_ops); _reduced_outs.resize(_reduction_ops - (keep_dims ? 1 : 0)); @@ -126,18 +136,6 @@ void NEReduceMean::configure(ITensor *input, const Coordinates &reduction_axis, ITensor *tmp_input = input; ITensor *tmp_output = output; - if(_do_requant) - { - _memory_group.manage(&_input_no_quant); - _memory_group.manage(&_output_no_quant); - TensorInfo output_no_quant_info = input->info()->clone()->set_tensor_shape(output_shape); - output_no_quant_info.set_data_type(DataType::F32); - auto_init_if_empty(*_output_no_quant.info(), output_no_quant_info); - auto_init_if_empty(*_input_no_quant.info(), input->info()->clone()->set_data_type(DataType::F32)); - _dequant.configure(input, &_input_no_quant); - tmp_input = &_input_no_quant; - tmp_output = &_output_no_quant; - } Coordinates axis_local = reduction_axis; const int input_dims = tmp_input->info()->num_dimensions(); @@ -145,70 +143,65 @@ void NEReduceMean::configure(ITensor *input, const Coordinates &reduction_axis, convert_negative_axis(axis_local, input_dims); // Perform reduction for every axis - for(int i = 0; i < _reduction_ops; ++i) + for (int i = 0; i < _reduction_ops; ++i) { - TensorShape out_shape = i == 0 ? tmp_input->info()->tensor_shape() : (&_reduced_outs[i - 1])->info()->tensor_shape(); + TensorShape out_shape = + i == 0 ? tmp_input->info()->tensor_shape() : (&_reduced_outs[i - 1])->info()->tensor_shape(); out_shape.set(axis_local[i], 1); auto in = (i == 0) ? tmp_input : (&_reduced_outs[i - 1]); - if(i == _reduction_ops - 1 && keep_dims) + if (i == _reduction_ops - 1 && keep_dims) { _reduction_kernels[i].configure(in, tmp_output, axis_local[i], ReductionOperation::MEAN_SUM); } else { - _reduced_outs[i].allocator()->init(TensorInfo(out_shape, tmp_input->info()->num_channels(), tmp_input->info()->data_type(), tmp_input->info()->quantization_info())); + _reduced_outs[i].allocator()->init(TensorInfo(out_shape, tmp_output->info()->num_channels(), + tmp_output->info()->data_type(), + tmp_output->info()->quantization_info())); _memory_group.manage(&_reduced_outs[i]); _reduction_kernels[i].configure(in, &_reduced_outs[i], axis_local[i], ReductionOperation::MEAN_SUM); } } // Allocate intermediate tensors - for(int i = 0; i < _reduction_ops - (keep_dims ? 1 : 0); ++i) + for (int i = 0; i < _reduction_ops - (keep_dims ? 1 : 0); ++i) { _reduced_outs[i].allocator()->allocate(); } - // Configure reshape layer if we want to drop the dimensions - if(!keep_dims) + if (!keep_dims) { TensorShape out_shape = tmp_input->info()->tensor_shape(); // We have to sort the reduction axis vectors in order for remove_dimension // to work properly + +// Suppress warning produced by a compiler bug in GCC +// https://gcc.gnu.org/bugzilla/show_bug.cgi?id=104165 +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Warray-bounds" std::sort(axis_local.begin(), axis_local.begin() + _reduction_ops); - for(int i = 0; i < _reduction_ops; ++i) +#pragma GCC diagnostic pop + + for (int i = 0; i < _reduction_ops; ++i) { - out_shape.remove_dimension(axis_local[i] - i); + out_shape.remove_dimension(axis_local[i] - i, false); } auto_init_if_empty(*tmp_output->info(), tmp_input->info()->clone()->set_tensor_shape(out_shape)); _reshape.configure(&_reduced_outs[_reduction_ops - 1], tmp_output); } - if(_do_requant) - { - _requant.configure(&_output_no_quant, output); - _input_no_quant.allocator()->allocate(); - _output_no_quant.allocator()->allocate(); - } } void NEReduceMean::run() { MemoryGroupResourceScope scope_mg(_memory_group); - if(_do_requant) - { - _dequant.run(); - } - for(auto &kernel : _reduction_kernels) + for (auto &kernel : _reduction_kernels) { kernel.run(); } - if(!_keep_dims) + if (!_keep_dims) { _reshape.run(); } - if(_do_requant) - { - _requant.run(); - } } } // namespace arm_compute diff --git a/src/runtime/NEON/functions/NEReductionOperation.cpp b/src/runtime/NEON/functions/NEReductionOperation.cpp index 5d6f520a52..8540d750fc 100644 --- a/src/runtime/NEON/functions/NEReductionOperation.cpp +++ b/src/runtime/NEON/functions/NEReductionOperation.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2020 Arm Limited. + * Copyright (c) 2017-2021 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -26,8 +26,10 @@ #include "arm_compute/core/Helpers.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" #include "arm_compute/runtime/NEON/NEScheduler.h" -#include "src/core/NEON/kernels/NEReductionOperationKernel.h" + +#include "src/common/utils/Log.h" #include "src/core/helpers/AutoConfiguration.h" +#include "src/core/NEON/kernels/NEReductionOperationKernel.h" namespace arm_compute { @@ -41,7 +43,7 @@ namespace */ size_t reduction_window_split_dimension(unsigned int axis) { - switch(axis) + switch (axis) { case 0: return Window::DimY; @@ -58,13 +60,21 @@ size_t reduction_window_split_dimension(unsigned int axis) NEReductionOperation::~NEReductionOperation() = default; NEReductionOperation::NEReductionOperation(std::shared_ptr<IMemoryManager> memory_manager) - : _memory_group(memory_manager), _reduction_kernel(), _reshape(), _output_internal(), _window_split(0), _reduction_axis(), _is_reshape_required(false) + : _memory_group(memory_manager), + _reduction_kernel(), + _reshape(), + _output_internal(), + _window_split(0), + _reduction_axis(), + _is_reshape_required(false) { } -Status NEReductionOperation::validate(const ITensorInfo *input, const ITensorInfo *output, unsigned int axis, ReductionOperation op, bool keep_dims) +Status NEReductionOperation::validate( + const ITensorInfo *input, const ITensorInfo *output, unsigned int axis, ReductionOperation op, bool keep_dims) { - ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis >= TensorShape::num_max_dimensions, "Reduction axis greater than max number of dimensions"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis >= TensorShape::num_max_dimensions, + "Reduction axis greater than max number of dimensions"); ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis > 3, "Unsupported reduction axis"); const auto is_reshape_required = !keep_dims; @@ -73,9 +83,10 @@ Status NEReductionOperation::validate(const ITensorInfo *input, const ITensorInf TensorInfo info_before_reshape; - if(is_reshape_required) + if (is_reshape_required) { - const TensorInfo expected_output_shape = output->clone()->set_tensor_shape(arm_compute::misc::shape_calculator::compute_reduced_shape(input->tensor_shape(), axis, keep_dims)); + const TensorInfo expected_output_shape = output->clone()->set_tensor_shape( + arm_compute::misc::shape_calculator::compute_reduced_shape(input->tensor_shape(), axis, keep_dims)); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&expected_output_shape, output); auto shape_before_reshape = input->tensor_shape(); @@ -83,17 +94,20 @@ Status NEReductionOperation::validate(const ITensorInfo *input, const ITensorInf const auto input_num_channles = input->num_channels(); const auto input_qinfo = input->quantization_info(); - const auto is_arg_min_max = (op == ReductionOperation::ARG_IDX_MAX) || (op == ReductionOperation::ARG_IDX_MIN); - const auto output_data_type = is_arg_min_max ? DataType::S32 : output->data_type(); + const auto is_arg_min_max = (op == ReductionOperation::ARG_IDX_MAX) || (op == ReductionOperation::ARG_IDX_MIN); + const auto output_data_type = is_arg_min_max ? DataType::S32 : output->data_type(); - info_before_reshape.set_data_type(output_data_type).set_tensor_shape(shape_before_reshape).set_num_channels(input_num_channles).set_quantization_info(input_qinfo); + info_before_reshape.set_data_type(output_data_type) + .set_tensor_shape(shape_before_reshape) + .set_num_channels(input_num_channles) + .set_quantization_info(input_qinfo); output_internal = &info_before_reshape; } ARM_COMPUTE_RETURN_ON_ERROR(NEReductionOperationKernel::validate(input, output_internal, axis, op)); - if(is_reshape_required) + if (is_reshape_required) { ARM_COMPUTE_RETURN_ON_ERROR(NEReshapeLayer::validate(output_internal, output)); } @@ -101,28 +115,43 @@ Status NEReductionOperation::validate(const ITensorInfo *input, const ITensorInf return Status{}; } -void NEReductionOperation::configure(ITensor *input, ITensor *output, unsigned int axis, ReductionOperation op, bool keep_dims) +void NEReductionOperation::configure( + ITensor *input, ITensor *output, unsigned int axis, ReductionOperation op, bool keep_dims) { ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); + ARM_COMPUTE_LOG_PARAMS(input, output, axis, op, keep_dims); _is_reshape_required = !keep_dims; auto *output_internal = output; const auto is_arg_min_max = (op == ReductionOperation::ARG_IDX_MAX) || (op == ReductionOperation::ARG_IDX_MIN); - if(_is_reshape_required) + if (_is_reshape_required) { - const auto output_internal_shape = arm_compute::misc::shape_calculator::compute_reduced_shape(input->info()->tensor_shape(), axis); - const auto output_external_shape = arm_compute::misc::shape_calculator::compute_reduced_shape(input->info()->tensor_shape(), axis, false); - const auto output_data_type = is_arg_min_max ? DataType::S32 : input->info()->data_type(); - const auto num_channels = input->info()->num_channels(); - const auto qinfo = input->info()->quantization_info(); - - _output_internal.allocator()->init(input->info()->clone()->set_data_type(output_data_type).set_tensor_shape(output_internal_shape).reset_padding().set_is_resizable(true).set_num_channels( - num_channels).set_quantization_info(qinfo)); + const auto output_internal_shape = + arm_compute::misc::shape_calculator::compute_reduced_shape(input->info()->tensor_shape(), axis); + const auto output_external_shape = + arm_compute::misc::shape_calculator::compute_reduced_shape(input->info()->tensor_shape(), axis, false); + const auto output_data_type = is_arg_min_max ? DataType::S32 : input->info()->data_type(); + const auto num_channels = input->info()->num_channels(); + const auto qinfo = input->info()->quantization_info(); + + _output_internal.allocator()->init(input->info() + ->clone() + ->set_data_type(output_data_type) + .set_tensor_shape(output_internal_shape) + .reset_padding() + .set_is_resizable(true) + .set_num_channels(num_channels) + .set_quantization_info(qinfo)); _memory_group.manage(&_output_internal); output_internal = &_output_internal; - auto_init_if_empty(*output->info(), input->info()->clone()->set_data_type(output_data_type).set_tensor_shape(output_external_shape).reset_padding().set_is_resizable(true)); + auto_init_if_empty(*output->info(), input->info() + ->clone() + ->set_data_type(output_data_type) + .set_tensor_shape(output_external_shape) + .reset_padding() + .set_is_resizable(true)); } ARM_COMPUTE_ERROR_THROW_ON(NEReductionOperation::validate(input->info(), output->info(), axis, op, keep_dims)); @@ -133,7 +162,7 @@ void NEReductionOperation::configure(ITensor *input, ITensor *output, unsigned i _window_split = reduction_window_split_dimension(axis); _reduction_axis = axis; - if(_is_reshape_required) + if (_is_reshape_required) { _reshape.configure(output_internal, output); _output_internal.allocator()->allocate(); @@ -144,7 +173,7 @@ void NEReductionOperation::run() { MemoryGroupResourceScope scope_mg(_memory_group); NEScheduler::get().schedule(_reduction_kernel.get(), _window_split); - if(_is_reshape_required) + if (_is_reshape_required) { _reshape.run(); } diff --git a/src/runtime/NEON/functions/NERemap.cpp b/src/runtime/NEON/functions/NERemap.cpp deleted file mode 100644 index d9fd987480..0000000000 --- a/src/runtime/NEON/functions/NERemap.cpp +++ /dev/null @@ -1,49 +0,0 @@ -/* - * Copyright (c) 2016-2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "arm_compute/runtime/NEON/functions/NERemap.h" - -#include "arm_compute/core/Error.h" -#include "arm_compute/core/ITensor.h" -#include "arm_compute/core/TensorInfo.h" -#include "arm_compute/core/Validate.h" -#include "arm_compute/runtime/TensorAllocator.h" -#include "src/core/NEON/kernels/NERemapKernel.h" - -#include <utility> - -namespace arm_compute -{ -void NERemap::configure(ITensor *input, const ITensor *map_x, const ITensor *map_y, ITensor *output, InterpolationPolicy policy, BorderMode border_mode, uint8_t constant_border_value) -{ - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8); - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8); - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(map_x, 1, DataType::F32); - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(map_y, 1, DataType::F32); - ARM_COMPUTE_ERROR_ON_MSG(policy == InterpolationPolicy::AREA, "Area interpolation is not supported"); - - auto k = std::make_unique<NERemapKernel>(); - k->configure(input, map_x, map_y, output, policy, border_mode, constant_border_value); - _kernel = std::move(k); -} -} // namespace arm_compute diff --git a/src/runtime/cpu/operators/CpuQuantize.cpp b/src/runtime/NEON/functions/NEReorderLayer.cpp index 5af7f6343b..89cf575f38 100644 --- a/src/runtime/cpu/operators/CpuQuantize.cpp +++ b/src/runtime/NEON/functions/NEReorderLayer.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021 Arm Limited. + * Copyright (c) 2023 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -21,38 +21,46 @@ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ +#if defined(__aarch64__) -#include "src/runtime/cpu/operators/CpuQuantize.h" +#include "arm_compute/runtime/NEON/functions/NEReorderLayer.h" -#include "arm_compute/core/Types.h" -#include "arm_compute/core/Validate.h" #include "arm_compute/runtime/NEON/NEScheduler.h" -#include "src/core/cpu/kernels/CpuQuantizeKernel.h" + +#include "src/core/NEON/kernels/NEReorderKernel.h" namespace arm_compute { -namespace cpu -{ -Status CpuQuantize::validate(const ITensorInfo *src, const ITensorInfo *dst) +NEReorderLayer::~NEReorderLayer() = default; + +NEReorderLayer::NEReorderLayer() : _reorder_kernel(std::make_unique<NEReorderKernel>()) { - ARM_COMPUTE_RETURN_ON_ERROR(kernels::CpuQuantizeKernel::validate(src, dst)); - return Status{}; } -void CpuQuantize::configure(const ITensorInfo *src, ITensorInfo *dst) +void NEReorderLayer::configure(const ITensor *input, + ITensor *output, + arm_compute::WeightFormat input_wf, + arm_compute::WeightFormat output_wf) { - ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst); + auto k = std::make_unique<NEReorderKernel>(); + k->configure(input, output, input_wf, output_wf); + _reorder_kernel = std::move(k); +} - // Configure quantize kernel - auto k = std::make_unique<kernels::CpuQuantizeKernel>(); - k->configure(src, dst); - _kernel = std::move(k); +void NEReorderLayer::run() +{ + // Run Reorder + NEScheduler::get().schedule(_reorder_kernel.get(), Window::DimX); } -void CpuQuantize::run(ITensorPack &tensors) +Status NEReorderLayer::validate(const ITensorInfo *input, + const ITensorInfo *output, + arm_compute::WeightFormat input_wf, + arm_compute::WeightFormat output_wf) { - ARM_COMPUTE_ERROR_ON_MSG(tensors.empty(), "No inputs provided"); - NEScheduler::get().schedule_op(_kernel.get(), Window::DimY, _kernel->window(), tensors); + return NEReorderKernel::validate(input, output, input_wf, output_wf); } -} // namespace cpu + } // namespace arm_compute + +#endif // defined(__aarch64__) diff --git a/src/runtime/NEON/functions/NEReorgLayer.cpp b/src/runtime/NEON/functions/NEReorgLayer.cpp index 23ca3a4eea..14e41d6df4 100644 --- a/src/runtime/NEON/functions/NEReorgLayer.cpp +++ b/src/runtime/NEON/functions/NEReorgLayer.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018-2020 Arm Limited. + * Copyright (c) 2018-2021 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -23,12 +23,15 @@ */ #include "arm_compute/runtime/NEON/functions/NEReorgLayer.h" +#include "src/common/utils/Log.h" #include "src/core/NEON/kernels/NEReorgLayerKernel.h" namespace arm_compute { void NEReorgLayer::configure(const ITensor *input, ITensor *output, int32_t stride) { + ARM_COMPUTE_LOG_PARAMS(input, output, stride); + auto k = std::make_unique<NEReorgLayerKernel>(); k->configure(input, output, stride); _kernel = std::move(k); diff --git a/src/runtime/NEON/functions/NEReshapeLayer.cpp b/src/runtime/NEON/functions/NEReshapeLayer.cpp index c0c78ea652..bed70ff66c 100644 --- a/src/runtime/NEON/functions/NEReshapeLayer.cpp +++ b/src/runtime/NEON/functions/NEReshapeLayer.cpp @@ -24,7 +24,8 @@ #include "arm_compute/runtime/NEON/functions/NEReshapeLayer.h" #include "arm_compute/core/Validate.h" -#include "src/runtime/cpu/operators/CpuReshape.h" + +#include "src/cpu/operators/CpuReshape.h" #include <utility> @@ -32,16 +33,15 @@ namespace arm_compute { struct NEReshapeLayer::Impl { - const ITensor *src{ nullptr }; - ITensor *dst{ nullptr }; - std::unique_ptr<cpu::CpuReshape> op{ nullptr }; + const ITensor *src{nullptr}; + ITensor *dst{nullptr}; + std::unique_ptr<cpu::CpuReshape> op{nullptr}; }; -NEReshapeLayer::NEReshapeLayer() - : _impl(std::make_unique<Impl>()) +NEReshapeLayer::NEReshapeLayer() : _impl(std::make_unique<Impl>()) { } -NEReshapeLayer::NEReshapeLayer(NEReshapeLayer &&) = default; +NEReshapeLayer::NEReshapeLayer(NEReshapeLayer &&) = default; NEReshapeLayer &NEReshapeLayer::operator=(NEReshapeLayer &&) = default; NEReshapeLayer::~NEReshapeLayer() = default; diff --git a/src/runtime/NEON/functions/NEReverse.cpp b/src/runtime/NEON/functions/NEReverse.cpp index 36127ef83c..a90f8d2e76 100644 --- a/src/runtime/NEON/functions/NEReverse.cpp +++ b/src/runtime/NEON/functions/NEReverse.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018-2020 Arm Limited. + * Copyright (c) 2018-2021, 2023 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -23,19 +23,25 @@ */ #include "arm_compute/runtime/NEON/functions/NEReverse.h" +#include "src/common/utils/Log.h" #include "src/core/NEON/kernels/NEReverseKernel.h" namespace arm_compute { -void NEReverse::configure(const ITensor *input, ITensor *output, const ITensor *axis) +void NEReverse::configure(const ITensor *input, ITensor *output, const ITensor *axis, bool use_inverted_axis) { + ARM_COMPUTE_LOG_PARAMS(input, output, axis); + auto k = std::make_unique<NEReverseKernel>(); - k->configure(input, output, axis); + k->configure(input, output, axis, use_inverted_axis); _kernel = std::move(k); } -Status NEReverse::validate(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *axis) +Status NEReverse::validate(const ITensorInfo *input, + const ITensorInfo *output, + const ITensorInfo *axis, + bool use_inverted_axis) { - return NEReverseKernel::validate(input, output, axis); + return NEReverseKernel::validate(input, output, axis, use_inverted_axis); } } // namespace arm_compute diff --git a/src/runtime/NEON/functions/NEScale.cpp b/src/runtime/NEON/functions/NEScale.cpp index 0fbad07d0f..0d011064f6 100644 --- a/src/runtime/NEON/functions/NEScale.cpp +++ b/src/runtime/NEON/functions/NEScale.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2016-2021 Arm Limited. + * Copyright (c) 2016-2023 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -23,32 +23,34 @@ */ #include "arm_compute/runtime/NEON/functions/NEScale.h" -#include "arm_compute/core/Validate.h" #include "arm_compute/runtime/Tensor.h" + +#include "src/common/utils/Log.h" #include "src/core/utils/ScaleUtils.h" -#include "src/runtime/cpu/operators/CpuScale.h" -#include "support/Rounding.h" +#include "src/cpu/operators/CpuScale.h" namespace arm_compute { struct NEScale::Impl { - const ITensor *src{ nullptr }; - ITensor *dst{ nullptr }; - Tensor dx{ nullptr }; /**< Element's distance between the X real coordinate and the smallest X following integer */ - Tensor dy{ nullptr }; /**< Element's distance between the Y real coordinate and the smallest Y following integer */ - Tensor offsets{ nullptr }; /**< Offset to access the element with NEAREST interpolation or the top-left element with BILINEAR interpolation in the input tensor */ - std::unique_ptr<cpu::CpuScale> op{ nullptr }; + const ITensor *src{nullptr}; + ITensor *dst{nullptr}; + Tensor dx{nullptr}; /**< Element's distance between the X real coordinate and the smallest X following integer */ + Tensor dy{nullptr}; /**< Element's distance between the Y real coordinate and the smallest Y following integer */ + Tensor offsets{ + nullptr}; /**< Offset to access the element with NEAREST interpolation or the top-left element with BILINEAR interpolation in the input tensor */ + std::unique_ptr<cpu::CpuScale> op{nullptr}; }; -NEScale::NEScale() - : _impl(std::make_unique<Impl>()) +NEScale::NEScale() : _impl(std::make_unique<Impl>()) { } NEScale::~NEScale() = default; void NEScale::configure(ITensor *input, ITensor *output, const ScaleKernelInfo &info) { + ARM_COMPUTE_LOG_PARAMS(input, output, info); + _impl->src = input; _impl->dst = output; _impl->op = std::make_unique<cpu::CpuScale>(); @@ -56,50 +58,71 @@ void NEScale::configure(ITensor *input, ITensor *output, const ScaleKernelInfo & // Configure for size of allocation of internal tensors // Get data layout and width/height indices - const DataLayout data_layout = info.data_layout == DataLayout::UNKNOWN ? input->info()->data_layout() : info.data_layout; - const int idx_width = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH); - const int idx_height = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT); + const DataLayout data_layout = + info.data_layout == DataLayout::UNKNOWN ? input->info()->data_layout() : info.data_layout; + const int idx_width = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH); + const int idx_height = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT); // Compute the ratio between source width/height and destination width/height - const bool is_align_corners_used = info.align_corners && arm_compute::scale_utils::is_align_corners_allowed_sampling_policy(info.sampling_policy); - const auto wr = arm_compute::scale_utils::calculate_resize_ratio(input->info()->dimension(idx_width), output->info()->dimension(idx_width), is_align_corners_used); - const auto hr = arm_compute::scale_utils::calculate_resize_ratio(input->info()->dimension(idx_height), output->info()->dimension(idx_height), is_align_corners_used); + const bool is_align_corners_used = + info.align_corners && arm_compute::scale_utils::is_align_corners_allowed_sampling_policy(info.sampling_policy); + const auto wr = arm_compute::scale_utils::calculate_resize_ratio( + input->info()->dimension(idx_width), output->info()->dimension(idx_width), is_align_corners_used); + const auto hr = arm_compute::scale_utils::calculate_resize_ratio( + input->info()->dimension(idx_height), output->info()->dimension(idx_height), is_align_corners_used); // Area interpolation behaves as Nearest Neighbour in case of up-sampling - InterpolationPolicy policy_to_use = (info.interpolation_policy == InterpolationPolicy::AREA && wr <= 1.f && hr <= 1.f) ? InterpolationPolicy::NEAREST_NEIGHBOR : info.interpolation_policy; + InterpolationPolicy policy_to_use = + (info.interpolation_policy == InterpolationPolicy::AREA && wr <= 1.f && hr <= 1.f) + ? InterpolationPolicy::NEAREST_NEIGHBOR + : info.interpolation_policy; // Get the tensor shape TensorShape shape(output->info()->dimension(idx_width)); shape.set(1, output->info()->dimension(idx_height), false); - const TensorInfo tensor_info_dxdy(shape, Format::F32); - const TensorInfo tensor_info_offsets(shape, Format::S32); + bool precompute_indices_weights = arm_compute::scale_utils::is_precomputation_required( + data_layout, input->info()->data_type(), policy_to_use, info.border_mode); - _impl->dx.allocator()->init(tensor_info_dxdy); - _impl->dy.allocator()->init(tensor_info_dxdy); - _impl->offsets.allocator()->init(tensor_info_offsets); - switch(policy_to_use) + if (precompute_indices_weights) { - case InterpolationPolicy::NEAREST_NEIGHBOR: - { - // Allocate once the configure methods have been called - _impl->offsets.allocator()->allocate(); - break; - } - case InterpolationPolicy::BILINEAR: + const TensorInfo tensor_info_dxdy(shape, Format::F32); + const TensorInfo tensor_info_offsets(shape, Format::S32); + + _impl->dx.allocator()->init(tensor_info_dxdy); + _impl->dy.allocator()->init(tensor_info_dxdy); + _impl->offsets.allocator()->init(tensor_info_offsets); + switch (policy_to_use) { - // Allocate once the configure methods have been called - _impl->dx.allocator()->allocate(); - _impl->dy.allocator()->allocate(); - _impl->offsets.allocator()->allocate(); - break; + case InterpolationPolicy::NEAREST_NEIGHBOR: + { + // Allocate once the configure methods have been called + _impl->offsets.allocator()->allocate(); + break; + } + case InterpolationPolicy::BILINEAR: + { + // Allocate once the configure methods have been called + _impl->dx.allocator()->allocate(); + _impl->dy.allocator()->allocate(); + _impl->offsets.allocator()->allocate(); + break; + } + case InterpolationPolicy::AREA: + { + break; + } + default: + ARM_COMPUTE_ERROR("Unsupported interpolation mode"); } - case InterpolationPolicy::AREA: + } + else + { + if (policy_to_use != InterpolationPolicy::NEAREST_NEIGHBOR && policy_to_use != InterpolationPolicy::BILINEAR && + policy_to_use != InterpolationPolicy::AREA) { - break; - } - default: ARM_COMPUTE_ERROR("Unsupported interpolation mode"); + } } } diff --git a/src/runtime/NEON/functions/NESelect.cpp b/src/runtime/NEON/functions/NESelect.cpp index f8ba9f03ed..55cad2202b 100644 --- a/src/runtime/NEON/functions/NESelect.cpp +++ b/src/runtime/NEON/functions/NESelect.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018-2020 Arm Limited. + * Copyright (c) 2018-2021 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -24,12 +24,16 @@ #include "arm_compute/runtime/NEON/functions/NESelect.h" #include "arm_compute/core/Types.h" + +#include "src/common/utils/Log.h" #include "src/core/NEON/kernels/NESelectKernel.h" namespace arm_compute { void NESelect::configure(const ITensor *c, const ITensor *x, const ITensor *y, ITensor *output) { + ARM_COMPUTE_LOG_PARAMS(c, x, y, output); + auto k = std::make_unique<NESelectKernel>(); k->configure(c, x, y, output); _kernel = std::move(k); diff --git a/src/runtime/NEON/functions/NESlice.cpp b/src/runtime/NEON/functions/NESlice.cpp index 9b08bca38a..12d43adc84 100644 --- a/src/runtime/NEON/functions/NESlice.cpp +++ b/src/runtime/NEON/functions/NESlice.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018-2020 Arm Limited. + * Copyright (c) 2018-2021 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -25,17 +25,23 @@ #include "arm_compute/core/ITensor.h" #include "arm_compute/core/Types.h" -#include "arm_compute/core/Validate.h" #include "arm_compute/core/utils/helpers/tensor_transform.h" +#include "arm_compute/core/Validate.h" + +#include "src/common/utils/Log.h" #include "src/core/NEON/kernels/NEStridedSliceKernel.h" namespace arm_compute { namespace experimental { -void NESlice::configure(const ITensorInfo *input, ITensorInfo *output, const Coordinates &starts, const Coordinates &ends) +void NESlice::configure(const ITensorInfo *input, + ITensorInfo *output, + const Coordinates &starts, + const Coordinates &ends) { ARM_COMPUTE_ERROR_ON_NULLPTR(input); + ARM_COMPUTE_LOG_PARAMS(input, output, starts, ends); // Get absolute end coordinates const int32_t slice_end_mask = arm_compute::helpers::tensor_transform::construct_slice_end_mask(ends); @@ -45,15 +51,16 @@ void NESlice::configure(const ITensorInfo *input, ITensorInfo *output, const Coo _kernel = std::move(k); } -Status NESlice::validate(const ITensorInfo *input, const ITensorInfo *output, const Coordinates &starts, const Coordinates &ends) +Status NESlice::validate(const ITensorInfo *input, + const ITensorInfo *output, + const Coordinates &starts, + const Coordinates &ends) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input); // Check start dimensions for being non-negative - ARM_COMPUTE_RETURN_ERROR_ON(std::any_of(starts.cbegin(), starts.cbegin() + starts.num_dimensions(), [](int i) - { - return i < 0; - })); + ARM_COMPUTE_RETURN_ERROR_ON( + std::any_of(starts.cbegin(), starts.cbegin() + starts.num_dimensions(), [](int i) { return i < 0; })); // Get absolute end coordinates const int32_t slice_end_mask = arm_compute::helpers::tensor_transform::construct_slice_end_mask(ends); @@ -64,20 +71,22 @@ Status NESlice::validate(const ITensorInfo *input, const ITensorInfo *output, co struct NESlice::Impl { - const ITensor *src{ nullptr }; - ITensor *dst{ nullptr }; - std::unique_ptr<experimental::NESlice> op{ nullptr }; + const ITensor *src{nullptr}; + ITensor *dst{nullptr}; + std::unique_ptr<experimental::NESlice> op{nullptr}; }; -NESlice::NESlice() - : _impl(std::make_unique<Impl>()) +NESlice::NESlice() : _impl(std::make_unique<Impl>()) { } -NESlice::NESlice(NESlice &&) = default; +NESlice::NESlice(NESlice &&) = default; NESlice &NESlice::operator=(NESlice &&) = default; NESlice::~NESlice() = default; -Status NESlice::validate(const ITensorInfo *input, const ITensorInfo *output, const Coordinates &starts, const Coordinates &ends) +Status NESlice::validate(const ITensorInfo *input, + const ITensorInfo *output, + const Coordinates &starts, + const Coordinates &ends) { return experimental::NESlice::validate(input, output, starts, ends); } diff --git a/src/runtime/NEON/functions/NESoftmaxLayer.cpp b/src/runtime/NEON/functions/NESoftmaxLayer.cpp index bee692c08b..be588c5b52 100644 --- a/src/runtime/NEON/functions/NESoftmaxLayer.cpp +++ b/src/runtime/NEON/functions/NESoftmaxLayer.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2021 Arm Limited. + * Copyright (c) 2017-2021, 2023 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -22,26 +22,26 @@ * SOFTWARE. */ #include "arm_compute/runtime/NEON/functions/NESoftmaxLayer.h" + #include "arm_compute/core/Validate.h" #include "arm_compute/runtime/MemoryGroup.h" #include "arm_compute/runtime/Tensor.h" -#include "src/core/cpu/kernels/CpuSoftmaxKernel.h" + #include "src/core/helpers/MemoryHelpers.h" #include "src/core/helpers/SoftmaxHelpers.h" -#include "src/runtime/cpu/operators/CpuSoftmax.h" +#include "src/cpu/operators/CpuSoftmax.h" namespace arm_compute { template <bool IS_LOG> struct NESoftmaxLayerGeneric<IS_LOG>::Impl { - const ITensor *src{ nullptr }; - ITensor *dst{ nullptr }; - Tensor max{ nullptr }; - std::unique_ptr<cpu::CpuSoftmaxGeneric<IS_LOG>> op{ nullptr }; - MemoryGroup memory_group{}; - ITensorPack run_pack{}; - WorkspaceData<Tensor> workspace_tensors{}; + const ITensor *src{nullptr}; + ITensor *dst{nullptr}; + std::unique_ptr<cpu::CpuSoftmaxGeneric> op{nullptr}; + MemoryGroup memory_group{}; + ITensorPack run_pack{}; + WorkspaceData<Tensor> workspace_tensors{}; }; template <bool IS_LOG> @@ -53,9 +53,9 @@ NESoftmaxLayerGeneric<IS_LOG>::NESoftmaxLayerGeneric(std::shared_ptr<IMemoryMana template <bool IS_LOG> NESoftmaxLayerGeneric<IS_LOG>::NESoftmaxLayerGeneric(NESoftmaxLayerGeneric &&) = default; -template <bool IS_LOG> +template <bool IS_LOG> NESoftmaxLayerGeneric<IS_LOG> &NESoftmaxLayerGeneric<IS_LOG>::operator=(NESoftmaxLayerGeneric &&) = default; -template <bool IS_LOG> +template <bool IS_LOG> NESoftmaxLayerGeneric<IS_LOG>::~NESoftmaxLayerGeneric() = default; template <bool IS_LOG> @@ -65,23 +65,24 @@ void NESoftmaxLayerGeneric<IS_LOG>::configure(ITensor *input, ITensor *output, f _impl->src = input; _impl->dst = output; - _impl->op = std::make_unique<cpu::CpuSoftmaxGeneric<IS_LOG>>(); - _impl->op->configure(input->info(), output->info(), beta, axis); + _impl->op = std::make_unique<cpu::CpuSoftmaxGeneric>(); + _impl->op->configure(input->info(), output->info(), beta, axis, IS_LOG); - _impl->run_pack = { { TensorType::ACL_SRC, _impl->src }, { TensorType::ACL_DST, _impl->dst } }; + _impl->run_pack = {{TensorType::ACL_SRC, _impl->src}, {TensorType::ACL_DST, _impl->dst}}; _impl->workspace_tensors = manage_workspace<Tensor>(_impl->op->workspace(), _impl->memory_group, _impl->run_pack); } template <bool IS_LOG> -Status NESoftmaxLayerGeneric<IS_LOG>::validate(const ITensorInfo *input, const ITensorInfo *output, float beta, int32_t axis) +Status +NESoftmaxLayerGeneric<IS_LOG>::validate(const ITensorInfo *input, const ITensorInfo *output, float beta, int32_t axis) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output); - ARM_COMPUTE_RETURN_ON_ERROR(cpu::CpuSoftmaxGeneric<IS_LOG>::validate(input, output, beta, axis)); + ARM_COMPUTE_RETURN_ON_ERROR(cpu::CpuSoftmaxGeneric::validate(input, output, beta, axis, IS_LOG)); return Status{}; } template <bool IS_LOG> -void NESoftmaxLayerGeneric<IS_LOG>::run() +void NESoftmaxLayerGeneric<IS_LOG>::run() { // Acquire all the temporaries MemoryGroupResourceScope scope_mg(_impl->memory_group); diff --git a/src/runtime/NEON/functions/NESpaceToBatchLayer.cpp b/src/runtime/NEON/functions/NESpaceToBatchLayer.cpp index e8a84246fe..556ebdd800 100644 --- a/src/runtime/NEON/functions/NESpaceToBatchLayer.cpp +++ b/src/runtime/NEON/functions/NESpaceToBatchLayer.cpp @@ -28,24 +28,29 @@ #include "arm_compute/core/TensorInfo.h" #include "arm_compute/core/Types.h" #include "arm_compute/core/Validate.h" -#include "arm_compute/runtime/NEON/NEScheduler.h" #include "arm_compute/runtime/NEON/functions/NEFill.h" +#include "arm_compute/runtime/NEON/NEScheduler.h" + +#include "src/common/utils/Log.h" #include "src/core/NEON/kernels/NESpaceToBatchLayerKernel.h" namespace arm_compute { NESpaceToBatchLayer::~NESpaceToBatchLayer() = default; -NESpaceToBatchLayer::NESpaceToBatchLayer() - : _space_to_batch_kernel(), _fill_f(), _has_padding(false) +NESpaceToBatchLayer::NESpaceToBatchLayer() : _space_to_batch_kernel(), _fill_f(), _has_padding(false) { } -void NESpaceToBatchLayer::configure(const ITensor *input, const ITensor *block_shape, const ITensor *paddings, ITensor *output) +void NESpaceToBatchLayer::configure(const ITensor *input, + const ITensor *block_shape, + const ITensor *paddings, + ITensor *output) { ARM_COMPUTE_ERROR_ON_NULLPTR(input, block_shape, paddings, output); + ARM_COMPUTE_LOG_PARAMS(input, block_shape, paddings, output); - if(input->info()->tensor_shape().total_size() != output->info()->tensor_shape().total_size()) + if (input->info()->tensor_shape().total_size() != output->info()->tensor_shape().total_size()) { _has_padding = true; _fill_f = std::make_unique<NEFill>(); @@ -55,11 +60,16 @@ void NESpaceToBatchLayer::configure(const ITensor *input, const ITensor *block_s _space_to_batch_kernel->configure(input, block_shape, paddings, output); } -void NESpaceToBatchLayer::configure(const ITensor *input, const int block_shape_x, const int block_shape_y, const Size2D &padding_left, const Size2D &padding_right, ITensor *output) +void NESpaceToBatchLayer::configure(const ITensor *input, + const int block_shape_x, + const int block_shape_y, + const Size2D &padding_left, + const Size2D &padding_right, + ITensor *output) { ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); - if(input->info()->tensor_shape().total_size() != output->info()->tensor_shape().total_size()) + if (input->info()->tensor_shape().total_size() != output->info()->tensor_shape().total_size()) { _has_padding = true; _fill_f = std::make_unique<NEFill>(); @@ -69,17 +79,25 @@ void NESpaceToBatchLayer::configure(const ITensor *input, const int block_shape_ _space_to_batch_kernel->configure(input, block_shape_x, block_shape_y, padding_left, padding_right, output); } -Status NESpaceToBatchLayer::validate(const ITensorInfo *input, const ITensorInfo *block_shape, const ITensorInfo *paddings, const ITensorInfo *output) +Status NESpaceToBatchLayer::validate(const ITensorInfo *input, + const ITensorInfo *block_shape, + const ITensorInfo *paddings, + const ITensorInfo *output) { ARM_COMPUTE_RETURN_ON_ERROR(NESpaceToBatchLayerKernel::validate(input, block_shape, paddings, output)); return Status{}; } -Status NESpaceToBatchLayer::validate(const ITensorInfo *input, const int block_shape_x, const int block_shape_y, const Size2D &padding_left, const Size2D &padding_right, +Status NESpaceToBatchLayer::validate(const ITensorInfo *input, + const int block_shape_x, + const int block_shape_y, + const Size2D &padding_left, + const Size2D &padding_right, const ITensorInfo *output) { - ARM_COMPUTE_RETURN_ON_ERROR(NESpaceToBatchLayerKernel::validate(input, block_shape_x, block_shape_y, padding_left, padding_right, output)); + ARM_COMPUTE_RETURN_ON_ERROR( + NESpaceToBatchLayerKernel::validate(input, block_shape_x, block_shape_y, padding_left, padding_right, output)); return Status{}; } @@ -87,7 +105,7 @@ Status NESpaceToBatchLayer::validate(const ITensorInfo *input, const int block_s void NESpaceToBatchLayer::run() { // Zero out output only if we have paddings - if(_has_padding) + if (_has_padding) { _fill_f->run(); } diff --git a/src/runtime/NEON/functions/NESpaceToDepthLayer.cpp b/src/runtime/NEON/functions/NESpaceToDepthLayer.cpp index 1e3776c448..846b619429 100644 --- a/src/runtime/NEON/functions/NESpaceToDepthLayer.cpp +++ b/src/runtime/NEON/functions/NESpaceToDepthLayer.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2020 Arm Limited. + * Copyright (c) 2019-2021 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -29,20 +29,23 @@ #include "arm_compute/core/Types.h" #include "arm_compute/core/Validate.h" #include "arm_compute/runtime/NEON/NEScheduler.h" + +#include "src/common/utils/Log.h" #include "src/core/NEON/kernels/NESpaceToDepthLayerKernel.h" namespace arm_compute { NESpaceToDepthLayer::~NESpaceToDepthLayer() = default; -NESpaceToDepthLayer::NESpaceToDepthLayer() - : _space_to_depth_kernel() +NESpaceToDepthLayer::NESpaceToDepthLayer() : _space_to_depth_kernel() { } void NESpaceToDepthLayer::configure(const ITensor *input, ITensor *output, int32_t block_shape) { ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); + ARM_COMPUTE_LOG_PARAMS(input, output, block_shape); + _space_to_depth_kernel = std::make_unique<NESpaceToDepthLayerKernel>(); _space_to_depth_kernel->configure(input, output, block_shape); } diff --git a/src/runtime/NEON/functions/NESplit.cpp b/src/runtime/NEON/functions/NESplit.cpp index db19bbb824..53b09e9ae5 100644 --- a/src/runtime/NEON/functions/NESplit.cpp +++ b/src/runtime/NEON/functions/NESplit.cpp @@ -34,7 +34,7 @@ namespace arm_compute { void NESplit::run() { - for(unsigned i = 0; i < _num_outputs; ++i) + for (unsigned i = 0; i < _num_outputs; ++i) { _slice_functions[i].run(); } diff --git a/src/runtime/NEON/functions/NEStackLayer.cpp b/src/runtime/NEON/functions/NEStackLayer.cpp index af5c80d036..2f88ffca2a 100644 --- a/src/runtime/NEON/functions/NEStackLayer.cpp +++ b/src/runtime/NEON/functions/NEStackLayer.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018-2020 Arm Limited. + * Copyright (c) 2018-2021, 2023 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -30,6 +30,8 @@ #include "arm_compute/core/Types.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" #include "arm_compute/runtime/NEON/NEScheduler.h" + +#include "src/common/utils/Log.h" #include "src/core/NEON/kernels/NEStackLayerKernel.h" namespace arm_compute @@ -37,25 +39,18 @@ namespace arm_compute NEStackLayer::~NEStackLayer() = default; NEStackLayer::NEStackLayer() // NOLINT - : _input(), - _stack_kernels(), - _num_inputs(0) + : _stack_kernel(std::make_unique<NEStackLayerKernel>()), _is_prepared(false) { } void NEStackLayer::configure(const std::vector<ITensor *> &input, int axis, ITensor *output) { - _num_inputs = input.size(); - _stack_kernels.resize(_num_inputs); + ARM_COMPUTE_LOG_PARAMS(input, axis, output); // Wrap around negative values const unsigned int axis_u = wrap_around(axis, static_cast<int>(input[0]->info()->num_dimensions() + 1)); - for(unsigned int i = 0; i < _num_inputs; i++) - { - _stack_kernels[i] = std::make_unique<NEStackLayerKernel>(); - _stack_kernels[i]->configure(input[i], axis_u, i, _num_inputs, output); - } + _stack_kernel->configure(input, axis_u, output); } Status NEStackLayer::validate(const std::vector<ITensorInfo *> &input, int axis, const ITensorInfo *output) @@ -67,24 +62,20 @@ Status NEStackLayer::validate(const std::vector<ITensorInfo *> &input, int axis, const size_t rank = input[0]->num_dimensions(); const unsigned int axis_u = wrap_around(axis, static_cast<int>(rank + 1)); - const unsigned int num_inputs = input.size(); - - for(unsigned int i = 0; i < num_inputs; i++) - { - // All the tensors must have the same rank - ARM_COMPUTE_RETURN_ERROR_ON(input[i]->num_dimensions() != rank); - // Validate Kernel - ARM_COMPUTE_RETURN_ON_ERROR(NEStackLayerKernel::validate(input[i], axis_u, i, num_inputs, output)); - } + // Validate Kernel + ARM_COMPUTE_RETURN_ON_ERROR(NEStackLayerKernel::validate(input, axis_u, output)); return Status{}; } void NEStackLayer::run() { - for(unsigned i = 0; i < _num_inputs; i++) + if (!_is_prepared) { - NEScheduler::get().schedule(_stack_kernels[i].get(), Window::DimY); + _stack_kernel->prepare(); + _is_prepared = true; } + + NEScheduler::get().schedule(_stack_kernel.get(), _stack_kernel->get_split_dimension()); } } // namespace arm_compute diff --git a/src/runtime/NEON/functions/NEStridedSlice.cpp b/src/runtime/NEON/functions/NEStridedSlice.cpp index fffb38c3ca..6a3ac8be05 100644 --- a/src/runtime/NEON/functions/NEStridedSlice.cpp +++ b/src/runtime/NEON/functions/NEStridedSlice.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018-2020 Arm Limited. + * Copyright (c) 2018-2021 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -25,24 +25,38 @@ #include "arm_compute/core/ITensor.h" #include "arm_compute/core/Types.h" + +#include "src/common/utils/Log.h" #include "src/core/NEON/kernels/NEStridedSliceKernel.h" namespace arm_compute { namespace experimental { -void NEStridedSlice::configure(const ITensorInfo *input, ITensorInfo *output, - const Coordinates &starts, const Coordinates &ends, const BiStrides &strides, - int32_t begin_mask, int32_t end_mask, int32_t shrink_axis_mask) +void NEStridedSlice::configure(const ITensorInfo *input, + ITensorInfo *output, + const Coordinates &starts, + const Coordinates &ends, + const BiStrides &strides, + int32_t begin_mask, + int32_t end_mask, + int32_t shrink_axis_mask) { + ARM_COMPUTE_LOG_PARAMS(input, output, starts, ends, strides, begin_mask, end_mask, shrink_axis_mask); + auto k = std::make_unique<NEStridedSliceKernel>(); k->configure(input, output, starts, ends, strides, begin_mask, end_mask, shrink_axis_mask); _kernel = std::move(k); } -Status NEStridedSlice::validate(const ITensorInfo *input, const ITensorInfo *output, - const Coordinates &starts, const Coordinates &ends, const BiStrides &strides, - int32_t begin_mask, int32_t end_mask, int32_t shrink_axis_mask) +Status NEStridedSlice::validate(const ITensorInfo *input, + const ITensorInfo *output, + const Coordinates &starts, + const Coordinates &ends, + const BiStrides &strides, + int32_t begin_mask, + int32_t end_mask, + int32_t shrink_axis_mask) { return NEStridedSliceKernel::validate(input, output, starts, ends, strides, begin_mask, end_mask, shrink_axis_mask); } @@ -50,22 +64,26 @@ Status NEStridedSlice::validate(const ITensorInfo *input, const ITensorInfo *out struct NEStridedSlice::Impl { - const ITensor *src{ nullptr }; - ITensor *dst{ nullptr }; - std::unique_ptr<experimental::NEStridedSlice> op{ nullptr }; + const ITensor *src{nullptr}; + ITensor *dst{nullptr}; + std::unique_ptr<experimental::NEStridedSlice> op{nullptr}; }; -NEStridedSlice::NEStridedSlice() - : _impl(std::make_unique<Impl>()) +NEStridedSlice::NEStridedSlice() : _impl(std::make_unique<Impl>()) { } -NEStridedSlice::NEStridedSlice(NEStridedSlice &&) = default; +NEStridedSlice::NEStridedSlice(NEStridedSlice &&) = default; NEStridedSlice &NEStridedSlice::operator=(NEStridedSlice &&) = default; NEStridedSlice::~NEStridedSlice() = default; -void NEStridedSlice::configure(const ITensor *input, ITensor *output, - const Coordinates &starts, const Coordinates &ends, const BiStrides &strides, - int32_t begin_mask, int32_t end_mask, int32_t shrink_axis_mask) +void NEStridedSlice::configure(const ITensor *input, + ITensor *output, + const Coordinates &starts, + const Coordinates &ends, + const BiStrides &strides, + int32_t begin_mask, + int32_t end_mask, + int32_t shrink_axis_mask) { _impl->src = input; _impl->dst = output; @@ -81,10 +99,16 @@ void NEStridedSlice::run() _impl->op->run(pack); } -Status NEStridedSlice::validate(const ITensorInfo *input, const ITensorInfo *output, - const Coordinates &starts, const Coordinates &ends, const BiStrides &strides, - int32_t begin_mask, int32_t end_mask, int32_t shrink_axis_mask) +Status NEStridedSlice::validate(const ITensorInfo *input, + const ITensorInfo *output, + const Coordinates &starts, + const Coordinates &ends, + const BiStrides &strides, + int32_t begin_mask, + int32_t end_mask, + int32_t shrink_axis_mask) { - return experimental::NEStridedSlice::validate(input, output, starts, ends, strides, begin_mask, end_mask, shrink_axis_mask); + return experimental::NEStridedSlice::validate(input, output, starts, ends, strides, begin_mask, end_mask, + shrink_axis_mask); } } // namespace arm_compute diff --git a/src/runtime/NEON/functions/NETile.cpp b/src/runtime/NEON/functions/NETile.cpp index 088816eb95..d10b1c8e95 100644 --- a/src/runtime/NEON/functions/NETile.cpp +++ b/src/runtime/NEON/functions/NETile.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018-2020 Arm Limited. + * Copyright (c) 2018-2021 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -23,12 +23,15 @@ */ #include "arm_compute/runtime/NEON/functions/NETile.h" +#include "src/common/utils/Log.h" #include "src/core/NEON/kernels/NETileKernel.h" namespace arm_compute { void NETile::configure(const ITensor *input, ITensor *output, const Multiples &multiples) { + ARM_COMPUTE_LOG_PARAMS(input, output, multiples); + auto k = std::make_unique<NETileKernel>(); k->configure(input, output, multiples); _kernel = std::move(k); diff --git a/src/runtime/NEON/functions/NETranspose.cpp b/src/runtime/NEON/functions/NETranspose.cpp index 3b3023f3b3..0144a85e8c 100644 --- a/src/runtime/NEON/functions/NETranspose.cpp +++ b/src/runtime/NEON/functions/NETranspose.cpp @@ -24,19 +24,20 @@ #include "arm_compute/runtime/NEON/functions/NETranspose.h" #include "arm_compute/core/Validate.h" -#include "src/runtime/cpu/operators/CpuTranspose.h" + +#include "src/common/utils/Log.h" +#include "src/cpu/operators/CpuTranspose.h" namespace arm_compute { struct NETranspose::Impl { - const ITensor *src{ nullptr }; - ITensor *dst{ nullptr }; - std::unique_ptr<cpu::CpuTranspose> op{ nullptr }; + const ITensor *src{nullptr}; + ITensor *dst{nullptr}; + std::unique_ptr<cpu::CpuTranspose> op{nullptr}; }; -NETranspose::NETranspose() - : _impl(std::make_unique<Impl>()) +NETranspose::NETranspose() : _impl(std::make_unique<Impl>()) { } @@ -45,6 +46,7 @@ NETranspose::~NETranspose() = default; void NETranspose::configure(const ITensor *input, ITensor *output) { ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); + ARM_COMPUTE_LOG_PARAMS(input, output); _impl->src = input; _impl->dst = output; diff --git a/src/runtime/NEON/functions/NEUnstack.cpp b/src/runtime/NEON/functions/NEUnstack.cpp index 50596dbc0a..2f7ed2bb1f 100644 --- a/src/runtime/NEON/functions/NEUnstack.cpp +++ b/src/runtime/NEON/functions/NEUnstack.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018-2019 Arm Limited. + * Copyright (c) 2018-2021 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -29,6 +29,8 @@ #include "arm_compute/core/Types.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" +#include "src/common/utils/Log.h" + namespace arm_compute { namespace @@ -38,13 +40,15 @@ inline unsigned int wrap_axis(int axis, const ITensorInfo *const tensor) return wrap_around(axis, static_cast<int>(tensor->num_dimensions())); } -inline void setup_slice_coordinates_and_mask(Coordinates &slice_start, int32_t &slice_end_mask, const unsigned int input_num_dimensions) +inline void setup_slice_coordinates_and_mask(Coordinates &slice_start, + int32_t &slice_end_mask, + const unsigned int input_num_dimensions) { // Setups up coordinates to slice the input tensor: start coordinates to all 0s and the unstacking axis of both Start/End to slice just one 2d tensor at a time. Coordinates slice_end; slice_start.set_num_dimensions(input_num_dimensions); slice_end.set_num_dimensions(input_num_dimensions); - for(size_t k = 0; k < input_num_dimensions; ++k) + for (size_t k = 0; k < input_num_dimensions; ++k) { slice_start.set(k, 0); slice_end.set(k, -1); @@ -54,22 +58,23 @@ inline void setup_slice_coordinates_and_mask(Coordinates &slice_start, int32_t & } // namespace NEUnstack::NEUnstack() // NOLINT - : _num_slices(0), - _strided_slice_vector() + : _num_slices(0), _strided_slice_vector() { } void NEUnstack::configure(const ITensor *input, const std::vector<ITensor *> &output_vector, int axis) { std::vector<ITensorInfo *> outputs_vector_info(output_vector.size()); - std::transform(output_vector.begin(), output_vector.end(), outputs_vector_info.begin(), [](ITensor * t) - { - ARM_COMPUTE_ERROR_ON_NULLPTR(t); - return t->info(); - }); + std::transform(output_vector.begin(), output_vector.end(), outputs_vector_info.begin(), + [](ITensor *t) + { + ARM_COMPUTE_ERROR_ON_NULLPTR(t); + return t->info(); + }); ARM_COMPUTE_ERROR_ON_NULLPTR(input); ARM_COMPUTE_ERROR_THROW_ON(NEUnstack::validate(input->info(), outputs_vector_info, axis)); + ARM_COMPUTE_LOG_PARAMS(input, output_vector, axis); // Wrap around negative values const unsigned int axis_u = wrap_axis(axis, input->info()); @@ -79,11 +84,12 @@ void NEUnstack::configure(const ITensor *input, const std::vector<ITensor *> &ou Coordinates slice_start; int32_t slice_end_mask; setup_slice_coordinates_and_mask(slice_start, slice_end_mask, input->info()->tensor_shape().num_dimensions()); - for(unsigned int slice = 0; slice < _num_slices; ++slice) + for (unsigned int slice = 0; slice < _num_slices; ++slice) { // Adjusts start and end coordinates to take a 2D slice at a time slice_start.set(axis_u, slice); - _strided_slice_vector[slice].configure(input, output_vector[slice], slice_start, Coordinates(), BiStrides(), 0, slice_end_mask, (1 << axis_u)); + _strided_slice_vector[slice].configure(input, output_vector[slice], slice_start, Coordinates(), BiStrides(), 0, + slice_end_mask, (1 << axis_u)); } } @@ -100,18 +106,20 @@ Status NEUnstack::validate(const ITensorInfo *input, const std::vector<ITensorIn Coordinates slice_start; int32_t slice_end_mask; - for(size_t k = 0; k < num_slices; ++k) + for (size_t k = 0; k < num_slices; ++k) { slice_start.set(wrap_axis(axis, input), k); setup_slice_coordinates_and_mask(slice_start, slice_end_mask, input->tensor_shape().num_dimensions()); - ARM_COMPUTE_RETURN_ON_ERROR(NEStridedSlice::validate(input, output_vector[k], slice_start, Coordinates(), BiStrides(), 0, slice_end_mask, (1 << wrap_axis(axis, input)))); + ARM_COMPUTE_RETURN_ON_ERROR(NEStridedSlice::validate(input, output_vector[k], slice_start, Coordinates(), + BiStrides(), 0, slice_end_mask, + (1 << wrap_axis(axis, input)))); } return Status{}; } void NEUnstack::run() { - for(unsigned i = 0; i < _num_slices; ++i) + for (unsigned i = 0; i < _num_slices; ++i) { _strided_slice_vector[i].run(); } diff --git a/src/runtime/NEON/functions/NEWinogradConvolutionLayer.cpp b/src/runtime/NEON/functions/NEWinogradConvolutionLayer.cpp index 0bf1738bec..7334be8456 100644 --- a/src/runtime/NEON/functions/NEWinogradConvolutionLayer.cpp +++ b/src/runtime/NEON/functions/NEWinogradConvolutionLayer.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2021 Arm Limited. + * Copyright (c) 2017-2022, 2024 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -24,759 +24,93 @@ #include "arm_compute/runtime/NEON/functions/NEWinogradConvolutionLayer.h" #include "arm_compute/core/Error.h" +#include "arm_compute/core/ITensorPack.h" #include "arm_compute/core/Utils.h" -#include "arm_compute/core/Validate.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" -#include "arm_compute/runtime/NEON/NEScheduler.h" -#include "src/core/CPP/Validate.h" -#include "src/core/NEON/kernels/NEGEMMInterleave4x4Kernel.h" -#include "src/core/NEON/kernels/NEGEMMMatrixAdditionKernel.h" -#include "src/core/NEON/kernels/NEGEMMMatrixMultiplyKernel.h" -#include "src/core/NEON/kernels/NEGEMMTranspose1xWKernel.h" -#include "src/core/NEON/kernels/NEWinogradConvolutionLayerKernel.h" -#include "src/runtime/cpu/operators/internal/CpuGemmAssemblyDispatch.h" +#include "arm_compute/core/Validate.h" +#include "src/core/CPP/Validate.h" +#include "src/core/helpers/MemoryHelpers.h" #include "src/core/NEON/kernels/convolution/common/utils.hpp" -#include "src/core/NEON/kernels/convolution/winograd/winograd.hpp" +#include "src/cpu/kernels/CpuWinogradConv2dKernel.h" +#include "src/cpu/operators/CpuWinogradConv2d.h" namespace arm_compute { -namespace -{ -inline Status validate_kernel_3x3(const Size2D input_dims, const ITensorInfo *input, const TensorInfo *input0, const TensorInfo *input1, const TensorInfo *batched_mm_output, - const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const WinogradInfo &winograd_info, const ActivationLayerInfo &act_info) -{ - ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32); - - if(input->data_type() == DataType::F32) - { - if(input_dims.width > 4 && input_dims.height > 4) - { - ARM_COMPUTE_RETURN_ON_ERROR((NEWinogradLayerTransformInputKernel<float, 4, 4, 3, 3>::validate(input, input0, winograd_info))); - ARM_COMPUTE_RETURN_ON_ERROR((NEWinogradLayerTransformWeightsKernel<float, 4, 4, 3, 3>::validate(weights, input1, winograd_info))); - ARM_COMPUTE_RETURN_ON_ERROR((NEWinogradLayerTransformOutputKernel<float, 4, 4, 3, 3>::validate(batched_mm_output, biases, output, winograd_info))); - } - else - { - ARM_COMPUTE_RETURN_ON_ERROR((NEWinogradLayerTransformInputKernel<float, 2, 2, 3, 3>::validate(input, input0, winograd_info))); - ARM_COMPUTE_RETURN_ON_ERROR((NEWinogradLayerTransformWeightsKernel<float, 2, 2, 3, 3>::validate(weights, input1, winograd_info))); - ARM_COMPUTE_RETURN_ON_ERROR((NEWinogradLayerTransformOutputKernel<float, 2, 2, 3, 3>::validate(batched_mm_output, biases, output, winograd_info))); - } - } -#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC - else if(input->data_type() == DataType::F16) - { - ARM_COMPUTE_RETURN_ON_ERROR((NEWinogradLayerTransformInputKernel<__fp16, 4, 4, 3, 3>::validate(input, input0, winograd_info))); - ARM_COMPUTE_RETURN_ON_ERROR((NEWinogradLayerTransformWeightsKernel<__fp16, 4, 4, 3, 3>::validate(weights, input1, winograd_info))); - ARM_COMPUTE_RETURN_ON_ERROR((NEWinogradLayerTransformOutputKernel<__fp16, 4, 4, 3, 3>::validate(batched_mm_output, biases, output, winograd_info))); - } -#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */ - - if(act_info.enabled()) - { - NEActivationLayer::validate(output, nullptr, act_info); - } - return Status{}; -} - -inline Status validate_kernel_5x5(const ITensorInfo *input, const TensorInfo *input0, const TensorInfo *input1, const TensorInfo *batched_mm_output, - const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const WinogradInfo &winograd_info, const ActivationLayerInfo &act_info) -{ - ARM_COMPUTE_RETURN_ON_ERROR((NEWinogradLayerTransformInputKernel<float, 2, 2, 5, 5>::validate(input, input0, winograd_info))); - ARM_COMPUTE_RETURN_ON_ERROR((NEWinogradLayerTransformWeightsKernel<float, 2, 2, 5, 5>::validate(weights, input1, winograd_info))); - ARM_COMPUTE_RETURN_ON_ERROR((NEWinogradLayerTransformOutputKernel<float, 2, 2, 5, 5>::validate(batched_mm_output, biases, output, winograd_info))); - if(act_info.enabled()) - { - NEActivationLayer::validate(output, nullptr, act_info); - } - return Status{}; -} +using namespace arm_compute::experimental; -inline Status validate_kernel_3x1(const ITensorInfo *input, const TensorInfo *input0, const TensorInfo *input1, const TensorInfo *batched_mm_output, - const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const WinogradInfo &winograd_info, const ActivationLayerInfo &act_info) +struct NEWinogradConvolutionLayer::Impl { - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32); - ARM_COMPUTE_RETURN_ON_ERROR((NEWinogradLayerTransformInputKernel<float, 1, 6, 1, 3>::validate(input, input0, winograd_info))); - ARM_COMPUTE_RETURN_ON_ERROR((NEWinogradLayerTransformWeightsKernel<float, 1, 6, 1, 3>::validate(weights, input1, winograd_info))); - ARM_COMPUTE_RETURN_ON_ERROR((NEWinogradLayerTransformOutputKernel<float, 1, 6, 1, 3>::validate(batched_mm_output, biases, output, winograd_info))); - if(act_info.enabled()) - { - NEActivationLayer::validate(output, nullptr, act_info); - } - return Status{}; -} - -inline Status validate_kernel_1x3(const ITensorInfo *input, const TensorInfo *input0, const TensorInfo *input1, const TensorInfo *batched_mm_output, - const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const WinogradInfo &winograd_info, const ActivationLayerInfo &act_info) -{ - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32); - ARM_COMPUTE_RETURN_ON_ERROR((NEWinogradLayerTransformInputKernel<float, 6, 1, 3, 1>::validate(input, input0, winograd_info))); - ARM_COMPUTE_RETURN_ON_ERROR((NEWinogradLayerTransformWeightsKernel<float, 6, 1, 3, 1>::validate(weights, input1, winograd_info))); - ARM_COMPUTE_RETURN_ON_ERROR((NEWinogradLayerTransformOutputKernel<float, 6, 1, 3, 1>::validate(batched_mm_output, biases, output, winograd_info))); - - if(act_info.enabled()) - { - NEActivationLayer::validate(output, nullptr, act_info); - } - return Status{}; -} - -inline Status validate_kernel_5x1(const ITensorInfo *input, const TensorInfo *input0, const TensorInfo *input1, const TensorInfo *batched_mm_output, - const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const WinogradInfo &winograd_info, const ActivationLayerInfo &act_info) -{ - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32); - ARM_COMPUTE_RETURN_ON_ERROR((NEWinogradLayerTransformInputKernel<float, 1, 4, 1, 5>::validate(input, input0, winograd_info))); - ARM_COMPUTE_RETURN_ON_ERROR((NEWinogradLayerTransformWeightsKernel<float, 1, 4, 1, 5>::validate(weights, input1, winograd_info))); - ARM_COMPUTE_RETURN_ON_ERROR((NEWinogradLayerTransformOutputKernel<float, 1, 4, 1, 5>::validate(batched_mm_output, biases, output, winograd_info))); - if(act_info.enabled()) - { - NEActivationLayer::validate(output, nullptr, act_info); - } - return Status{}; -} -inline Status validate_kernel_1x5(const ITensorInfo *input, const TensorInfo *input0, const TensorInfo *input1, const TensorInfo *batched_mm_output, - const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const WinogradInfo &winograd_info, const ActivationLayerInfo &act_info) -{ - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32); - ARM_COMPUTE_RETURN_ON_ERROR((NEWinogradLayerTransformInputKernel<float, 4, 1, 5, 1>::validate(input, input0, winograd_info))); - ARM_COMPUTE_RETURN_ON_ERROR((NEWinogradLayerTransformWeightsKernel<float, 4, 1, 5, 1>::validate(weights, input1, winograd_info))); - ARM_COMPUTE_RETURN_ON_ERROR((NEWinogradLayerTransformOutputKernel<float, 4, 1, 5, 1>::validate(batched_mm_output, biases, output, winograd_info))); - if(act_info.enabled()) - { - NEActivationLayer::validate(output, nullptr, act_info); - } - return Status{}; -} - -inline Status validate_kernel_7x1(const ITensorInfo *input, const TensorInfo *input0, const TensorInfo *input1, const TensorInfo *batched_mm_output, - const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const WinogradInfo &winograd_info, const ActivationLayerInfo &act_info) -{ - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32); - ARM_COMPUTE_RETURN_ON_ERROR((NEWinogradLayerTransformInputKernel<float, 1, 2, 1, 7>::validate(input, input0, winograd_info))); - ARM_COMPUTE_RETURN_ON_ERROR((NEWinogradLayerTransformWeightsKernel<float, 1, 2, 1, 7>::validate(weights, input1, winograd_info))); - ARM_COMPUTE_RETURN_ON_ERROR((NEWinogradLayerTransformOutputKernel<float, 1, 2, 1, 7>::validate(batched_mm_output, biases, output, winograd_info))); - if(act_info.enabled()) - { - NEActivationLayer::validate(output, nullptr, act_info); - } - return Status{}; -} - -inline Status validate_kernel_1x7(const ITensorInfo *input, const TensorInfo *input0, const TensorInfo *input1, const TensorInfo *batched_mm_output, - const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const WinogradInfo &winograd_info, const ActivationLayerInfo &act_info) -{ - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32); - ARM_COMPUTE_RETURN_ON_ERROR((NEWinogradLayerTransformInputKernel<float, 2, 1, 7, 1>::validate(input, input0, winograd_info))); - ARM_COMPUTE_RETURN_ON_ERROR((NEWinogradLayerTransformWeightsKernel<float, 2, 1, 7, 1>::validate(weights, input1, winograd_info))); - ARM_COMPUTE_RETURN_ON_ERROR((NEWinogradLayerTransformOutputKernel<float, 2, 1, 7, 1>::validate(batched_mm_output, biases, output, winograd_info))); - - if(act_info.enabled()) - { - NEActivationLayer::validate(output, nullptr, act_info); - } - return Status{}; -} - -inline Tensor4DShape internal_get_input_shape(const arm_compute::ITensor *input) -{ - const DataLayout data_layout = input->info()->data_layout(); - const int in_width = input->info()->dimension(get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH)); - const int in_height = input->info()->dimension(get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT)); - const int in_channels = input->info()->dimension(get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL)); - const int in_batches = input->info()->dimension(3); - - return Tensor4DShape{ in_batches, in_height, in_width, in_channels }; -} - -Status validate_arguments(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info) -{ - ARM_COMPUTE_UNUSED(output); - ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input); - - ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.stride().first != 1 || conv_info.stride().second != 1, "Winograd layer only supports unit strides."); - if(biases != nullptr) - { - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, biases); - ARM_COMPUTE_RETURN_ERROR_ON(biases->num_dimensions() > 1); - } - return INEWinogradLayerTransformWeightsKernel::validate(input, weights); -} - -Size2D winograd_output_tile(const Size2D &input_dims, const Size2D &kernel_dims, DataType data_type) -{ - Size2D output_tile = Size2D{}; - if(kernel_dims == Size2D(3U, 3U)) - { - output_tile = (input_dims.width <= 4 || input_dims.height <= 4) ? Size2D(2U, 2U) : Size2D(4U, 4U); - if(data_type == DataType::F16) - { - output_tile = Size2D(4U, 4U); - } - } - else if(kernel_dims == Size2D(5U, 5U)) - { - output_tile = Size2D(2U, 2U); - } - else if(kernel_dims == Size2D(1U, 3U)) - { - output_tile = Size2D(1U, 6U); - } - else if(kernel_dims == Size2D(3U, 1U)) - { - output_tile = Size2D(6U, 1U); - } - else if(kernel_dims == Size2D(1U, 5U)) - { - output_tile = Size2D(1U, 4U); - } - else if(kernel_dims == Size2D(5U, 1U)) - { - output_tile = Size2D(4U, 1U); - } - else if(kernel_dims == Size2D(7U, 1U)) - { - output_tile = Size2D(2U, 1U); - } - else if(kernel_dims == Size2D(1U, 7U)) - { - output_tile = Size2D(1U, 2U); - } - return output_tile; -} - -bool check_support_fast_math(const Size2D &output_tile, const Size2D &kernel_size, DataType data_type) -{ - // Check if we want to configure a Winograd configuration which requires fast math - using WinogradConfiguration = std::pair<std::pair<int, int>, std::pair<int, int>>; - - const std::vector<WinogradConfiguration> fast_math_winograd_f16 = - { - WinogradConfiguration(std::pair<int, int>(4, 4), std::pair<int, int>(3, 3)) - }; - - const std::vector<WinogradConfiguration> fast_math_winograd_f32 = - { - WinogradConfiguration(std::pair<int, int>(2, 2), std::pair<int, int>(5, 5)), - WinogradConfiguration(std::pair<int, int>(4, 4), std::pair<int, int>(5, 5)) - }; - - auto p = std::make_pair(std::pair<int, int>(output_tile.width, output_tile.height), - std::pair<int, int>(kernel_size.width, kernel_size.height)); - - switch(data_type) - { - case DataType::F16: - return std::find(fast_math_winograd_f16.begin(), fast_math_winograd_f16.end(), p) != fast_math_winograd_f16.end(); - case DataType::F32: - return std::find(fast_math_winograd_f32.begin(), fast_math_winograd_f32.end(), p) != fast_math_winograd_f32.end(); - default: - return false; - } -} - -inline bool fuse_function_supported(const ActivationLayerInfo &act_info) -{ - return act_info.activation() == ActivationLayerInfo::ActivationFunction::RELU || act_info.activation() == ActivationLayerInfo::ActivationFunction::BOUNDED_RELU; -} - -arm_gemm::Activation arm_gemm_activation_from_acl_activation(const ActivationLayerInfo &act_info) -{ - switch(act_info.activation()) - { - case ActivationLayerInfo::ActivationFunction::RELU: - { - return arm_gemm::Activation(arm_gemm::Activation::Type::ReLU, act_info.a(), act_info.b()); - } - case ActivationLayerInfo::ActivationFunction::BOUNDED_RELU: - { - return arm_gemm::Activation(arm_gemm::Activation::Type::BoundedReLU, act_info.a(), act_info.b()); - } - default: - { - return arm_gemm::Activation(arm_gemm::Activation::Type::None); - } - } -} -} //namespace + MemoryGroup memory_group{}; + std::unique_ptr<cpu::CpuWinogradConv2d> op{nullptr}; + ITensorPack run_pack{}; + ITensorPack prep_pack{}; + WorkspaceData<Tensor> workspace{}; + experimental::MemoryRequirements aux_mem_req{}; + const ITensor *original_weights{nullptr}; + bool is_prepared{false}; + bool is_activationlayer_enabled{false}; + DataLayout data_layout{}; +}; NEWinogradConvolutionLayer::NEWinogradConvolutionLayer(const std::shared_ptr<IMemoryManager> &memory_manager) - : _memory_group(memory_manager), _gemm_function(memory_manager), _transform_input_kernel(nullptr), _transform_output_kernel(nullptr), _transform_weights_kernel(nullptr), _activationlayer_function(), - _permute_input(), _permute_weights(), _permute_output(), _input_transformed(), _output_transformed(), _input_workspace(), _output_workspace(), _kernel_storage(), _input_nhwc(), _output_nhwc(), - _weights_hwio(), _input(), _weights(), _output(), _is_prepared(false), _is_activationlayer_enabled(false), _data_layout() + : _impl(std::make_unique<Impl>()) { + _impl->memory_group = MemoryGroup(memory_manager); } -void NEWinogradConvolutionLayer::configure(const ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info, - bool enable_fast_math) -{ - ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output); - ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), weights->info(), (biases != nullptr) ? biases->info() : nullptr, output->info(), conv_info)); - - // Get indices for the width and height - _data_layout = input->info()->data_layout(); - const unsigned int width_idx = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::WIDTH); - const unsigned int height_idx = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::HEIGHT); - const unsigned int channel_idx = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::CHANNEL); - - const Size2D input_dims = Size2D(input->info()->dimension(width_idx), input->info()->dimension(height_idx)); - const Size2D kernel_size = Size2D(weights->info()->dimension(width_idx), weights->info()->dimension(height_idx)); - const DataType data_type = input->info()->data_type(); - const Size2D output_tile = winograd_output_tile(input_dims, kernel_size, data_type); - - // Check if the Winograd configuration requires fast math - if(!enable_fast_math) - { - ARM_COMPUTE_ERROR_ON_MSG(check_support_fast_math(output_tile, kernel_size, data_type), - "This Winograd configuration requires enable_fast_math=true"); - } - - _weights = weights; - _input = input; - _output = output; - _is_prepared = false; - - int n_gemms = 1; - int N_BLOCK = 1; // Size of block used by GEMM. - - std::unique_ptr<INEWinogradLayerTransformInputKernel> transform_input_kernel; - std::unique_ptr<INEWinogradLayerTransformWeightsKernel> transform_weights_kernel; - std::unique_ptr<INEWinogradLayerTransformOutputKernel> transform_output_kernel; - - if(data_type == DataType::F32) - { - if(kernel_size == Size2D(3, 3)) - { - if(input->info()->dimension(width_idx) > 4 && input->info()->dimension(height_idx) > 4) - { - using config = NEWinogradLayerConfiguration<float, float, 4, 4, 3, 3>; - transform_input_kernel = std::make_unique<config::TransformInputKernel>(); - transform_weights_kernel = std::make_unique<config::TransformWeightsKernel>(); - transform_output_kernel = std::make_unique<config::TransformOutputKernel>(); - n_gemms = config::WinogradBase::N_GEMMS; - N_BLOCK = config::WinogradConv::N_BLOCK; - } - else - { - using config = NEWinogradLayerConfiguration<float, float, 2, 2, 3, 3>; - transform_input_kernel = std::make_unique<config::TransformInputKernel>(); - transform_weights_kernel = std::make_unique<config::TransformWeightsKernel>(); - transform_output_kernel = std::make_unique<config::TransformOutputKernel>(); - n_gemms = config::WinogradBase::N_GEMMS; - N_BLOCK = config::WinogradConv::N_BLOCK; - } - } - else if(kernel_size == Size2D(5, 5)) - { - using config = NEWinogradLayerConfiguration<float, float, 2, 2, 5, 5>; - transform_input_kernel = std::make_unique<config::TransformInputKernel>(); - transform_weights_kernel = std::make_unique<config::TransformWeightsKernel>(); - transform_output_kernel = std::make_unique<config::TransformOutputKernel>(); - n_gemms = config::WinogradBase::N_GEMMS; - N_BLOCK = config::WinogradConv::N_BLOCK; - } - else if(kernel_size == Size2D(1, 3)) - { - using config = NEWinogradLayerConfiguration<float, float, 6, 1, 3, 1>; - transform_input_kernel = std::make_unique<config::TransformInputKernel>(); - transform_weights_kernel = std::make_unique<config::TransformWeightsKernel>(); - transform_output_kernel = std::make_unique<config::TransformOutputKernel>(); - n_gemms = config::WinogradBase::N_GEMMS; - N_BLOCK = config::WinogradConv::N_BLOCK; - } - else if(kernel_size == Size2D(3, 1)) - { - using config = NEWinogradLayerConfiguration<float, float, 1, 6, 1, 3>; - transform_input_kernel = std::make_unique<config::TransformInputKernel>(); - transform_weights_kernel = std::make_unique<config::TransformWeightsKernel>(); - transform_output_kernel = std::make_unique<config::TransformOutputKernel>(); - n_gemms = config::WinogradBase::N_GEMMS; - N_BLOCK = config::WinogradConv::N_BLOCK; - } - else if(kernel_size == Size2D(1, 5)) - { - using config = NEWinogradLayerConfiguration<float, float, 4, 1, 5, 1>; - transform_input_kernel = std::make_unique<config::TransformInputKernel>(); - transform_weights_kernel = std::make_unique<config::TransformWeightsKernel>(); - transform_output_kernel = std::make_unique<config::TransformOutputKernel>(); - n_gemms = config::WinogradBase::N_GEMMS; - N_BLOCK = config::WinogradConv::N_BLOCK; - } - else if(kernel_size == Size2D(5, 1)) - { - using config = NEWinogradLayerConfiguration<float, float, 1, 4, 1, 5>; - transform_input_kernel = std::make_unique<config::TransformInputKernel>(); - transform_weights_kernel = std::make_unique<config::TransformWeightsKernel>(); - transform_output_kernel = std::make_unique<config::TransformOutputKernel>(); - n_gemms = config::WinogradBase::N_GEMMS; - N_BLOCK = config::WinogradConv::N_BLOCK; - } - else if(kernel_size == Size2D(1, 7)) - { - using config = NEWinogradLayerConfiguration<float, float, 2, 1, 7, 1>; - transform_input_kernel = std::make_unique<config::TransformInputKernel>(); - transform_weights_kernel = std::make_unique<config::TransformWeightsKernel>(); - transform_output_kernel = std::make_unique<config::TransformOutputKernel>(); - n_gemms = config::WinogradBase::N_GEMMS; - N_BLOCK = config::WinogradConv::N_BLOCK; - } - else if(kernel_size == Size2D(7, 1)) - { - using config = NEWinogradLayerConfiguration<float, float, 1, 2, 1, 7>; - transform_input_kernel = std::make_unique<config::TransformInputKernel>(); - transform_weights_kernel = std::make_unique<config::TransformWeightsKernel>(); - transform_output_kernel = std::make_unique<config::TransformOutputKernel>(); - n_gemms = config::WinogradBase::N_GEMMS; - N_BLOCK = config::WinogradConv::N_BLOCK; - } - else - { - ARM_COMPUTE_ERROR("Not supported."); - } - } -#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC - else if(data_type == DataType::F16) - { - if(kernel_size == Size2D(3, 3)) - { - using config = NEWinogradLayerConfiguration<__fp16, __fp16, 4, 4, 3, 3>; - transform_input_kernel = std::make_unique<config::TransformInputKernel>(); - transform_weights_kernel = std::make_unique<config::TransformWeightsKernel>(); - transform_output_kernel = std::make_unique<config::TransformOutputKernel>(); - n_gemms = config::WinogradBase::N_GEMMS; - N_BLOCK = config::WinogradConv::N_BLOCK; - } - else - { - ARM_COMPUTE_ERROR("Not supported."); - } - } -#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC - else - { - ARM_COMPUTE_ERROR("Not supported."); - } - - const PaddingType use_padding_type = (conv_info.pad_top() != 0u || conv_info.pad_left() != 0) ? PADDING_SAME : PADDING_VALID; - const bool use_same_padding = use_padding_type == PADDING_SAME; - - // Get convolved dimensions - const int in_channels = input->info()->dimension(channel_idx); - const int out_channels = output->info()->dimension(channel_idx); - - const Tensor4DShape in_shape(internal_get_input_shape(input)); - const size_t data_type_size = input->info()->element_size(); - // Get the memory required to instantiate a new Winograd operator. - constexpr size_t storage_alignment = 64; - - // Kernel Storage - const size_t kernel_storage_size = transform_weights_kernel->get_weight_storage_size(out_channels, - in_channels) - * data_type_size; - - // Input storage - const size_t input_storage_size = transform_input_kernel->get_input_storage_size(in_shape.n_batches, in_shape.n_channels, in_shape.n_rows, in_shape.n_cols, - use_same_padding) - * data_type_size; - - // Output storage - const size_t output_storage_size = transform_output_kernel->get_output_storage_size(in_shape.n_batches, in_shape.n_rows, in_shape.n_cols, out_channels) * data_type_size; - const int kernel_matrix_stride = transform_weights_kernel->get_matrix_stride(out_channels, in_channels); - const int output_matrix_stride = transform_output_kernel->get_matrix_stride(in_shape.n_batches, in_shape.n_rows, in_shape.n_cols, out_channels); - const auto output_shape = transform_output_kernel->get_output_shape(in_shape.n_rows, in_shape.n_cols, use_padding_type == PADDING_SAME); - const int input_matrix_stride = transform_input_kernel->get_matrix_stride(in_shape.n_batches, in_channels, in_shape.n_rows, in_shape.n_cols, use_padding_type == PADDING_SAME); - - // Configure GEMM - const int tile_rows = iceildiv(output_shape.first, output_tile.height); - const int tile_cols = iceildiv(output_shape.second, output_tile.width); - const int m = in_shape.n_batches * tile_rows * tile_cols; - const int k = in_shape.n_channels; - const int n = out_channels; - const int kernel_matrix_row_stride = roundup(out_channels, N_BLOCK); - const int output_matrix_row_stride = kernel_matrix_row_stride; - - TensorShape a_shape(k, m, 1, n_gemms); - Strides a_strides(data_type_size); - a_strides.set(1, a_strides[0] * k); - //a_strides.set(2, data_type_size * input_matrix_stride / n_gemms); FIXME: This is the real batch size, but RSH's code crashes if it's not 0. - a_strides.set(2, 0); - a_strides.set(3, data_type_size * input_matrix_stride); - - TensorShape b_shape(n, k, n_gemms); - Strides b_strides(data_type_size); - b_strides.set(1, data_type_size * kernel_matrix_row_stride); - b_strides.set(2, data_type_size * kernel_matrix_stride); - - TensorShape d_shape(n, m, 1, n_gemms); - Strides d_strides(data_type_size); - d_strides.set(1, data_type_size * output_matrix_row_stride); - //d_strides.set(2, data_type_size * output_matrix_stride / n_gemms); FIXME: This is the real batch size, but RSH's code crashes if it's not 0. - d_strides.set(2, 0); - d_strides.set(3, data_type_size * output_matrix_stride); - - TensorInfo a_info{}; - TensorInfo b_info{}; - TensorInfo d_info{}; - a_info.init(a_shape, 1, data_type, a_strides, 0, input_storage_size); - b_info.init(b_shape, 1, data_type, b_strides, 0, kernel_storage_size); - d_info.init(d_shape, 1, data_type, d_strides, 0, output_storage_size); - - _input_transformed.allocator()->init(a_info, storage_alignment); - _kernel_storage.allocator()->init(b_info, storage_alignment); - _output_transformed.allocator()->init(d_info, storage_alignment); - - // configure and allocate dst tensor to be used to convert from winograd domain to spatial domain when calling to reshape_output() - TensorInfo info(TensorShape(_output->info()->dimension(2), _output->info()->dimension(0), - _output->info()->dimension(1), _output->info()->dimension(3)), - 1, _output->info()->data_type()); - _output_nhwc.allocator()->init(info); - - const ITensor *input_to_use = _input; - ITensor *output_to_use = _output; - PermutationVector weights_permutation_vector(3U, 0U, 1U, 2U); - const unsigned int max_num_threads = NEScheduler::get().num_threads(); - - // Configure the kernel to transform the input tensor from NCHW -> NHWC - if(_data_layout == DataLayout::NCHW) - { - _memory_group.manage(&_input_nhwc); - _permute_input.configure(input, &_input_nhwc, PermutationVector(2U, 0U, 1U)); - input_to_use = &_input_nhwc; - weights_permutation_vector = PermutationVector(3U, 2U, 0U, 1U); - } - - // Configure input transform kernel - _memory_group.manage(&_input_transformed); - _memory_group.manage(&_input_workspace); - transform_input_kernel->configure(input_to_use, in_shape.n_batches, in_shape.n_rows, in_shape.n_cols, in_shape.n_channels, use_padding_type, - &_input_transformed, input_matrix_stride, &_input_workspace); - const size_t input_workspace_size = transform_input_kernel->get_working_space_size(max_num_threads); - TensorInfo input_workspace_info(TensorShape(input_workspace_size), 1, _input->info()->data_type()); - _input_workspace.allocator()->init(input_workspace_info); - _input_workspace.allocator()->allocate(); - if(_data_layout == DataLayout::NCHW) - { - _input_nhwc.allocator()->allocate(); - } - - // Re-order a weight tensor from [Output feature map x Input feature map x Height x Width] to [Height x Width x Input feature map x Output feature map] - _permute_weights.configure(weights, &_weights_hwio, weights_permutation_vector); - transform_weights_kernel->configure(&_weights_hwio, &_kernel_storage, kernel_matrix_stride, out_channels, in_channels); - - // Configure GEMM function - _memory_group.manage(&_output_transformed); - _gemm_function.configure(&_input_transformed, &_kernel_storage, nullptr, &_output_transformed, 1.0f, 0.f); - _input_transformed.allocator()->allocate(); - - // Configure output transform function - // The biases tensor has not been allocated at this point in time, the output transform will add the biases to the final result in the run() method - if(_data_layout == DataLayout::NCHW) - { - _memory_group.manage(&_output_nhwc); - output_to_use = &_output_nhwc; - } - const arm_gemm::Activation activation = arm_gemm_activation_from_acl_activation(act_info); - - transform_output_kernel->configure(biases, - &_output_transformed, - output_matrix_stride, - output_to_use, - in_shape.n_batches, - output_shape.first, - output_shape.second, - out_channels, - &_output_workspace, - activation); +NEWinogradConvolutionLayer::~NEWinogradConvolutionLayer() = default; - const size_t output_workspace_size = transform_output_kernel->get_working_space_size(max_num_threads); - TensorInfo output_workspace_info(TensorShape(output_workspace_size), 1, _output->info()->data_type()); - _output_workspace.allocator()->init(output_workspace_info); - _output_workspace.allocator()->allocate(); - _output_transformed.allocator()->allocate(); - - // Reorder the convoluted output to ACL's ordering NCHW - if(_data_layout == DataLayout::NCHW) - { - _permute_output.configure(&_output_nhwc, _output, PermutationVector(1U, 2U, 0U)); - _output_nhwc.allocator()->allocate(); - } - - _transform_input_kernel = std::move(transform_input_kernel); - _transform_weights_kernel = std::move(transform_weights_kernel); - _transform_output_kernel = std::move(transform_output_kernel); +void NEWinogradConvolutionLayer::configure(const ITensor *input, + const ITensor *weights, + const ITensor *biases, + ITensor *output, + const PadStrideInfo &conv_info, + const ActivationLayerInfo &act_info, + bool enable_fast_math) +{ + _impl->original_weights = weights; + _impl->op = std::make_unique<cpu::CpuWinogradConv2d>(); + _impl->op->configure(input->info(), weights->info(), biases != nullptr ? biases->info() : nullptr, output->info(), + conv_info, act_info, enable_fast_math); - //Configure Activation Layer - _is_activationlayer_enabled = act_info.enabled() && !fuse_function_supported(act_info); - if(_is_activationlayer_enabled) - { - _activationlayer_function.configure(_output, nullptr, act_info); - } + _impl->aux_mem_req = _impl->op->workspace(); + _impl->run_pack = {{ACL_SRC_0, input}, {ACL_SRC_1, weights}, {ACL_SRC_2, biases}, {ACL_DST, output}}; + _impl->prep_pack = {{ACL_SRC_1, weights}, {ACL_SRC_2, biases}}; + _impl->workspace = + manage_workspace<Tensor>(_impl->aux_mem_req, _impl->memory_group, _impl->run_pack, _impl->prep_pack); } void NEWinogradConvolutionLayer::run() { prepare(); - MemoryGroupResourceScope scope_mg(_memory_group); - - if(_data_layout == DataLayout::NCHW) - { - //Bring channels to the front as Winograd code expects the tensor to be in the format NHWC - _permute_input.run(); - } - - // Transform input tensor to the winograd domain - NEScheduler::get().schedule(_transform_input_kernel.get(), Window::DimX); - - //Run 16 GEMMs in multiple threads, each kernel runs one or more GEMMs - _gemm_function.run(); - - // Transform output tensor to the spatial domain - NEScheduler::get().schedule(_transform_output_kernel.get(), Window::DimX); - - if(_data_layout == DataLayout::NCHW) - { - // Reorder the convoluted output to ACL's ordering NCHW - _permute_output.run(); - } - - if(_is_activationlayer_enabled) - { - _activationlayer_function.run(); - } + MemoryGroupResourceScope scope_mg(_impl->memory_group); + _impl->op->run(_impl->run_pack); } -Status NEWinogradConvolutionLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info, - const ActivationLayerInfo &act_info, bool enable_fast_math) +Status NEWinogradConvolutionLayer::validate(const ITensorInfo *input, + const ITensorInfo *weights, + const ITensorInfo *biases, + const ITensorInfo *output, + const PadStrideInfo &conv_info, + const ActivationLayerInfo &act_info, + bool enable_fast_math) { - ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output); - ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, weights, biases, output, conv_info)); - - // Get indices for the width and height - const size_t idx_width = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::WIDTH); - const size_t idx_height = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::HEIGHT); - - // Input shape, kernel size and output tile - const Size2D input_dims = Size2D(input->dimension(idx_width), input->dimension(idx_height)); - const Size2D kernel_size = Size2D(weights->dimension(idx_width), weights->dimension(idx_height)); - const DataType data_type = input->data_type(); - const Size2D output_tile = winograd_output_tile(input_dims, kernel_size, data_type); - - // Check if the Winograd configuration requires fast math - if(!enable_fast_math) - { - ARM_COMPUTE_RETURN_ERROR_ON_MSG(check_support_fast_math(output_tile, kernel_size, data_type), - "This Winograd configuration requires enable_fast_math=true"); - } - - const WinogradInfo winograd_info = WinogradInfo(output_tile, - kernel_size, - input_dims, - conv_info, - input->data_layout()); - - // Validate input transform - const TensorShape input0_shape = misc::shape_calculator::compute_winograd_input_transform_shape(*input, winograd_info); - const TensorInfo input0 = input->clone()->set_tensor_shape(input0_shape); - // Validate filter transform - const TensorShape input1_shape = misc::shape_calculator::compute_winograd_filter_transform_shape(*weights, winograd_info); - const TensorInfo input1 = weights->clone()->set_tensor_shape(input1_shape); - // Validate batched matrix multiply - TensorShape batched_mm_output_shape = input0.tensor_shape(); - batched_mm_output_shape[0] = input1.tensor_shape()[0]; - const TensorInfo batched_mm_output = input0.clone()->set_tensor_shape(batched_mm_output_shape); - - if(kernel_size == Size2D(3, 3)) - { - ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.pad_top() != 0u && conv_info.pad_top() != 1, "Only SAME or VALID padding supported"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.pad_bottom() != 0u && conv_info.pad_bottom() != 1, "Only SAME or VALID padding supported"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.pad_left() != 0u && conv_info.pad_left() != 1, "Only SAME or VALID padding supported"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.pad_right() != 0u && conv_info.pad_right() != 1, "Only SAME or VALID padding supported"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.pad_right() != conv_info.pad_left(), "Only SAME or VALID padding supported"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.pad_top() != conv_info.pad_bottom(), "Only SAME or VALID padding supported"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.pad_top() != conv_info.pad_left(), "Only SAME or VALID padding supported"); - return validate_kernel_3x3(input_dims, input, &input0, &input1, &batched_mm_output, weights, biases, output, winograd_info, act_info); - } - else if(kernel_size == Size2D(5, 5)) - { - ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.pad_top() != 0u && conv_info.pad_top() != 2, "Only SAME or VALID padding supported"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.pad_left() != 0u && conv_info.pad_left() != 2, "Only SAME or VALID padding supported"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.pad_bottom() != 0u && conv_info.pad_bottom() != 2, "Only SAME or VALID padding supported"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.pad_right() != 0u && conv_info.pad_right() != 2, "Only SAME or VALID padding supported"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.pad_right() != conv_info.pad_left(), "Only SAME or VALID padding supported"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.pad_top() != conv_info.pad_bottom(), "Only SAME or VALID padding supported"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.pad_top() != conv_info.pad_left(), "Only SAME or VALID padding supported"); - return validate_kernel_5x5(input, &input0, &input1, &batched_mm_output, weights, biases, output, winograd_info, act_info); - } - if(kernel_size == Size2D(3, 1)) - { - ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.pad_left() != 0u && conv_info.pad_left() != 1, "Only SAME or VALID padding supported"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.pad_right() != 0u && conv_info.pad_right() != 1, "Only SAME or VALID padding supported"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.pad_top() != 0u && conv_info.pad_bottom() != 0, "Only SAME or VALID padding supported"); - return validate_kernel_3x1(input, &input0, &input1, &batched_mm_output, weights, biases, output, winograd_info, act_info); - } - else if(kernel_size == Size2D(1, 3)) - { - ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.pad_top() != 0u && conv_info.pad_top() != 1, "Only SAME or VALID padding supported"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.pad_bottom() != 0u && conv_info.pad_bottom() != 1, "Only SAME or VALID padding supported"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.pad_left() != 0u && conv_info.pad_right() != 0, "Only SAME or VALID padding supported"); - return validate_kernel_1x3(input, &input0, &input1, &batched_mm_output, weights, biases, output, winograd_info, act_info); - } - else if(kernel_size == Size2D(5, 1)) - { - ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.pad_left() != 0u && conv_info.pad_left() != 2, "Only SAME or VALID padding supported"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.pad_right() != 0u && conv_info.pad_right() != 2, "Only SAME or VALID padding supported"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.pad_top() != 0u && conv_info.pad_bottom() != 0, "Only SAME or VALID padding supported"); - return validate_kernel_5x1(input, &input0, &input1, &batched_mm_output, weights, biases, output, winograd_info, act_info); - } - else if(kernel_size == Size2D(1, 5)) - { - ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.pad_top() != 0u && conv_info.pad_top() != 2, "Only SAME or VALID padding supported"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.pad_bottom() != 0u && conv_info.pad_bottom() != 2, "Only SAME or VALID padding supported"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.pad_left() != 0u && conv_info.pad_right() != 0, "Only SAME or VALID padding supported"); - return validate_kernel_1x5(input, &input0, &input1, &batched_mm_output, weights, biases, output, winograd_info, act_info); - } - else if(kernel_size == Size2D(7, 1)) - { - ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.pad_left() != 0u && conv_info.pad_left() != 3, "Only SAME or VALID padding supported"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.pad_right() != 0u && conv_info.pad_right() != 3, "Only SAME or VALID padding supported"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.pad_top() != 0u && conv_info.pad_bottom() != 0, "Only SAME or VALID padding supported"); - return validate_kernel_7x1(input, &input0, &input1, &batched_mm_output, weights, biases, output, winograd_info, act_info); - } - else if(kernel_size == Size2D(1, 7)) - { - ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.pad_top() != 0u && conv_info.pad_top() != 3, "Only SAME or VALID padding supported"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.pad_bottom() != 0u && conv_info.pad_bottom() != 3, "Only SAME or VALID padding supported"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.pad_left() != 0u && conv_info.pad_right() != 0, "Only SAME or VALID padding supported"); - return validate_kernel_1x7(input, &input0, &input1, &batched_mm_output, weights, biases, output, winograd_info, act_info); - } - else - { - ARM_COMPUTE_RETURN_ERROR_MSG("Kernel shape not supported"); - } + return cpu::CpuWinogradConv2d::validate(input, weights, biases, output, conv_info, act_info, enable_fast_math); } void NEWinogradConvolutionLayer::prepare() { - if(!_is_prepared) + if (!_impl->is_prepared) { - // Permute weights - _weights_hwio.allocator()->allocate(); - _permute_weights.run(); - _weights->mark_as_unused(); - - // Transform weights - _kernel_storage.allocator()->allocate(); - NEScheduler::get().schedule(_transform_weights_kernel.get(), Window::DimX); - _weights_hwio.allocator()->free(); + _impl->op->prepare(_impl->prep_pack); + _impl->original_weights->mark_as_unused(); - _gemm_function.prepare(); - if(!_kernel_storage.is_used()) - { - _kernel_storage.allocator()->free(); - } + // Release temporary tensors that are only used in prepare stage + release_temporaries<Tensor>(_impl->aux_mem_req, _impl->workspace); - _is_prepared = true; + _impl->is_prepared = true; } } } // namespace arm_compute diff --git a/src/runtime/OMP/OMPScheduler.cpp b/src/runtime/OMP/OMPScheduler.cpp index ca763f907b..2a5abb5f7a 100644 --- a/src/runtime/OMP/OMPScheduler.cpp +++ b/src/runtime/OMP/OMPScheduler.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2021 Arm Limited. + * Copyright (c) 2017-2024 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -27,14 +27,29 @@ #include "arm_compute/core/Error.h" #include "arm_compute/core/Helpers.h" #include "arm_compute/core/Utils.h" + #include <omp.h> namespace arm_compute { +#if !defined(_WIN64) && !defined(BARE_METAL) && !defined(__APPLE__) && !defined(__OpenBSD__) && \ + (defined(__arm__) || defined(__aarch64__)) && defined(__ANDROID__) +OMPScheduler::OMPScheduler() // NOLINT + : _num_threads(cpu_info().get_cpu_num_excluding_little()), + _has_lmb(cpu_info().cpu_has_little_mid_big()), + _nonlittle_num_cpus(cpu_info().get_cpu_num_excluding_little()) +{ +} +#else /* !defined(_WIN64) && !defined(BARE_METAL) && !defined(__APPLE__) && !defined(__OpenBSD__) && \ + (defined(__arm__) || defined(__aarch64__)) && defined(__ANDROID__)*/ OMPScheduler::OMPScheduler() // NOLINT - : _num_threads(omp_get_max_threads()) + : _num_threads(omp_get_max_threads()), + _has_lmb(cpu_info().cpu_has_little_mid_big()), + _nonlittle_num_cpus(cpu_info().get_cpu_num_excluding_little()) { } +#endif /* !defined(_WIN64) && !defined(BARE_METAL) && !defined(__APPLE__) && !defined(__OpenBSD__) && \ + (defined(__arm__) || defined(__aarch64__)) && defined(__ANDROID__)*/ unsigned int OMPScheduler::num_threads() const { @@ -44,7 +59,15 @@ unsigned int OMPScheduler::num_threads() const void OMPScheduler::set_num_threads(unsigned int num_threads) { const unsigned int num_cores = omp_get_max_threads(); - _num_threads = (num_threads == 0) ? num_cores : num_threads; +#if !defined(_WIN64) && !defined(BARE_METAL) && !defined(__APPLE__) && !defined(__OpenBSD__) && \ + (defined(__arm__) || defined(__aarch64__)) && defined(__ANDROID__) + const unsigned int adjusted_num_threads = (_has_lmb) ? _nonlittle_num_cpus : num_threads; + _num_threads = (num_threads == 0) ? num_cores : adjusted_num_threads; +#else /* !defined(_WIN64) && !defined(BARE_METAL) && !defined(__APPLE__) && !defined(__OpenBSD__) && \ + (defined(__arm__) || defined(__aarch64__)) && defined(__ANDROID__)*/ + _num_threads = (num_threads == 0) ? num_cores : num_threads; +#endif /* !defined(_WIN64) && !defined(BARE_METAL) && !defined(__APPLE__) && !defined(__OpenBSD__) && \ + (defined(__arm__) || defined(__aarch64__)) && defined(__ANDROID__)*/ } void OMPScheduler::schedule(ICPPKernel *kernel, const Hints &hints) @@ -63,20 +86,20 @@ void OMPScheduler::schedule_op(ICPPKernel *kernel, const Hints &hints, const Win const unsigned int num_iterations = max_window.num_iterations(hints.split_dimension()); const unsigned int num_threads = std::min(num_iterations, _num_threads); - if(!kernel->is_parallelisable() || num_threads == 1) + if (!kernel->is_parallelisable() || num_threads == 1) { ThreadInfo info; - info.cpu_info = &_cpu_info; + info.cpu_info = &cpu_info(); kernel->run_op(tensors, max_window, info); } else { const unsigned int num_windows = num_threads; std::vector<IScheduler::Workload> workloads(num_windows); - for(unsigned int t = 0; t < num_windows; t++) + for (unsigned int t = 0; t < num_windows; t++) { //Capture 't' by copy, all the other variables by reference: - workloads[t] = [t, &hints, &max_window, &num_windows, &kernel, &tensors](const ThreadInfo & info) + workloads[t] = [t, &hints, &max_window, &num_windows, &kernel, &tensors](const ThreadInfo &info) { Window win = max_window.split_window(hints.split_dimension(), t, num_windows); win.validate(); @@ -89,20 +112,25 @@ void OMPScheduler::schedule_op(ICPPKernel *kernel, const Hints &hints, const Win #ifndef DOXYGEN_SKIP_THIS void OMPScheduler::run_workloads(std::vector<arm_compute::IScheduler::Workload> &workloads) { - const unsigned int num_threads = std::min(_num_threads, static_cast<unsigned int>(workloads.size())); - if(num_threads < 1) + const unsigned int amount_of_work = static_cast<unsigned int>(workloads.size()); + const unsigned int num_threads_to_use = std::min(_num_threads, amount_of_work); + + if (num_threads_to_use < 1) { return; } ThreadInfo info; - info.cpu_info = &_cpu_info; - info.num_threads = num_threads; - #pragma omp parallel firstprivate(info) num_threads(num_threads) + info.cpu_info = &cpu_info(); + info.num_threads = num_threads_to_use; +#pragma omp parallel for firstprivate(info) num_threads(num_threads_to_use) default(shared) proc_bind(close) \ + schedule(static, 1) + for (unsigned int wid = 0; wid < amount_of_work; ++wid) { - const int tid = omp_get_thread_num(); + const int tid = omp_get_thread_num(); + info.thread_id = tid; - workloads[tid](info); + workloads[wid](info); } } #endif /* DOXYGEN_SKIP_THIS */ diff --git a/src/runtime/OffsetLifetimeManager.cpp b/src/runtime/OffsetLifetimeManager.cpp index a47fa184fa..d746f618b5 100644 --- a/src/runtime/OffsetLifetimeManager.cpp +++ b/src/runtime/OffsetLifetimeManager.cpp @@ -43,8 +43,7 @@ size_t align_offset(size_t offset, size_t alignment) return (remainder != 0U) ? offset + (alignment - remainder) : offset; } } // namespace -OffsetLifetimeManager::OffsetLifetimeManager() - : _blob(0) +OffsetLifetimeManager::OffsetLifetimeManager() : _blob(0) { } @@ -71,21 +70,22 @@ void OffsetLifetimeManager::update_blobs_and_mappings() // Update blob size size_t max_aggregated_size = 0; - std::for_each(std::begin(_free_blobs), std::end(_free_blobs), [&](const Blob & b) - { - max_aggregated_size += b.max_size; - _blob.alignment = std::max(_blob.alignment, b.max_alignment); - }); + std::for_each(std::begin(_free_blobs), std::end(_free_blobs), + [&](const Blob &b) + { + max_aggregated_size += b.max_size; + _blob.alignment = std::max(_blob.alignment, b.max_alignment); + }); max_aggregated_size += _free_blobs.size() * _blob.alignment; _blob.owners = std::max(_blob.owners, _free_blobs.size()); _blob.size = std::max(_blob.size, max_aggregated_size); // Calculate group mappings - auto &group_mappings = _active_group->mappings(); + auto &group_mappings = _active_group->mappings(); size_t offset = 0; - for(auto &free_blob : _free_blobs) + for (auto &free_blob : _free_blobs) { - for(auto &bound_element_id : free_blob.bound_elements) + for (auto &bound_element_id : free_blob.bound_elements) { ARM_COMPUTE_ERROR_ON(_active_elements.find(bound_element_id) == std::end(_active_elements)); Element &bound_element = _active_elements[bound_element_id]; diff --git a/src/runtime/OffsetMemoryPool.cpp b/src/runtime/OffsetMemoryPool.cpp index ffedf5586c..8f3c1a84ba 100644 --- a/src/runtime/OffsetMemoryPool.cpp +++ b/src/runtime/OffsetMemoryPool.cpp @@ -21,8 +21,6 @@ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ -#include <algorithm> - #include "arm_compute/runtime/OffsetMemoryPool.h" #include "arm_compute/core/Error.h" @@ -31,6 +29,8 @@ #include "arm_compute/runtime/MemoryRegion.h" #include "arm_compute/runtime/Types.h" +#include <algorithm> + namespace arm_compute { OffsetMemoryPool::OffsetMemoryPool(IAllocator *allocator, BlobInfo blob_info) @@ -50,7 +50,7 @@ void OffsetMemoryPool::acquire(MemoryMappings &handles) ARM_COMPUTE_ERROR_ON(_blob == nullptr); // Set memory to handlers - for(auto &handle : handles) + for (auto &handle : handles) { ARM_COMPUTE_ERROR_ON(handle.first == nullptr); handle.first->set_owned_region(_blob->extract_subregion(handle.second, _blob_info.size - handle.second)); @@ -59,7 +59,7 @@ void OffsetMemoryPool::acquire(MemoryMappings &handles) void OffsetMemoryPool::release(MemoryMappings &handles) { - for(auto &handle : handles) + for (auto &handle : handles) { ARM_COMPUTE_ERROR_ON(handle.first == nullptr); handle.first->set_region(nullptr); diff --git a/src/runtime/OperatorTensor.cpp b/src/runtime/OperatorTensor.cpp index a8ad53da90..19415b35cf 100644 --- a/src/runtime/OperatorTensor.cpp +++ b/src/runtime/OperatorTensor.cpp @@ -22,6 +22,7 @@ * SOFTWARE. */ #include "arm_compute/runtime/OperatorTensor.h" + #include "arm_compute/runtime/MemoryRegion.h" #include "support/Cast.h" @@ -47,7 +48,7 @@ ITensorInfo *OperatorTensor::info() uint8_t *OperatorTensor::buffer() const { - switch(_mem_type) + switch (_mem_type) { case MemoryType::CPU: return (uint8_t *)utils::cast::polymorphic_downcast<MemoryRegion *>(_memory->region())->buffer(); diff --git a/src/runtime/PoolManager.cpp b/src/runtime/PoolManager.cpp index 87376a71a4..7fb9bd8000 100644 --- a/src/runtime/PoolManager.cpp +++ b/src/runtime/PoolManager.cpp @@ -31,8 +31,7 @@ using namespace arm_compute; -PoolManager::PoolManager() - : _free_pools(), _occupied_pools(), _sem(), _mtx() +PoolManager::PoolManager() : _free_pools(), _occupied_pools(), _sem(), _mtx() { } @@ -52,10 +51,8 @@ void PoolManager::unlock_pool(IMemoryPool *pool) ARM_COMPUTE_ERROR_ON_MSG(_free_pools.empty() && _occupied_pools.empty(), "Haven't setup any pools!"); arm_compute::lock_guard<arm_compute::Mutex> lock(_mtx); - auto it = std::find_if(std::begin(_occupied_pools), std::end(_occupied_pools), [pool](const std::unique_ptr<IMemoryPool> &pool_it) - { - return pool_it.get() == pool; - }); + auto it = std::find_if(std::begin(_occupied_pools), std::end(_occupied_pools), + [pool](const std::unique_ptr<IMemoryPool> &pool_it) { return pool_it.get() == pool; }); ARM_COMPUTE_ERROR_ON_MSG(it == std::end(_occupied_pools), "Pool to be unlocked couldn't be found!"); _free_pools.splice(std::begin(_free_pools), _occupied_pools, it); _sem->signal(); @@ -78,7 +75,7 @@ std::unique_ptr<IMemoryPool> PoolManager::release_pool() arm_compute::lock_guard<arm_compute::Mutex> lock(_mtx); ARM_COMPUTE_ERROR_ON_MSG(!_occupied_pools.empty(), "All pools should be free in order to release one!"); - if(!_free_pools.empty()) + if (!_free_pools.empty()) { std::unique_ptr<IMemoryPool> pool = std::move(_free_pools.front()); ARM_COMPUTE_ERROR_ON(_free_pools.front() != nullptr); diff --git a/src/runtime/RuntimeContext.cpp b/src/runtime/RuntimeContext.cpp index d1dea066e7..1de8d2abdb 100644 --- a/src/runtime/RuntimeContext.cpp +++ b/src/runtime/RuntimeContext.cpp @@ -28,8 +28,7 @@ namespace arm_compute { -RuntimeContext::RuntimeContext() - : _owned_scheduler(SchedulerFactory::create()), _scheduler(_owned_scheduler.get()) +RuntimeContext::RuntimeContext() : _owned_scheduler(SchedulerFactory::create()), _scheduler(_owned_scheduler.get()) { } diff --git a/src/runtime/Scheduler.cpp b/src/runtime/Scheduler.cpp index 0713b9a2ad..3f1e96968a 100644 --- a/src/runtime/Scheduler.cpp +++ b/src/runtime/Scheduler.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2020 Arm Limited. + * Copyright (c) 2017-2020, 2023 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -76,7 +76,7 @@ void Scheduler::set(Type t) bool Scheduler::is_available(Type t) { - if(t == Type::CUSTOM) + if (t == Type::CUSTOM) { return _custom_scheduler != nullptr; } @@ -93,11 +93,12 @@ Scheduler::Type Scheduler::get_type() IScheduler &Scheduler::get() { - if(_scheduler_type == Type::CUSTOM) + if (_scheduler_type == Type::CUSTOM) { - if(_custom_scheduler == nullptr) + if (_custom_scheduler == nullptr) { - ARM_COMPUTE_ERROR("No custom scheduler has been setup. Call set(std::shared_ptr<IScheduler> &scheduler) before Scheduler::get()"); + ARM_COMPUTE_ERROR("No custom scheduler has been setup. Call set(std::shared_ptr<IScheduler> &scheduler) " + "before Scheduler::get()"); } else { @@ -106,13 +107,13 @@ IScheduler &Scheduler::get() } else { - if(_schedulers.empty()) + if (_schedulers.empty()) { _schedulers = init(); } auto it = _schedulers.find(_scheduler_type); - if(it != _schedulers.end()) + if (it != _schedulers.end()) { return *it->second; } diff --git a/src/runtime/SchedulerFactory.cpp b/src/runtime/SchedulerFactory.cpp index cc21d62630..4fb08d79f5 100644 --- a/src/runtime/SchedulerFactory.cpp +++ b/src/runtime/SchedulerFactory.cpp @@ -48,7 +48,7 @@ const SchedulerFactory::Type SchedulerFactory::_default_type = SchedulerFactory: std::unique_ptr<IScheduler> SchedulerFactory::create(Type type) { - switch(type) + switch (type) { case Type::ST: { diff --git a/src/runtime/SchedulerUtils.cpp b/src/runtime/SchedulerUtils.cpp index 6f9a32c879..74ee539fec 100644 --- a/src/runtime/SchedulerUtils.cpp +++ b/src/runtime/SchedulerUtils.cpp @@ -47,35 +47,34 @@ std::pair<unsigned, unsigned> split_2d(unsigned max_threads, std::size_t m, std: double ratio = m / static_cast<double>(n); // nt = sqrt(max_threads * (m / n) ) - const unsigned adjusted = std::round( - std::sqrt(max_threads * ratio)); + const unsigned adjusted = std::round(std::sqrt(max_threads * ratio)); //find the nearest factor of max_threads - for(unsigned i = 0; i != adjusted; ++i) + for (unsigned i = 0; i != adjusted; ++i) { //try down const unsigned adj_down = adjusted - i; - if(max_threads % adj_down == 0) + if (max_threads % adj_down == 0) { - return { adj_down, max_threads / adj_down }; + return {adj_down, max_threads / adj_down}; } //try up const unsigned adj_up = adjusted + i; - if(max_threads % adj_up == 0) + if (max_threads % adj_up == 0) { - return { adj_up, max_threads / adj_up }; + return {adj_up, max_threads / adj_up}; } } //we didn't find anything so lets bail out with maxes biased to the largest dimension - if(m > n) + if (m > n) { - return { std::min<unsigned>(m, max_threads), 1 }; + return {std::min<unsigned>(m, max_threads), 1}; } else { - return { 1, std::min<unsigned>(n, max_threads) }; + return {1, std::min<unsigned>(n, max_threads)}; } } #endif /* #ifndef BARE_METAL */ diff --git a/src/runtime/SubTensor.cpp b/src/runtime/SubTensor.cpp index ae16c8be0a..f87256abb1 100644 --- a/src/runtime/SubTensor.cpp +++ b/src/runtime/SubTensor.cpp @@ -27,8 +27,7 @@ using namespace arm_compute; -SubTensor::SubTensor() - : _parent(nullptr), _info() +SubTensor::SubTensor() : _parent(nullptr), _info() { } diff --git a/src/runtime/Tensor.cpp b/src/runtime/Tensor.cpp index 6dcef9f0b5..f17e323694 100644 --- a/src/runtime/Tensor.cpp +++ b/src/runtime/Tensor.cpp @@ -25,8 +25,7 @@ namespace arm_compute { -Tensor::Tensor(IRuntimeContext *) - : _allocator(this) +Tensor::Tensor(IRuntimeContext *) : _allocator(this) { } diff --git a/src/runtime/TensorAllocator.cpp b/src/runtime/TensorAllocator.cpp index 4ae27c59fc..372852bfea 100644 --- a/src/runtime/TensorAllocator.cpp +++ b/src/runtime/TensorAllocator.cpp @@ -43,13 +43,13 @@ bool validate_subtensor_shape(const TensorInfo &parent_info, const TensorInfo &c const size_t parent_dims = parent_info.num_dimensions(); const size_t child_dims = child_info.num_dimensions(); - if(child_dims <= parent_dims) + if (child_dims <= parent_dims) { - for(size_t num_dimensions = child_dims; num_dimensions > 0; --num_dimensions) + for (size_t num_dimensions = child_dims; num_dimensions > 0; --num_dimensions) { const size_t child_dim_size = coords[num_dimensions - 1] + child_shape[num_dimensions - 1]; - if((coords[num_dimensions - 1] < 0) || (child_dim_size > parent_shape[num_dimensions - 1])) + if ((coords[num_dimensions - 1] < 0) || (child_dim_size > parent_shape[num_dimensions - 1])) { is_valid = false; break; @@ -65,8 +65,7 @@ bool validate_subtensor_shape(const TensorInfo &parent_info, const TensorInfo &c } } // namespace -TensorAllocator::TensorAllocator(IMemoryManageable *owner) - : _owner(owner), _associated_memory_group(nullptr), _memory() +TensorAllocator::TensorAllocator(IMemoryManageable *owner) : _owner(owner), _associated_memory_group(nullptr), _memory() { } @@ -88,7 +87,7 @@ TensorAllocator::TensorAllocator(TensorAllocator &&o) noexcept TensorAllocator &TensorAllocator::operator=(TensorAllocator &&o) noexcept { - if(&o != this) + if (&o != this) { _owner = o._owner; o._owner = nullptr; @@ -117,8 +116,10 @@ void TensorAllocator::init(const TensorAllocator &allocator, const Coordinates & _memory = Memory(allocator._memory.region()); // Init tensor info with new dimensions - size_t total_size = parent_info.offset_element_in_bytes(coords) + sub_info.total_size() - sub_info.offset_first_element_in_bytes(); - sub_info.init(sub_info.tensor_shape(), sub_info.format(), parent_info.strides_in_bytes(), parent_info.offset_element_in_bytes(coords), total_size); + size_t total_size = + parent_info.offset_element_in_bytes(coords) + sub_info.total_size() - sub_info.offset_first_element_in_bytes(); + sub_info.init(sub_info.tensor_shape(), sub_info.format(), parent_info.strides_in_bytes(), + parent_info.offset_element_in_bytes(coords), total_size); // Set TensorInfo init(sub_info); @@ -133,7 +134,7 @@ void TensorAllocator::allocate() { // Align to 64-byte boundaries by default if alignment is not specified const size_t alignment_to_use = (alignment() != 0) ? alignment() : 64; - if(_associated_memory_group == nullptr) + if (_associated_memory_group == nullptr) { _memory.set_owned_region(std::make_unique<MemoryRegion>(info().total_size(), alignment_to_use)); } diff --git a/src/runtime/Utils.cpp b/src/runtime/Utils.cpp index 15e9d43a49..a7f7b5f3cb 100644 --- a/src/runtime/Utils.cpp +++ b/src/runtime/Utils.cpp @@ -41,20 +41,17 @@ static const std::string information = const std::string &string_from_scheduler_type(Scheduler::Type t) { - static std::map<Scheduler::Type, const std::string> scheduler_type_map = - { - { Scheduler::Type::ST, "Single Thread" }, - { Scheduler::Type::CPP, "C++11 Threads" }, - { Scheduler::Type::OMP, "OpenMP Threads" }, - { Scheduler::Type::CUSTOM, "Custom" } - }; + static std::map<Scheduler::Type, const std::string> scheduler_type_map = {{Scheduler::Type::ST, "Single Thread"}, + {Scheduler::Type::CPP, "C++11 Threads"}, + {Scheduler::Type::OMP, "OpenMP Threads"}, + {Scheduler::Type::CUSTOM, "Custom"}}; return scheduler_type_map[t]; } void schedule_kernel_on_ctx(IRuntimeContext *ctx, ICPPKernel *kernel, const IScheduler::Hints &hints) { - if(ctx) + if (ctx) { ARM_COMPUTE_ERROR_ON(ctx->scheduler() == nullptr); ctx->scheduler()->schedule(kernel, hints); @@ -68,7 +65,7 @@ void schedule_kernel_on_ctx(IRuntimeContext *ctx, ICPPKernel *kernel, const ISch unsigned int calculate_number_of_stages_only_x_axis(size_t input_x_dimension, unsigned int axis) { // We need only 1 stage for all axis except x-axis - if(axis != 0) + if (axis != 0) { return 1; } diff --git a/src/runtime/cpu/ICpuOperator.h b/src/runtime/cpu/ICpuOperator.h deleted file mode 100644 index 70ab4364c7..0000000000 --- a/src/runtime/cpu/ICpuOperator.h +++ /dev/null @@ -1,36 +0,0 @@ -/* - * Copyright (c) 2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifndef ARM_COMPUTE_ICPUOPERATOR_H -#define ARM_COMPUTE_ICPUOPERATOR_H - -#include "arm_compute/runtime/NEON/INEOperator.h" - -namespace arm_compute -{ -namespace cpu -{ -using ICpuOperator = experimental::INEOperator; -} // namespace cpu -} // namespace arm_compute -#endif /* ARM_COMPUTE_ICPUOPERATOR_H */ diff --git a/src/runtime/cpu/operators/CpuActivation.cpp b/src/runtime/cpu/operators/CpuActivation.cpp deleted file mode 100644 index 7753c9601f..0000000000 --- a/src/runtime/cpu/operators/CpuActivation.cpp +++ /dev/null @@ -1,44 +0,0 @@ -/* - * Copyright (c) 2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "src/runtime/cpu/operators/CpuActivation.h" - -#include "src/core/cpu/kernels/CpuActivationKernel.h" - -namespace arm_compute -{ -namespace cpu -{ -void CpuActivation::configure(const ITensorInfo *input, ITensorInfo *output, const ActivationLayerInfo &activation_info) -{ - auto k = std::make_unique<kernels::CpuActivationKernel>(); - k->configure(input, output, activation_info); - _kernel = std::move(k); -} - -Status CpuActivation::validate(const ITensorInfo *input, const ITensorInfo *output, const ActivationLayerInfo &activation_info) -{ - return kernels::CpuActivationKernel::validate(input, output, activation_info); -} -} // namespace cpu -} // namespace arm_compute diff --git a/src/runtime/cpu/operators/CpuActivation.h b/src/runtime/cpu/operators/CpuActivation.h deleted file mode 100644 index 0ae16bf958..0000000000 --- a/src/runtime/cpu/operators/CpuActivation.h +++ /dev/null @@ -1,58 +0,0 @@ -/* - * Copyright (c) 2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifndef ARM_COMPUTE_CPU_ACTIVATION_H -#define ARM_COMPUTE_CPU_ACTIVATION_H - -#include "src/runtime/cpu/ICpuOperator.h" - -namespace arm_compute -{ -namespace cpu -{ -/** Basic function to run @ref kernels::CpuActivationKernel */ -class CpuActivation : public ICpuOperator -{ -public: - /** Constructor */ - CpuActivation() = default; - /** Configure operator for a given list of arguments - * - * @param[in] input Source tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/QSYMM16/F16/F32. - * @param[out] output Destination tensor info. Data type supported: same as @p src - * @param[in] activation_info Activation layer parameters. - */ - void configure(const ITensorInfo *input, ITensorInfo *output, const ActivationLayerInfo &activation_info); - /** Static function to check if given info will lead to a valid configuration of @ref CpuActivation - * - * @param[in] input Source tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/QSYMM16/F16/F32. - * @param[in] output Destination tensor info. Data type supported: same as @p src - * @param[in] act_info Activation layer information. - * - * @return a status - */ - static Status validate(const ITensorInfo *input, const ITensorInfo *output, const ActivationLayerInfo &act_info); -}; -} // namespace cpu -} // namespace arm_compute -#endif /* ARM_COMPUTE_CPU_ACTIVATION_H */ diff --git a/src/runtime/cpu/operators/CpuAdd.cpp b/src/runtime/cpu/operators/CpuAdd.cpp deleted file mode 100644 index 23b09aca4f..0000000000 --- a/src/runtime/cpu/operators/CpuAdd.cpp +++ /dev/null @@ -1,46 +0,0 @@ -/* - * Copyright (c) 2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "src/runtime/cpu/operators/CpuAdd.h" - -#include "src/core/cpu/kernels/CpuAddKernel.h" - -namespace arm_compute -{ -namespace cpu -{ -void CpuAdd::configure(const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst, ConvertPolicy policy, const ActivationLayerInfo &act_info) -{ - ARM_COMPUTE_UNUSED(act_info); - auto k = std::make_unique<kernels::CpuAddKernel>(); - k->configure(src0, src1, dst, policy); - _kernel = std::move(k); -} - -Status CpuAdd::validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst, ConvertPolicy policy, const ActivationLayerInfo &act_info) -{ - ARM_COMPUTE_RETURN_ERROR_ON(act_info.enabled()); - return kernels::CpuAddKernel::validate(src0, src1, dst, policy); -} -} // namespace cpu -} // namespace arm_compute diff --git a/src/runtime/cpu/operators/CpuAdd.h b/src/runtime/cpu/operators/CpuAdd.h deleted file mode 100644 index 8ae7833f01..0000000000 --- a/src/runtime/cpu/operators/CpuAdd.h +++ /dev/null @@ -1,77 +0,0 @@ -/* - * Copyright (c) 2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifndef ARM_COMPUTE_CPU_ADD_H -#define ARM_COMPUTE_CPU_ADD_H - -#include "src/runtime/cpu/ICpuOperator.h" - -namespace arm_compute -{ -namespace cpu -{ -/** Basic function to run @ref kernels::CpuAddKernel */ -class CpuAdd : public ICpuOperator -{ -public: - /** Constructor */ - CpuAdd() = default; - /** Initialise the kernel's input, dst and border mode. - * - * Valid configurations (src0,src1) -> dst : - * - * - (U8,U8) -> U8 - * - (U8,U8) -> S16 - * - (S16,U8) -> S16 - * - (U8,S16) -> S16 - * - (S16,S16) -> S16 - * - (S32,S32) -> S32 - * - (F16,F16) -> F16 - * - (F32,F32) -> F32 - * - (QASYMM8,QASYMM8) -> QASYMM8 - * - (QASYMM8_SIGNED,QASYMM8_SIGNED) -> QASYMM8_SIGNED - * - (QSYMM16,QSYMM16) -> QSYMM16 - * - * @param[in] src0 First input tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/S32/F32 - * @param[in] src1 Second input tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/S32/F32 - * @param[out] dst The dst tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/S32/F32. - * @param[in] policy Overflow policy. - * @param[in] act_info (Optional) Activation layer information in case of a fused activation. Currently not supported. - * - */ - void configure(const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst, ConvertPolicy policy, const ActivationLayerInfo &act_info = ActivationLayerInfo()); - /** Static function to check if given info will lead to a valid configuration of @ref CpuAdd - * - * @param[in] src0 First input tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/S32/F32 - * @param[in] src1 Second input tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/S32/F32 - * @param[in] dst The dst tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/S32/F32. - * @param[in] policy Overflow policy. - * @param[in] act_info (Optional) Activation layer information in case of a fused activation. Currently not supported. - * - * @return a status - */ - static Status validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst, ConvertPolicy policy, const ActivationLayerInfo &act_info = ActivationLayerInfo()); -}; -} // namespace cpu -} // namespace arm_compute -#endif /* ARM_COMPUTE_CPU_ADD_H */ diff --git a/src/runtime/cpu/operators/CpuCast.h b/src/runtime/cpu/operators/CpuCast.h deleted file mode 100644 index 2aea2d2b09..0000000000 --- a/src/runtime/cpu/operators/CpuCast.h +++ /dev/null @@ -1,73 +0,0 @@ -/* - * Copyright (c) 2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifndef ARM_COMPUTE_CPU_CAST_H -#define ARM_COMPUTE_CPU_CAST_H - -#include "src/runtime/cpu/ICpuOperator.h" - -namespace arm_compute -{ -namespace cpu -{ -/** Basic function to run @ref kernels::CpuCastKernel */ -class CpuCast : public ICpuOperator -{ -public: - /** Constructor */ - CpuCast() = default; - /** Configure operator for a given list of arguments - * - * Input data type must be different than output data type. - * - * Valid data layouts: - * - All - * - * Valid data type configurations: - * |src |dst | - * |:--------------|:-----------------------------------------------| - * |QASYMM8_SIGNED | S16, S32, F32, F16 | - * |QASYMM8 | U16, S16, S32, F32, F16 | - * |U8 | U16, S16, S32, F32, F16 | - * |U16 | U8, U32 | - * |S16 | QASYMM8_SIGNED, U8, S32 | - * |F16 | QASYMM8_SIGNED, QASYMM8, F32, S32, U8 | - * |S32 | QASYMM8_SIGNED, QASYMM8, F16, F32, U8 | - * |F32 | QASYMM8_SIGNED, QASYMM8, BFLOAT16, F16, S32, U8| - * - * @param[in] src The source tensor to convert. Data types supported: U8/S8/U16/S16/U32/S32/F16/F32. - * @param[out] dst The destination tensor. Data types supported: U8/S8/U16/S16/U32/S32/F16/F32. - * @param[in] policy Conversion policy. - */ - void configure(const ITensorInfo *src, ITensorInfo *dst, ConvertPolicy policy); - /** Static function to check if given info will lead to a valid configuration - * - * Similar to @ref CpuCast::configure() - * - * @return a status - */ - static Status validate(const ITensorInfo *src, const ITensorInfo *dst, ConvertPolicy policy); -}; -} // namespace cpu -} // namespace arm_compute -#endif /* ARM_COMPUTE_CPU_ACTIVATION_H */ diff --git a/src/runtime/cpu/operators/CpuConcatenate.cpp b/src/runtime/cpu/operators/CpuConcatenate.cpp deleted file mode 100644 index 23eb3fceab..0000000000 --- a/src/runtime/cpu/operators/CpuConcatenate.cpp +++ /dev/null @@ -1,173 +0,0 @@ -/* - * Copyright (c) 2018-2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "src/runtime/cpu/operators/CpuConcatenate.h" - -#include "src/core/cpu/kernels/CpuConcatenateBatchKernel.h" -#include "src/core/cpu/kernels/CpuConcatenateDepthKernel.h" -#include "src/core/cpu/kernels/CpuConcatenateHeightKernel.h" -#include "src/core/cpu/kernels/CpuConcatenateWidthKernel.h" - -#include "arm_compute/core/utils/misc/ShapeCalculator.h" -#include "arm_compute/runtime/NEON/NEScheduler.h" - -#include "arm_compute/core/Error.h" -#include "arm_compute/core/ITensor.h" -#include "arm_compute/core/TensorInfo.h" -#include "arm_compute/core/Types.h" -#include "arm_compute/core/Validate.h" -#include "src/core/helpers/AutoConfiguration.h" - -namespace arm_compute -{ -namespace cpu -{ -CpuConcatenate::CpuConcatenate() - : _concat_kernels(), _num_srcs(0), _axis(0) -{ -} - -void CpuConcatenate::configure(const std::vector<const ITensorInfo *> &srcs_vector, ITensorInfo *dst, size_t axis) -{ - ARM_COMPUTE_ERROR_ON(dst == nullptr); - - _axis = axis; - _num_srcs = srcs_vector.size(); - - TensorShape dst_shape = arm_compute::misc::shape_calculator::calculate_concatenate_shape(srcs_vector, axis); - - // Output auto inizialitation if not yet initialized - auto_init_if_empty(*dst, dst_shape, 1, srcs_vector[0]->data_type()); - ARM_COMPUTE_ERROR_THROW_ON(CpuConcatenate::validate(srcs_vector, dst, axis)); - - unsigned int offset = 0; - - for(unsigned int i = 0; i < _num_srcs; ++i) - { - switch(axis) - { - case Window::DimX: - { - auto kernel = std::make_unique<kernels::CpuConcatenateWidthKernel>(); - kernel->configure(srcs_vector.at(i), offset, dst); - _concat_kernels.emplace_back(std::move(kernel)); - break; - } - case Window::DimY: - { - auto kernel = std::make_unique<kernels::CpuConcatenateHeightKernel>(); - kernel->configure(srcs_vector.at(i), offset, dst); - _concat_kernels.emplace_back(std::move(kernel)); - break; - } - case Window::DimZ: - { - auto kernel = std::make_unique<kernels::CpuConcatenateDepthKernel>(); - kernel->configure(srcs_vector.at(i), offset, dst); - _concat_kernels.emplace_back(std::move(kernel)); - break; - } - case 3: - { - auto kernel = std::make_unique<kernels::CpuConcatenateBatchKernel>(); - kernel->configure(srcs_vector.at(i), offset, dst); - _concat_kernels.emplace_back(std::move(kernel)); - break; - } - default: - ARM_COMPUTE_ERROR("Axis not supported"); - } - offset += srcs_vector.at(i)->dimension(axis); - } -} - -Status CpuConcatenate::validate(const std::vector<const ITensorInfo *> &srcs_vector, const ITensorInfo *dst, size_t axis) -{ - ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(dst); - ARM_COMPUTE_RETURN_ERROR_ON(srcs_vector.size() < 2); - - unsigned int offset = 0; - for(const auto &src : srcs_vector) - { - ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src); - switch(axis) - { - case Window::DimX: - { - ARM_COMPUTE_RETURN_ON_ERROR(kernels::CpuConcatenateWidthKernel::validate(src, offset, dst)); - break; - } - case Window::DimY: - { - ARM_COMPUTE_RETURN_ON_ERROR(kernels::CpuConcatenateHeightKernel::validate(src, offset, dst)); - break; - } - case Window::DimZ: - { - ARM_COMPUTE_RETURN_ON_ERROR(kernels::CpuConcatenateDepthKernel::validate(src, offset, dst)); - break; - } - case 3: - { - ARM_COMPUTE_RETURN_ON_ERROR(kernels::CpuConcatenateBatchKernel::validate(src, offset, dst)); - break; - } - default: - ARM_COMPUTE_ERROR("Axis not supported"); - } - offset += src->dimension(axis); - } - - if(dst->total_size() != 0) - { - TensorShape dst_shape = arm_compute::misc::shape_calculator::calculate_concatenate_shape(srcs_vector, axis); - ARM_COMPUTE_RETURN_ERROR_ON(dst_shape.total_size() != dst->tensor_shape().total_size()); - } - - return Status{}; -} - -void CpuConcatenate::run(ITensorPack &tensors) -{ - if(tensors.empty()) - { - ARM_COMPUTE_ERROR("No inputs provided"); - } - - if(static_cast<int>(tensors.size() - 1) != static_cast<int>(_num_srcs)) - { - ARM_COMPUTE_ERROR("Configured with different number of inputs"); - } - - int i = 0; - for(auto &k : _concat_kernels) - { - ITensorPack pack; - pack.add_tensor(TensorType::ACL_SRC, tensors.get_const_tensor(ACL_SRC_VEC + i)); - pack.add_tensor(TensorType::ACL_DST, tensors.get_tensor(ACL_DST)); - NEScheduler::get().schedule_op(k.get(), Window::DimY, k->window(), pack); - ++i; - } -} -} // namespace cpu -} // namespace arm_compute diff --git a/src/runtime/cpu/operators/CpuConcatenate.h b/src/runtime/cpu/operators/CpuConcatenate.h deleted file mode 100644 index d2af3e2ad2..0000000000 --- a/src/runtime/cpu/operators/CpuConcatenate.h +++ /dev/null @@ -1,83 +0,0 @@ -/* - * Copyright (c) 2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifndef ARM_COMPUTE_CPU_CONCATENATE_H -#define ARM_COMPUTE_CPU_CONCATENATE_H - -#include "src/core/cpu/ICpuKernel.h" -#include "src/runtime/cpu/ICpuOperator.h" - -#include <vector> - -namespace arm_compute -{ -namespace cpu -{ -/** Basic function to execute concatenate tensors along a given axis. This function calls the following kernels: - * - * -# @ref kernels::CpuConcatenateWidthKernel (if underlying concatenation axis is 0). - * -# @ref kernels::CpuConcatenateHeightKernel (if underlying concatenation axis is 1). - * -# @ref kernels::CpuConcatenateDepthKernel (if underlying concatenation axis is 2). - * -# @ref kernels::CpuConcatenateBatchKernel (if underlying concatenation axis is 3). - */ -class CpuConcatenate : public ICpuOperator -{ -public: - /** Constructor */ - CpuConcatenate(); - /** Configure operator for a given list of arguments - * - * @note Input and output tensor dimensions preconditions defer depending on the concatenation axis. - * @note Preconditions can be found respectively at @ref kernels::CpuConcatenateWidthKernel, @ref kernels::CpuConcatenateHeightKernel, - * @ref kernels::CpuConcatenateDepthKernel and @ref kernels::CpuConcatenateBatchKernel. - * - * @param[in,out] srcs_vector The vectors containing all the tensors to concatenate. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32. - * @param[out] dst Output tensor. Data types supported: Same as @p srcs_vector. - * @param[in] axis Concatenation axis. Supported underlying concatenation axis are 0, 1, 2 and 3. - */ - void configure(const std::vector<const ITensorInfo *> &srcs_vector, ITensorInfo *dst, size_t axis); - /** Static function to check if given info will lead to a valid configuration of @ref NEConcatenateLayer - * - * @note Input and output tensor dimensions preconditions defer depending on the concatenation axis. - * @note Preconditions can be found respectively at @ref kernels::CpuConcatenateWidthKernel, @ref kernels::CpuConcatenateHeightKernel, - * @ref kernels::CpuConcatenateDepthKernel and @ref kernels::CpuConcatenateBatchKernel. - * - * @param[in] srcs_vector The vectors containing all the tensors info to concatenate. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32. - * @param[in] dst Output tensor info. Data types supported: Same as @p srcs_vector. - * @param[in] axis Concatenation axis. Supported underlying concatenation axis are 0, 1, 2 and 3. - * - * @return a status - */ - static Status validate(const std::vector<const ITensorInfo *> &srcs_vector, const ITensorInfo *dst, size_t axis); - - // Inherited methods overridden: - void run(ITensorPack &tensors) override; - -private: - std::vector<std::unique_ptr<ICpuKernel>> _concat_kernels; - unsigned int _num_srcs; - unsigned int _axis; -}; -} // namespace cpu -} // namespace arm_compute -#endif /* ARM_COMPUTE_CPU_CONCATENATE_H */ diff --git a/src/runtime/cpu/operators/CpuConvertFullyConnectedWeights.cpp b/src/runtime/cpu/operators/CpuConvertFullyConnectedWeights.cpp deleted file mode 100644 index 3f2f4e95cf..0000000000 --- a/src/runtime/cpu/operators/CpuConvertFullyConnectedWeights.cpp +++ /dev/null @@ -1,50 +0,0 @@ -/* - * Copyright (c) 2018-2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "src/runtime/cpu/operators/CpuConvertFullyConnectedWeights.h" - -#include "arm_compute/runtime/NEON/NEScheduler.h" -#include "src/core/cpu/kernels/CpuConvertFullyConnectedWeightsKernel.h" - -namespace arm_compute -{ -namespace cpu -{ -void CpuConvertFullyConnectedWeights::configure(const ITensorInfo *src, ITensorInfo *dst, const TensorShape &original_src_shape, DataLayout data_layout) -{ - auto k = std::make_unique<kernels::CpuConvertFullyConnectedWeightsKernel>(); - k->configure(src, dst, original_src_shape, data_layout); - _kernel = std::move(k); -} - -Status CpuConvertFullyConnectedWeights::validate(const ITensorInfo *src, const ITensorInfo *dst, const TensorShape &original_src_shape, DataLayout data_layout) -{ - return kernels::CpuConvertFullyConnectedWeightsKernel::validate(src, dst, original_src_shape, data_layout); -} - -void CpuConvertFullyConnectedWeights::run(ITensorPack &tensors) -{ - NEScheduler::get().schedule_op(_kernel.get(), Window::DimZ, _kernel->window(), tensors); -} -} // namesapce cpu -} // namespace arm_compute diff --git a/src/runtime/cpu/operators/CpuConvertFullyConnectedWeights.h b/src/runtime/cpu/operators/CpuConvertFullyConnectedWeights.h deleted file mode 100644 index 3f1ddf1dbe..0000000000 --- a/src/runtime/cpu/operators/CpuConvertFullyConnectedWeights.h +++ /dev/null @@ -1,62 +0,0 @@ -/* - * Copyright (c) 2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifndef ARM_COMPUTE_CPU_CONVERTFULLYCONNECTEDWEIGHTS_H -#define ARM_COMPUTE_CPU_CONVERTFULLYCONNECTEDWEIGHTS_H - -#include "src/runtime/cpu/ICpuOperator.h" - -namespace arm_compute -{ -namespace cpu -{ -/** Basic function to run @ref kernels::CpuConvertFullyConnectedWeightsKernel */ -class CpuConvertFullyConnectedWeights : public ICpuOperator -{ -public: - /** Constructor */ - CpuConvertFullyConnectedWeights() = default; - /** Configure operator for a given list of arguments - * - * @param[in] src Source tensor to permute. Data types supported: All - * @param[out] dst Destintation tensor. Data types supported: Same as @p src - * @param[in] original_src_shape Shape of the original src tensor (the one entering fully connected layer). - * @param[in] data_layout The data layout the weights have been trained in. - */ - void configure(const ITensorInfo *src, ITensorInfo *dst, const TensorShape &original_src_shape, DataLayout data_layout); - /** Static function to check if given info will lead to a valid configuration of @ref CpuConvertFullyConnectedWeights - * - * @param[in] src Source tensor to permute. Data types supported: All - * @param[in] dst Destination tensor. Data types supported: Same as @p dst - * @param[in] original_src_shape Shape of the original src tensor (the one entering fully connected layer). - * @param[in] data_layout The data layout the weights have been trained in. - * - * @return a status - */ - static Status validate(const ITensorInfo *src, const ITensorInfo *dst, const TensorShape &original_src_shape, DataLayout data_layout); - // Inherited methods overridden: - void run(ITensorPack &tensors) override; -}; -} // namespace cpu -} // namespace arm_compute -#endif /* ARM_COMPUTE_CPU_CONVERTFULLYCONNECTEDWEIGHTS_H */ diff --git a/src/runtime/cpu/operators/CpuCopy.cpp b/src/runtime/cpu/operators/CpuCopy.cpp deleted file mode 100644 index 9fbe916163..0000000000 --- a/src/runtime/cpu/operators/CpuCopy.cpp +++ /dev/null @@ -1,44 +0,0 @@ -/* - * Copyright (c) 2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "src/runtime/cpu/operators/CpuCopy.h" - -#include "src/core/cpu/kernels/CpuCopyKernel.h" - -namespace arm_compute -{ -namespace cpu -{ -void CpuCopy::configure(const ITensorInfo *src, ITensorInfo *dst) -{ - auto k = std::make_unique<kernels::CpuCopyKernel>(); - k->configure(src, dst); - _kernel = std::move(k); -} - -Status CpuCopy::validate(const ITensorInfo *src, const ITensorInfo *dst) -{ - return kernels::CpuCopyKernel::validate(src, dst); -} -} // namespace cpu -} // namespace arm_compute diff --git a/src/runtime/cpu/operators/CpuDepthwiseConv2d.cpp b/src/runtime/cpu/operators/CpuDepthwiseConv2d.cpp deleted file mode 100644 index 160a9fd70b..0000000000 --- a/src/runtime/cpu/operators/CpuDepthwiseConv2d.cpp +++ /dev/null @@ -1,523 +0,0 @@ -/* - * Copyright (c) 2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "src/runtime/cpu/operators/CpuDepthwiseConv2d.h" - -#include "arm_compute/core/TensorInfo.h" -#include "arm_compute/core/Validate.h" -#include "arm_compute/core/utils/misc/InfoHelpers.h" -#include "arm_compute/core/utils/misc/ShapeCalculator.h" -#include "arm_compute/runtime/NEON/NEScheduler.h" -#include "src/core/cpu/kernels/CpuDepthwiseConv2dNativeKernel.h" - -namespace arm_compute -{ -namespace cpu -{ -namespace -{ -Status validate_arguments_optimized(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, const ConvolutionInfo &info) -{ - ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, weights, dst); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F16, DataType::F32); - if(!is_data_type_quantized_per_channel(weights->data_type())) - { - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, weights); - } - ARM_COMPUTE_RETURN_ERROR_ON(src->data_layout() == DataLayout::UNKNOWN); - ARM_COMPUTE_RETURN_ERROR_ON(info.dilation.x() < 1 || info.dilation.y() < 1); - const size_t idx_w = get_data_layout_dimension_index(src->data_layout(), DataLayoutDimension::WIDTH); - const size_t idx_h = get_data_layout_dimension_index(src->data_layout(), DataLayoutDimension::HEIGHT); - ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(idx_w) + (weights->dimension(idx_w) - 1) * (info.dilation.x() - 1) > src->dimension(idx_w) + info.pad_stride_info.pad_left() + - info.pad_stride_info.pad_right()); - ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(idx_h) + (weights->dimension(idx_h) - 1) * (info.dilation.y() - 1) > src->dimension(idx_h) + info.pad_stride_info.pad_top() + - info.pad_stride_info.pad_bottom()); - - if(biases != nullptr) - { - const unsigned int channel_idx = get_data_layout_dimension_index(src->data_layout(), DataLayoutDimension::CHANNEL); - ARM_COMPUTE_RETURN_ERROR_ON(biases->num_dimensions() > 1); - ARM_COMPUTE_RETURN_ERROR_ON(biases->dimension(0) != weights->dimension(channel_idx)); - } - - ARM_COMPUTE_RETURN_ON_ERROR(CpuDepthwiseConv2dAssemblyDispatch::validate(src, weights, biases, dst, info)); - - //Validate Activation Layer - if(info.act_info.enabled()) - { - ARM_COMPUTE_RETURN_ON_ERROR(CpuActivation::validate(dst, nullptr, info.act_info)); - } - return Status{}; -} -} // namespace - -CpuDepthwiseConv2d::CpuDepthwiseConv2dOptimizedInternal::CpuDepthwiseConv2dOptimizedInternal() - : _dwc_optimized_func(nullptr), _permute_input(nullptr), _permute_weights(nullptr), _permute_output(nullptr), _activationlayer_function(nullptr), _has_bias(false), _is_quantized(false), - _is_nchw(true), _permute(false), _is_activationlayer_enabled(false), _is_prepared(false) -{ -} - -void CpuDepthwiseConv2d::CpuDepthwiseConv2dOptimizedInternal::configure(ITensorInfo *src, - const ITensorInfo *weights, - const ITensorInfo *biases, - ITensorInfo *dst, - const ConvolutionInfo &info) -{ - ARM_COMPUTE_ERROR_ON_NULLPTR(src, weights, dst); - // Perform validation step - ARM_COMPUTE_ERROR_THROW_ON(CpuDepthwiseConv2dOptimizedInternal::validate(src, weights, (biases == nullptr) ? nullptr : biases, - dst, info)); - - _is_quantized = is_data_type_quantized_asymmetric(src->data_type()); - _has_bias = biases != nullptr; - _is_nchw = src->data_layout() == DataLayout::NCHW; - _permute = _is_nchw; - _is_prepared = false; - - // Configure pipeline - ActivationLayerInfo act_info_to_use = ActivationLayerInfo(); - const bool is_relu = arm_compute::utils::info_helpers::is_relu(info.act_info); - const bool is_relu6 = arm_compute::utils::info_helpers::is_relu6(info.act_info); - _is_activationlayer_enabled = info.act_info.enabled() && !(is_relu || is_relu6); - - if(!_is_activationlayer_enabled) - { - act_info_to_use = info.act_info; - } - - _dwc_optimized_func = std::make_unique<CpuDepthwiseConv2dAssemblyDispatch>(); - if(_is_nchw) - { - _permute_input = std::make_unique<cpu::CpuPermute>(); - _permute_weights = std::make_unique<cpu::CpuPermute>(); - _permute_output = std::make_unique<cpu::CpuPermute>(); - - auto input_perm = std::make_unique<TensorInfo>(); - auto weights_perm = std::make_unique<TensorInfo>(); - auto output_perm = std::make_unique<TensorInfo>(); - - // Configure the function to transform the input tensor from NCHW -> NHWC - _permute_input->configure(src, input_perm.get(), PermutationVector(2U, 0U, 1U)); - input_perm->set_data_layout(DataLayout::NHWC); - - // Configure the function to transform the weights tensor from IHW -> HWI - _permute_weights->configure(weights, weights_perm.get(), PermutationVector(2U, 0U, 1U)); - weights_perm->set_data_layout(DataLayout::NHWC); - - output_perm->set_data_layout(DataLayout::NHWC); - output_perm->set_quantization_info(dst->quantization_info()); - - // Configure optimized depthwise - _dwc_optimized_func->configure(input_perm.get(), weights_perm.get(), biases, output_perm.get(), info); - - // Configure the function to transform the convoluted output to ACL's native ordering format NCHW - output_perm->set_data_layout(DataLayout::NHWC); - _permute_output->configure(output_perm.get(), dst, PermutationVector(1U, 2U, 0U)); - } - else - { - _dwc_optimized_func->configure(src, weights, biases, dst, info); - } - - // Configure activation - if(_is_activationlayer_enabled) - { - _activationlayer_function = std::make_unique<cpu::CpuActivation>(); - _activationlayer_function->configure(dst, nullptr, info.act_info); - } -} - -Status CpuDepthwiseConv2d::CpuDepthwiseConv2dOptimizedInternal::validate(const ITensorInfo *src, - const ITensorInfo *weights, - const ITensorInfo *biases, - const ITensorInfo *dst, - const ConvolutionInfo &info) -{ - return validate_arguments_optimized(src, weights, biases, dst, info); -} - -void CpuDepthwiseConv2d::CpuDepthwiseConv2dOptimizedInternal::run(ITensorPack &tensors) -{ - ARM_COMPUTE_ERROR_ON_MSG(tensors.empty(), "No inputs provided"); - prepare(tensors); - - auto bias = tensors.get_const_tensor(TensorType::ACL_SRC_2); - auto dst = tensors.get_tensor(TensorType::ACL_DST_0); - auto workspace = tensors.get_tensor(TensorType::ACL_INT_3); - auto packed_weights = tensors.get_tensor(TensorType::ACL_INT_4); - - // Permute input - if(_permute) - { - ITensorPack pack; - auto src = tensors.get_const_tensor(TensorType::ACL_SRC_0); - auto src_perm = tensors.get_tensor(TensorType::ACL_INT_0); - pack.add_tensor(TensorType::ACL_SRC, src); - pack.add_tensor(TensorType::ACL_DST, src_perm); - _permute_input->run(pack); - } - - // Run assembly function - if(_is_nchw) - { - auto src_perm = tensors.get_tensor(TensorType::ACL_INT_0); - auto weights_perm = tensors.get_tensor(TensorType::ACL_INT_1); - auto dst_perm = tensors.get_tensor(TensorType::ACL_INT_2); - - ITensorPack pack; - pack.add_tensor(TensorType::ACL_SRC_0, src_perm); - pack.add_tensor(TensorType::ACL_SRC_1, weights_perm); - pack.add_tensor(TensorType::ACL_SRC_2, bias); - pack.add_tensor(TensorType::ACL_INT_0, workspace); - pack.add_tensor(TensorType::ACL_INT_1, packed_weights); - pack.add_tensor(TensorType::ACL_DST, dst_perm); - _dwc_optimized_func->run(pack); - } - else - { - auto src = tensors.get_tensor(TensorType::ACL_SRC_0); - auto weights = tensors.get_tensor(TensorType::ACL_SRC_1); - auto dst = tensors.get_tensor(TensorType::ACL_DST); - - ITensorPack pack; - pack.add_tensor(TensorType::ACL_SRC_0, src); - pack.add_tensor(TensorType::ACL_SRC_1, weights); - pack.add_tensor(TensorType::ACL_SRC_2, bias); - pack.add_tensor(TensorType::ACL_INT_0, workspace); - pack.add_tensor(TensorType::ACL_INT_1, packed_weights); - pack.add_tensor(TensorType::ACL_DST, dst); - _dwc_optimized_func->run(pack); - } - - // Permute output - if(_is_nchw) - { - ITensorPack pack; - auto dst_perm = tensors.get_tensor(TensorType::ACL_INT_2); - pack.add_tensor(TensorType::ACL_SRC, dst_perm); - pack.add_tensor(TensorType::ACL_DST, dst); - _permute_output->run(pack); - } - - // Run activation - if(_is_activationlayer_enabled) - { - ITensorPack pack; - pack.add_tensor(TensorType::ACL_SRC, dst); - pack.add_tensor(TensorType::ACL_DST, dst); - _activationlayer_function->run(pack); - } -} - -void CpuDepthwiseConv2d::CpuDepthwiseConv2dOptimizedInternal::prepare(ITensorPack &tensors) -{ - if(!_is_prepared) - { - auto weights = tensors.get_const_tensor(TensorType::ACL_SRC_1); - auto bias = tensors.get_const_tensor(TensorType::ACL_SRC_2); - auto packed_weights = tensors.get_tensor(TensorType::ACL_INT_4); - - // Permute weights - if(_permute) - { - auto permuted_weights = tensors.get_tensor(TensorType::ACL_INT_1); - - ITensorPack pack; - pack.add_tensor(TensorType::ACL_SRC, weights); - pack.add_tensor(TensorType::ACL_DST, permuted_weights); - _permute_weights->run(pack); - - weights->mark_as_unused(); - - ITensorPack pack_opt; - pack_opt.add_const_tensor(TensorType::ACL_SRC_1, permuted_weights); - pack_opt.add_tensor(TensorType::ACL_SRC_2, bias); - pack_opt.add_tensor(TensorType::ACL_INT_1, packed_weights); - - // Prepare optimized function - _dwc_optimized_func->prepare(pack_opt); - } - else - { - ITensorPack pack_opt; - pack_opt.add_tensor(TensorType::ACL_SRC_1, weights); - pack_opt.add_tensor(TensorType::ACL_SRC_2, bias); - pack_opt.add_tensor(TensorType::ACL_INT_1, packed_weights); - - // Prepare optimized function - _dwc_optimized_func->prepare(pack_opt); - } - - _is_prepared = true; - } -} - -CpuDepthwiseConv2d::CpuDepthwiseConv2dGeneric::CpuDepthwiseConv2dGeneric() - : _depthwise_conv_kernel(nullptr), _permute_input(nullptr), _permute_weights(nullptr), _permute_output(nullptr), _activationlayer_function(nullptr), _is_nchw(true), _is_prepared(false), - _is_activationlayer_enabled(false) -{ -} - -void CpuDepthwiseConv2d::CpuDepthwiseConv2dGeneric::configure(ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, ITensorInfo *dst, const ConvolutionInfo &info) -{ - ARM_COMPUTE_ERROR_ON_NULLPTR(src, weights, dst); - ARM_COMPUTE_ERROR_THROW_ON(CpuDepthwiseConv2d::validate(src, weights, (biases == nullptr) ? nullptr : biases, - dst, info)); - - _is_nchw = src->data_layout() == DataLayout::NCHW; - _is_prepared = !_is_nchw; - - ITensorInfo *input_to_use = src; - const ITensorInfo *weights_to_use = weights; - ITensorInfo *output_to_use = dst; - - auto input_perm = std::make_unique<TensorInfo>(); - auto weights_perm = std::make_unique<TensorInfo>(); - auto output_perm = std::make_unique<TensorInfo>(dst->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(TensorShape())); - - if(_is_nchw) - { - _permute_input = std::make_unique<cpu::CpuPermute>(); - _permute_weights = std::make_unique<cpu::CpuPermute>(); - - _permute_input->configure(src, input_perm.get(), PermutationVector(2U, 0U, 1U)); - input_perm->set_data_layout(DataLayout::NHWC); - input_to_use = input_perm.get(); - - _permute_weights->configure(weights, weights_perm.get(), PermutationVector(2U, 0U, 1U)); - weights_perm->set_data_layout(DataLayout::NHWC); - weights_to_use = weights_perm.get(); - - output_to_use = output_perm.get(); - } - - _depthwise_conv_kernel = std::make_unique<cpu::kernels::CpuDepthwiseConv2dNativeKernel>(); - _depthwise_conv_kernel->configure(input_to_use, weights_to_use, biases, output_to_use, info); - - if(_is_nchw) - { - _permute_output = std::make_unique<cpu::CpuPermute>(); - _permute_output->configure(output_perm.get(), dst, PermutationVector(1U, 2U, 0U)); - output_perm->set_data_layout(DataLayout::NHWC); - } - - //Configure Activation Layer - _is_activationlayer_enabled = info.act_info.enabled(); - if(_is_activationlayer_enabled) - { - _activationlayer_function = std::make_unique<cpu::CpuActivation>(); - _activationlayer_function->configure(dst, nullptr, info.act_info); - } -} - -Status CpuDepthwiseConv2d::CpuDepthwiseConv2dGeneric::validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, - const ConvolutionInfo &info) -{ - ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, weights, dst); - if(src->data_layout() == DataLayout::NCHW) - { - TensorShape permuted_input_shape = src->tensor_shape(); - TensorShape permuted_weights_shape = weights->tensor_shape(); - TensorShape permuted_output_shape = misc::shape_calculator::compute_depthwise_convolution_shape(*src, *weights, info); - permute(permuted_input_shape, PermutationVector(2U, 0U, 1U)); - permute(permuted_weights_shape, PermutationVector(2U, 0U, 1U)); - permute(permuted_output_shape, PermutationVector(2U, 0U, 1U)); - - const TensorInfo permuted_input = TensorInfo(src->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(permuted_input_shape).set_data_layout(DataLayout::NHWC)); - const TensorInfo permuted_weights = TensorInfo(weights->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(permuted_weights_shape).set_data_layout(DataLayout::NHWC)); - const TensorInfo permuted_output = TensorInfo(dst->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(permuted_output_shape).set_data_layout(DataLayout::NCHW)); - - ARM_COMPUTE_RETURN_ON_ERROR(CpuPermute::validate(src, &permuted_input, PermutationVector(2U, 0U, 1U))); - ARM_COMPUTE_RETURN_ON_ERROR(CpuPermute::validate(weights, &permuted_weights, PermutationVector(2U, 0U, 1U))); - ARM_COMPUTE_RETURN_ON_ERROR(CpuPermute::validate(&permuted_output, dst, PermutationVector(1U, 2U, 0U))); - - ARM_COMPUTE_RETURN_ON_ERROR(cpu::kernels::CpuDepthwiseConv2dNativeKernel::validate(&permuted_input, &permuted_weights, biases, &permuted_output, info)); - } - else - { - ARM_COMPUTE_RETURN_ON_ERROR(cpu::kernels::CpuDepthwiseConv2dNativeKernel::validate(src, weights, biases, dst, info)); - } - - // Validate Activation Layer - if(info.act_info.enabled()) - { - ARM_COMPUTE_RETURN_ON_ERROR(CpuActivation::validate(dst, nullptr, info.act_info)); - } - - return Status{}; -} - -void CpuDepthwiseConv2d::CpuDepthwiseConv2dGeneric::run(ITensorPack &tensors) -{ - auto src = tensors.get_const_tensor(TensorType::ACL_SRC_0); - auto weights = tensors.get_const_tensor(TensorType::ACL_SRC_1); - auto biases = tensors.get_const_tensor(TensorType::ACL_SRC_2); - auto dst = tensors.get_tensor(TensorType::ACL_DST_0); - - if(_is_nchw) - { - prepare(tensors); - auto src_perm = tensors.get_tensor(TensorType::ACL_INT_0); - auto weights_perm = tensors.get_tensor(TensorType::ACL_INT_1); - auto dst_perm = tensors.get_tensor(TensorType::ACL_INT_2); - - ITensorPack pack; - pack.add_tensor(TensorType::ACL_SRC, src); - pack.add_tensor(TensorType::ACL_DST, src_perm); - _permute_input->run(pack); - - ITensorPack pack_depth; - pack_depth.add_const_tensor(TensorType::ACL_SRC_0, src_perm); - pack_depth.add_const_tensor(TensorType::ACL_SRC_1, weights_perm); - pack_depth.add_tensor(TensorType::ACL_SRC_2, biases); - pack_depth.add_tensor(TensorType::ACL_DST, dst_perm); - NEScheduler::get().schedule_op(_depthwise_conv_kernel.get(), Window::DimY, _depthwise_conv_kernel->window(), pack_depth); - } - else - { - ITensorPack pack_depth; - pack_depth.add_tensor(TensorType::ACL_SRC_0, src); - pack_depth.add_tensor(TensorType::ACL_SRC_1, weights); - pack_depth.add_tensor(TensorType::ACL_SRC_2, biases); - pack_depth.add_tensor(TensorType::ACL_DST, dst); - NEScheduler::get().schedule_op(_depthwise_conv_kernel.get(), Window::DimY, _depthwise_conv_kernel->window(), pack_depth); - } - - if(_is_nchw) - { - ITensorPack pack; - auto dst_perm = tensors.get_tensor(TensorType::ACL_INT_2); - pack.add_tensor(TensorType::ACL_SRC, dst_perm); - pack.add_tensor(TensorType::ACL_DST, dst); - _permute_output->run(pack); - } - - if(_is_activationlayer_enabled) - { - ITensorPack pack; - pack.add_tensor(TensorType::ACL_SRC, dst); - pack.add_tensor(TensorType::ACL_DST, dst); - _activationlayer_function->run(pack); - } -} - -void CpuDepthwiseConv2d::CpuDepthwiseConv2dGeneric::prepare(ITensorPack &tensors) -{ - if(!_is_prepared) - { - auto weights = tensors.get_const_tensor(TensorType::ACL_SRC_1); - auto weights_perm = tensors.get_tensor(TensorType::ACL_INT_1); - - ARM_COMPUTE_ERROR_ON(!weights->is_used()); - - ITensorPack pack; - pack.add_tensor(TensorType::ACL_SRC, weights); - pack.add_tensor(TensorType::ACL_DST, weights_perm); - - _permute_weights->run(pack); - weights->mark_as_unused(); - _is_prepared = true; - } -} - -CpuDepthwiseConv2d::CpuDepthwiseConv2d() - : _depth_conv_func(DepthwiseConvolutionFunction::GENERIC), _func_optimized(), _func_generic() -{ -} - -void CpuDepthwiseConv2d::configure(ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, ITensorInfo *dst, const ConvolutionInfo &info) -{ - _depth_conv_func = get_depthwiseconvolution_function(src, weights, (biases != nullptr) ? biases : nullptr, dst, info); - switch(_depth_conv_func) - { - case DepthwiseConvolutionFunction::OPTIMIZED: - _func_optimized.configure(src, weights, biases, dst, info); - break; - case DepthwiseConvolutionFunction::GENERIC: - _func_generic.configure(src, weights, biases, dst, info); - break; - default: - ARM_COMPUTE_ERROR("Unsupported DepthwiseConvolutionFunction"); - } -} - -Status CpuDepthwiseConv2d::validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, const ConvolutionInfo &info) -{ - DepthwiseConvolutionFunction depth_conv_func = get_depthwiseconvolution_function(src, weights, biases, dst, info); - switch(depth_conv_func) - { - case DepthwiseConvolutionFunction::OPTIMIZED: - return CpuDepthwiseConv2dOptimizedInternal::validate(src, weights, biases, dst, info); - break; - case DepthwiseConvolutionFunction::GENERIC: - return CpuDepthwiseConv2dGeneric::validate(src, weights, biases, dst, info); - break; - default: - ARM_COMPUTE_ERROR("Unsupported DepthwiseConvolutionFunction"); - } -} - -DepthwiseConvolutionFunction CpuDepthwiseConv2d::get_depthwiseconvolution_function(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, - const ConvolutionInfo &info) -{ - if(bool(CpuDepthwiseConv2dOptimizedInternal::validate(src, weights, biases, dst, info))) - { - return DepthwiseConvolutionFunction::OPTIMIZED; - } - else - { - return DepthwiseConvolutionFunction::GENERIC; - } -} - -void CpuDepthwiseConv2d::run(ITensorPack &tensors) -{ - switch(_depth_conv_func) - { - case DepthwiseConvolutionFunction::OPTIMIZED: - _func_optimized.run(tensors); - break; - case DepthwiseConvolutionFunction::GENERIC: - _func_generic.run(tensors); - break; - default: - ARM_COMPUTE_ERROR("DepthwiseConvolutionFunction not properly configured"); - } -} - -void CpuDepthwiseConv2d::prepare(ITensorPack &tensors) -{ - switch(_depth_conv_func) - { - case DepthwiseConvolutionFunction::OPTIMIZED: - _func_optimized.prepare(tensors); - break; - case DepthwiseConvolutionFunction::GENERIC: - _func_generic.prepare(tensors); - break; - default: - ARM_COMPUTE_ERROR("DepthwiseConvolutionFunction not properly configured"); - } -} -} // namespace cpu -} // namespace arm_compute diff --git a/src/runtime/cpu/operators/CpuDepthwiseConv2d.h b/src/runtime/cpu/operators/CpuDepthwiseConv2d.h deleted file mode 100644 index 049397fe60..0000000000 --- a/src/runtime/cpu/operators/CpuDepthwiseConv2d.h +++ /dev/null @@ -1,213 +0,0 @@ -/* - * Copyright (c) 2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifndef ARM_COMPUTE_CPU_DEPTHWISECONV2D_H -#define ARM_COMPUTE_CPU_DEPTHWISECONV2D_H - -#include "arm_compute/core/ITensorInfo.h" -#include "arm_compute/core/experimental/Types.h" -#include "src/core/cpu/ICpuKernel.h" -#include "src/core/cpu/kernels/CpuDepthwiseConv2dNativeKernel.h" -#include "src/runtime/cpu/ICpuOperator.h" -#include "src/runtime/cpu/operators/CpuActivation.h" -#include "src/runtime/cpu/operators/CpuDepthwiseConv2dAssemblyDispatch.h" -#include "src/runtime/cpu/operators/CpuPermute.h" - -#include <memory> - -namespace arm_compute -{ -namespace cpu -{ -/** Function to execute a depthwise convolution. - */ -class CpuDepthwiseConv2d : public ICpuOperator -{ -public: - /** Default constructor */ - CpuDepthwiseConv2d(); - /** Initialize the function's source, destination, weights and convolution information. - * - * @param[in, out] src Source tensor info. Data type supported: QASYMM8/QASYMM8_SIGNED/F16/F32 - * @param[out] dst Destination tensor info. Data type supported: same as @p src. - * @param[in] weights Weights tensor info. These are 3D tensor infos with shape [kernel_x, kernel_y, IFM]. - * Data type supported: Same as @p src or QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL when @p src is QASYMM8/QASYMM8_SIGNED. - * @param[in] biases Biases tensor info. A 1D tensor with shape [IFM]. Must be nullptr if not needed. - * Data type supported: Same as @p src, S32 when src is QASYMM8/QASYMM8_SIGNED. - * @param[in] info Depthwise convolution meta-data. - */ - void configure(ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, ITensorInfo *dst, const ConvolutionInfo &info); - - /** Static function to check if given info will lead to a valid configuration - * - * Similar to CpuDepthwiseConv2d::configure() - * - * @return a status - */ - static Status validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, const ConvolutionInfo &info); - - /** Static function to choose the best depthwise convolution function for @ref CpuDepthwiseConv2d - * - * @param[in] src Source tensor info. Data type supported: QASYMM8/QASYMM8_SIGNED/F16/F32 - * @param[in] weights Weights tensor info. These are 3D tensors with shape [kernel_x, kernel_y, IFM]. - * Data type supported: Same as @p src or QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL when @p src is QASYMM8/QASYMM8_SIGNED. - * @param[in] biases Biases tensor info. A 1D tensor with shape [IFM]. Must be nullptr if not needed. - * Data type supported: Same as @p src, S32 when src is QASYMM8/QASYMM8_SIGNED. - * @param[in] dst Destination tensor. Data type supported: same as @p src. - * @param[in] info Depthwise convolution meta-data. - * - * @return a Depthwise Convolution Function - */ - static DepthwiseConvolutionFunction get_depthwiseconvolution_function(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, - const ConvolutionInfo &info); - - // Inherited methods overriden: - void run(ITensorPack &tensors) override; - void prepare(ITensorPack &tensors) override; - -private: - /** Basic function to execute optimized depthwise convolution routines. This function calls the following kernels: - * - * @note At the moment 3x3 and 5x5 convolution of stride 1, 2 are supported - * - * -# @ref NEFillBorderKernel (if pad_x or pad_y > 0) and no assembly kernel implementation is present - * -# @ref CpuDepthwiseConv2d3x3Kernel if 3x3 and no assembly kernel implementation is present - * -# @ref NEDepthwiseConvolutionAssemblyDispatch if assembly kernel implementation is present - * -# @ref NEDirectConvolutionLayerOutputStageKernel if re-quantization of dst is required - * -# @ref NEActivationLayer if fused activation is required - * - */ - class CpuDepthwiseConv2dOptimizedInternal : public ICpuOperator - { - public: - /** Default constructor */ - CpuDepthwiseConv2dOptimizedInternal(); - /** Prevent instances of this class from being copied (As this class contains pointers) */ - CpuDepthwiseConv2dOptimizedInternal(const CpuDepthwiseConv2dOptimizedInternal &) = delete; - /** Default move constructor */ - CpuDepthwiseConv2dOptimizedInternal(CpuDepthwiseConv2dOptimizedInternal &&) = default; - /** Prevent instances of this class from being copied (As this class contains pointers) */ - CpuDepthwiseConv2dOptimizedInternal &operator=(const CpuDepthwiseConv2dOptimizedInternal &) = delete; - /** Default move assignment operator */ - CpuDepthwiseConv2dOptimizedInternal &operator=(CpuDepthwiseConv2dOptimizedInternal &&) = default; - /** Default destructor */ - ~CpuDepthwiseConv2dOptimizedInternal() = default; - /** Initialize the function's source, destination, kernels and border_size. - * - * @param[in, out] src Source tensor info. Data type supported: QASYMM8/QASYMM8_SIGNED/F16/F32. (Written to only for border filling). - * @param[in] weights Weights tensor info. These are 3D tensors with shape [kernel_x, kernel_y, IFM]. Data type supported: Same as @p src. - * @param[in] biases Biases tensor info. A 1D tensor with shape [IFM]. Must be nullptr if not needed. - * Data type supported: Same as @p src, S32 when src is QASYMM8/QASYMM8_SIGNED. - * @param[out] dst Destination tensor info. Data type supported: same as @p src. - * @param[in] info Depthwise convolution meta-data. - */ - void configure(ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, ITensorInfo *dst, const ConvolutionInfo &info); - - /** Static function to check if given info will lead to a valid configuration - * - * Similar to CpuDepthwiseConv2dOptimizedInternal::configure() - * - * @return a status - */ - static Status validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, const ConvolutionInfo &info); - - // Inherited methods overriden: - void run(ITensorPack &tensors) override; - void prepare(ITensorPack &tensors) override; - - private: - std::unique_ptr<CpuDepthwiseConv2dAssemblyDispatch> _dwc_optimized_func{ nullptr }; - std::unique_ptr<CpuPermute> _permute_input{ nullptr }; - std::unique_ptr<CpuPermute> _permute_weights{ nullptr }; - std::unique_ptr<CpuPermute> _permute_output{ nullptr }; - std::unique_ptr<CpuActivation> _activationlayer_function{ nullptr }; - bool _has_bias{ false }; - bool _is_quantized{ false }; - bool _is_nchw{ true }; - bool _permute{ false }; - bool _is_activationlayer_enabled{ false }; - bool _is_prepared{ false }; - }; - - /** Basic function to execute a generic depthwise convolution. This function calls the following kernel: - * - * -# @ref CpuDepthwiseConv2dNativeKernel - * - */ - class CpuDepthwiseConv2dGeneric : public ICpuOperator - { - public: - /** Default constructor */ - CpuDepthwiseConv2dGeneric(); - /** Prevent instances of this class from being copied (As this class contains pointers) */ - CpuDepthwiseConv2dGeneric(const CpuDepthwiseConv2dGeneric &) = delete; - /** Default move constructor */ - CpuDepthwiseConv2dGeneric(CpuDepthwiseConv2dGeneric &&) = default; - /** Prevent instances of this class from being copied (As this class contains pointers) */ - CpuDepthwiseConv2dGeneric &operator=(const CpuDepthwiseConv2dGeneric &) = delete; - /** Default move assignment operator */ - CpuDepthwiseConv2dGeneric &operator=(CpuDepthwiseConv2dGeneric &&) = default; - /** Default destructor */ - ~CpuDepthwiseConv2dGeneric() = default; - /** Initialize the function's source, destination, weights and convolution information. - * - * @param[in, out] src Source tensor info. Data type supported: QASYMM8/QASYMM8_SIGNED/F16/F32. (Written to only for border filling). - * @param[out] dst Destination tensor info. Data type supported: same as @p src. - * @param[in] weights Weights tensor info. These are 3D tensors with shape [kernel_x, kernel_y, IFM]. - * Data type supported: Same as @p src or QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL when @p src is QASYMM8/QASYMM8_SIGNED. - * @param[in] biases Biases tensor info. A 1D tensor with shape [IFM]. Must be nullptr if not needed. - * Data type supported: Same as @p src, S32 when src is QASYMM8/QASYMM8_SIGNED. - * @param[in] info Depthwise convolution meta-data. - */ - void configure(ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, ITensorInfo *dst, const ConvolutionInfo &info); - - /** Static function to check if given info will lead to a valid configuration - * - * Similar to CpuDepthwiseConv2dGeneric::configure() - * - * @return a status - */ - static Status validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, const ConvolutionInfo &info); - - // Inherited methods overridden: - void run(ITensorPack &tensors) override; - void prepare(ITensorPack &tensors) override; - - private: - std::unique_ptr<kernels::CpuDepthwiseConv2dNativeKernel> _depthwise_conv_kernel{ nullptr }; - std::unique_ptr<CpuPermute> _permute_input{ nullptr }; - std::unique_ptr<CpuPermute> _permute_weights{ nullptr }; - std::unique_ptr<CpuPermute> _permute_output{ nullptr }; - std::unique_ptr<CpuActivation> _activationlayer_function{ nullptr }; - bool _is_nchw{ true }; - bool _is_prepared{ false }; - bool _is_activationlayer_enabled{ false }; - }; - - DepthwiseConvolutionFunction _depth_conv_func; - CpuDepthwiseConv2dOptimizedInternal _func_optimized; - CpuDepthwiseConv2dGeneric _func_generic; -}; -} // namespace cpu -} // namespace arm_compute -#endif /* ARM_COMPUTE_CPU_DEPTHWISECONV2D_H */ diff --git a/src/runtime/cpu/operators/CpuDepthwiseConv2dAssemblyDispatch.cpp b/src/runtime/cpu/operators/CpuDepthwiseConv2dAssemblyDispatch.cpp deleted file mode 100644 index a36ee1d45b..0000000000 --- a/src/runtime/cpu/operators/CpuDepthwiseConv2dAssemblyDispatch.cpp +++ /dev/null @@ -1,563 +0,0 @@ -/* - * Copyright (c) 2019-2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ - -#include "src/runtime/cpu/operators/CpuDepthwiseConv2dAssemblyDispatch.h" - -#include "arm_compute/core/ITensor.h" -#include "arm_compute/core/Utils.h" -#include "arm_compute/core/utils/misc/InfoHelpers.h" -#include "arm_compute/core/utils/misc/ShapeCalculator.h" -#include "arm_compute/core/utils/quantization/AsymmHelpers.h" -#include "src/core/CPP/Validate.h" -#include "src/core/NEON/kernels/assembly/NEDepthwiseConvolutionAssemblyKernelWrapper.h" -#include "src/core/NEON/kernels/convolution/depthwise/depthwise_dilated.hpp" -#include "src/core/NEON/kernels/convolution/depthwise/depthwise_quantized_dilated.hpp" -#include "src/core/helpers/AutoConfiguration.h" - -#include "arm_compute/runtime/NEON/NEScheduler.h" - -#include <set> - -namespace arm_compute -{ -namespace cpu -{ -namespace -{ -std::unique_ptr<depthwise::IDepthwiseConvolution> get_qasymm8_convolver(int kernel_size, int stride_x, - int n_batches, int in_rows, int in_cols, int n_channels, - int dilation_factor, neon_convolution_kernels::ActivationFunction activation, - const qasymm8::QAsymm8Params &wqinfo, const qasymm8::QAsymm8Params &iqinfo, const qasymm8::QAsymm8Params &oqinfo, - const qasymm8::QAsymm8RescaleParams &rescale_params, - int padding_top, int padding_left, int padding_bottom, int padding_right) -{ - switch(kernel_size) - { - case 3: - { - switch(stride_x) - { - case 1: - return std::make_unique<depthwise::QAsymm8DilatedDepthwiseConvolution<2, 2, 3, 3, 1, 1>>( - n_batches, in_rows, in_cols, n_channels, dilation_factor, activation, wqinfo, iqinfo, oqinfo, rescale_params, padding_top, padding_left, padding_bottom, padding_right); - case 2: - return std::make_unique<depthwise::QAsymm8DilatedDepthwiseConvolution<2, 2, 3, 3, 2, 2>>( - n_batches, in_rows, in_cols, n_channels, dilation_factor, activation, wqinfo, iqinfo, oqinfo, rescale_params, padding_top, padding_left, padding_bottom, padding_right); - default: - return nullptr; - } - } - case 5: - { - switch(stride_x) - { - case 1: - return std::make_unique<depthwise::QAsymm8DilatedDepthwiseConvolution<2, 2, 5, 5, 1, 1>>( - n_batches, in_rows, in_cols, n_channels, dilation_factor, activation, wqinfo, iqinfo, oqinfo, rescale_params, padding_top, padding_left, padding_bottom, padding_right); - case 2: - return std::make_unique<depthwise::QAsymm8DilatedDepthwiseConvolution<2, 2, 5, 5, 2, 2>>( - n_batches, in_rows, in_cols, n_channels, dilation_factor, activation, wqinfo, iqinfo, oqinfo, rescale_params, padding_top, padding_left, padding_bottom, padding_right); - default: - return nullptr; - } - } - default: - return nullptr; - } -} - -std::unique_ptr<depthwise::IDepthwiseConvolution> get_qsymm8_perchannel_convolver(int kernel_size, int stride_x, - int n_batches, int in_rows, int in_cols, int n_channels, - neon_convolution_kernels::ActivationFunction activation, - const qsymm8::QSymm8PerChannelParams &wqinfo, const qasymm8::QAsymm8Params &iqinfo, const qasymm8::QAsymm8Params &oqinfo, - const qsymm8::QSymm8PerChannelRescaleParams &rescale_params, - int padding_top, int padding_left, int padding_bottom, int padding_right) -{ - switch(kernel_size) - { - case 3: - { - switch(stride_x) - { - case 1: - return std::make_unique<depthwise::QSymm8HybridPerChannelDepthwiseConvolution<2, 2, 3, 3, 1, 1>>( - n_batches, in_rows, in_cols, n_channels, activation, wqinfo, iqinfo, oqinfo, rescale_params, padding_top, padding_left, padding_bottom, padding_right); - case 2: - return std::make_unique<depthwise::QSymm8HybridPerChannelDepthwiseConvolution<2, 2, 3, 3, 2, 2>>( - n_batches, in_rows, in_cols, n_channels, activation, wqinfo, iqinfo, oqinfo, rescale_params, padding_top, padding_left, padding_bottom, padding_right); - default: - return nullptr; - } - } - case 5: - { - switch(stride_x) - { - case 1: - return std::make_unique<depthwise::QSymm8HybridPerChannelDepthwiseConvolution<2, 2, 5, 5, 1, 1>>( - n_batches, in_rows, in_cols, n_channels, activation, wqinfo, iqinfo, oqinfo, rescale_params, padding_top, padding_left, padding_bottom, padding_right); - case 2: - return std::make_unique<depthwise::QSymm8HybridPerChannelDepthwiseConvolution<2, 2, 5, 5, 2, 2>>( - n_batches, in_rows, in_cols, n_channels, activation, wqinfo, iqinfo, oqinfo, rescale_params, padding_top, padding_left, padding_bottom, padding_right); - default: - return nullptr; - } - } - default: - return nullptr; - } -} - -#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC -std::unique_ptr<depthwise::IDepthwiseConvolution> get_fp16_convolver(int kernel_size, int stride_x, - int n_batches, int in_rows, int in_cols, int n_channels, - int dilation_factor, neon_convolution_kernels::ActivationFunction activation, - int padding_top, int padding_left, int padding_bottom, int padding_right) -{ - switch(kernel_size) - { - case 3: - { - switch(stride_x) - { - case 1: - return std::make_unique<depthwise::DilatedDepthwiseConvolution<3, 3, 3, 3, 1, 1, float16_t, float16_t, float16_t>>( - n_batches, in_rows, in_cols, n_channels, dilation_factor, activation, padding_top, padding_left, padding_bottom, padding_right); - case 2: - return std::make_unique<depthwise::DilatedDepthwiseConvolution<3, 3, 3, 3, 2, 2, float16_t, float16_t, float16_t>>( - n_batches, in_rows, in_cols, n_channels, dilation_factor, activation, padding_top, padding_left, padding_bottom, padding_right); - default: - return nullptr; - } - } - case 5: - { - switch(stride_x) - { - case 1: - return std::make_unique<depthwise::DilatedDepthwiseConvolution<3, 3, 5, 5, 1, 1, float16_t, float16_t, float16_t>>( - n_batches, in_rows, in_cols, n_channels, dilation_factor, activation, padding_top, padding_left, padding_bottom, padding_right); - case 2: - return std::make_unique<depthwise::DilatedDepthwiseConvolution<3, 3, 5, 5, 2, 2, float16_t, float16_t, float16_t>>( - n_batches, in_rows, in_cols, n_channels, dilation_factor, activation, padding_top, padding_left, padding_bottom, padding_right); - default: - return nullptr; - } - } - default: - return nullptr; - } -} -#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC - -std::unique_ptr<depthwise::IDepthwiseConvolution> get_fp32_convolver(int kernel_size, int stride_x, - int n_batches, int in_rows, int in_cols, int n_channels, - int dilation_factor, neon_convolution_kernels::ActivationFunction activation, - int padding_top, int padding_left, int padding_bottom, int padding_right) -{ - switch(kernel_size) - { - case 3: - { - switch(stride_x) - { - case 1: - return std::make_unique<depthwise::DilatedDepthwiseConvolution<4, 4, 3, 3, 1, 1, float, float, float>>( - n_batches, in_rows, in_cols, n_channels, dilation_factor, activation, padding_top, padding_left, padding_bottom, padding_right); - case 2: - return std::make_unique<depthwise::DilatedDepthwiseConvolution<3, 3, 3, 3, 2, 2, float, float, float>>( - n_batches, in_rows, in_cols, n_channels, dilation_factor, activation, padding_top, padding_left, padding_bottom, padding_right); - default: - return nullptr; - } - } - case 5: - { - switch(stride_x) - { - case 1: - return std::make_unique<depthwise::DilatedDepthwiseConvolution<4, 4, 5, 5, 1, 1, float, float, float>>( - n_batches, in_rows, in_cols, n_channels, dilation_factor, activation, padding_top, padding_left, padding_bottom, padding_right); - case 2: - return std::make_unique<depthwise::DilatedDepthwiseConvolution<3, 3, 5, 5, 2, 2, float, float, float>>( - n_batches, in_rows, in_cols, n_channels, dilation_factor, activation, padding_top, padding_left, padding_bottom, padding_right); - default: - return nullptr; - } - } - default: - return nullptr; - } -} - -std::unique_ptr<depthwise::IDepthwiseConvolution> create_convolver(const ITensorInfo *src, - const ITensorInfo *weights, - ITensorInfo *output, - const ConvolutionInfo &info) -{ - const DataType data_type = src->data_type(); - const TensorShape shape = src->tensor_shape(); - - const int n_batches = shape[3]; - const int in_rows = shape.z(); - const int in_cols = shape.y(); - const int n_channels = shape.x(); - const int dilation_factor = info.dilation.x(); - const int padding_top = info.pad_stride_info.pad_top(); - const int padding_left = info.pad_stride_info.pad_left(); - const int padding_bottom = info.pad_stride_info.pad_bottom(); - const int padding_right = info.pad_stride_info.pad_right(); - - const bool is_uniform_quantized = (data_type == DataType::QASYMM8) && (weights->data_type() == DataType::QASYMM8); - const bool is_perchannel_quantized = (data_type == DataType::QASYMM8) && (weights->data_type() == DataType::QSYMM8_PER_CHANNEL); - - const unsigned int stride_x = info.pad_stride_info.stride().first; - const unsigned int kernel_size = weights->tensor_shape().y(); - - // Map activation function - neon_convolution_kernels::ActivationFunction activation = neon_convolution_kernels::ActivationFunction::None; - if(arm_compute::utils::info_helpers::is_relu(info.act_info)) - { - activation = neon_convolution_kernels::ActivationFunction::ReLU; - } - else if(arm_compute::utils::info_helpers::is_relu6(info.act_info)) - { - activation = neon_convolution_kernels::ActivationFunction::ReLU6; - } - - // Create quantized convolver - if(is_uniform_quantized) - { - const UniformQuantizationInfo input_qinfo = src->quantization_info().uniform(); - const UniformQuantizationInfo weights_qinfo = weights->quantization_info().uniform(); - const UniformQuantizationInfo output_qinfo = output->quantization_info().uniform(); - - // Check that quantization info are in the range [0, 255] - ARM_COMPUTE_ERROR_ON(input_qinfo.offset < 0 || input_qinfo.offset > 255); - ARM_COMPUTE_ERROR_ON(weights_qinfo.offset < 0 || weights_qinfo.offset > 255); - ARM_COMPUTE_ERROR_ON(output_qinfo.offset < 0 || output_qinfo.offset > 255); - const qasymm8::QAsymm8Params iqinfo{ static_cast<uint8_t>(input_qinfo.offset), input_qinfo.scale }; - const qasymm8::QAsymm8Params wqinfo{ static_cast<uint8_t>(weights_qinfo.offset), weights_qinfo.scale }; - const qasymm8::QAsymm8Params oqinfo{ static_cast<uint8_t>(output_qinfo.offset), output_qinfo.scale }; - - // Calculate rescale parameters - const float fmultipler = iqinfo.scale * wqinfo.scale / oqinfo.scale; - int32_t qmultiplier = 0; - int32_t qshift = 0; - quantization::calculate_quantized_multiplier_less_than_one(fmultipler, &qmultiplier, &qshift); - qasymm8::QAsymm8RescaleParams rescale_params(qshift, qmultiplier, fmultipler); - - return get_qasymm8_convolver(kernel_size, stride_x, n_batches, in_rows, in_cols, n_channels, dilation_factor, activation, - wqinfo, iqinfo, oqinfo, rescale_params, padding_top, padding_left, padding_bottom, padding_right); - } - else if(is_perchannel_quantized) - { - const UniformQuantizationInfo input_qinfo = src->quantization_info().uniform(); - const QuantizationInfo weights_qinfo = weights->quantization_info(); - const UniformQuantizationInfo output_qinfo = output->quantization_info().uniform(); - - // Check that quantization info are in the range [0, 255] - ARM_COMPUTE_ERROR_ON(input_qinfo.offset < 0 || input_qinfo.offset > 255); - ARM_COMPUTE_ERROR_ON(output_qinfo.offset < 0 || output_qinfo.offset > 255); - const qasymm8::QAsymm8Params iqinfo{ static_cast<uint8_t>(input_qinfo.offset), input_qinfo.scale }; - const qsymm8::QSymm8PerChannelParams wqinfo{ weights_qinfo.scale() }; - const qasymm8::QAsymm8Params oqinfo{ static_cast<uint8_t>(output_qinfo.offset), output_qinfo.scale }; - - // Calculate rescale parameters - std::vector<float> fmultipliers; - std::vector<int32_t> qmultipliers; - std::vector<int32_t> qshifts; - - for(auto const s : wqinfo.scales) - { - const float fmultipler = iqinfo.scale * s / oqinfo.scale; - int32_t qmultiplier = 0; - int32_t qshift = 0; - quantization::calculate_quantized_multiplier_less_than_one(fmultipler, &qmultiplier, &qshift); - fmultipliers.push_back(fmultipler); - qmultipliers.push_back(qmultiplier); - qshifts.push_back(qshift); - } - - qsymm8::QSymm8PerChannelRescaleParams rescale_params(qshifts, qmultipliers, fmultipliers); - - return get_qsymm8_perchannel_convolver(kernel_size, stride_x, n_batches, in_rows, in_cols, n_channels, activation, - wqinfo, iqinfo, oqinfo, rescale_params, padding_top, padding_left, padding_bottom, padding_right); - } - else - { - // Create float convolver - switch(data_type) - { -#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC - case DataType::F16: - { - return get_fp16_convolver(kernel_size, stride_x, n_batches, in_rows, in_cols, n_channels, dilation_factor, activation, padding_top, padding_left, padding_bottom, padding_right); - } -#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC - case DataType::F32: - { - return get_fp32_convolver(kernel_size, stride_x, n_batches, in_rows, in_cols, n_channels, dilation_factor, activation, padding_top, padding_left, padding_bottom, padding_right); - } - default: - return nullptr; - } - } -} -} // namespace - -struct CpuDepthwiseConv2dAssemblyDispatch::LocalImpl -{ - std::unique_ptr<depthwise::IDepthwiseConvolution> dwc_assembly_kernel{ nullptr }; - NEDepthwiseConvolutionAssemblyKernelWrapper dwc_acl_kernel{}; - bool is_prepared{ false }; - experimental::MemoryRequirements mem_req{}; -}; - -#ifndef DOXYGEN_SKIP_THIS -CpuDepthwiseConv2dAssemblyDispatch::CpuDepthwiseConv2dAssemblyDispatch() - : _pImpl(std::make_unique<LocalImpl>()) -{ -} -#endif /* DOXYGEN_SKIP_THIS */ - -CpuDepthwiseConv2dAssemblyDispatch::~CpuDepthwiseConv2dAssemblyDispatch() = default; - -void CpuDepthwiseConv2dAssemblyDispatch::configure(const ITensorInfo *src, - const ITensorInfo *weights, - const ITensorInfo *bias, - ITensorInfo *dst, - const ConvolutionInfo &info) -{ - ARM_COMPUTE_ERROR_ON_NULLPTR(src, weights, dst); - ARM_COMPUTE_UNUSED(bias); - ARM_COMPUTE_ERROR_THROW_ON(CpuDepthwiseConv2dAssemblyDispatch::validate(src, - weights, - bias != nullptr ? bias : nullptr, - dst, - info)); - - // Output auto inizialitation if not yet initialized - const TensorShape dst_shape = misc::shape_calculator::compute_depthwise_convolution_shape(*src, *weights, info); - auto_init_if_empty(*dst, src->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(dst_shape).set_quantization_info(dst->quantization_info())); - - _pImpl->is_prepared = false; - - // Create convolver - _pImpl->dwc_assembly_kernel = create_convolver(src, weights, dst, info); - ARM_COMPUTE_ERROR_ON(_pImpl->dwc_assembly_kernel == nullptr); - - // Create assembly kernel wrapper - _pImpl->dwc_acl_kernel.configure(_pImpl->dwc_assembly_kernel.get()); - - constexpr size_t alignment = 128; - - // Create workspace - const unsigned int num_threads = NEScheduler::get().num_threads(); - const size_t workspace_size = _pImpl->dwc_assembly_kernel->get_working_space_size(num_threads); - ARM_COMPUTE_ERROR_ON_MSG(workspace_size == 0, "Workspace size cannot be 0 !"); - _pImpl->mem_req.push_back({ TensorType::ACL_INT_0, workspace_size, alignment }); - - // Create packing tensor - const size_t pack_tensor_size = _pImpl->dwc_assembly_kernel->get_packed_params_size(); - ARM_COMPUTE_ERROR_ON_MSG(pack_tensor_size == 0, "Pack tensor size cannot be 0 !"); - - _pImpl->mem_req.push_back({ TensorType::ACL_INT_1, pack_tensor_size, alignment }); -} - -experimental::MemoryRequirements CpuDepthwiseConv2dAssemblyDispatch::workspace() const -{ - return _pImpl->mem_req; -} - -Status CpuDepthwiseConv2dAssemblyDispatch::validate(const ITensorInfo *src, - const ITensorInfo *weights, - const ITensorInfo *bias, - const ITensorInfo *dst, - const ConvolutionInfo &info) -{ - ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(src); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::F16, DataType::F32); - if(weights->data_type() != DataType::QSYMM8_PER_CHANNEL) - { - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, weights); - } - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(src, weights); - - // Validate convolver - ARM_COMPUTE_RETURN_ERROR_ON(!is_optimized_supported(src, weights, info)); - - // Validate activation - const bool is_relu = arm_compute::utils::info_helpers::is_relu(info.act_info); - const bool is_relu6 = arm_compute::utils::info_helpers::is_relu6(info.act_info); - ARM_COMPUTE_RETURN_ERROR_ON(info.act_info.enabled() && !(is_relu || is_relu6)); - - // Check bias - if(bias != nullptr) - { - unsigned int channel_idx = get_data_layout_dimension_index(src->data_layout(), DataLayoutDimension::CHANNEL); - ARM_COMPUTE_RETURN_ERROR_ON(bias->num_dimensions() > 1); - ARM_COMPUTE_RETURN_ERROR_ON(bias->dimension(0) != weights->dimension(channel_idx)); - } - - // Check output - if(dst->total_size() != 0) - { - const TensorShape dst_shape = misc::shape_calculator::compute_depthwise_convolution_shape(*src, *weights, info); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(dst->tensor_shape(), dst_shape); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst); - } - - // The uniform quantization case will only have 1 scale value in the weights quantization info - const UniformQuantizationInfo src_qinfo = src->quantization_info().uniform(); - const QuantizationInfo weights_qinfo = weights->quantization_info(); - const UniformQuantizationInfo dst_qinfo = dst->quantization_info().uniform(); - for(auto const s : weights_qinfo.scale()) - { - const float fmultipler = src_qinfo.scale * s / dst_qinfo.scale; - ARM_COMPUTE_RETURN_ERROR_ON(fmultipler > 1.f); - } - - return Status{}; -} - -bool CpuDepthwiseConv2dAssemblyDispatch::is_optimized_supported(const ITensorInfo *src, - const ITensorInfo *weights, - const ConvolutionInfo &info) -{ - ARM_COMPUTE_ERROR_ON_NULLPTR(src, weights); - - // Reshape input shape if in NHWC format - const DataLayout data_layout = src->data_layout(); - TensorShape in_shape{ src->tensor_shape() }; - if(data_layout == DataLayout::NHWC) - { - in_shape.set(Window::DimX, src->tensor_shape().y()); - in_shape.set(Window::DimY, src->tensor_shape().z()); - in_shape.set(Window::DimZ, src->tensor_shape().x()); - } - - // Check data type - const DataType input_type = src->data_type(); - const bool is_input_type_valid = is_data_type_float(input_type) || input_type == DataType::QASYMM8; - const DataType weights_type = weights->data_type(); - const bool is_weights_type_valid = is_data_type_float(weights_type) || weights_type == DataType::QASYMM8 || weights_type == DataType::QASYMM8_SIGNED - || weights_type == DataType::QSYMM8_PER_CHANNEL; - - // Check weighs size - std::set<unsigned int> supported_kernel_sizes = { 3, 5 }; - const unsigned int width_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH); - const unsigned int height_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT); - const unsigned int kernel_w = weights->dimension(width_idx); - const unsigned int kernel_h = weights->dimension(height_idx); - bool weights_supported = (kernel_w == kernel_h) && (supported_kernel_sizes.count(kernel_w) != 0); - - // Check for supported strides - const auto &strides = info.pad_stride_info.stride(); - bool supported_strides = (strides.first == strides.second) && ((strides.first == 1) || (strides.first == 2)); - - // Check for supported padding - const auto pad_top = info.pad_stride_info.pad_top(); - const auto pad_right = info.pad_stride_info.pad_right(); - const auto pad_bottom = info.pad_stride_info.pad_bottom(); - const auto pad_left = info.pad_stride_info.pad_left(); - PadStrideInfo same_pad = calculate_same_pad(in_shape, TensorShape(kernel_w, kernel_h), info.pad_stride_info, DataLayout::NCHW, info.dilation); - bool is_same_padding = (pad_top == same_pad.pad_top()) && (pad_right == same_pad.pad_right()) && (pad_bottom == same_pad.pad_bottom()) && (pad_left == same_pad.pad_left()); - bool is_valid_padding = (pad_top == 0) && (pad_right == 0) && (pad_bottom == 0) && (pad_left == 0); - bool supported_padding = is_same_padding || is_valid_padding; - // TODO(COMPMID-2464): Enable once dilated conv with stride 2 is supported - bool is_dilation_supported = ((info.dilation == Size2D(1U, 1U)) || ((info.dilation.x() == info.dilation.y()) && strides.first == 1)); - - if(weights_type == DataType::QSYMM8_PER_CHANNEL) - { - is_dilation_supported = is_dilation_supported && (info.dilation == Size2D(1U, 1U)); - } - - return is_input_type_valid && is_weights_type_valid && weights_supported && supported_strides && supported_padding && (info.depth_multiplier == 1) && is_dilation_supported; -} - -void CpuDepthwiseConv2dAssemblyDispatch::run(ITensorPack &tensors) -{ - // Prepare assembly kernel - prepare(tensors); - - auto src = tensors.get_tensor(TensorType::ACL_SRC_0); - auto workspace = tensors.get_tensor(TensorType::ACL_INT_0); - auto dst = tensors.get_tensor(TensorType::ACL_DST); - - // Setup inputs/outputs - ARM_COMPUTE_ERROR_ON(workspace == nullptr && workspace->buffer() == nullptr); - _pImpl->dwc_assembly_kernel->set_working_space(static_cast<void *>(workspace->buffer())); - - ARM_COMPUTE_ERROR_ON(workspace->buffer() == nullptr); - const int input_element_size = src->info()->element_size(); - const int input_batch_stride = src->info()->strides_in_bytes()[3] / input_element_size; - const int input_row_stride = src->info()->strides_in_bytes().z() / input_element_size; - const int input_col_stride = src->info()->strides_in_bytes().y() / input_element_size; - const void *input_ptr = src->buffer() + src->info()->offset_first_element_in_bytes(); - _pImpl->dwc_assembly_kernel->set_input(input_ptr, input_batch_stride, input_row_stride, input_col_stride); - - ARM_COMPUTE_ERROR_ON(dst->buffer() == nullptr); - const int output_element_size = dst->info()->element_size(); - const int output_batch_stride = dst->info()->strides_in_bytes()[3] / output_element_size; - const int output_row_stride = dst->info()->strides_in_bytes().z() / output_element_size; - const int output_col_stride = dst->info()->strides_in_bytes().y() / output_element_size; - void *output_ptr = dst->buffer() + dst->info()->offset_first_element_in_bytes(); - _pImpl->dwc_assembly_kernel->set_output(output_ptr, output_batch_stride, output_row_stride, output_col_stride); - - // Schedule assembly kernel - NEScheduler::get().schedule(&_pImpl->dwc_acl_kernel, Window::DimX); -} - -void CpuDepthwiseConv2dAssemblyDispatch::prepare(ITensorPack &tensors) -{ - if(!_pImpl->is_prepared) - { - auto weights = tensors.get_const_tensor(TensorType::ACL_SRC_1); - auto bias = tensors.get_const_tensor(TensorType::ACL_SRC_2); - auto packed_weights = tensors.get_tensor(TensorType::ACL_INT_1); - - ARM_COMPUTE_ERROR_ON(packed_weights->buffer() == nullptr); - - // Pack weights and bias - const int weights_element_size = weights->info()->element_size(); - const int weights_row_stride = weights->info()->strides_in_bytes().z() / weights_element_size; - const int weights_col_stride = weights->info()->strides_in_bytes().y() / weights_element_size; - _pImpl->dwc_assembly_kernel->pack_params(packed_weights->buffer(), - weights->buffer() + weights->info()->offset_first_element_in_bytes(), - weights_row_stride, - weights_col_stride, - (bias != nullptr) ? bias->buffer() : nullptr); - _pImpl->dwc_assembly_kernel->set_packed_params_buffer(packed_weights->buffer()); - - weights->mark_as_unused(); - if(bias != nullptr) - { - bias->mark_as_unused(); - } - _pImpl->is_prepared = true; - } -} -} // namespace cpu -} // namespace arm_compute diff --git a/src/runtime/cpu/operators/CpuDepthwiseConv2dAssemblyDispatch.h b/src/runtime/cpu/operators/CpuDepthwiseConv2dAssemblyDispatch.h deleted file mode 100644 index 195942b7fd..0000000000 --- a/src/runtime/cpu/operators/CpuDepthwiseConv2dAssemblyDispatch.h +++ /dev/null @@ -1,86 +0,0 @@ -/* - * Copyright (c) 2019-2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifndef ARM_COMPUTE_CPU_DEPTHWISECONV2DASSEMBLYDISPATCH_H -#define ARM_COMPUTE_CPU_DEPTHWISECONV2DASSEMBLYDISPATCH_H - -#include "src/core/common/Macros.h" -#include "src/runtime/cpu/ICpuOperator.h" - -namespace arm_compute -{ -namespace cpu -{ -/** Depthwise convolution assembly kernel glue */ -class CpuDepthwiseConv2dAssemblyDispatch : public ICpuOperator -{ -public: - /** Default constructor */ - CpuDepthwiseConv2dAssemblyDispatch(); - ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuDepthwiseConv2dAssemblyDispatch); - /** Default destructor */ - ~CpuDepthwiseConv2dAssemblyDispatch(); - - /** Initialize the function's source, destination, kernels and border_size. - * - * @note Supports only NHWC format - * - * @param[in] src Source tensor info. Data type supported: QASYMM8/F16/F32. (Written to only for border filling). - * @param[in] weights Weights tensor info. These are 3D tensors with shape [W, H, IFM]. Data type supported: Same as @p src. - * @param[in] bias (Optional) Biases tensor info. A 1D tensor with shape [IFM]. Must be nullptr if not needed. - * Data type supported: Same as @p src. - * @param[out] dst Destination tensor info. Data type supported: same as @p src. - * @param[in] info Depthwise convolution meta-data. - */ - void configure(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *bias, ITensorInfo *dst, const ConvolutionInfo &info); - /** Static function to check if given info will lead to a valid configuration - * - * Similar to CpuDepthwiseConv2dAssemblyDispatch::configure() - * - * @return a status - */ - static Status validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *bias, const ITensorInfo *dst, const ConvolutionInfo &info); - /** Check if the optimized kernel can be used for the given kernel sizes and strides - * - * @warning Even if this return true the inputs and outputs might need to get permuted as the only layout supported is NHWC - * - * @param[in] src Input tensor info. - * @param[in] weights Weights tensor info. - * @param[in] info Depthwise convolution meta-data. - * - * @return True if the assembly kernel could be used else false. Note that transformations of input/output could be needed. - */ - static bool is_optimized_supported(const ITensorInfo *src, const ITensorInfo *weights, const ConvolutionInfo &info); - - // Inherited methods overridden: - void run(ITensorPack &tensors) override; - void prepare(ITensorPack &tensors) override; - experimental::MemoryRequirements workspace() const override; - -private: - struct LocalImpl; - std::unique_ptr<LocalImpl> _pImpl; -}; -} // namespace cpu -} // namespace arm_compute -#endif /* ARM_COMPUTE_CPU_DEPTHWISECONV2DASSEMBLYDISPATCH_H */ diff --git a/src/runtime/cpu/operators/CpuDequantize.h b/src/runtime/cpu/operators/CpuDequantize.h deleted file mode 100644 index d1fb9e8d0e..0000000000 --- a/src/runtime/cpu/operators/CpuDequantize.h +++ /dev/null @@ -1,58 +0,0 @@ -/* - * Copyright (c) 2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifndef ARM_COMPUTE_CPU_DEQUANTIZE_H -#define ARM_COMPUTE_CPU_DEQUANTIZE_H - -#include "src/runtime/cpu/ICpuOperator.h" - -namespace arm_compute -{ -namespace cpu -{ -/** Basic function to run @ref kernels::CpuDequantizeKernel that dequantizes an input tensor */ -class CpuDequantize : public ICpuOperator -{ -public: - /** Default Constructor */ - CpuDequantize() = default; - /** Configure the kernel. - * - * @param[in] src Source tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL/QSYMM8/QSYMM16. - * @param[out] dst Destination tensor info with the same dimensions of input. Data type supported: F16/F32. - */ - void configure(const ITensorInfo *src, ITensorInfo *dst); - /** Static function to check if given info will lead to a valid configuration - * - * Similar to @ref CpuDequantize::configure() - * - * @return a status - */ - static Status validate(const ITensorInfo *src, const ITensorInfo *dst); - - // Inherited methods overridden: - void run(ITensorPack &tensors) override; -}; -} // namespace cpu -} // namespace arm_compute -#endif /* ARM_COMPUTE_CPU_DEQUANTIZE_H */ diff --git a/src/runtime/cpu/operators/CpuDirectConv2d.cpp b/src/runtime/cpu/operators/CpuDirectConv2d.cpp deleted file mode 100644 index 8812b777a3..0000000000 --- a/src/runtime/cpu/operators/CpuDirectConv2d.cpp +++ /dev/null @@ -1,147 +0,0 @@ -/* - * Copyright (c) 2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "src/runtime/cpu/operators/CpuDirectConv2d.h" - -#include "arm_compute/core/PixelValue.h" -#include "arm_compute/core/Utils.h" -#include "arm_compute/core/Validate.h" -#include "arm_compute/runtime/NEON/NEScheduler.h" - -namespace arm_compute -{ -namespace cpu -{ -CpuDirectConv2d::~CpuDirectConv2d() = default; - -CpuDirectConv2d::CpuDirectConv2d(std::shared_ptr<IMemoryManager> memory_manager) - : _memory_group(std::move(memory_manager)), _output_stage_kernel(), _conv_kernel(), _input_border_handler(), _activationlayer_function(), _accumulator(), _has_bias(false), - _is_activationlayer_enabled(false), _dim_split(Window::DimZ), _is_padding_required() -{ -} - -void CpuDirectConv2d::configure(ITensorInfo *src, ITensorInfo *weights, const ITensorInfo *bias, ITensorInfo *dst, const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info) -{ - ARM_COMPUTE_ERROR_ON(src->data_layout() == DataLayout::UNKNOWN); - _output_stage_kernel = std::make_unique<kernels::CpuDirectConv2dOutputStageKernel>(); - _conv_kernel = std::make_unique<kernels::CpuDirectConv2dKernel>(); - _input_border_handler = std::make_unique<NEFillBorderKernel>(); - - // Free accumulator - if(_accumulator.buffer() != nullptr) - { - _accumulator.allocator()->free(); - } - - _dim_split = src->data_layout() == DataLayout::NCHW ? Window::DimZ : Window::DimY; - - // Check if bias should be added in the convolution result - _has_bias = (bias != nullptr); - - _conv_kernel->configure(src, weights, dst, conv_info); - if(_has_bias) - { - _output_stage_kernel->configure(dst, bias); - } - _is_padding_required = !_conv_kernel->border_size().empty(); - - if(_is_padding_required) - { - // Add zero padding XY - _input_border_handler->configure(src, _conv_kernel->border_size(), BorderMode::CONSTANT, PixelValue(static_cast<float>(0.f))); - } - - //Configure Activation Layer - _is_activationlayer_enabled = act_info.enabled(); - if(_is_activationlayer_enabled) - { - _activationlayer_function = std::make_unique<CpuActivation>(); - _activationlayer_function->configure(dst, dst, act_info); - } -} - -Status CpuDirectConv2d::validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *bias, const ITensorInfo *dst, const PadStrideInfo &conv_info, - const ActivationLayerInfo &act_info) -{ - ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, weights, dst); - - // output might not be initialized since it can be an intermediate tensor of another layer - DataType data_type = src->data_type(); - TensorInfo accumulator(dst->clone()->set_is_resizable(true).reset_padding().set_data_type(data_type)); - - // Validate Convolution kernel - ARM_COMPUTE_RETURN_ON_ERROR(kernels::CpuDirectConv2dKernel::validate(src, weights, &accumulator, conv_info)); - - if(bias != nullptr) - { - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(weights, bias); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(bias->dimension(0) != weights->dimension(3), - "Biases size and number of input feature maps should match"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(bias->num_dimensions() > 1, "Biases should be one dimensional"); - } - - // Validate bias kernel - ARM_COMPUTE_RETURN_ON_ERROR(kernels::CpuDirectConv2dOutputStageKernel::validate(&accumulator, bias, dst)); - - if(act_info.enabled()) - { - ARM_COMPUTE_RETURN_ON_ERROR(CpuActivation::validate(dst, nullptr, act_info)); - } - - return Status{}; -} - -void CpuDirectConv2d::run(ITensorPack &tensors) -{ - MemoryGroupResourceScope scope_mg(_memory_group); - - auto src = tensors.get_tensor(TensorType::ACL_SRC_0); - auto bias = tensors.get_const_tensor(TensorType::ACL_SRC_2); - auto dst = tensors.get_tensor(TensorType::ACL_DST); - - if(_is_padding_required) - { - ITensorPack pack; - pack.add_tensor(TensorType::ACL_SRC_DST, src); - NEScheduler::get().schedule_op(_input_border_handler.get(), Window::DimZ, _input_border_handler->window(), pack); - } - NEScheduler::get().schedule_op(_conv_kernel.get(), _dim_split, _conv_kernel->window(), tensors); - if(_has_bias) - { - ITensorPack pack; - pack.add_tensor(TensorType::ACL_SRC_0, dst); - pack.add_tensor(TensorType::ACL_SRC_1, bias); - pack.add_tensor(TensorType::ACL_DST, dst); - NEScheduler::get().schedule_op(_output_stage_kernel.get(), Window::DimY, _output_stage_kernel->window(), pack); - } - - if(_is_activationlayer_enabled) - { - ITensorPack pack; - pack.add_tensor(TensorType::ACL_SRC, dst); - pack.add_tensor(TensorType::ACL_DST, dst); - _activationlayer_function->run(pack); - } -} -} // namespace cpu -} // namespace arm_compute diff --git a/src/runtime/cpu/operators/CpuDirectConv2d.h b/src/runtime/cpu/operators/CpuDirectConv2d.h deleted file mode 100644 index 9e584b9c49..0000000000 --- a/src/runtime/cpu/operators/CpuDirectConv2d.h +++ /dev/null @@ -1,107 +0,0 @@ -/* - * Copyright (c) 2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifndef ARM_COMPUTE_CPU_DIRECTCONV2D_H -#define ARM_COMPUTE_CPU_DIRECTCONV2D_H - -#include "arm_compute/core/ITensorInfo.h" -#include "arm_compute/core/Types.h" -#include "arm_compute/core/experimental/Types.h" -#include "arm_compute/runtime/IMemoryManager.h" -#include "arm_compute/runtime/MemoryGroup.h" -#include "arm_compute/runtime/NEON/functions/NEActivationLayer.h" -#include "arm_compute/runtime/Tensor.h" -#include "src/core/NEON/kernels/NEFillBorderKernel.h" -#include "src/core/cpu/ICpuKernel.h" -#include "src/core/cpu/kernels/CpuDirectConv2dKernel.h" -#include "src/core/cpu/kernels/CpuDirectConv2dOutputStageKernel.h" -#include "src/runtime/cpu/ICpuOperator.h" -#include "src/runtime/cpu/operators/CpuActivation.h" - -#include <memory> - -namespace arm_compute -{ -namespace cpu -{ -/** Function to run the direct convolution. - * - * This function calls the following kernels: - * - * -# @ref NEFillBorderKernel for the input - * -# @ref kernels::CpuDirectConv2dOutputStageKernel - * -# @ref kernels::CpuDirectConv2dKernel - */ -class CpuDirectConv2d : public ICpuOperator -{ -public: - /** Constructor */ - CpuDirectConv2d(std::shared_ptr<IMemoryManager> memory_manager = nullptr); - /** Destructor */ - ~CpuDirectConv2d(); - /** Set the input, weights, biases and output tensors. - * - * @note: DirectConvolution only works in the following configurations: - * 1x1 convolution with stride_x = 1/2/3, stride_y = 1/2/3 data type = F16/F32 - * 3x3 convolution with stride_x = 1/2/3, stride_y = 1/2/3 data type = F16/F32 - * 5x5 convolution with stride_x = 1/2/3, stride_y = 1/2/3 data type = F32 - * - * @param[in, out] src Input tensor info. Data types supported: F16/F32. - * @param[in] weights Set of kernels to convolve the input volume. - * Supported sizes: 1x1, 3x3 and 5x5. - * The 3rd dimension must be the same as the input's volume 3rd dimension. - * Data type supported: Same as @p src. - * @param[in] bias Set of biases. Can be nullptr. Data type supported: Same as @p src. - * @param[out] dst Output tensor info. - * The 3rd dimensions must be equal to the 4th dimension of the @p kernels tensor. Data types supported: Same as @p input. - * @param[in] conv_info Contains padding and stride information described in @ref PadStrideInfo. - * @param[in] act_info (Optional) Activation layer information in case of a fused activation. - */ - void configure(ITensorInfo *src, ITensorInfo *weights, const ITensorInfo *bias, ITensorInfo *dst, const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info = ActivationLayerInfo()); - /** Static function to check if given info will lead to a valid configuration - * - * Similar to CpuDirectConv2d::configure() - * - * @return a status - */ - static Status validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *bias, const ITensorInfo *dst, const PadStrideInfo &conv_info, - const ActivationLayerInfo &act_info = ActivationLayerInfo()); - - // Inherited methods overridden: - void run(ITensorPack &tensors) override; - -private: - MemoryGroup _memory_group; - std::unique_ptr<kernels::CpuDirectConv2dOutputStageKernel> _output_stage_kernel; - std::unique_ptr<kernels::CpuDirectConv2dKernel> _conv_kernel; - std::unique_ptr<NEFillBorderKernel> _input_border_handler; - std::unique_ptr<CpuActivation> _activationlayer_function; - Tensor _accumulator; - bool _has_bias{ false }; - bool _is_activationlayer_enabled{ false }; - unsigned int _dim_split{ 0 }; - bool _is_padding_required{ false }; -}; -} // namespace cpu -} // namespace arm_compute -#endif /* ARM_COMPUTE_CPU_DIRECTCONV2D_H */ diff --git a/src/runtime/cpu/operators/CpuElementwise.cpp b/src/runtime/cpu/operators/CpuElementwise.cpp deleted file mode 100644 index 8953d4769c..0000000000 --- a/src/runtime/cpu/operators/CpuElementwise.cpp +++ /dev/null @@ -1,124 +0,0 @@ -/* - * Copyright (c) 2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "src/runtime/cpu/operators/CpuElementwise.h" -#include "src/core/cpu/kernels/CpuElementwiseKernel.h" -#include "src/core/helpers/WindowHelpers.h" - -namespace arm_compute -{ -namespace cpu -{ -void CpuElementwiseBase::run(ITensorPack &tensors) -{ - // If the kernel has been configured, use the window from the kernel. - if(_kernel->is_window_configured()) - { - ICpuOperator::run(tensors); - return; - } - - auto src0_info = tensors.get_const_tensor(TensorType::ACL_SRC_0)->info(); - auto src1_info = tensors.get_const_tensor(TensorType::ACL_SRC_1)->info(); - auto shape_and_window = compute_output_shape_and_window(src0_info->tensor_shape(), src1_info->tensor_shape()); - ICpuOperator::run(tensors, shape_and_window.second); -} - -template <ArithmeticOperation op> -void CpuElementwiseArithmetic<op>::configure(const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst) -{ - auto k = std::make_unique<kernels::CpuArithmeticKernel>(); - k->configure(op, src0, src1, dst); - _kernel = std::move(k); -} - -template <ArithmeticOperation op> -Status CpuElementwiseArithmetic<op>::validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst) -{ - return kernels::CpuArithmeticKernel::validate(op, src0, src1, dst); -} - -template class CpuElementwiseArithmetic<ArithmeticOperation::MAX>; -template class CpuElementwiseArithmetic<ArithmeticOperation::MIN>; -template class CpuElementwiseArithmetic<ArithmeticOperation::SQUARED_DIFF>; -template class CpuElementwiseArithmetic<ArithmeticOperation::PRELU>; - -void CpuElementwiseDivision::configure(const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst) -{ - auto k = std::make_unique<kernels::CpuDivisionKernel>(); - k->configure(src0, src1, dst); - _kernel = std::move(k); -} - -Status CpuElementwiseDivision::validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst) -{ - return kernels::CpuDivisionKernel::validate(src0, src1, dst); -} - -void CpuElementwisePower::configure(const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst) -{ - auto k = std::make_unique<kernels::CpuPowerKernel>(); - k->configure(src0, src1, dst); - _kernel = std::move(k); -} - -Status CpuElementwisePower::validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst) -{ - return kernels::CpuPowerKernel::validate(src0, src1, dst); -} - -template <ComparisonOperation COP> -void CpuElementwiseComparisonStatic<COP>::configure(const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst) -{ - auto k = std::make_unique<kernels::CpuComparisonKernel>(); - k->configure(COP, src0, src1, dst); - _kernel = std::move(k); -} - -template <ComparisonOperation COP> -Status CpuElementwiseComparisonStatic<COP>::validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst) -{ - return kernels::CpuComparisonKernel::validate(COP, src0, src1, dst); -} - -void CpuElementwiseComparison::configure(const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst, ComparisonOperation op) -{ - auto k = std::make_unique<kernels::CpuComparisonKernel>(); - k->configure(op, src0, src1, dst); - _kernel = std::move(k); -} - -Status CpuElementwiseComparison::validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst, ComparisonOperation op) -{ - return kernels::CpuComparisonKernel::validate(op, src0, src1, dst); -} - -// Supported Specializations -template class CpuElementwiseComparisonStatic<ComparisonOperation::Equal>; -template class CpuElementwiseComparisonStatic<ComparisonOperation::NotEqual>; -template class CpuElementwiseComparisonStatic<ComparisonOperation::Greater>; -template class CpuElementwiseComparisonStatic<ComparisonOperation::GreaterEqual>; -template class CpuElementwiseComparisonStatic<ComparisonOperation::Less>; -template class CpuElementwiseComparisonStatic<ComparisonOperation::LessEqual>; -} // namespace cpu -} // namespace arm_compute
\ No newline at end of file diff --git a/src/runtime/cpu/operators/CpuElementwise.h b/src/runtime/cpu/operators/CpuElementwise.h deleted file mode 100644 index 899a2ffdb7..0000000000 --- a/src/runtime/cpu/operators/CpuElementwise.h +++ /dev/null @@ -1,196 +0,0 @@ -/* - * Copyright (c) 2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifndef ARM_COMPUTE_CPU_ELEMENTWISE_H -#define ARM_COMPUTE_CPU_ELEMENTWISE_H - -#include "src/runtime/cpu/ICpuOperator.h" - -namespace arm_compute -{ -namespace cpu -{ -class CpuElementwiseBase : public ICpuOperator -{ -public: - // Inherited methods overridden: - void run(ITensorPack &tensors) override; -}; -/** Class to run @ref cpu::kernels::CpuArithmeticKernel except for division and power - * - * @note Max/Min/Squared difference supports input data type of QASYMM8/QASYMM8_SIGNED/S16/F16/S32/F32 - * @note PRelu supports inpute data type of QASYMM8/QASYMM8_SIGNED/F16/F32. - */ -template <ArithmeticOperation op> -class CpuElementwiseArithmetic : public CpuElementwiseBase -{ -public: - /** Configure the operator - * - * @param[in] src0 The first source tensor information. - * @param[in] src1 The second source tensor information. With PRelu, this is used as alpha tensor. - * @param[out] dst The output tensor information. - */ - void configure(const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst); - /** Static function to check if the given information will lead to a valid configuration - * - * @param[in] src0 The first source tensor information. - * @param[in] src1 The second source tensor information. With PRelu, this is used as alpha tensor. - * @param[out] dst The output tensor information. - * - * @return A status - */ - static Status validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst); -}; - -/** Class to run @ref cpu::kernels::CpuArithmeticKernel except for maximum operation */ -using CpuElementwiseMax = CpuElementwiseArithmetic<ArithmeticOperation::MAX>; -/** Class to run @ref cpu::kernels::CpuArithmeticKernel except for minimum operation */ -using CpuElementwiseMin = CpuElementwiseArithmetic<ArithmeticOperation::MIN>; -/** Class to run @ref cpu::kernels::CpuArithmeticKernel except for squared difference operation */ -using CpuElementwiseSquaredDiff = CpuElementwiseArithmetic<ArithmeticOperation::SQUARED_DIFF>; - -/** Basic function to run @ref cpu::kernels::CpuArithmeticKernel for division - * - * @note The tensor data type for the inputs must be S32/F16/F32. - * @note The function performs a division operation between two tensors (i.e., out[i] = in1[i] / in2[i]) - */ -class CpuElementwiseDivision : public CpuElementwiseBase -{ -public: - /** Initialise the kernel's inputs, dst and conversion policy. - * - * @param[in, out] src0 First tensor input info. Data types supported: S32/F16/F32. - * @param[in, out] src1 Second tensor input info. Data types supported: Same as @p src0. - * @param[out] dst Output tensor info. Data types supported: Same as @p src0. - */ - void configure(const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst); - /** Static function to check if given info will lead to a valid configuration of @ref cpu::kernels::CpuArithmeticKernel for division - * - * @param[in] src0 First tensor input info. Data types supported: S32/F16/F32. - * @param[in] src1 Second tensor input info. Data types supported: Same as @p src0. - * @param[in] dst Output tensor info. Data types supported: Same as @p src0. - * - * @return a status - */ - static Status validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst); -}; - -/** Basic function to run @ref cpu::kernels::CpuArithmeticKernel for power - * - * @note The tensor data type for the inputs must be F16/F32. - * @note The function performs a elementwise power of in1 to in2 (i.e., out[i] = in1[i] ^ in2[i]) - * @note For an exponent that is a float, this function will only work with a positive base. - */ -class CpuElementwisePower : public CpuElementwiseBase -{ -public: - /** Initialise the kernel's inputs, dst and conversion policy. - * - * @param[in, out] src0 First tensor input info. Data types supported: F16/F32. - * @param[in, out] src1 Second tensor input info. Data types supported: Same as @p src0. - * @param[out] dst Output tensor info. Data types supported: Same as @p src0. - */ - void configure(const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst); - /** Static function to check if given info will lead to a valid configuration of @ref cpu::kernels::CpuArithmeticKernel for power - * - * @param[in] src0 First tensor input info. Data types supported: F16/F32. - * @param[in] src1 Second tensor input info. Data types supported: Same as @p src0. - * @param[in] dst Output tensor info. Data types supported: Same as @p src0. - * - * @return a status - */ - static Status validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst); -}; - -/** Basic function to run @ref cpu::kernels::CpuComparisonKernel. - * - * @note The tensor data type for the inputs must be QASYMM8/QASYMM8_SIGNED/S16/F16/S32/F32. - * @note The function performs a comparison operation between two tensors. - */ -class CpuElementwiseComparison : public CpuElementwiseBase -{ -public: - /** Initialise the kernel's inputs, dst and conversion policy. - * - * @param[in, out] src0 First tensor input info. Data types supported: QASYMM8/QASYMM8_SIGNED/S16/F16/S32/F32. - * @param[in, out] src1 Second tensor input info. Data types supported: Same as @p src0. - * @param[out] dst Output tensor info. Data types supported: U16/U32. - * @param[in] op Comparison Operation to be performed. - */ - void configure(const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst, ComparisonOperation op); - /** Static function to check if given info will lead to a valid configuration of @ref cpu::kernels::CpuComparisonKernel - * - * @param[in] src0 First tensor input info. Data types supported: QASYMM8/QASYMM8_SIGNED/S16/F16/S32/F32. - * @param[in] src1 Second tensor input info. Data types supported: Same as @p src0. - * @param[in] dst Output tensor info. Data types supported: U16/U32. - * @param[in] op Comparison Operation to be performed. - * - * @return a status - */ - static Status validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst, ComparisonOperation op); -}; - -/** Basic function to run @ref cpu::kernels::CpuComparisonKernel - * - * @note The tensor data type for the inputs must be QASYMM8/QASYMM8_SIGNED/S16/F16/S32/F32. - * @note The function performs a comparison operation between two tensors. - */ -template <ComparisonOperation op> -class CpuElementwiseComparisonStatic : public CpuElementwiseBase -{ -public: - /** Initialise the kernel's inputs, dst and conversion policy. - * - * @param[in, out] src0 First tensor input info. Data types supported: QASYMM8/QASYMM8_SIGNED/S16/F16/S32/F32. - * @param[in, out] src1 Second tensor input info. Data types supported: Same as @p src0. - * @param[out] dst Output tensor info. Data types supported: U16/U32. - */ - void configure(const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst); - /** Static function to check if given info will lead to a valid configuration of @ref cpu::kernels::CpuComparisonKernel - * - * @param[in] src0 First tensor input info. Data types supported: QASYMM8/QASYMM8_SIGNED/S16/F16/S32/F32. - * @param[in] src1 Second tensor input info. Data types supported: Same as @p src0. - * @param[in] dst Output tensor info. Data types supported: U16/U32. - * - * @return a status - */ - static Status validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst); -}; - -/** Basic function to run equal comparison. */ -using NEEqual = CpuElementwiseComparisonStatic<ComparisonOperation::Equal>; -/** Basic function to run not equal comparison. */ -using NENotEqual = CpuElementwiseComparisonStatic<ComparisonOperation::NotEqual>; -/** Basic function to run greater comparison. */ -using NEGreater = CpuElementwiseComparisonStatic<ComparisonOperation::Greater>; -/** Basic function to run greater-equal comparison. */ -using NEGreaterEqual = CpuElementwiseComparisonStatic<ComparisonOperation::GreaterEqual>; -/** Basic function to run less comparison. */ -using NELess = CpuElementwiseComparisonStatic<ComparisonOperation::Less>; -/** Basic function to run less-equal comparison. */ -using NELessEqual = CpuElementwiseComparisonStatic<ComparisonOperation::LessEqual>; -} // namespace cpu -} // namespace arm_compute - -#endif /* ARM_COMPUTE_CPU_ELEMENTWISE_H */
\ No newline at end of file diff --git a/src/runtime/cpu/operators/CpuElementwiseUnary.cpp b/src/runtime/cpu/operators/CpuElementwiseUnary.cpp deleted file mode 100644 index c79e6e9acf..0000000000 --- a/src/runtime/cpu/operators/CpuElementwiseUnary.cpp +++ /dev/null @@ -1,58 +0,0 @@ -/* - * Copyright (c) 2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "src/runtime/cpu/operators/CpuElementwiseUnary.h" -#include "src/core/cpu/kernels/CpuElementwiseUnaryKernel.h" -#include "src/core/helpers/WindowHelpers.h" - -namespace arm_compute -{ -namespace cpu -{ -using KernelType = kernels::CpuElementwiseUnaryKernel; - -void CpuElementwiseUnary::configure(ElementWiseUnary op, const ITensorInfo &src, ITensorInfo &dst) -{ - auto k = std::make_unique<KernelType>(); - k->configure(op, src, dst); - _kernel = std::move(k); -} - -Status CpuElementwiseUnary::validate(ElementWiseUnary op, const ITensorInfo &src, const ITensorInfo &dst) -{ - return KernelType::validate(op, src, dst); -} - -void CpuElementwiseUnary::run(ITensorPack &tensors) -{ - if(_kernel->is_window_configured()) - { - ICpuOperator::run(tensors); - return; - } - - auto src_info = tensors.get_const_tensor(TensorType::ACL_SRC)->info(); - ICpuOperator::run(tensors, compute_output_shape_and_window(src_info->tensor_shape()).second); -} -} // namespace cpu -} // namespace arm_compute
\ No newline at end of file diff --git a/src/runtime/cpu/operators/CpuElementwiseUnary.h b/src/runtime/cpu/operators/CpuElementwiseUnary.h deleted file mode 100644 index 721ba2a85b..0000000000 --- a/src/runtime/cpu/operators/CpuElementwiseUnary.h +++ /dev/null @@ -1,61 +0,0 @@ -/* - * Copyright (c) 2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifndef ARM_COMPUTE_CPU_ELEMENTWISE_UNARY_H -#define ARM_COMPUTE_CPU_ELEMENTWISE_UNARY_H - -#include "arm_compute/core/Types.h" -#include "src/runtime/cpu/ICpuOperator.h" - -namespace arm_compute -{ -namespace cpu -{ -class CpuElementwiseUnary : public ICpuOperator -{ -public: - /** Initialize the function - * - * @param[in] op Unary operation to execute - * @param[in] src Input tensor information. Data types supported: F16/F32, F16/F32/S32 for NEG/ABS operations. - * @param[out] dst Output tensor information. Data types supported: Same as @p src. - */ - void configure(ElementWiseUnary op, const ITensorInfo &src, ITensorInfo &dst); - /** Static function to check if given info will lead to a valid configuration - * - * @param[in] op Unary operation to execute - * @param[in] src First tensor input info. Data types supported: F16/F32, F16/F32/S32 for NEG/ABS operations. - * @param[in] dst Output tensor info. Data types supported: Same as @p input. - * - * @return a status - */ - static Status validate(ElementWiseUnary op, const ITensorInfo &src, const ITensorInfo &dst); - - // Inherited methods overridden: - void run(ITensorPack &tensors) override; -}; - -} // namespace cpu -} // namespace arm_compute - -#endif /* ARM_COMPUTE_CPU_ELEMENTWISE_UNARY_H */
\ No newline at end of file diff --git a/src/runtime/cpu/operators/CpuFill.cpp b/src/runtime/cpu/operators/CpuFill.cpp deleted file mode 100644 index 081e30ea17..0000000000 --- a/src/runtime/cpu/operators/CpuFill.cpp +++ /dev/null @@ -1,39 +0,0 @@ -/* - * Copyright (c) 2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "src/runtime/cpu/operators/CpuFill.h" - -#include "src/core/cpu/kernels/CpuFillKernel.h" - -namespace arm_compute -{ -namespace cpu -{ -void CpuFill::configure(const ITensorInfo *tensor, PixelValue constant_value) -{ - auto k = std::make_unique<kernels::CpuFillKernel>(); - k->configure(tensor, constant_value); - _kernel = std::move(k); -} -} // namespace cpu -} // namespace arm_compute diff --git a/src/runtime/cpu/operators/CpuFill.h b/src/runtime/cpu/operators/CpuFill.h deleted file mode 100644 index fac8e76481..0000000000 --- a/src/runtime/cpu/operators/CpuFill.h +++ /dev/null @@ -1,48 +0,0 @@ -/* - * Copyright (c) 2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifndef ARM_COMPUTE_CPU_FILL_H -#define ARM_COMPUTE_CPU_FILL_H - -#include "src/runtime/cpu/ICpuOperator.h" - -namespace arm_compute -{ -namespace cpu -{ -/** Basic function to run @ref kernels::CpuFillKernel */ -class CpuFill : public ICpuOperator -{ -public: - /** Constructor */ - CpuFill() = default; - /** Configure operator for a given list of arguments - * - * @param[in,out] tensor Tensor to fill. Supported data types: All - * @param[in] constant_value The value used to fill the planes of the tensor - */ - void configure(const ITensorInfo *tensor, PixelValue constant_value); -}; -} // namespace cpu -} // namespace arm_compute -#endif /* ARM_COMPUTE_CPU_FILL_H */ diff --git a/src/runtime/cpu/operators/CpuFlatten.cpp b/src/runtime/cpu/operators/CpuFlatten.cpp deleted file mode 100644 index 58e6e4b671..0000000000 --- a/src/runtime/cpu/operators/CpuFlatten.cpp +++ /dev/null @@ -1,44 +0,0 @@ -/* - * Copyright (c) 2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "src/runtime/cpu/operators/CpuFlatten.h" - -#include "src/core/cpu/kernels/CpuReshapeKernel.h" - -namespace arm_compute -{ -namespace cpu -{ -void CpuFlatten::configure(const ITensorInfo *src, ITensorInfo *dst) -{ - auto k = std::make_unique<kernels::CpuReshapeKernel>(); - k->configure(src, dst); - _kernel = std::move(k); -} - -Status CpuFlatten::validate(const ITensorInfo *src, const ITensorInfo *dst) -{ - return kernels::CpuReshapeKernel::validate(src, dst); -} -} // namespace cpu -} // namespace arm_compute diff --git a/src/runtime/cpu/operators/CpuFlatten.h b/src/runtime/cpu/operators/CpuFlatten.h deleted file mode 100644 index ae71453988..0000000000 --- a/src/runtime/cpu/operators/CpuFlatten.h +++ /dev/null @@ -1,66 +0,0 @@ -/* - * Copyright (c) 2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifndef ARM_COMPUTE_CPU_FLATTEN_H -#define ARM_COMPUTE_CPU_FLATTEN_H - -#include "src/runtime/cpu/ICpuOperator.h" - -namespace arm_compute -{ -namespace cpu -{ -/** Basic function to flatten a given input */ -class CpuFlatten : public ICpuOperator -{ -public: - /** Constructor */ - CpuFlatten() = default; - /** Configure operator for a given list of arguments - * - * Valid data layouts: - * - All - * - * Valid data type configurations: - * |src |dst | - * |:--------------|:--------------| - * |All |All | - * - * @param[in] src Source tensor to flatten with at least 3 dimensions. - * The dimensions above the third will be interpreted as batches. Data types supported: All - * @param[in] dst Destination tensor with shape [w*h*d, input_batches] where: - * w = width input tensor, h = height input tensor and d = depth input tensor. - * Data type supported: same as @p src - */ - void configure(const ITensorInfo *src, ITensorInfo *dst); - /** Static function to check if given info will lead to a valid configuration - * - * Similar to @ref CpuFlatten::configure() - * - * @return a status - */ - static Status validate(const ITensorInfo *src, const ITensorInfo *dst); -}; -} // namespace cpu -} // namespace arm_compute -#endif /* ARM_COMPUTE_CPU_FLATTEN_H */ diff --git a/src/runtime/cpu/operators/CpuFloor.cpp b/src/runtime/cpu/operators/CpuFloor.cpp deleted file mode 100644 index 4e169a04be..0000000000 --- a/src/runtime/cpu/operators/CpuFloor.cpp +++ /dev/null @@ -1,44 +0,0 @@ -/* - * Copyright (c) 2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "src/runtime/cpu/operators/CpuFloor.h" - -#include "src/core/cpu/kernels/CpuFloorKernel.h" - -namespace arm_compute -{ -namespace cpu -{ -void CpuFloor::configure(const ITensorInfo *src, ITensorInfo *dst) -{ - auto k = std::make_unique<kernels::CpuFloorKernel>(); - k->configure(src, dst); - _kernel = std::move(k); -} - -Status CpuFloor::validate(const ITensorInfo *src, const ITensorInfo *dst) -{ - return kernels::CpuFloorKernel::validate(src, dst); -} -} // namespace cpu -} // namespace arm_compute diff --git a/src/runtime/cpu/operators/CpuFloor.h b/src/runtime/cpu/operators/CpuFloor.h deleted file mode 100644 index cbb9d565eb..0000000000 --- a/src/runtime/cpu/operators/CpuFloor.h +++ /dev/null @@ -1,56 +0,0 @@ -/* - * Copyright (c) 2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifndef ARM_COMPUTE_CPU_FLOOR_H -#define ARM_COMPUTE_CPU_FLOOR_H - -#include "src/runtime/cpu/ICpuOperator.h" - -namespace arm_compute -{ -namespace cpu -{ -/** Basic function to run @ref kernels::CpuFloorKernel */ -class CpuFloor : public ICpuOperator -{ -public: - /** Constructor */ - CpuFloor() = default; - /** Configure operator for a given list of arguments - * - * @param[in] src Source tensor info. Data types supported: F16/F32. - * @param[in] dst Destination tensor info. Data type supported: same as @p src - */ - void configure(const ITensorInfo *src, ITensorInfo *dst); - /** Static function to check if given info will lead to a valid configuration of @ref CpuFloor - * - * @param[in] src Source tensor info. Data types supported: F16/F32. - * @param[in] dst Destination tensor info. Data type supported: same as @p src - * - * @return a status - */ - static Status validate(const ITensorInfo *src, const ITensorInfo *dst); -}; -} // namespace cpu -} // namespace arm_compute -#endif /* ARM_COMPUTE_CPU_FLOOR_H */ diff --git a/src/runtime/cpu/operators/CpuGemmDirectConv2d.cpp b/src/runtime/cpu/operators/CpuGemmDirectConv2d.cpp deleted file mode 100644 index e50099df1f..0000000000 --- a/src/runtime/cpu/operators/CpuGemmDirectConv2d.cpp +++ /dev/null @@ -1,206 +0,0 @@ -/* - * Copyright (c) 2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "src/runtime/cpu/operators/CpuGemmDirectConv2d.h" - -#include "arm_compute/core/utils/misc/ShapeCalculator.h" -#include "arm_compute/core/utils/quantization/AsymmHelpers.h" -#include "arm_compute/runtime/FunctionDescriptors.h" -#include "arm_compute/runtime/NEON/NEScheduler.h" -#include "src/runtime/cpu/operators/CpuActivation.h" -#include "src/runtime/cpu/operators/CpuPermute.h" -#include "src/runtime/cpu/operators/internal/CpuGemmAssemblyDispatch.h" - -#include <set> - -namespace arm_compute -{ -namespace cpu -{ -namespace -{ -GEMMLowpOutputStageInfo calculate_output_stage_metadata(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *dst, const ActivationLayerInfo &act) -{ - // Since we need negative offsets for computing convolution, we need to change QuantizationInfo() - // Extract and negate input and weights offset - const QuantizationInfo iqinfo = src->quantization_info(); - const QuantizationInfo wqinfo = weights->quantization_info(); - const QuantizationInfo oqinfo = (dst->total_size() == 0) ? iqinfo : dst->quantization_info(); - const UniformQuantizationInfo uoqinfo = oqinfo.uniform(); - const DataType data_type = src->data_type(); - // Merge activation with output stage - const std::set<ActivationLayerInfo::ActivationFunction> supported_acts = { ActivationLayerInfo::ActivationFunction::RELU, - ActivationLayerInfo::ActivationFunction::BOUNDED_RELU, - ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU - }; - PixelValue type_min{}; - PixelValue type_max{}; - std::tie(type_min, type_max) = get_min_max(data_type); - int32_t min_activation = type_min.get<int32_t>(); - int32_t max_activation = type_max.get<int32_t>(); - if(supported_acts.count(act.activation()) != 0) - { - std::tie(min_activation, max_activation) = get_quantized_activation_min_max(act, data_type, uoqinfo); - } - GEMMLowpOutputStageInfo os_info; - os_info.type = GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT; - os_info.gemmlowp_offset = uoqinfo.offset; - os_info.gemmlowp_min_bound = min_activation; - os_info.gemmlowp_max_bound = max_activation; - os_info.is_quantized_per_channel = (weights->data_type() == DataType::QSYMM8_PER_CHANNEL); - quantization::calculate_quantized_multipliers(iqinfo, wqinfo, oqinfo, os_info); - return os_info; -} -cpu::AsmGemmInfo init_assembly_metadata(const Conv2dInfo &info, bool is_indirect) -{ - cpu::AsmGemmInfo asm_info; - asm_info.method = is_indirect ? cpu::AsmConvMethod::Indirect : cpu::AsmConvMethod::Conv; - asm_info.ps_info = info.conv_info; - asm_info.activation_info = info.act_info; - asm_info.depth_output_gemm3d = true; - asm_info.reinterpret_input_as_3d = true; - asm_info.padding_top = info.conv_info.pad_top(); - asm_info.padding_left = info.conv_info.pad_left(); - asm_info.padding_value = 0.f; - asm_info.negated_offsets = false; - return asm_info; -} -} // namespace - -CpuGemmDirectConv2d::CpuGemmDirectConv2d(const std::shared_ptr<IMemoryManager> &memory_manager) - : _gemm_asm_func(std::make_unique<CpuGemmAssemblyDispatch>(memory_manager)), - _activation_func(std::make_unique<CpuActivation>()), - _weights_permute_func(std::make_unique<CpuPermute>()), - _permuted_weights_info(), - _permuted_weights(std::make_unique<Tensor>()) -{ -} - -CpuGemmDirectConv2d::~CpuGemmDirectConv2d() = default; - -void CpuGemmDirectConv2d::configure(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, ITensorInfo *dst, const Conv2dInfo &info) -{ - ARM_COMPUTE_ERROR_ON_NULLPTR(src, weights, dst); - ARM_COMPUTE_ERROR_THROW_ON(CpuGemmDirectConv2d::validate(src, - weights, - biases != nullptr ? biases : nullptr, - dst, - info)); - _original_weights_info = weights; - _weights_permute_func->configure(weights, &_permuted_weights_info, PermutationVector{ 3, 0, 1, 2 }); - - // Configure assembly dispatch - cpu::AsmGemmInfo asm_info = init_assembly_metadata(info, false); - if(is_data_type_quantized(src->data_type())) - { - asm_info.output_stage = calculate_output_stage_metadata(src, weights, dst, info.act_info); - } - _gemm_asm_func->configure(src, &_permuted_weights_info, biases, dst, asm_info); - - // Configure activation - if(info.act_info.enabled() && !_gemm_asm_func->is_activation_supported(info.act_info)) - { - _activation_func->configure(dst, nullptr, info.act_info); - _run_activation = true; - } -} -Status CpuGemmDirectConv2d::validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, const Conv2dInfo &info) -{ - ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, weights, dst); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::BFLOAT16, DataType::F16, DataType::F32); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(weights, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM8_PER_CHANNEL, DataType::BFLOAT16, DataType::F16, DataType::F32); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(src, weights); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.num_groups > 1, "Grouping (num_groups != 1) is not supported on Neon"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(src->data_layout() != DataLayout::NHWC, "Data layout supported is NHWC"); - const DataType data_type = src->data_type(); - const TensorShape i_shape = src->tensor_shape(); - const TensorShape w_shape = weights->tensor_shape(); - ARM_COMPUTE_RETURN_ERROR_ON(w_shape[0] != i_shape[0]); - ARM_COMPUTE_RETURN_ERROR_ON(info.dilation != Size2D(1U, 1U)); - ARM_COMPUTE_RETURN_ERROR_ON(weights->num_dimensions() > 4); - // Validate biases - if(biases != nullptr) - { - if(is_data_type_quantized_asymmetric(data_type)) - { - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(biases, 1, DataType::S32); - } - else if(data_type == DataType::BFLOAT16) - { - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(biases, 1, DataType::F32); - } - else - { - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, biases); - } - ARM_COMPUTE_RETURN_ERROR_ON(biases->dimension(0) != weights->dimension(3)); - ARM_COMPUTE_RETURN_ERROR_ON(biases->num_dimensions() > 1); - } - - cpu::AsmGemmInfo asm_info = init_assembly_metadata(info, false); - ARM_COMPUTE_RETURN_ON_ERROR(cpu::CpuGemmAssemblyDispatch::validate(src, weights, biases, dst, asm_info)); - return Status{}; -} -void CpuGemmDirectConv2d::run(ITensorPack &tensors) -{ - prepare(tensors); - - _gemm_asm_func->run(tensors); - if(_run_activation) - { - _activation_func->run(tensors); - } -} - -void CpuGemmDirectConv2d::allocate_permuted_weights() -{ - // TODO: This function will be removed when memory injection is implemeted. - ARM_COMPUTE_ERROR_ON(_permuted_weights == nullptr); - _permuted_weights->allocator()->free(); - _permuted_weights->allocator()->init(_permuted_weights_info); - _permuted_weights->allocator()->allocate(); -} - -void CpuGemmDirectConv2d::prepare(ITensorPack &tensors) -{ - if(!_is_prepared) - { - allocate_permuted_weights(); - ITensorPack permute_tensors - { - { TensorType::ACL_SRC, tensors.get_const_tensor(TensorType::ACL_SRC_1) }, - { TensorType::ACL_DST, _permuted_weights.get() }, - }; - - _weights_permute_func->run(permute_tensors); - - tensors.get_const_tensor(TensorType::ACL_SRC_1)->mark_as_unused(); - - // switch the original tensor with permuted tensor - tensors.add_const_tensor(TensorType::ACL_SRC_1, _permuted_weights.get()); - _is_prepared = true; - } -} - -} // namespace cpu -} // namespace arm_compute
\ No newline at end of file diff --git a/src/runtime/cpu/operators/CpuGemmDirectConv2d.h b/src/runtime/cpu/operators/CpuGemmDirectConv2d.h deleted file mode 100644 index 6aa17c2349..0000000000 --- a/src/runtime/cpu/operators/CpuGemmDirectConv2d.h +++ /dev/null @@ -1,112 +0,0 @@ -/* - * Copyright (c) 2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifndef ARM_COMPUTE_CPU_GEMM_DIRECT_CONV_2D_H -#define ARM_COMPUTE_CPU_GEMM_DIRECT_CONV_2D_H - -#include "arm_compute/core/ITensorInfo.h" -#include "arm_compute/core/experimental/Types.h" -#include "arm_compute/runtime/Tensor.h" -#include "src/core/common/Macros.h" -#include "src/core/cpu/ICpuKernel.h" -#include "src/runtime/cpu/ICpuOperator.h" - -#include <memory> - -namespace arm_compute -{ -// Forward declarations -class ITensor; -struct Conv2dInfo; -namespace cpu -{ -class CpuGemmAssemblyDispatch; -class CpuActivation; -class CpuPermute; - -class CpuGemmDirectConv2d : public ICpuOperator -{ -public: - /** Constructor */ - CpuGemmDirectConv2d(const std::shared_ptr<IMemoryManager> &memory_manager = nullptr); - ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuGemmDirectConv2d); - /** Destructor */ - ~CpuGemmDirectConv2d(); - /** Set the input and output tensors. - * - * Valid data layouts: - * - All - * - * Valid data type configurations: - * |src0 |src1 |src2 |dst | - * |:--------------|:--------------|:--------------|:--------------| - * |QASYMM8 |QASYMM8 |S32 |QASYMM8 | - * |QASYMM8_SIGNED |QASYMM8_SIGNED |S32 |QASYMM8_SIGNED | - * |F16 |F16 |F16 |F16 | - * |F32 |F32 |F32 |F32 | - * |BFLOAT16 |BFLOAT16 |BFLOAT16 |BFLOAT16 | - * - * @param[in] src Source tensor info. 3 lower dimensions represent a single input [width, height, IFM], - * while every optional dimension from 4 and above represent a batch of inputs. - * Data types supported: QASYMM8/QASYMM8_SIGNED/BFLOAT16/F16/F32. - * @param[in] weights Weights tensor info. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM]. - * Data type supported: QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL/BFLOAT16/F16/F32. - * @param[in] biases Biases tensor info. Shared biases supported. Biases are 1D tensor with dimensions [OFM]. - * Data type supported: Should match @p input data type, except for input of QASYMM8/QASYMM8_SIGNED type where biases should be of S32 type. - * @param[in] dst Destination tensor info. 3 lower dimensions represent a single output [width, height, OFM], while the rest represent batch of outputs. - * Data types supported: Same as @p input. - * @param[in] info Contains padding and stride information described in @ref PadStrideInfo. - */ - void configure(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, ITensorInfo *dst, const Conv2dInfo &info); - /** Static function to check if given info will lead to a valid configuration of @ref CpuGemmDirectConv2d - * - * Similar to CpuGemmDirectConv2d::configure() - * - * @return a status - */ - static Status validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, const Conv2dInfo &info); - - // Inherited methods overridden: - void run(ITensorPack &tensors) override; - void prepare(ITensorPack &constants) override; - -private: - std::unique_ptr<CpuGemmAssemblyDispatch> _gemm_asm_func; - std::unique_ptr<CpuActivation> _activation_func; - std::unique_ptr<CpuPermute> _weights_permute_func; - const ITensorInfo *_original_weights_info{}; - TensorInfo _permuted_weights_info; - std::unique_ptr<Tensor> _permuted_weights{ nullptr }; - bool _is_prepared{ false }; - bool _run_activation{ false }; - - /** Function to allocated a tensor for permuted weights - * - * @note This function will be removed when memory injection is properly implemented. - */ - void allocate_permuted_weights(); -}; -} // namespace cpu -} // namespace arm_compute - -#endif /* ARM_COMPUTE_CPU_GEMM_DIRECT_CONV_2D_H */ diff --git a/src/runtime/cpu/operators/CpuMul.cpp b/src/runtime/cpu/operators/CpuMul.cpp deleted file mode 100644 index 2f3d442a70..0000000000 --- a/src/runtime/cpu/operators/CpuMul.cpp +++ /dev/null @@ -1,77 +0,0 @@ -/* - * Copyright (c) 2016-2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "src/runtime/cpu/operators/CpuMul.h" - -#include "arm_compute/core/TensorInfo.h" -#include "arm_compute/core/Validate.h" -#include "arm_compute/runtime/NEON/NEScheduler.h" -#include "src/core/cpu/kernels/CpuMulKernel.h" - -namespace arm_compute -{ -namespace cpu -{ -Status CpuMul::validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, float scale, ConvertPolicy overflow_policy, RoundingPolicy rounding_policy, - const ActivationLayerInfo &act_info) -{ - ARM_COMPUTE_RETURN_ERROR_ON(act_info.enabled()); - return kernels::CpuMulKernel::validate(src1, src2, dst, scale, overflow_policy, rounding_policy); -} - -void CpuMul::configure(ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, float scale, ConvertPolicy overflow_policy, RoundingPolicy rounding_policy, - const ActivationLayerInfo &act_info) -{ - ARM_COMPUTE_UNUSED(act_info); - auto k = std::make_unique<kernels::CpuMulKernel>(); - k->configure(src1, src2, dst, scale, overflow_policy, rounding_policy); - _kernel = std::move(k); -} - -void CpuMul::run(ITensorPack &tensors) -{ - ARM_COMPUTE_ERROR_ON_MSG(tensors.empty(), "No inputs provided"); - NEScheduler::get().schedule_op(_kernel.get(), Window::DimY, _kernel->window(), tensors); -} - -Status CpuComplexMul::validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, const ActivationLayerInfo &act_info) -{ - ARM_COMPUTE_RETURN_ERROR_ON(act_info.enabled()); - return kernels::CpuComplexMulKernel::validate(src1, src2, dst); -} - -void CpuComplexMul::configure(ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, const ActivationLayerInfo &act_info) -{ - ARM_COMPUTE_UNUSED(act_info); - auto k = std::make_unique<kernels::CpuComplexMulKernel>(); - k->configure(src1, src2, dst); - _kernel = std::move(k); -} - -void CpuComplexMul::run(ITensorPack &tensors) -{ - ARM_COMPUTE_ERROR_ON_MSG(tensors.empty(), "No inputs provided"); - NEScheduler::get().schedule_op(_kernel.get(), Window::DimY, _kernel->window(), tensors); -} -} // namespace cpu -} // namespace arm_compute
\ No newline at end of file diff --git a/src/runtime/cpu/operators/CpuMul.h b/src/runtime/cpu/operators/CpuMul.h deleted file mode 100644 index 6e717188a4..0000000000 --- a/src/runtime/cpu/operators/CpuMul.h +++ /dev/null @@ -1,109 +0,0 @@ -/* - * Copyright (c) 2016-2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifndef ARM_COMPUTE_CPU_MUL_H -#define ARM_COMPUTE_CPU_MUL_H - -#include "arm_compute/core/ITensorInfo.h" -#include "src/runtime/cpu/ICpuOperator.h" - -namespace arm_compute -{ -namespace cpu -{ -/** Basic function to run @ref kernels::CpuMulKernel */ -class CpuMul : public ICpuOperator -{ -public: - /** Default Constructor */ - CpuMul() = default; - /** Initialise the kernel's inputs, dst and convertion policy. - * - * @note For @p scale equal to 1/255 only round to nearest even (implemented as round half up) is supported. - * For all other scale values only round to zero (implemented as round towards minus infinity) is supported. - * - * @param[in, out] src1 First input tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/S32/QSYMM16/F16/F32 - * This input tensor is [in, out] because its TensorInfo might be modified inside the kernel in case of broadcasting of dimension 0. - * @param[in, out] src2 Second input tensor info. Data types supported: U8, QASYMM8 (only if @p src1 is QASYMM8), QASYMM8_SIGNED (only if @p src1 is QASYMM8_SIGNED), S16, S32, QSYMM16 (only if @p src1 is QSYMM16), F16 (only if @p src1 is F16), F32 (only if @p src1 is F32). - * This input tensor is [in, out] because its TensorInfo might be modified inside the kernel in case of broadcasting of dimension 0. - * @param[out] dst dst tensor info. Data types supported: - * - U8, only if both inputs are U8. - * - QASYMM8, only if both inputs are QASYMM8. - * - QASYMM8_SIGNED, only if @p src1 is QASYMM8_SIGNED. - * - S16. - * - QSYMM16, only if both inputs are QSYMM16. - * - S32, only if both inputs are S32 or both are QSYMM16. - * - F16, only if @p src1 is F16. - * - F32, only if both inputs are F32. - * @param[in] scale Scale to apply after multiplication. - * Scale must be positive and its value must be either 1/255 or 1/2^n where n is between 0 and 15. - * If both @p src1, @p src2 and @p dst are of datatype S32, scale cannot be 1/255 - * @param[in] overflow_policy Overflow policy. ConvertPolicy cannot be WRAP if any of the inputs is of quantized datatype - * @param[in] rounding_policy Rounding policy. - * @param[in] act_info (Optional) Activation layer information in case of a fused activation. Currently not supported. - */ - void configure(ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, float scale, ConvertPolicy overflow_policy, RoundingPolicy rounding_policy, - const ActivationLayerInfo &act_info = ActivationLayerInfo()); - /** Static function to check if given info will lead to a valid configuration - * - * Similar to @ref CpuMul::configure() - * - * @return a status - */ - static Status validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, float scale, ConvertPolicy overflow_policy, RoundingPolicy rounding_policy, - const ActivationLayerInfo &act_info = ActivationLayerInfo()); - - // Inherited methods overridden: - void run(ITensorPack &tensors) override; -}; - -/** Basic function to run @ref kernels::CpuComplexMulKernel */ -class CpuComplexMul : public ICpuOperator -{ -public: - /** Default Constructor */ - CpuComplexMul() = default; - /** Initialise the kernel's inputs, dst. - * - * @param[in, out] src1 First input tensor. Data types supported: F32. Number of channels supported: 2 (complex tensor). - * The input tensor is [in, out] because its TensorInfo might be modified inside the kernel in case of broadcasting of dimension 0. - * @param[in, out] src2 Second input tensor. Data types supported: same as @p src1. Number of channels supported: same as @p src1. - * The input tensor is [in, out] because its TensorInfo might be modified inside the kernel in case of broadcasting of dimension 0. - * @param[out] dst The dst tensor. Data types supported: same as @p src1. Number of channels: same as @p src1. - * @param[in] act_info (Optional) Activation layer information in case of a fused activation. Currently not supported. - */ - void configure(ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, const ActivationLayerInfo &act_info = ActivationLayerInfo()); - /** Static function to check if given info will lead to a valid configuration - * - * Similar to @ref CpuComplexMul::configure() - * - * @return a status - */ - static Status validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, const ActivationLayerInfo &act_info = ActivationLayerInfo()); - - // Inherited methods overridden: - void run(ITensorPack &tensors) override; -}; -} // namespace cpu -} // namespace arm_compute -#endif /* ARM_COMPUTE_CPU_MUL_H */
\ No newline at end of file diff --git a/src/runtime/cpu/operators/CpuPRelu.h b/src/runtime/cpu/operators/CpuPRelu.h deleted file mode 100644 index a6859f95d9..0000000000 --- a/src/runtime/cpu/operators/CpuPRelu.h +++ /dev/null @@ -1,38 +0,0 @@ -/* - * Copyright (c) 2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifndef ARM_COMPUTE_CPU_PRELU_H -#define ARM_COMPUTE_CPU_PRELU_H - -#include "src/runtime/cpu/operators/CpuElementwise.h" - -namespace arm_compute -{ -namespace cpu -{ -/** Class to run @ref cpu::kernels::CpuArithmeticKernel except for PRelu operation */ -using CpuPRelu = CpuElementwiseArithmetic<ArithmeticOperation::PRELU>; -} // namespace cpu -} // namespace arm_compute - -#endif /* ARM_COMPUTE_CPU_PRELU_H */
\ No newline at end of file diff --git a/src/runtime/cpu/operators/CpuPermute.cpp b/src/runtime/cpu/operators/CpuPermute.cpp deleted file mode 100644 index 7fde1e3767..0000000000 --- a/src/runtime/cpu/operators/CpuPermute.cpp +++ /dev/null @@ -1,44 +0,0 @@ -/* - * Copyright (c) 2018-2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "src/runtime/cpu/operators/CpuPermute.h" - -#include "src/core/cpu/kernels/CpuPermuteKernel.h" - -namespace arm_compute -{ -namespace cpu -{ -void CpuPermute::configure(const ITensorInfo *src, ITensorInfo *dst, const PermutationVector &perm) -{ - auto k = std::make_unique<kernels::CpuPermuteKernel>(); - k->configure(src, dst, perm); - _kernel = std::move(k); -} - -Status CpuPermute::validate(const ITensorInfo *src, const ITensorInfo *dst, const PermutationVector &perm) -{ - return kernels::CpuPermuteKernel::validate(src, dst, perm); -} -} // namesapce cpu -} // namespace arm_compute diff --git a/src/runtime/cpu/operators/CpuPermute.h b/src/runtime/cpu/operators/CpuPermute.h deleted file mode 100644 index 2b30d7fbd8..0000000000 --- a/src/runtime/cpu/operators/CpuPermute.h +++ /dev/null @@ -1,62 +0,0 @@ -/* - * Copyright (c) 2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifndef ARM_COMPUTE_CPU_PERMUTE_H -#define ARM_COMPUTE_CPU_PERMUTE_H - -#include "src/runtime/cpu/ICpuOperator.h" - -namespace arm_compute -{ -namespace cpu -{ -/** Basic function to run @ref kernels::CpuPermuteKernel */ -class CpuPermute : public ICpuOperator -{ -public: - /** Constructor */ - CpuPermute() = default; - /** Configure operator for a given list of arguments - * - * @note Arbitrary permutation vectors are supported with rank not greater than 4 - * - * @param[in] src Source tensor to permute. Data types supported: All - * @param[out] dst Destintation tensor. Data types supported: Same as @p src - * @param[in] perm Permutation vector - */ - void configure(const ITensorInfo *src, ITensorInfo *dst, const PermutationVector &perm); - /** Static function to check if given info will lead to a valid configuration of @ref CpuPermute - * - * @note Arbitrary permutation vectors are supported with rank not greater than 4 - * - * @param[in] src Source tensor to permute. Data types supported: All - * @param[in] dst Destination tensor. Data types supported: Same as @p dst - * @param[in] perm Permutation vector - * - * @return a status - */ - static Status validate(const ITensorInfo *src, const ITensorInfo *dst, const PermutationVector &perm); -}; -} // namespace cpu -} // namespace arm_compute -#endif /* ARM_COMPUTE_CPU_PERMUTE_H */ diff --git a/src/runtime/cpu/operators/CpuPool2d.cpp b/src/runtime/cpu/operators/CpuPool2d.cpp deleted file mode 100644 index e746c8fb3b..0000000000 --- a/src/runtime/cpu/operators/CpuPool2d.cpp +++ /dev/null @@ -1,158 +0,0 @@ -/* - * Copyright (c) 2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "src/runtime/cpu/operators/CpuPool2d.h" - -#include "arm_compute/core/ITensor.h" -#include "arm_compute/core/TensorInfo.h" -#include "arm_compute/runtime/NEON/NEScheduler.h" -#include "src/core/NEON/kernels/NEFillBorderKernel.h" -#include "src/core/cpu/kernels/CpuPool2dKernel.h" -#include "src/core/cpu/kernels/internal/CpuPool2dAssemblyWrapperKernel.h" - -using namespace arm_compute::experimental; - -namespace arm_compute -{ -namespace cpu -{ -CpuPool2d::CpuPool2d() - : _pooling_layer_kernel(), - _border_handler(), - _asm_glue(), - _is_global_pooling_layer(false), - _data_layout(DataLayout::NCHW), - _aux_mem(1) -{ -} - -CpuPool2d::~CpuPool2d() = default; - -void CpuPool2d::configure(ITensorInfo *src, ITensorInfo *dst, const PoolingLayerInfo &pool_info, ITensorInfo *indices) -{ - // Check if we can run assembly kernels. Currently, indices are not supported by those kernels - const bool run_optimised = bool(kernels::CpuPool2dAssemblyWrapperKernel::validate(src, dst, pool_info)) && (indices == nullptr); - - // Get data layout - _data_layout = pool_info.data_layout == DataLayout::UNKNOWN ? src->data_layout() : pool_info.data_layout; - - // Check if we have Global Pooling Layer - const unsigned int idx_width = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::WIDTH); - const unsigned int idx_height = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::HEIGHT); - _is_global_pooling_layer = (src->dimension(idx_width) == pool_info.pool_size.width) && (src->dimension(idx_height) == pool_info.pool_size.height); - - if(run_optimised) - { - const CPUInfo &ci = NEScheduler::get().cpu_info(); - const unsigned int num_threads = NEScheduler::get().num_threads(); - - auto pooling_wrapper = std::make_unique<kernels::CpuPool2dAssemblyWrapperKernel>(); - ARM_COMPUTE_ERROR_ON(pooling_wrapper == nullptr); - pooling_wrapper->configure(src, dst, pool_info, ci); - - // Get kernel's memory requirements - constexpr size_t alignment = 4096; - const size_t workspace_size = pooling_wrapper->get_working_size(num_threads); - _aux_mem[0] = MemoryInfo(TensorType::ACL_INT_0, MemoryLifetime::Temporary, workspace_size, alignment); - - _asm_glue = std::move(pooling_wrapper); - } - else - { - // Configure pooling kernel - auto k = std::make_unique<kernels::CpuPool2dKernel>(); - k->configure(src, dst, pool_info, indices); - _pooling_layer_kernel = std::move(k); - - switch(_data_layout) - { - case DataLayout::NCHW: - { - // Configure border depending on operation required (quantize border in case of asymmetric data_type) - BorderMode border_mode = (!indices && pool_info.pool_type == PoolingType::MAX) ? BorderMode::REPLICATE : BorderMode::CONSTANT; - PixelValue zero_value((indices) ? std::numeric_limits<int>::min() : 0.f); - if(is_data_type_quantized_asymmetric(src->data_type()) && !pool_info.exclude_padding) - { - zero_value = PixelValue(0, src->data_type(), src->quantization_info()); - } - auto b = std::make_unique<NEFillBorderKernel>(); - b->configure(src, _pooling_layer_kernel->border_size(), border_mode, zero_value); - _border_handler = std::move(b); - break; - } - case DataLayout::NHWC: - break; - default: - ARM_COMPUTE_ERROR("Data layout not supported"); - } - } -} - -Status CpuPool2d::validate(const ITensorInfo *src, const ITensorInfo *dst, const PoolingLayerInfo &pool_info, const ITensorInfo *indices) -{ - const bool run_optimised = bool(kernels::CpuPool2dAssemblyWrapperKernel::validate(src, dst, pool_info)) && (indices == nullptr); - - if(run_optimised) - { - return Status{}; - } - - return kernels::CpuPool2dKernel::validate(src, dst, pool_info, indices); -} - -void CpuPool2d::run(ITensorPack &tensors) -{ - ARM_COMPUTE_ERROR_ON_MSG(tensors.empty(), "No tensors provided"); - - if(_asm_glue) - { - const auto hints = (_is_global_pooling_layer) ? Window::DimX : Window::DimY; - NEScheduler::get().schedule_op(_asm_glue.get(), hints, _asm_glue->window(), tensors); - } - else - { - switch(_data_layout) - { - case DataLayout::NCHW: - // Fill border - NEScheduler::get().schedule_op(_border_handler.get(), Window::DimY, _border_handler->window(), tensors); - - // Run pooling layer - NEScheduler::get().schedule_op(_pooling_layer_kernel.get(), _is_global_pooling_layer ? Window::DimZ : Window::DimY, _pooling_layer_kernel->window(), tensors); - break; - case DataLayout::NHWC: - // Run pooling layer - NEScheduler::get().schedule_op(_pooling_layer_kernel.get(), Window::DimX, _pooling_layer_kernel->window(), tensors); - break; - default: - ARM_COMPUTE_ERROR("Data layout not supported"); - } - } -} - -experimental::MemoryRequirements CpuPool2d::workspace() const -{ - return _aux_mem; -} -} // namespace cpu -} // namespace arm_compute diff --git a/src/runtime/cpu/operators/CpuPool2d.h b/src/runtime/cpu/operators/CpuPool2d.h deleted file mode 100644 index 68416b5cfc..0000000000 --- a/src/runtime/cpu/operators/CpuPool2d.h +++ /dev/null @@ -1,87 +0,0 @@ -/* - * Copyright (c) 2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifndef ARM_COMPUTE_CPU_POOL2D_H -#define ARM_COMPUTE_CPU_POOL2D_H - -#include "arm_compute/core/experimental/Types.h" -#include "src/core/common/Macros.h" -#include "src/runtime/cpu/ICpuOperator.h" - -#include <memory> - -namespace arm_compute -{ -// Forward Declarations -struct PoolingLayerInfo; - -namespace cpu -{ -/** Basic function to simulate a pooling layer with the specified pooling operation. This function calls the following kernels: - * - * -# @ref NEFillBorderKernel (executed if padding size is different from zero) - * -# @ref kernels::CpuPool2dKernel - * -# @ref kernels::CpuPool2dAssemblyWrapperKernel - */ -class CpuPool2d : public ICpuOperator -{ -public: - /** Constructor */ - CpuPool2d(); - ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuPool2d); - /** Default destructor */ - ~CpuPool2d(); - /** Set the src and dst tensors. - * - * @note F16 is supported for pool sizes 2 and 3 only - * - * @param[in, out] src Source tensor info. (Written to only when padding != 0) Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32. - * @param[out] dst Destination tensor info. Data types supported: same as @p src. - * @param[in] pool_info Contains pooling operation information described in @ref PoolingLayerInfo. - * @param[out] indices (optional) The indices of the maximal values. Data type supported: U32. - */ - void configure(ITensorInfo *src, ITensorInfo *dst, const PoolingLayerInfo &pool_info, ITensorInfo *indices = nullptr); - /** Static function to check if given info will lead to a valid configuration - * - * Similar to CpuPool2d::configure() - * - * @return a status - */ - static Status validate(const ITensorInfo *src, const ITensorInfo *dst, const PoolingLayerInfo &pool_info, const ITensorInfo *indices = nullptr); - - // Inherited methods overridden: - void run(ITensorPack &tensors) override; - experimental::MemoryRequirements workspace() const override; - -private: - std::unique_ptr<INEKernel> _pooling_layer_kernel; - std::unique_ptr<INEKernel> _border_handler; - std::unique_ptr<INEKernel> _asm_glue; - - bool _is_global_pooling_layer; - DataLayout _data_layout; - experimental::MemoryRequirements _aux_mem{}; -}; -} // namespace cpu -} // namespace arm_compute -#endif /* ARM_COMPUTE_CPU_POOL2D_H */ diff --git a/src/runtime/cpu/operators/CpuQuantize.h b/src/runtime/cpu/operators/CpuQuantize.h deleted file mode 100644 index 09afffd920..0000000000 --- a/src/runtime/cpu/operators/CpuQuantize.h +++ /dev/null @@ -1,58 +0,0 @@ -/* - * Copyright (c) 2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifndef ARM_COMPUTE_CPU_QUANTIZE_H -#define ARM_COMPUTE_CPU_QUANTIZE_H - -#include "src/runtime/cpu/ICpuOperator.h" - -namespace arm_compute -{ -namespace cpu -{ -/** Basic function to run @ref kernels::CpuQuantizeKernel that dequantizes an input tensor */ -class CpuQuantize : public ICpuOperator -{ -public: - /** Default Constructor */ - CpuQuantize() = default; - /** Set the input and output tensors. - * - * @param[in] src Source tensor info. The dimensions over the third will be interpreted as batches. Data types supported: QASYMM8/QASYMM8_SIGNED/F32/F16. - * @param[out] dst Destination tensor info with the same dimensions of input. Data types supported: QASYMM8/QASYMM8_SIGNED/QASYMM16 - */ - void configure(const ITensorInfo *src, ITensorInfo *dst); - /** Static function to check if given info will lead to a valid configuration - * - * Similar to @ref CpuQuantize::configure() - * - * @return a status - */ - static Status validate(const ITensorInfo *src, const ITensorInfo *dst); - - // Inherited methods overridden: - void run(ITensorPack &tensors) override; -}; -} // namespace cpu -} // namespace arm_compute -#endif /* ARM_COMPUTE_CPU_QUANTIZE_H */ diff --git a/src/runtime/cpu/operators/CpuReshape.cpp b/src/runtime/cpu/operators/CpuReshape.cpp deleted file mode 100644 index 33c9cb87b6..0000000000 --- a/src/runtime/cpu/operators/CpuReshape.cpp +++ /dev/null @@ -1,44 +0,0 @@ -/* - * Copyright (c) 2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "src/runtime/cpu/operators/CpuReshape.h" - -#include "src/core/cpu/kernels/CpuReshapeKernel.h" - -namespace arm_compute -{ -namespace cpu -{ -void CpuReshape::configure(const ITensorInfo *src, ITensorInfo *dst) -{ - auto k = std::make_unique<kernels::CpuReshapeKernel>(); - k->configure(src, dst); - _kernel = std::move(k); -} - -Status CpuReshape::validate(const ITensorInfo *src, const ITensorInfo *dst) -{ - return kernels::CpuReshapeKernel::validate(src, dst); -} -} // namespace cpu -} // namespace arm_compute diff --git a/src/runtime/cpu/operators/CpuReshape.h b/src/runtime/cpu/operators/CpuReshape.h deleted file mode 100644 index e136043568..0000000000 --- a/src/runtime/cpu/operators/CpuReshape.h +++ /dev/null @@ -1,57 +0,0 @@ -/* - * Copyright (c) 2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifndef ARM_COMPUTE_CPU_RESHAPE_H -#define ARM_COMPUTE_CPU_RESHAPE_H - -#include "src/runtime/cpu/ICpuOperator.h" - -namespace arm_compute -{ -namespace cpu -{ -/** Basic function to run @ref kernels::CpuReshapeKernel */ -class CpuReshape : public ICpuOperator -{ -public: - /** Constructor */ - CpuReshape() = default; - /** Configure operator for a given list of arguments - * - * @param[in] src Source tensor info. Data type supported: All - * @param[out] dst Destination info. Data type supported: Same as @p src - */ - void configure(const ITensorInfo *src, ITensorInfo *dst); - - /** Static function to check if given info will lead to a valid configuration of @ref CpuReshape - * - * @param[in] src Source tensor info. Data type supported: All - * @param[in] dst Destination tensor info. Data type supported: Same as @p src - * - * @return a status - */ - static Status validate(const ITensorInfo *src, const ITensorInfo *dst); -}; -} // namespace cpu -} // namespace arm_compute -#endif /* ARM_COMPUTE_CPU_RESHAPE_H */ diff --git a/src/runtime/cpu/operators/CpuScale.cpp b/src/runtime/cpu/operators/CpuScale.cpp deleted file mode 100644 index 681a15e26c..0000000000 --- a/src/runtime/cpu/operators/CpuScale.cpp +++ /dev/null @@ -1,254 +0,0 @@ -/* - * Copyright (c) 2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "src/runtime/cpu/operators/CpuScale.h" - -#include "arm_compute/core/Helpers.h" -#include "arm_compute/core/TensorInfo.h" -#include "arm_compute/core/Validate.h" -#include "arm_compute/runtime/NEON/NEScheduler.h" -#include "src/core/cpu/kernels/CpuScaleKernel.h" -#include "src/core/utils/ScaleUtils.h" -#include "support/Rounding.h" - -namespace arm_compute -{ -namespace cpu -{ -namespace -{ -void precompute_dx_dy_offsets(ITensor *dx, ITensor *dy, ITensor *offsets, float wr, float hr, SamplingPolicy sampling_policy, bool align_corners) -{ - ARM_COMPUTE_ERROR_ON(offsets == nullptr); - float sampling_offset = 0.0f; - if(sampling_policy == SamplingPolicy::CENTER) - { - sampling_offset = 0.5f; - } - - Window win; - win.set(Window::DimX, Window::Dimension(0, offsets->info()->dimension(0), 1)); - win.set(Window::DimY, Window::Dimension(0, offsets->info()->dimension(1), 1)); - - if(dx != nullptr && dy != nullptr) - { - // Pre-compute the offset and pixel's distance for BILINEAR interpolation - Iterator offsets_it(offsets, win); - Iterator dx_it(dx, win); - Iterator dy_it(dy, win); - - execute_window_loop(win, [&](const Coordinates & id) - { - const float in_x = (id.x() + sampling_offset) * wr - sampling_offset; - const float in_y = (id.y() + sampling_offset) * hr - sampling_offset; - const int in_xi = std::floor(in_x); - const int in_yi = std::floor(in_y); - - *reinterpret_cast<int32_t *>(offsets_it.ptr()) = in_xi; - *reinterpret_cast<float *>(dx_it.ptr()) = in_x - in_xi; - *reinterpret_cast<float *>(dy_it.ptr()) = in_y - in_yi; - }, - offsets_it, dx_it, dy_it); - } - else - { - // Pre-compute the offset for NEAREST interpolation - Iterator offsets_it(offsets, win); - - execute_window_loop(win, [&](const Coordinates & id) - { - const float float_in_xi = (id.x() + sampling_offset) * wr; - const auto in_xi = static_cast<size_t>(align_corners ? arm_compute::utils::rounding::round_half_away_from_zero(float_in_xi) : std::floor(float_in_xi)); - *reinterpret_cast<int32_t *>(offsets_it.ptr()) = in_xi; - }, - offsets_it); - } -} -} // namespace - -CpuScale::CpuScale() - : _scale_info(InterpolationPolicy::NEAREST_NEIGHBOR, BorderMode::UNDEFINED), _data_layout(DataLayout::UNKNOWN), _is_prepared(false) -{ -} - -void CpuScale::configure(ITensorInfo *src, ITensorInfo *dst, const ScaleKernelInfo &info) -{ - ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst); - ARM_COMPUTE_ERROR_THROW_ON(CpuScale::validate(src, dst, info)); - - _scale_info = info; - - // Get data layout and width/height indices - _data_layout = _scale_info.data_layout == DataLayout::UNKNOWN ? src->data_layout() : _scale_info.data_layout; - const int idx_width = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::WIDTH); - const int idx_height = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::HEIGHT); - - // Compute the ratio between source width/height and destination width/height - const bool is_align_corners_used = _scale_info.align_corners && arm_compute::scale_utils::is_align_corners_allowed_sampling_policy(_scale_info.sampling_policy); - const auto wr = arm_compute::scale_utils::calculate_resize_ratio(src->dimension(idx_width), dst->dimension(idx_width), is_align_corners_used); - const auto hr = arm_compute::scale_utils::calculate_resize_ratio(src->dimension(idx_height), dst->dimension(idx_height), is_align_corners_used); - - // Area interpolation behaves as Nearest Neighbour in case of up-sampling - InterpolationPolicy policy_to_use = (_scale_info.interpolation_policy == InterpolationPolicy::AREA && wr <= 1.f - && hr <= 1.f) ? - InterpolationPolicy::NEAREST_NEIGHBOR : - _scale_info.interpolation_policy; - - // Get the tensor shape - TensorShape shape(dst->dimension(idx_width)); - shape.set(1, dst->dimension(idx_height), false); - - TensorInfo tensor_info_offsets(shape, Format::S32); - TensorInfo tensor_info_dxdy(shape, Format::F32); - - auto dx = std::make_unique<TensorInfo>(tensor_info_dxdy); - auto dy = std::make_unique<TensorInfo>(tensor_info_dxdy); - auto offsets = std::make_unique<TensorInfo>(tensor_info_offsets); - auto scale_kernel = std::make_unique<kernels::CpuScaleKernel>(); - switch(policy_to_use) - { - case InterpolationPolicy::NEAREST_NEIGHBOR: - { - scale_kernel->configure(src, nullptr, nullptr, offsets.get(), dst, info); - break; - } - case InterpolationPolicy::BILINEAR: - { - scale_kernel->configure(src, dx.get(), dy.get(), offsets.get(), dst, info); - break; - } - case InterpolationPolicy::AREA: - { - scale_kernel->configure(src, nullptr, nullptr, nullptr, dst, info); - break; - } - default: - ARM_COMPUTE_ERROR("Unsupported interpolation mode"); - } - _kernel = std::move(scale_kernel); -} - -Status CpuScale::validate(const ITensorInfo *src, const ITensorInfo *dst, const ScaleKernelInfo &info) -{ - ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst); - ARM_COMPUTE_RETURN_ERROR_ON(info.sampling_policy != SamplingPolicy::CENTER && info.sampling_policy != SamplingPolicy::TOP_LEFT); - - ITensorInfo *offsets = nullptr; - ITensorInfo *dx = nullptr; - ITensorInfo *dy = nullptr; - - // Get data layout and width/height indices - const DataLayout data_layout = info.data_layout == DataLayout::UNKNOWN ? src->data_layout() : info.data_layout; - const int idx_width = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH); - const int idx_height = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT); - - // Compute the ratio between source width/height and destination width/height - const bool is_align_corners_used = info.align_corners && arm_compute::scale_utils::is_align_corners_allowed_sampling_policy(info.sampling_policy); - const auto wr = arm_compute::scale_utils::calculate_resize_ratio(src->dimension(idx_width), dst->dimension(idx_width), is_align_corners_used); - const auto hr = arm_compute::scale_utils::calculate_resize_ratio(src->dimension(idx_height), dst->dimension(idx_height), is_align_corners_used); - - // Area interpolation behaves as Nearest Neighbour in case of up-sampling - InterpolationPolicy policy_to_use = (info.interpolation_policy == InterpolationPolicy::AREA && wr <= 1.f && hr <= 1.f) ? InterpolationPolicy::NEAREST_NEIGHBOR : info.interpolation_policy; - - // Get the tensor shape of auxilary buffers - const TensorShape shape(dst->dimension(idx_width), dst->dimension(idx_height)); - TensorInfo tensor_info_offsets(shape, Format::S32); - TensorInfo tensor_info_dx(shape, Format::F32); - TensorInfo tensor_info_dy(shape, Format::F32); - switch(policy_to_use) - { - case InterpolationPolicy::NEAREST_NEIGHBOR: - offsets = &tensor_info_offsets; - break; - case InterpolationPolicy::BILINEAR: - offsets = &tensor_info_offsets; - dx = &tensor_info_dx; - dy = &tensor_info_dy; - break; - default: - break; - } - - ARM_COMPUTE_RETURN_ON_ERROR(kernels::CpuScaleKernel::validate(src->clone().get(), dx, dy, offsets, dst->clone().get(), info)); - return Status{}; -} - -void CpuScale::prepare(ITensorPack &tensors) -{ - if(!_is_prepared) - { - _is_prepared = true; - const auto src = tensors.get_const_tensor(TensorType::ACL_SRC); - auto dst = tensors.get_tensor(TensorType::ACL_DST); - auto dx = tensors.get_tensor(TensorType::ACL_INT_0); - auto dy = tensors.get_tensor(TensorType::ACL_INT_1); - auto offsets = tensors.get_tensor(TensorType::ACL_INT_2); - - // Get data layout and width/height indices - const int idx_width = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::WIDTH); - const int idx_height = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::HEIGHT); - - // Compute the ratio between source width/height and destination width/height - const bool is_align_corners_used = _scale_info.align_corners && arm_compute::scale_utils::is_align_corners_allowed_sampling_policy(_scale_info.sampling_policy); - const auto wr = arm_compute::scale_utils::calculate_resize_ratio(src->info()->dimension(idx_width), dst->info()->dimension(idx_width), is_align_corners_used); - const auto hr = arm_compute::scale_utils::calculate_resize_ratio(src->info()->dimension(idx_height), dst->info()->dimension(idx_height), is_align_corners_used); - - // Area interpolation behaves as Nearest Neighbour in case of up-sampling - InterpolationPolicy policy_to_use = (_scale_info.interpolation_policy == InterpolationPolicy::AREA && wr <= 1.f - && hr <= 1.f) ? - InterpolationPolicy::NEAREST_NEIGHBOR : - _scale_info.interpolation_policy; - const SamplingPolicy sampling_policy = _scale_info.sampling_policy; - - switch(policy_to_use) - { - case InterpolationPolicy::NEAREST_NEIGHBOR: - { - // Pre-compute offsets for nearest interpolation - precompute_dx_dy_offsets(nullptr, nullptr, offsets, wr, hr, sampling_policy, is_align_corners_used); - break; - } - case InterpolationPolicy::BILINEAR: - { - // Pre-compute dx, dy and offsets for bilinear interpolation - precompute_dx_dy_offsets(dx, dy, offsets, wr, hr, sampling_policy, is_align_corners_used); - break; - } - case InterpolationPolicy::AREA: - { - break; - } - default: - ARM_COMPUTE_ERROR("Unsupported interpolation mode"); - } - } -} - -void CpuScale::run(ITensorPack &tensors) -{ - ARM_COMPUTE_ERROR_ON_MSG(tensors.empty(), "No inputs provided"); - prepare(tensors); - NEScheduler::get().schedule_op(_kernel.get(), Window::DimY, _kernel->window(), tensors); -} -} // namespace cpu -} // namespace arm_compute diff --git a/src/runtime/cpu/operators/CpuScale.h b/src/runtime/cpu/operators/CpuScale.h deleted file mode 100644 index 90248a8d59..0000000000 --- a/src/runtime/cpu/operators/CpuScale.h +++ /dev/null @@ -1,73 +0,0 @@ -/* - * Copyright (c) 2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifndef ARM_COMPUTE_CPU_SCALE_H -#define ARM_COMPUTE_CPU_SCALE_H - -#include "arm_compute/core/ITensorInfo.h" -#include "arm_compute/core/KernelDescriptors.h" -#include "arm_compute/core/experimental/Types.h" -#include "src/core/cpu/ICpuKernel.h" -#include "src/runtime/cpu/ICpuOperator.h" - -#include <memory> - -namespace arm_compute -{ -namespace cpu -{ -/** Basic function to compute Scale */ -class CpuScale : public ICpuOperator -{ -public: - /** Default Constructor */ - CpuScale(); - /** Initialize the function's source, destination, interpolation type and border_mode. - * - * @param[in, out] src Source tensor info. Data type supported: QASYMM8/QASYMM8_SIGNED/U8/S16/F16/F32. (Written to only for @p border_mode != UNDEFINED) - * @param[out] dst Destination tensor info. Data type supported: Same as @p src. All but the lowest two dimensions must be the same size as in the input tensor, i.e. scaling is only performed within the XY-plane. - * @param[in] info @ref ScaleKernelInfo to be used for configuration - */ - void configure(ITensorInfo *src, ITensorInfo *dst, const ScaleKernelInfo &info); - /** Static function to check if given info will lead to a valid configuration of @ref NEScale - * - * @param[in] src Source tensor info. Data type supported: QASYMM8/QASYMM8_SIGNED/U8/S16/F16/F32. (Written to only for @p border_mode != UNDEFINED) - * @param[in] dst Destination tensor info. Data type supported: Same as @p src. All but the lowest two dimensions must be the same size as in the input tensor, i.e. scaling is only performed within the XY-plane. - * @param[in] info @ref ScaleKernelInfo to be used for validation - * - * @return a status - */ - static Status validate(const ITensorInfo *src, const ITensorInfo *dst, const ScaleKernelInfo &info); - - // Inherited methods overridden: - void prepare(ITensorPack &tensors) override; - void run(ITensorPack &tensors) override; - -private: - ScaleKernelInfo _scale_info; - DataLayout _data_layout; - bool _is_prepared; -}; -} // namespace cpu -} // namespace arm_compute -#endif /*ARM_COMPUTE_CPU_SCALE_H */ diff --git a/src/runtime/cpu/operators/CpuSoftmax.cpp b/src/runtime/cpu/operators/CpuSoftmax.cpp deleted file mode 100644 index e17925ee50..0000000000 --- a/src/runtime/cpu/operators/CpuSoftmax.cpp +++ /dev/null @@ -1,221 +0,0 @@ -/* - * Copyright (c) 2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "src/runtime/cpu/operators/CpuSoftmax.h" - -#include "arm_compute/core/Helpers.h" -#include "arm_compute/core/TensorInfo.h" -#include "arm_compute/core/Validate.h" -#include "arm_compute/core/utils/misc/ShapeCalculator.h" -#include "arm_compute/runtime/NEON/NEScheduler.h" -#include "src/core/cpu/kernels/CpuSoftmaxKernel.h" -#include "src/core/helpers/MemoryHelpers.h" -#include "src/core/helpers/SoftmaxHelpers.h" -#include "src/runtime/cpu/utils/CpuAuxTensorHandler.h" - -using namespace arm_compute::experimental; - -namespace arm_compute -{ -namespace cpu -{ -template <bool IS_LOG> -CpuSoftmaxGeneric<IS_LOG>::CpuSoftmaxGeneric() - : _permute_input(), - _permute_output(), - _max_kernel(), - _softmax_kernel(), - _max(), - _tmp(), - _input_permuted(), - _output_permuted(), - _needs_permute(false), - _aux_mem(InternalTensorIdx::COUNT) -{ -} - -template <bool IS_LOG> -void CpuSoftmaxGeneric<IS_LOG>::configure(const ITensorInfo *src, ITensorInfo *dst, float beta, int32_t axis) -{ - // Perform validation step - ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst); - ARM_COMPUTE_ERROR_THROW_ON(CpuSoftmaxGeneric::validate(src, dst, beta, axis)); - - const unsigned int actual_axis = static_cast<unsigned int>(wrap_around(axis, static_cast<int32_t>(src->num_dimensions()))); - - _needs_permute = actual_axis > 0; - - if(_needs_permute) - { - _permute_input.configure(src, &_input_permuted, softmax_helpers::get_permutation_vector_from_softmax_axis(actual_axis)); - } - - // We want to deal with a 2D input. Either it is the permuted version of the original input (4D case) - // or it is the original input case (2D case) - const ITensorInfo *tmp_input = (_needs_permute ? &_input_permuted : src); - - // Create intermediate tensors shapes - TensorShape max_sum_shape = tmp_input->tensor_shape(); - max_sum_shape.set(0, 1); - const TensorInfo input_info = tmp_input->clone()->reset_padding().set_is_resizable(true); - DataType tmp_data_type = is_data_type_quantized_asymmetric(tmp_input->data_type()) ? DataType::F32 : tmp_input->data_type(); - TensorInfo tensor_info_tmp(input_info.clone()->set_data_type(tmp_data_type)); - TensorInfo max_info(tmp_input->clone()->set_tensor_shape(max_sum_shape)); - - // Init intermediate tensors - _max = TensorInfo(max_info); - _tmp = TensorInfo(tensor_info_tmp); - - // Configure kernels - auto mk = std::make_unique<kernels::CpuLogits1DMaxKernel>(); - mk->configure(tmp_input, &_max); - _max_kernel = std::move(mk); - - auto sm = std::make_unique<kernels::CpuLogits1DSoftmaxKernel<IS_LOG>>(); - if(_needs_permute) - { - // The normalization kernel stores the result in a permuted output tensor - sm->configure(tmp_input, &_max, &_output_permuted, beta, &_tmp); - - // Re-permute the permuted output into the requested (4D) output - _permute_output.configure(&_output_permuted, dst, softmax_helpers::get_permutation_vector_from_softmax_axis(actual_axis)); - } - else - { - // Softmax 2D case - sm->configure(tmp_input, &_max, dst, beta, &_tmp); - } - _softmax_kernel = std::move(sm); - - _aux_mem[InternalTensorIdx::MAX] = MemoryInfo(offset_int_vec(InternalTensorIdx::MAX), MemoryLifetime::Temporary, _max.total_size()); - _aux_mem[InternalTensorIdx::TMP] = MemoryInfo(offset_int_vec(InternalTensorIdx::TMP), MemoryLifetime::Temporary, _tmp.total_size()); - - _aux_mem[InternalTensorIdx::PERMUTED_SRC] = MemoryInfo(offset_int_vec(InternalTensorIdx::PERMUTED_SRC), MemoryLifetime::Temporary, _input_permuted.total_size()); - _aux_mem[InternalTensorIdx::PERMUTED_DST] = MemoryInfo(offset_int_vec(InternalTensorIdx::PERMUTED_DST), MemoryLifetime::Temporary, _output_permuted.total_size()); -} - -template <bool IS_LOG> -Status CpuSoftmaxGeneric<IS_LOG>::validate(const ITensorInfo *src, const ITensorInfo *dst, float beta, int32_t axis) -{ - // Perform validation step - ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(src->num_dimensions() > 4, "Only up to 4 dimensions are supported"); - ARM_COMPUTE_UNUSED(beta); - ARM_COMPUTE_RETURN_ERROR_ON(axis < static_cast<int32_t>(-src->num_dimensions()) || static_cast<int32_t>(src->num_dimensions()) <= axis); - - // Create intermediate tensor info - DataType tmp_data_type = src->data_type(); - const TensorInfo tensor_info_tmp(src->clone()->set_data_type(tmp_data_type).set_is_resizable(true)); - - TensorShape max_sum_shape = src->tensor_shape(); - max_sum_shape.set(0, 1); - const TensorInfo tensor_info_max_sum(src->clone()->set_tensor_shape(max_sum_shape).set_data_type(tmp_data_type).set_quantization_info(src->quantization_info()).set_is_resizable(true)); - const TensorInfo dont_care; - - const unsigned int actual_axis = static_cast<unsigned int>(wrap_around(axis, static_cast<int32_t>(src->num_dimensions()))); - - const bool needs_permute = actual_axis > 0; - - if(needs_permute) - { - const PermutationVector permutation_vector = softmax_helpers::get_permutation_vector_from_softmax_axis(actual_axis); - const TensorShape permuted_shape = misc::shape_calculator::compute_permutation_output_shape(*src, permutation_vector); - TensorInfo input_permuted(src->clone()->set_tensor_shape(permuted_shape)); - ARM_COMPUTE_RETURN_ON_ERROR(CpuPermute::validate(src, &input_permuted, permutation_vector)); - TensorInfo output_permuted(dst->clone()->set_tensor_shape(permuted_shape)); - ARM_COMPUTE_RETURN_ON_ERROR(CpuPermute::validate(&output_permuted, dst, permutation_vector)); - } - - ARM_COMPUTE_RETURN_ON_ERROR(kernels::CpuLogits1DMaxKernel::validate(src, &tensor_info_max_sum)); - ARM_COMPUTE_RETURN_ON_ERROR(kernels::CpuLogits1DSoftmaxKernel<IS_LOG>::validate(&tensor_info_tmp, &tensor_info_max_sum, dst, beta, &dont_care)); - - return Status{}; -} - -template <bool IS_LOG> -void CpuSoftmaxGeneric<IS_LOG>::run(ITensorPack &tensors) -{ - ARM_COMPUTE_ERROR_ON_MSG(tensors.empty(), "No inputs provided"); - - auto src = tensors.get_const_tensor(TensorType::ACL_SRC); - auto dst = tensors.get_tensor(TensorType::ACL_DST); - - CpuAuxTensorHandler tmp(offset_int_vec(InternalTensorIdx::TMP), _tmp, tensors, false); - CpuAuxTensorHandler max(offset_int_vec(InternalTensorIdx::MAX), _max, tensors, false); - - CpuAuxTensorHandler input_permuted(offset_int_vec(InternalTensorIdx::PERMUTED_SRC), _input_permuted, tensors, false); - CpuAuxTensorHandler output_permuted(offset_int_vec(InternalTensorIdx::PERMUTED_DST), _output_permuted, tensors, false); - - ITensorPack max_pack; - ITensorPack softmax_pack; - - if(_needs_permute) - { - ITensorPack permute_in_pack = { { TensorType::ACL_SRC, src }, { TensorType::ACL_DST, input_permuted.get() } }; - _permute_input.run(permute_in_pack); - - max_pack = { { TensorType::ACL_SRC, input_permuted.get() }, { TensorType::ACL_DST, max.get() } }; - - softmax_pack = - { - { TensorType::ACL_SRC_0, input_permuted.get() }, - { TensorType::ACL_SRC_1, max.get() }, - { TensorType::ACL_DST_0, output_permuted.get() }, - { TensorType::ACL_DST_1, tmp.get() } - }; - } - else - { - max_pack = { { TensorType::ACL_SRC, src }, { TensorType::ACL_DST, max.get() } }; - - softmax_pack = - { - { TensorType::ACL_SRC_0, src }, - { TensorType::ACL_SRC_1, max.get() }, - { TensorType::ACL_DST_0, dst }, - { TensorType::ACL_DST_1, tmp.get() } - }; - } - - NEScheduler::get().schedule_op(_max_kernel.get(), Window::DimY, _max_kernel->window(), max_pack); - NEScheduler::get().schedule_op(_softmax_kernel.get(), Window::DimY, _softmax_kernel->window(), softmax_pack); - - if(_needs_permute) - { - ITensorPack permute_out_pack; - permute_out_pack.add_tensor(TensorType::ACL_SRC, output_permuted.get()); - permute_out_pack.add_tensor(TensorType::ACL_DST, dst); - _permute_output.run(permute_out_pack); - } -} - -template <bool IS_LOG> -experimental::MemoryRequirements CpuSoftmaxGeneric<IS_LOG>::workspace() const -{ - return _aux_mem; -} - -template class CpuSoftmaxGeneric<false>; -template class CpuSoftmaxGeneric<true>; -} // namespace cpu -} // namespace arm_compute diff --git a/src/runtime/cpu/operators/CpuSoftmax.h b/src/runtime/cpu/operators/CpuSoftmax.h deleted file mode 100644 index 38817977b3..0000000000 --- a/src/runtime/cpu/operators/CpuSoftmax.h +++ /dev/null @@ -1,117 +0,0 @@ -/* - * Copyright (c) 2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifndef ARM_COMPUTE_CPU_SOFTMAX_H -#define ARM_COMPUTE_CPU_SOFTMAX_H - -#include "arm_compute/core/TensorInfo.h" -#include "arm_compute/core/experimental/Types.h" -#include "src/core/cpu/ICpuKernel.h" -#include "src/runtime/cpu/ICpuOperator.h" -#include "src/runtime/cpu/operators/CpuPermute.h" -#include <memory> - -namespace arm_compute -{ -namespace cpu -{ -class CpuLogits1DMaxKernel; -template <bool IS_LOG> -class CpuLogits1DSoftmaxKernel; - -/** Basic function to compute a SoftmaxLayer and a Log SoftmaxLayer. - * - * Softmax is calculated by : - * @f[ out = exp((x - max(x)) * beta) / sum(exp((x - max(x)) * beta)) @f] - * - * Log Softmax is calculated by : - * @f[ out = (x - max(x) * beta) - log(\sum{e^{x - max(x) * beta}}) @f] - * - * This function runs the following function/kernels: - * -# If axis is not 0: - * -# @ref CpuPermute - * -# @ref kernels::CpuLogits1DMaxKernel - * -# @ref kernels::CpuLogits1DSoftmaxKernel - */ -template <bool IS_LOG = false> -class CpuSoftmaxGeneric : public ICpuOperator -{ -public: - /** Constructor */ - CpuSoftmaxGeneric(); - /** Set the input and output tensors. - * - * @param[in,out] src Source tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32. - * last value of each row to the nearest multiple. - * @param[out] dst Destination tensor ifo. Data types supported: same as @p input. - * @param[in] beta (Optional) A scaling factor for the exponent. - * @param[in] axis (Optional) The dimension in which to apply the function. E.g. for input of shape 4x5x6 and - * axis=1, softmax will be applied to 4x6=24 vectors of size 5. Defaults to 0 - */ - void configure(const ITensorInfo *src, ITensorInfo *dst, float beta = 1.0f, int32_t axis = 0); - - /** Static function to check if given info will lead to a valid configuration of @ref CpuSoftmax - * - * @param[in] src Source tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32. - * @param[in] dst Destination tensor info. Data types supported: same as @p input - * @param[in] beta (Optional) A scaling factor for the exponent. - * @param[in] axis (Optional) The dimension in which to apply the function. E.g. for input of shape 4x5x6 and - * axis=1, softmax will be applied to 4x6=24 vectors of size 5. Defaults to 0 - * - * @return a status - */ - static Status validate(const ITensorInfo *src, const ITensorInfo *dst, float beta = 1.0f, int32_t axis = 0); - - // Inherited methods overridden: - void run(ITensorPack &tensors) override; - experimental::MemoryRequirements workspace() const override; - -private: - enum InternalTensorIdx - { - MAX = 0, - TMP, - PERMUTED_SRC, - PERMUTED_DST, - COUNT - }; - - CpuPermute _permute_input; - CpuPermute _permute_output; - std::unique_ptr<ICpuKernel> _max_kernel; - std::unique_ptr<ICpuKernel> _softmax_kernel; - - TensorInfo _max; - TensorInfo _tmp; - TensorInfo _input_permuted; - TensorInfo _output_permuted; - - bool _needs_permute; - experimental::MemoryRequirements _aux_mem{}; -}; -using CpuSoftmax = CpuSoftmaxGeneric<false>; -using CpuLogSoftmax = CpuSoftmaxGeneric<true>; - -} // namespace cpu -} // namespace arm_compute -#endif /* ARM_COMPUTE_CPU_SOFTMAX_H */ diff --git a/src/runtime/cpu/operators/CpuSub.cpp b/src/runtime/cpu/operators/CpuSub.cpp deleted file mode 100644 index 9baaaa9d67..0000000000 --- a/src/runtime/cpu/operators/CpuSub.cpp +++ /dev/null @@ -1,46 +0,0 @@ -/* - * Copyright (c) 2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "src/runtime/cpu/operators/CpuSub.h" - -#include "src/core/cpu/kernels/CpuSubKernel.h" - -namespace arm_compute -{ -namespace cpu -{ -void CpuSub::configure(const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst, ConvertPolicy policy, const ActivationLayerInfo &act_info) -{ - ARM_COMPUTE_UNUSED(act_info); - auto k = std::make_unique<kernels::CpuSubKernel>(); - k->configure(src0, src1, dst, policy); - _kernel = std::move(k); -} - -Status CpuSub::validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst, ConvertPolicy policy, const ActivationLayerInfo &act_info) -{ - ARM_COMPUTE_RETURN_ERROR_ON(act_info.enabled()); - return kernels::CpuSubKernel::validate(src0, src1, dst, policy); -} -} // namespace cpu -} // namespace arm_compute
\ No newline at end of file diff --git a/src/runtime/cpu/operators/CpuSub.h b/src/runtime/cpu/operators/CpuSub.h deleted file mode 100644 index 099ffef87e..0000000000 --- a/src/runtime/cpu/operators/CpuSub.h +++ /dev/null @@ -1,86 +0,0 @@ -/* - * Copyright (c) 2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifndef ARM_COMPUTE_CPU_SUB_H -#define ARM_COMPUTE_CPU_SUB_H - -#include "src/runtime/cpu/ICpuOperator.h" - -namespace arm_compute -{ -namespace cpu -{ -/** Basic function to run @ref kernels::CpuSubKernel */ -class CpuSub : public ICpuOperator -{ -public: - /** Initialise the kernel's inputs, dst and conversion policy. - * - * Valid configurations (src0,src1) -> dst : - * - * - (U8,U8) -> U8 - * - (U8,U8) -> S16 - * - (QASYMM8, QASYMM8) -> QASYMM8 - * - (QASYMM8_SIGNED, QASYMM8_SIGNED) -> QASYMM8_SIGNED - * - (S16,U8) -> S16 - * - (U8,S16) -> S16 - * - (S16,S16) -> S16 - * - (S32,S32) -> S32 - * - (F16,F16) -> F16 - * - (F32,F32) -> F32 - * - * @param[in] src0 First tensor input info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/QSYMM16/S16/S32/F16/F32 - * @param[in] src1 Second tensor input info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/QSYMM16/S16/S32/F16/F32 - * @param[out] dst Output tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/QSYMM16/S16/S32/F16/F32 - * @param[in] policy Policy to use to handle overflow. Convert policy cannot be WRAP if datatype is quantized. - * @param[in] act_info (Optional) Activation layer information in case of a fused activation. Currently not supported. - */ - void configure(const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst, ConvertPolicy policy, const ActivationLayerInfo &act_info = ActivationLayerInfo()); - /** Static function to check if given info will lead to a valid configuration of @ref CpuSub - * - * Valid configurations (src0,src1) -> dst : - * - * - (U8,U8) -> U8 - * - (U8,U8) -> S16 - * - (QASYMM8, QASYMM8) -> QASYMM8 - * - (QASYMM8_SIGNED, QASYMM8_SIGNED) -> QASYMM8_SIGNED - * - (S16,U8) -> S16 - * - (U8,S16) -> S16 - * - (S16,S16) -> S16 - * - (S32,S32) -> S32 - * - (F16,F16) -> F16 - * - (F32,F32) -> F32 - * - * @param[in] src0 First tensor input. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/S32/F16/F32 - * @param[in] src1 Second tensor input. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/S32/F16/F32 - * @param[in] dst Output tensor. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/S32/F16/F32 - * @param[in] policy Policy to use to handle overflow. Convert policy cannot be WRAP if datatype is quantized. - * @param[in] act_info (Optional) Activation layer information in case of a fused activation. Currently not supported. - * - * @return a status - */ - static Status validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst, ConvertPolicy policy, const ActivationLayerInfo &act_info = ActivationLayerInfo()); -}; -} // namespace cpu -} // namespace arm_compute -#endif /* ARM_COMPUTE_CPU_SUB_H */
\ No newline at end of file diff --git a/src/runtime/cpu/operators/CpuTranspose.cpp b/src/runtime/cpu/operators/CpuTranspose.cpp deleted file mode 100644 index 51eeb90b8b..0000000000 --- a/src/runtime/cpu/operators/CpuTranspose.cpp +++ /dev/null @@ -1,44 +0,0 @@ -/* - * Copyright (c) 2018-2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "src/runtime/cpu/operators/CpuTranspose.h" - -#include "src/core/cpu/kernels/CpuTransposeKernel.h" - -namespace arm_compute -{ -namespace cpu -{ -void CpuTranspose::configure(const ITensorInfo *src, ITensorInfo *dst) -{ - auto k = std::make_unique<kernels::CpuTransposeKernel>(); - k->configure(src, dst); - _kernel = std::move(k); -} - -Status CpuTranspose::validate(const ITensorInfo *src, const ITensorInfo *dst) -{ - return kernels::CpuTransposeKernel::validate(src, dst); -} -} // namesapce cpu -} // namespace arm_compute diff --git a/src/runtime/cpu/operators/CpuTranspose.h b/src/runtime/cpu/operators/CpuTranspose.h deleted file mode 100644 index c0232ddab2..0000000000 --- a/src/runtime/cpu/operators/CpuTranspose.h +++ /dev/null @@ -1,56 +0,0 @@ -/* - * Copyright (c) 2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifndef ARM_COMPUTE_CPU_TRANSPOSE_H -#define ARM_COMPUTE_CPU_TRANSPOSE_H - -#include "src/runtime/cpu/ICpuOperator.h" - -namespace arm_compute -{ -namespace cpu -{ -/** Basic function to run @ref kernels::CpuTransposeKernel */ -class CpuTranspose : public ICpuOperator -{ -public: - /** Constructor */ - CpuTranspose() = default; - /** Configure operator for a given list of arguments - * - * @param[in] src Source tensor to permute. Data types supported: All - * @param[out] dst Destintation tensor. Data types supported: Same as @p src - */ - void configure(const ITensorInfo *src, ITensorInfo *dst); - /** Static function to check if given info will lead to a valid configuration of @ref CpuTranspose - * - * @param[in] src Source tensor to permute. Data types supported: All - * @param[in] dst Destination tensor. Data types supported: Same as @p dst - * - * @return a status - */ - static Status validate(const ITensorInfo *src, const ITensorInfo *dst); -}; -} // namespace cpu -} // namespace arm_compute -#endif /* ARM_COMPUTE_CPU_TRANSPOSE_H */ diff --git a/src/runtime/cpu/operators/internal/CpuGemmAssemblyDispatch.cpp b/src/runtime/cpu/operators/internal/CpuGemmAssemblyDispatch.cpp deleted file mode 100644 index ea3742fee5..0000000000 --- a/src/runtime/cpu/operators/internal/CpuGemmAssemblyDispatch.cpp +++ /dev/null @@ -1,869 +0,0 @@ -/* - * Copyright (c) 2018-2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "src/runtime/cpu/operators/internal/CpuGemmAssemblyDispatch.h" - -#include "arm_compute/runtime/NEON/NEScheduler.h" -#include "src/core/CPP/Validate.h" -#include "src/core/cpu/kernels/assembly/CpuGemmAssemblyWrapperKernel.h" -#include "src/core/cpu/kernels/assembly/arm_gemm.hpp" - -#include <arm_neon.h> -#include <cstdlib> - -namespace arm_compute -{ -namespace cpu -{ -namespace -{ -struct free_delete -{ - void operator()(void *x) - { - free(x); - } -}; - -struct Params -{ - unsigned int M; - unsigned int N; - unsigned int K; - unsigned int batches; - unsigned int multis; - unsigned int sections; - bool indirect; -}; - -Params extract_parameters(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *d, const AsmGemmInfo &info) -{ - ARM_COMPUTE_ERROR_ON_NULLPTR(a, b, d); - Params p; - p.M = d->tensor_shape().y(); - p.K = a->tensor_shape().x(); - p.N = d->tensor_shape().x(); - p.batches = 1; - p.multis = 1; - p.sections = 1; - p.indirect = false; - - if(info.method == AsmConvMethod::Conv || info.method == AsmConvMethod::Indirect) - { - p.indirect = true; - p.sections = b->tensor_shape()[2] * b->tensor_shape()[3]; - } - else - { - p.multis = b->tensor_shape().z(); - p.batches = d->tensor_shape().total_size_upper(2) / p.multis; - } - - // Update M in case of GEMM3D for output - if(info.depth_output_gemm3d != 0) - { - p.M = d->tensor_shape().y() * d->tensor_shape().z(); - p.batches = d->tensor_shape().total_size_upper(3) / p.multis; - } - - return p; -} - -arm_gemm::Activation map_to_arm_gemm_activation(const ActivationLayerInfo &act) -{ - arm_gemm::Activation gemm_act; - - // Early exit in case lower bound is other than 0, as it's not yet supported - if(act.b() != 0.f) - { - return gemm_act; - } - - switch(act.activation()) - { - case ActivationLayerInfo::ActivationFunction::RELU: - gemm_act.type = arm_gemm::Activation::Type::ReLU; - break; - case ActivationLayerInfo::ActivationFunction::BOUNDED_RELU: - gemm_act.type = arm_gemm::Activation::Type::BoundedReLU; - gemm_act.param1 = act.a(); - gemm_act.param2 = 0.f; - break; - case ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU: - gemm_act.type = arm_gemm::Activation::Type::BoundedReLU; - gemm_act.param1 = act.a(); - gemm_act.param2 = act.b(); - break; - default: - gemm_act.type = arm_gemm::Activation::Type::None; - } - - return gemm_act; -} - -IScheduler::Hints scheduling_hint_heuristic(arm_gemm::GemmMethod method, DataType data_type) -{ - // Schedule assembly kernel - const int granule_threshold = 200; - IScheduler::Hints scheduling_hint = IScheduler::Hints(Window::DimX); - if(method == arm_gemm::GemmMethod::GEMM_INTERLEAVED && data_type == DataType::F32) - { - scheduling_hint = IScheduler::Hints(Window::DimX, IScheduler::StrategyHint::DYNAMIC, granule_threshold); - } - else if(method == arm_gemm::GemmMethod::GEMM_INTERLEAVED_2D && (data_type == DataType::F32 || data_type == DataType::F16 || data_type == DataType::U8 || data_type == DataType::S8)) - { - //GEMM_INTERLEAVED supports 2D parallelism, IScheduler::split_dimensions_all signals to parallelise over all window dimensions - scheduling_hint = IScheduler::Hints(IScheduler::split_dimensions_all, IScheduler::StrategyHint::STATIC, granule_threshold); - } - else if(method == arm_gemm::GemmMethod::QUANTIZE_WRAPPER_2D && (data_type == DataType::QASYMM8 || data_type == DataType::QASYMM8_SIGNED)) - { - //special case for QASYMM8 to support 2D parallelism, scheduler here may be tweaked differently compared to FP32 case - scheduling_hint = IScheduler::Hints(IScheduler::split_dimensions_all, IScheduler::StrategyHint::STATIC, granule_threshold); - } - - return scheduling_hint; -} - -template <typename TypeInput, typename TypeOutput> -class FallbackTransform : public ITransformWeights -{ -public: - FallbackTransform() noexcept {}; - /** Prevent instances of this class from being copied (As this class contains pointers) */ - FallbackTransform(const FallbackTransform &) = delete; - /** Default move constructor */ - FallbackTransform(FallbackTransform &&) = default; - /** Prevent instances of this class from being copied (As this class contains pointers) */ - FallbackTransform &operator=(const FallbackTransform &) = delete; - /** Default move assignment operator */ - FallbackTransform &operator=(FallbackTransform &&) = default; - void run() override - { - _output.allocator()->allocate(); - ARM_COMPUTE_ERROR_ON(_output.buffer() == nullptr); - _gemm_kernel_asm->pretranspose_B_array(_output.buffer(), _in1_ptr, _ldb, _multi_stride_b); - _reshape_run = true; - } - - void release() override - { - _output.allocator()->free(); - } - - ITensor *get_weights() override - { - return &_output; - } - - uint32_t uid() override - { - uint32_t id = (_B_pretranspose_size | 0x80000000); - return id; - } - - void configure(size_t B_pretranspose_size, unsigned int alignment) - { - _output.allocator()->init(TensorInfo(TensorShape{ (B_pretranspose_size + alignment) }, 1, DataType::S8), alignment); - _B_pretranspose_size = B_pretranspose_size; - } - - void set_pretranspose(ITensor *tensor) - { - if(!_reshape_run) - { - _gemm_kernel_asm->set_pretransposed_B_data(tensor->buffer()); - } - } - - void set_args(const int ldb, const TypeInput *in1_ptr, const int multi_stride_b, std::shared_ptr<arm_gemm::GemmCommon<TypeInput, TypeOutput>> gemm_kernel_asm) - { - _ldb = ldb; - _in1_ptr = in1_ptr; - _multi_stride_b = multi_stride_b; - _gemm_kernel_asm = gemm_kernel_asm; - } - -private: - Tensor _output{}; - int _ldb{}; - const TypeInput *_in1_ptr{}; - int _multi_stride_b{}; - size_t _B_pretranspose_size{}; - std::shared_ptr<arm_gemm::GemmCommon<TypeInput, TypeOutput>> _gemm_kernel_asm{ nullptr }; -}; - -/** Fallback in case ACL doesn't have a function */ -template <typename TypeInput, typename TypeOutput, class OutputStage = arm_gemm::Nothing> -class Fallback : public CpuGemmAssemblyDispatch::IFallback -{ -public: - /** Destructor */ - ~Fallback() - { - if(_pretranspose && !(is_weight_managed())) - { - delete _pretranspose; - } - } - - /** Initialise the functions's input and output. - * - * @param[in] a Input tensor containing the Matrix A. - * @param[in] b Input tensor containing the Matrix B. - * @param[in] c Input tensor containing the Matrix C. - * @param[out] d Output tensor to store the result of matrix multiplication. - * @param[in] args Matrix multiplication information. - * @param[in] gemm_info GEMM meta-data - * @param[in] memory_group Memory group to be used by the function. - * @param[in] weights_manager Weights manager to be used by the function. - * @param[in] os Output stage meta-data. - */ - void configure(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, ITensorInfo *d, - arm_gemm::GemmArgs args, const AsmGemmInfo &gemm_info, - MemoryGroup &memory_group, IWeightsManager *weights_manager, const OutputStage &os = {}); - - /** Set requantization shifts to be used - * - * @param[in] shifts Requantization shifts - * - * @return Pointer to the shift data - */ - /** Set requantization data to be used - * - * - * @param shifts Requantization shifts - * @param multipliers Requantization multipliers - * - * @return A tuple with the pointers to the shift and multiplier data respectively - */ - std::tuple<bool, const int32_t *, const int32_t *, const int32_t *> set_requantize_data(const std::vector<int32_t> &shifts, - const std::vector<int32_t> &multipliers); - - // Inherited methods overridden: - void run(ITensorPack &tensors) override; - void prepare(ITensorPack &tensors) override; - bool is_configured() const override; - -private: - /** Allocate a workspace tensor. - * - * @param[in] workspace_size Size to allocate. - * @param[in] memory_group Tensor memory group. - * @param[in] alignment Workspace memory alignment. - */ - void allocate_workspace(size_t workspace_size, MemoryGroup &memory_group, size_t alignment); - /** Configure the indirect buffer - * - * @param[in] a Input tensor containing the Matrix A. - * @param[in] b Input tensor containing the Matrix B. - * @param[out] d Output tensor to store the result of matrix multiplication. - * @param[in] info GEMM meta-data - */ - void configure_indirect(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *d, const AsmGemmInfo &info); - /** Prepare the indirect buffer */ - void prepare_indirect_buffer(ITensorPack &tensors); - - /** Assembly Gemm kernel */ - std::shared_ptr<arm_gemm::GemmCommon<TypeInput, TypeOutput>> _gemm_kernel_asm{ nullptr }; - /** Optimised Arm® Neon™ kernel */ - std::unique_ptr<INEKernel> _optimised_kernel{ nullptr }; - /** GEMM workspace */ - Tensor _workspace{}; - /** Pre-transpose tensor */ - ITensor *_pretranspose{ nullptr }; - /** Prepared flag */ - bool _is_prepared{ false }; - /** GEMM meta-data */ - AsmGemmInfo _gemm_info{}; - /** Weights manager */ - IWeightsManager *_weights_manager{ nullptr }; - /** Weights transform object */ - FallbackTransform<TypeInput, TypeOutput> _weights_transform{}; - /** GEMM kernel description */ - arm_gemm::KernelDescription _kernel_info{}; - /** Per channel quantization shifts */ - std::vector<int32_t> _shifts{}; - std::vector<int32_t> right_shifts{}; - std::vector<int32_t> left_shifts{}; - /** Per channel quantization multipliers */ - std::vector<int32_t> _multipliers{}; - /** Indirect buffer */ - std::unique_ptr<const TypeInput *const *, free_delete> _indirect_arg{}; - std::unique_ptr<const TypeInput *, free_delete> _indirect_buf{}; - std::vector<TypeInput> _indirect_pad{}; - arm_gemm::ConvolutionParameters _cp{}; - - bool is_weight_managed() - { - // TODO (COMPMID-4539): This function should do the following: - // _weights_manager && _weights_manager->are_weights_managed(_b) - // , where _b is the second Tensor that is used to be given to the configure(). - // Currently, however, weight manager is disabled to make this class stateless. - // This should be revisited in the future. - return false; - } - - void acquire_managed_weight() - { - // TODO (COMPMID-4539): This function should do the following: - // _pretranspose = _weights_manager->acquire(_b, &_weights_transform); - // , where _b is the second Tensor that is used to be given to the configure(). - // Currently, however, weight manager is disabled to make this class stateless. - _pretranspose = nullptr; - } -}; - -template <typename TypeInput, typename TypeOutput, class OutputStage> -std::tuple<bool, const int32_t *, const int32_t *, const int32_t *> -Fallback<TypeInput, TypeOutput, OutputStage>::set_requantize_data(const std::vector<int32_t> &shifts, const std::vector<int32_t> &multipliers) -{ - _multipliers = multipliers; - _shifts = shifts; - bool need_left = false; - for(const auto s : _shifts) - { - left_shifts.push_back(std::max(-s, int32_t(0))); - right_shifts.push_back(std::min(-s, int32_t(0))); - if(s < 0 && !need_left) - { - need_left = true; - } - } - return std::make_tuple(need_left, left_shifts.data(), right_shifts.data(), _multipliers.data()); -} - -template <typename TypeInput, typename TypeOutput, class OutputStage> -void Fallback<TypeInput, TypeOutput, OutputStage>::prepare_indirect_buffer(ITensorPack &tensors) -{ - auto a = tensors.get_const_tensor(TensorType::ACL_SRC_0); - const TypeInput *A_ptr = reinterpret_cast<TypeInput *>(a->buffer()); - const int multis = 1; - const int batches = a->info()->tensor_shape().total_size_upper(3); - const size_t stride_A = a->info()->strides_in_bytes().y() / sizeof(TypeInput); - const size_t batch_stride_A = a->info()->strides_in_bytes()[3] / sizeof(TypeInput); - const size_t multi_stride_A = a->info()->strides_in_bytes()[4] / sizeof(TypeInput); - - const size_t output_hw = _cp.output_height * _cp.output_width; - const int batch_size = _cp.kernel_height * _cp.kernel_width * output_hw * sizeof(TypeInput); - const size_t batch_stride = batch_size / sizeof(TypeInput); - const int multi_size = batch_size * batches; - const size_t multi_stride = multi_size / sizeof(TypeInput); - - for(int64_t m = 0; m < multis; m++) - { - for(int64_t b = 0; b < batches; b++) - { - for(int64_t output_y = 0; output_y < _cp.output_height; output_y++) - { - for(int64_t output_x = 0; output_x < _cp.output_width; output_x++) - { - int64_t output_xy = (output_y * _cp.output_width) + output_x; - - for(int64_t kernel_y = 0; kernel_y < _cp.kernel_height; kernel_y++) - { - for(int64_t kernel_x = 0; kernel_x < _cp.kernel_width; kernel_x++) - { - int64_t input_x = (output_x * _cp.output_stride_w) + kernel_x - _cp.padding_left; - int64_t input_y = (output_y * _cp.output_stride_h) + kernel_y - _cp.padding_top; - int64_t kernel_xy = (kernel_y * _cp.kernel_width) + kernel_x; - int64_t input_xy = (input_y * _cp.input_width) + input_x; - - if(input_x < 0 || input_x >= _cp.input_width || input_y < 0 || input_y >= _cp.input_height) - { - _indirect_buf.get()[m * multi_stride + b * batch_stride + kernel_xy * output_hw + output_xy] = _indirect_pad.data(); - } - else - { - _indirect_buf.get()[m * multi_stride + b * batch_stride + kernel_xy * output_hw + output_xy] = - A_ptr + (m * multi_stride_A + b * batch_stride_A + input_xy * stride_A); - } - } - } - } - } - } - } -} - -template <typename TypeInput, typename TypeOutput, class OutputStage> -void Fallback<TypeInput, TypeOutput, OutputStage>::configure_indirect(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *d, const AsmGemmInfo &info) -{ - ARM_COMPUTE_ERROR_ON(!(info.method == AsmConvMethod::Conv || info.method == AsmConvMethod::Indirect)); - - float zeropad = 0.f; - if(is_data_type_quantized(a->data_type())) - { - zeropad = a->quantization_info().uniform().offset; - } - - const int64_t input_width = static_cast<int64_t>(a->tensor_shape()[1]); - const int64_t input_height = static_cast<int64_t>(a->tensor_shape()[2]); - const int64_t input_channels = static_cast<int64_t>(a->tensor_shape()[0]); - const int64_t kernel_width = static_cast<int64_t>(b->tensor_shape()[2]); - const int64_t kernel_height = static_cast<int64_t>(b->tensor_shape()[3]); - const int64_t output_width = static_cast<int64_t>(d->tensor_shape()[1]); - const int64_t output_height = static_cast<int64_t>(d->tensor_shape()[2]); - - _cp = { input_width, input_height, input_channels, kernel_width, kernel_height, output_width, output_height, - info.ps_info.stride().first, info.ps_info.stride().second, info.padding_top, info.padding_left, zeropad - }; - - if(info.method == AsmConvMethod::Conv) - { - _gemm_kernel_asm->set_convolution_parameters(_cp); - } - - if(info.method == AsmConvMethod::Indirect) - { - const unsigned int multis = 1; - const unsigned int batches = a->tensor_shape().total_size_upper(3); - const unsigned int kernel_hw = _cp.kernel_width * _cp.kernel_height; - const unsigned int output_hw = _cp.output_width * _cp.output_height; - - using TypeInputPtr = TypeInput *; - const int batch_size = kernel_hw * output_hw * sizeof(TypeInputPtr); - const size_t batch_stride = batch_size / sizeof(TypeInputPtr); - const int multi_size = batch_size * batches; - const size_t multi_stride = multi_size / sizeof(TypeInputPtr); - - _indirect_buf = std::unique_ptr<const TypeInput *, free_delete>(reinterpret_cast<const TypeInput **>(malloc(multi_size * multis))); - _indirect_arg = std::unique_ptr<const TypeInput *const *, free_delete>(reinterpret_cast<const TypeInput *const **>(malloc(sizeof(TypeInput **) * kernel_hw * multis * batches))); - _indirect_pad = std::vector<TypeInput>(_cp.input_channels, TypeInput(zeropad)); - - // Set indirect argument - int64_t pos = 0; - for(int64_t m = 0; m < multis; m++) - { - for(int64_t b = 0; b < batches; b++) - { - for(int64_t kernel_xy = 0; kernel_xy < kernel_hw; kernel_xy++) - { - (_indirect_arg.get())[pos++] = _indirect_buf.get() + m * multi_stride + b * batch_stride + kernel_xy * output_hw; - } - } - } - - _gemm_kernel_asm->set_indirect_parameters(a->tensor_shape()[0], _indirect_arg.get()); - } -} - -template <typename TypeInput, typename TypeOutput, class OutputStage> -void Fallback<TypeInput, TypeOutput, OutputStage>::configure(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, ITensorInfo *d, - arm_gemm::GemmArgs args, const AsmGemmInfo &gemm_info, - MemoryGroup &memory_group, IWeightsManager *weights_manager, const OutputStage &os) -{ - ARM_COMPUTE_UNUSED(c); - arm_gemm::GemmConfig gemm_cfg; - _kernel_info = arm_gemm::get_gemm_method<TypeInput, TypeOutput, OutputStage>(args, os); - _weights_manager = weights_manager; - if(_kernel_info.method != arm_gemm::GemmMethod::GEMV_BATCHED) - { - gemm_cfg.filter = _kernel_info.name; - args._cfg = &gemm_cfg; - } - _gemm_kernel_asm = arm_gemm::gemm<TypeInput, TypeOutput, OutputStage>(args, os); - if(_gemm_kernel_asm == nullptr) - { - //configuration not supported: Leave function unconfigured: - return; - } - - // arm_compute wrapper for the Gemm object (see above) - auto acl_gemm_wrapper = std::make_unique<kernel::CpuGemmAssemblyWrapperKernel<TypeInput, TypeOutput>>(); - ARM_COMPUTE_ERROR_ON(acl_gemm_wrapper == nullptr); - acl_gemm_wrapper->configure(_gemm_kernel_asm.get(), gemm_cfg.filter); - const size_t workspace_size = _gemm_kernel_asm->get_working_size(); - if(workspace_size > 0) - { - // Allocate workspace - const unsigned int alignment = 4096; - allocate_workspace(workspace_size, memory_group, alignment); - } - - //if we disable this code below in brackets then ConvLayer deadlocks when threads > 1 and - //the shapes are In=1x1x1024 Weights=1x1x1024x1001 Biases=1001 Out=1x1x1001 - { - const unsigned int window_size = _gemm_kernel_asm->get_window_size().total_size(); - if(window_size < static_cast<unsigned int>(args._maxthreads)) - { - _gemm_kernel_asm->set_nthreads(window_size); - } - } - - _optimised_kernel = std::move(acl_gemm_wrapper); - _gemm_info = gemm_info; - // Check for pre-transposed support - if(_gemm_kernel_asm->B_pretranspose_required()) - { - // Forcing 128-byte alignment (required by 32-bit kernels) - const unsigned int alignment = 128; - const size_t B_pretranspose_size = _gemm_kernel_asm->get_B_pretransposed_array_size(); - if(is_weight_managed()) - { - _weights_transform.configure(B_pretranspose_size, alignment); - acquire_managed_weight(); - } - else - { - _pretranspose = new Tensor(); - static_cast<Tensor *>(_pretranspose)->allocator()->init(TensorInfo(TensorShape{ (B_pretranspose_size + alignment) }, 1, DataType::S8), alignment); - } - } - - // Handle indirect GEMM convolution - if(gemm_info.method == AsmConvMethod::Conv || gemm_info.method == AsmConvMethod::Indirect) - { - configure_indirect(a, b, d, gemm_info); - } -} - -template <typename TypeInput, typename TypeOutput, class OutputStage> -void Fallback<TypeInput, TypeOutput, OutputStage>::prepare(ITensorPack &tensors) -{ - auto b = tensors.get_const_tensor(TensorType::ACL_SRC_1); - auto c = tensors.get_const_tensor(TensorType::ACL_SRC_2); - if(!_is_prepared) - { - // Setup up matrix bias in the assembly kernel, it's just a pointer to matrix C. - if(c && c->info()->data_type() == DataType::S32) - { - _gemm_kernel_asm->set_quantized_bias(reinterpret_cast<const int32_t *>(c->buffer() + c->info()->offset_first_element_in_bytes()), 0); - } - - // Pretranspose B if required - if(_gemm_kernel_asm->B_pretranspose_required()) - { - const int ldb = b->info()->strides_in_bytes().y() / sizeof(TypeInput); - const auto in1_ptr = reinterpret_cast<const TypeInput *>(b->buffer() + b->info()->offset_first_element_in_bytes()); - const int multi_stride_b = b->info()->strides_in_bytes().z() / sizeof(TypeInput); - - if(is_weight_managed()) - { - _weights_transform.set_args(ldb, in1_ptr, multi_stride_b, _gemm_kernel_asm); - _weights_manager->run(b, &_weights_transform); - - // If we didn't run the reshape function, set the pretransposed buffer - if(!_weights_transform.is_reshape_run()) - { - _weights_transform.set_pretranspose(_pretranspose); - } - } - else - { - static_cast<Tensor *>(_pretranspose)->allocator()->allocate(); - ARM_COMPUTE_ERROR_ON(_pretranspose->buffer() == nullptr); - _gemm_kernel_asm->pretranspose_B_array(_pretranspose->buffer(), in1_ptr, ldb, multi_stride_b); - b->mark_as_unused(); - } - } - - if(_gemm_info.method == AsmConvMethod::Indirect) - { - prepare_indirect_buffer(tensors); - } - - _is_prepared = true; - } -} - -template <typename TypeInput, typename TypeOutput, class OutputStage> -void Fallback<TypeInput, TypeOutput, OutputStage>::allocate_workspace(size_t workspace_size, MemoryGroup &memory_group, size_t alignment) -{ - ARM_COMPUTE_ERROR_ON_MSG(workspace_size == 0, "size cannot be 0"); - _workspace.allocator()->init(TensorInfo(TensorShape{ (workspace_size + alignment) }, 1, DataType::S8), alignment); - memory_group.manage(&_workspace); - _workspace.allocator()->allocate(); -} - -template <typename TypeInput, typename TypeOutput, class OutputStage> -bool Fallback<TypeInput, TypeOutput, OutputStage>::is_configured() const -{ - return _optimised_kernel != nullptr; -} - -template <typename TypeInput, typename TypeOutput, class OutputStage> -void Fallback<TypeInput, TypeOutput, OutputStage>::run(ITensorPack &tensors) -{ - auto a = tensors.get_const_tensor(TensorType::ACL_SRC_0); - auto b = tensors.get_const_tensor(TensorType::ACL_SRC_1); - auto c = tensors.get_const_tensor(TensorType::ACL_SRC_2); - auto d = tensors.get_tensor(TensorType::ACL_DST); - - int lda = a->info()->strides_in_bytes().y() / sizeof(TypeInput); - int ldb = 0; - const int ldd = d->info()->strides_in_bytes().y() / sizeof(TypeOutput); - - const size_t a_batch_idx = _gemm_info.reinterpret_input_as_3d != 0 ? 3 : 2; - const size_t a_multi_idx = a_batch_idx + 1; - const size_t d_batch_idx = _gemm_info.depth_output_gemm3d != 0 ? 3 : 2; - const size_t d_multi_idx = d_batch_idx + 1; - - int batch_stride_a = a->info()->strides_in_bytes()[a_batch_idx] / sizeof(TypeInput); - const int batch_stride_d = d->info()->strides_in_bytes()[d_batch_idx] / sizeof(TypeOutput); - - int multi_stride_a = a->info()->strides_in_bytes()[a_multi_idx] / sizeof(TypeInput); - int multi_stride_b = 0; - const int multi_stride_d = d->info()->strides_in_bytes()[d_multi_idx] / sizeof(TypeOutput); - - auto in0_ptr = reinterpret_cast<const TypeInput *>(a->buffer() + a->info()->offset_first_element_in_bytes()); - const TypeInput *in1_ptr = nullptr; - auto out_ptr = reinterpret_cast<TypeOutput *>(d->buffer() + d->info()->offset_first_element_in_bytes()); - - // Check if B is pre-tranposed and de-reference if not - if(!_gemm_kernel_asm->B_is_pretransposed()) - { - ldb = b->info()->strides_in_bytes().y() / sizeof(TypeInput); - multi_stride_b = b->info()->strides_in_bytes().z() / sizeof(TypeInput); - in1_ptr = reinterpret_cast<const TypeInput *>(b->buffer() + b->info()->offset_first_element_in_bytes()); - } - - const auto scheduling_hint = scheduling_hint_heuristic(_kernel_info.method, d->info()->data_type()); - - // Set workspace if needed and reset number of threads as buffer manager gets re-created with max_threads - if(_workspace.buffer() != nullptr) - { - _gemm_kernel_asm->set_working_space(reinterpret_cast<void *>(_workspace.buffer())); - const unsigned int split_dim = scheduling_hint.split_dimension(); - const unsigned int window_size = _gemm_kernel_asm->get_window_size().total_size(); - unsigned int num_threads = NEScheduler::get().num_threads(); - if(window_size < num_threads) - { - num_threads = window_size; - } - if(split_dim != IScheduler::split_dimensions_all) - { - // Make sure the kernel does not expect more threads than we can actually spawn - const unsigned int num_iterations = _optimised_kernel.get()->window().num_iterations(split_dim); - num_threads = std::min(num_iterations, num_threads); - } - _gemm_kernel_asm->set_nthreads(num_threads); - } - - // Prepare assembly kernel - prepare(tensors); - - // Setup up matrix bias in the assembly kernel, it's just a pointer to matrix C. - TypeOutput *bias = nullptr; - if(c && c->info()->data_type() != DataType::S32) - { - bias = reinterpret_cast<TypeOutput *>(c->buffer() + c->info()->offset_first_element_in_bytes()); - } - - if(_gemm_info.method == AsmConvMethod::Indirect) - { - in0_ptr = nullptr; - lda = 0; - batch_stride_a = 0; - multi_stride_a = 0; - } - - // Set gemm parameters - _gemm_kernel_asm->set_arrays(in0_ptr, lda, batch_stride_a, multi_stride_a, - in1_ptr, ldb, multi_stride_b, - out_ptr, ldd, batch_stride_d, multi_stride_d, - bias, 0); - // Schedule - NEScheduler::get().schedule(_optimised_kernel.get(), scheduling_hint); -} - -template <typename TypeInput, typename TypeOutput> -void create_arm_gemm(std::unique_ptr<CpuGemmAssemblyDispatch::IFallback> &arm_gemm, MemoryGroup &memory_group, - const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, ITensorInfo *d, arm_gemm::Activation activation, const AsmGemmInfo &info, - IWeightsManager *weights_manager) -{ - Params p = extract_parameters(a, b, d, info); - const CPUInfo &ci = NEScheduler::get().cpu_info(); - unsigned int num_threads = NEScheduler::get().num_threads(); - - arm_gemm::GemmArgs args(&ci, p.M, p.N, p.K, p.sections, p.batches, p.multis, p.indirect, activation, num_threads); - - // Create arm_gemm fallback - auto fallback = std::make_unique<Fallback<TypeInput, TypeOutput>>(); - fallback->configure(a, b, c, d, args, info, memory_group, weights_manager); - arm_gemm = std::move(fallback); -} - -template <typename TypeInput, typename TypeOutput> -void create_arm_gemm_quant(std::unique_ptr<CpuGemmAssemblyDispatch::IFallback> &arm_gemm, MemoryGroup &memory_group, - const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, ITensorInfo *d, arm_gemm::Activation activation, const AsmGemmInfo &info, - IWeightsManager *weights_manager) -{ - ARM_COMPUTE_UNUSED(activation); - Params p = extract_parameters(a, b, d, info); - const CPUInfo &ci = NEScheduler::get().cpu_info(); - unsigned int num_threads = NEScheduler::get().num_threads(); - - arm_gemm::GemmArgs args(&ci, p.M, p.N, p.K, p.sections, p.batches, p.multis, p.indirect, activation, num_threads); - - // Create arm_gemm fallback - auto fallback = std::make_unique<Fallback<TypeInput, TypeOutput, arm_gemm::Requantize32>>(); - - // Configure requantization info - const int32_t negation = info.negated_offsets ? 1 : -1; - const int32_t a_offset = -a->quantization_info().uniform().offset * negation; - const int32_t b_offset = -b->quantization_info().uniform().offset * negation; - const GEMMLowpOutputStageInfo os_info = info.output_stage; - - arm_gemm::Requantize32 gemm_requant_info{}; - if(os_info.gemmlowp_shifts.size() > 1) - { - const auto requantize_data = fallback->set_requantize_data(os_info.gemmlowp_shifts, os_info.gemmlowp_multipliers); - gemm_requant_info = arm_gemm::Requantize32(nullptr, 0, - a_offset, b_offset, os_info.gemmlowp_offset, - (std::get<0>(requantize_data)) ? std::get<1>(requantize_data) : nullptr, - std::get<2>(requantize_data), - std::get<3>(requantize_data), - os_info.gemmlowp_min_bound, os_info.gemmlowp_max_bound); - } - else - { - gemm_requant_info = arm_gemm::Requantize32(nullptr, 0, - a_offset, b_offset, os_info.gemmlowp_offset, - -os_info.gemmlowp_shift, os_info.gemmlowp_multiplier, - os_info.gemmlowp_min_bound, os_info.gemmlowp_max_bound); - } - - // Configure fallback - fallback->configure(a, b, c, d, args, info, memory_group, weights_manager, gemm_requant_info); - arm_gemm = std::move(fallback); -} - -} //namespace - -CpuGemmAssemblyDispatch::CpuGemmAssemblyDispatch(std::shared_ptr<IMemoryManager> memory_manager, IWeightsManager *weights_manager) - : _arm_gemm(nullptr), _memory_group(std::move(memory_manager)), _weights_manager(weights_manager) -{ -} - -Status CpuGemmAssemblyDispatch::validate(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *d, const AsmGemmInfo &info) -{ - ARM_COMPUTE_UNUSED(c, info); - ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(a, b, d); - ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(a); - ARM_COMPUTE_RETURN_ERROR_ON_CPU_BF16_UNSUPPORTED(a); - -#ifndef __aarch64__ - ARM_COMPUTE_RETURN_ERROR_ON_MSG(a->element_size() == 1, "8bit integer types only supported for aarch64"); -#endif /* __aarch64__ */ - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(a, 1, DataType::U8, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::S8, - DataType::BFLOAT16, DataType::F16, DataType::F32); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(b, 1, DataType::U8, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM8_PER_CHANNEL, DataType::S8, - DataType::BFLOAT16, DataType::F16, DataType::F32); - if(is_data_type_quantized_per_channel(b->data_type())) - { - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(a, 1, DataType::QASYMM8_SIGNED, DataType::S8); - } - else - { - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(a, b); - } - ARM_COMPUTE_RETURN_ERROR_ON_MSG(a->data_type() == DataType::F32 && d->data_type() != DataType::F32, "Only F32 output supported for F32 input"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(a->data_type() == DataType::F16 && d->data_type() != DataType::F16, "Only F16 output supported for F16 input"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(a->data_type() == DataType::BFLOAT16 && d->data_type() != DataType::F32, "Only F32 output supported for BFLOAT16 input"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(a->data_type() == DataType::U8 && d->data_type() != DataType::U32, "Only U32 output supported for U8 input"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(a->data_type() == DataType::S8 && d->data_type() != DataType::S32, "Only S32 output supported for S8 input"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(a->data_type() == DataType::QASYMM8 && d->data_type() != DataType::QASYMM8, "Only QASYMM8 output supported for QASYMM8 input"); - return Status{}; -} - -bool CpuGemmAssemblyDispatch::is_activation_supported(const ActivationLayerInfo &activation) -{ - arm_gemm::Activation act = map_to_arm_gemm_activation(activation); - return act.type != arm_gemm::Activation::Type::None; -} - -void CpuGemmAssemblyDispatch::configure(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, ITensorInfo *d, const AsmGemmInfo &info) -{ - ARM_COMPUTE_ERROR_ON_NULLPTR(a, b, d); - arm_gemm::Activation act = map_to_arm_gemm_activation(info.activation_info); - - //If we don't support a combination of data types, silently return: it is the caller's responsibility to check if configure() was successful via is_configured() - if(!CpuGemmAssemblyDispatch::validate(a, b, c, d, info)) - { - return; - } - - switch(a->data_type()) - { - case DataType::F32: - create_arm_gemm<float, float>(_arm_gemm, _memory_group, a, b, c, d, act, info, _weights_manager); - break; -#ifdef __aarch64__ - case DataType::U8: - case DataType::QASYMM8: - if(d->data_type() == DataType::S32) - { - create_arm_gemm<uint8_t, uint32_t>(_arm_gemm, _memory_group, a, b, c, d, act, info, _weights_manager); - } - else - { - create_arm_gemm_quant<uint8_t, uint8_t>(_arm_gemm, _memory_group, a, b, c, d, act, info, _weights_manager); - } - break; - case DataType::S8: - case DataType::QASYMM8_SIGNED: - if(d->data_type() == DataType::S32) - { - create_arm_gemm<int8_t, int32_t>(_arm_gemm, _memory_group, a, b, c, d, act, info, _weights_manager); - } - else - { - create_arm_gemm_quant<int8_t, int8_t>(_arm_gemm, _memory_group, a, b, c, d, act, info, _weights_manager); - } - break; -#endif /* __aarch64__ */ -#if defined(__ARM_FEATURE_BF16_VECTOR_ARITHMETIC) || defined(ARM_COMPUTE_FORCE_BF16) - case DataType::BFLOAT16: - create_arm_gemm<bfloat16, float>(_arm_gemm, _memory_group, a, b, c, d, act, info, _weights_manager); - break; -#endif /* defined(__ARM_FEATURE_BF16_VECTOR_ARITHMETIC) || defined(ARM_COMPUTE_FORCE_BF16) */ -#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC - case DataType::F16: - create_arm_gemm<float16_t, float16_t>(_arm_gemm, _memory_group, a, b, c, d, act, info, _weights_manager); - break; -#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */ - default: - break; - } -} - -void CpuGemmAssemblyDispatch::prepare(ITensorPack &tensors) -{ - ARM_COMPUTE_ERROR_ON(_arm_gemm == nullptr); - _arm_gemm->prepare(tensors); -} - -bool CpuGemmAssemblyDispatch::is_configured() const -{ - return _arm_gemm != nullptr && _arm_gemm->is_configured(); -} - -void CpuGemmAssemblyDispatch::run(ITensorPack &tensors) -{ - MemoryGroupResourceScope scope_mg(_memory_group); - - ARM_COMPUTE_ERROR_ON(_arm_gemm == nullptr); - _arm_gemm->run(tensors); -} -} // namespace cpu -} // namespace arm_compute diff --git a/src/runtime/cpu/operators/internal/CpuGemmAssemblyDispatch.h b/src/runtime/cpu/operators/internal/CpuGemmAssemblyDispatch.h deleted file mode 100644 index ffc097c75c..0000000000 --- a/src/runtime/cpu/operators/internal/CpuGemmAssemblyDispatch.h +++ /dev/null @@ -1,126 +0,0 @@ -/* - * Copyright (c) 2018-2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifndef ARM_COMPUTE_CPU_INTERNAL_CPU_GEMM_ASSEMBLY_DISPATCH_H -#define ARM_COMPUTE_CPU_INTERNAL_CPU_GEMM_ASSEMBLY_DISPATCH_H - -#include "arm_compute/runtime/IMemoryManager.h" -#include "arm_compute/runtime/IWeightsManager.h" -#include "arm_compute/runtime/MemoryGroup.h" -#include "arm_compute/runtime/Tensor.h" -#include "src/core/common/Macros.h" -#include "src/runtime/cpu/ICpuOperator.h" - -namespace arm_compute -{ -namespace cpu -{ -/* Convolution method supported by the assembly gemm interface */ -enum class AsmConvMethod -{ - Im2Col, - Indirect, - Conv -}; - -struct AsmGemmInfo -{ - AsmConvMethod method{ AsmConvMethod::Im2Col }; - PadStrideInfo ps_info{}; - ActivationLayerInfo activation_info{}; - GEMMLowpOutputStageInfo output_stage{}; - bool negated_offsets{ true }; - bool reinterpret_input_as_3d{ false }; - bool depth_output_gemm3d{ false }; - int64_t padding_top{ 0 }; - int64_t padding_left{ 0 }; - float padding_value{ 0.f }; -}; - -/** Assembly kernel glue */ -class CpuGemmAssemblyDispatch : public ICpuOperator -{ -public: - /** Constructor */ - CpuGemmAssemblyDispatch(std::shared_ptr<IMemoryManager> memory_manager = nullptr, IWeightsManager *weights_manager = nullptr); - /** Defautl destructor */ - ~CpuGemmAssemblyDispatch() = default; - - ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuGemmAssemblyDispatch); - - class IFallback - { - public: - virtual void run(ITensorPack &tensors) = 0; - virtual void prepare(ITensorPack &tensors) = 0; - virtual bool is_configured() const = 0; - virtual ~IFallback() = default; - }; - -public: - /** If supported create a Compute Library function else fallback to the arm_gemm function. - * - * @param[in] a Input tensor (Matrix A) - * @param[in] b Input tensor (Matrix B) - * @param[in] c Input tensor (Matrix C) used to pass the bias for quantized calculations - * @param[out] d Output tensor to store the result of matrix multiplication. Data type supported: same as @p input0. - * @param[in] info GEMM meta-data - */ - void configure(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, ITensorInfo *d, const AsmGemmInfo &info); - - /** Indicates whether or not this function can be used to process the given parameters. - * - * @param[in] a Input tensor info (Matrix A) - * @param[in] b Input tensor info (Matrix B) - * @param[in] c Input tensor info (Matrix C) used to pass the bias for quantized calculations - * @param[in] d Output tensor to store the result of matrix multiplication. Data type supported: same as @p input0. - * @param[in] info GEMM meta-data - * - * @return a status. - */ - static Status validate(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *d, const AsmGemmInfo &info); - /** Checks if activation is supported by the gemm assembly dispatcher - * - * @param[in] activation Activation to check - * - * @return True if activation is supported else false - */ - static bool is_activation_supported(const ActivationLayerInfo &activation); - /** Was the function successfully configured ? - * - * @return True if the function is configured and ready to run - */ - bool is_configured() const; - - // Inherited methods overridden: - void prepare(ITensorPack &tensors) override; - void run(ITensorPack &tensors) override; - -private: - std::unique_ptr<IFallback> _arm_gemm; /**< Interface for the arm_gemm fallback */ - MemoryGroup _memory_group; /**< Function memory group */ - IWeightsManager *_weights_manager; /**< Pointer to the weights manager */ -}; -} // namespace cpu -} // namespace arm_compute -#endif /* ARM_COMPUTE_CPU_INTERNAL_CPU_GEMM_ASSEMBLY_DISPATCH_H */ diff --git a/src/runtime/cpu/utils/CpuAuxTensorHandler.h b/src/runtime/cpu/utils/CpuAuxTensorHandler.h deleted file mode 100644 index 644018a718..0000000000 --- a/src/runtime/cpu/utils/CpuAuxTensorHandler.h +++ /dev/null @@ -1,101 +0,0 @@ -/* - * Copyright (c) 2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifndef ARM_COMPUTE_CPU_UTILS_CPU_AUX_TENSOR_HANDLER_H -#define ARM_COMPUTE_CPU_UTILS_CPU_AUX_TENSOR_HANDLER_H - -#include "arm_compute/core/ITensorPack.h" -#include "arm_compute/core/TensorInfo.h" -#include "arm_compute/runtime/Tensor.h" - -#include "support/Cast.h" - -namespace arm_compute -{ -namespace cpu -{ -/* Tensor handler to wrap and handle tensor allocations on workspace buffers */ -class CpuAuxTensorHandler -{ -public: - CpuAuxTensorHandler(int slot_id, TensorInfo &info, ITensorPack &pack, bool pack_inject = false) - : _tensor() - { - _tensor.allocator()->soft_init(info); - - ITensor *packed_tensor = utils::cast::polymorphic_downcast<ITensor *>(pack.get_tensor(slot_id)); - if((packed_tensor == nullptr) || (info.total_size() > packed_tensor->info()->total_size())) - { - _tensor.allocator()->allocate(); - if(pack_inject) - { - pack.add_tensor(slot_id, &_tensor); - _injected_tensor_pack = &pack; - _injected_slot_id = slot_id; - } - } - else - { - _tensor.allocator()->import_memory(packed_tensor->buffer()); - } - } - - CpuAuxTensorHandler(TensorInfo &info, ITensor &tensor) - : _tensor() - { - _tensor.allocator()->soft_init(info); - if(info.total_size() <= tensor.info()->total_size()) - { - _tensor.allocator()->import_memory(tensor.buffer()); - } - } - - CpuAuxTensorHandler(const CpuAuxTensorHandler &) = delete; - CpuAuxTensorHandler &operator=(const CpuAuxTensorHandler) = delete; - - ~CpuAuxTensorHandler() - { - if(_injected_tensor_pack) - { - _injected_tensor_pack->remove_tensor(_injected_slot_id); - } - } - - ITensor *get() - { - return &_tensor; - } - - ITensor *operator()() - { - return &_tensor; - } - -private: - Tensor _tensor{}; - ITensorPack *_injected_tensor_pack{ nullptr }; - int _injected_slot_id{ TensorType::ACL_UNKNOWN }; -}; -} // namespace cpu -} // namespace arm_compute -#endif /* ARM_COMPUTE_CPU_UTILS_CPU_AUX_TENSOR_HANDLER_H */
\ No newline at end of file diff --git a/src/runtime/gpu/cl/IClOperator.h b/src/runtime/gpu/cl/IClOperator.h deleted file mode 100644 index 049bf05dc1..0000000000 --- a/src/runtime/gpu/cl/IClOperator.h +++ /dev/null @@ -1,37 +0,0 @@ -/* - * Copyright (c) 2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifndef ARM_COMPUTE_ICL_OPERATOR_H -#define ARM_COMPUTE_ICL_OPERATOR_H - -#include "arm_compute/core/ITensorInfo.h" -#include "arm_compute/runtime/CL/ICLOperator.h" - -namespace arm_compute -{ -namespace opencl -{ -using IClOperator = experimental::ICLOperator; -} // namespace opencl -} // namespace arm_compute -#endif /* ARM_COMPUTE_ICL_OPERATOR_H */ diff --git a/src/runtime/gpu/cl/operators/ClActivation.cpp b/src/runtime/gpu/cl/operators/ClActivation.cpp deleted file mode 100644 index 71aa57bdbd..0000000000 --- a/src/runtime/gpu/cl/operators/ClActivation.cpp +++ /dev/null @@ -1,45 +0,0 @@ -/* - * Copyright (c) 2016-2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "src/runtime/gpu/cl/operators/ClActivation.h" - -#include "src/core/gpu/cl/ClCompileContext.h" -#include "src/core/gpu/cl/kernels/ClActivationKernel.h" - -namespace arm_compute -{ -namespace opencl -{ -void ClActivation::configure(const ClCompileContext &compile_context, ITensorInfo *src, ITensorInfo *dst, const ActivationLayerInfo &act_info) -{ - auto k = std::make_unique<kernels::ClActivationKernel>(); - k->configure(compile_context, src, dst, act_info); - _kernel = std::move(k); -} - -Status ClActivation::validate(const ITensorInfo *src, const ITensorInfo *dst, const ActivationLayerInfo &act_info) -{ - return kernels::ClActivationKernel::validate(src, dst, act_info); -} -} // namespace opencl -} // namespace arm_compute diff --git a/src/runtime/gpu/cl/operators/ClActivation.h b/src/runtime/gpu/cl/operators/ClActivation.h deleted file mode 100644 index 235b826b87..0000000000 --- a/src/runtime/gpu/cl/operators/ClActivation.h +++ /dev/null @@ -1,60 +0,0 @@ -/* - * Copyright (c) 2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifndef ARM_COMPUTE_CL_ACTIVATION_H -#define ARM_COMPUTE_CL_ACTIVATION_H - -#include "src/core/gpu/cl/ClCompileContext.h" -#include "src/runtime/gpu/cl/IClOperator.h" - -namespace arm_compute -{ -namespace opencl -{ -/** Basic function to run @ref kernels::ClActivationKernel */ -class ClActivation : public IClOperator -{ -public: - /** Constructor */ - ClActivation() = default; - /** Configure operator for a given list of arguments - * - * @param[in] compile_context The compile context to be used. - * @param[in] src Source tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/QSYMM16/F16/F32. - * @param[out] dst Destination tensor info. Data type supported: same as @p src - * @param[in] activation_info Activation layer parameters. - */ - void configure(const ClCompileContext &compile_context, ITensorInfo *src, ITensorInfo *dst, const ActivationLayerInfo &activation_info); - /** Static function to check if given info will lead to a valid configuration of @ref ClActivation - * - * @param[in] src Source tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/QSYMM16/F16/F32. - * @param[in] dst Destination tensor info. Data type supported: same as @p src - * @param[in] act_info Activation layer information. - * - * @return a status - */ - static Status validate(const ITensorInfo *src, const ITensorInfo *dst, const ActivationLayerInfo &act_info); -}; -} // namespace opencl -} // namespace arm_compute -#endif /* ARM_COMPUTE_CL_ACTIVATION_H */ diff --git a/src/runtime/gpu/cl/operators/ClAdd.cpp b/src/runtime/gpu/cl/operators/ClAdd.cpp deleted file mode 100644 index 01f550f819..0000000000 --- a/src/runtime/gpu/cl/operators/ClAdd.cpp +++ /dev/null @@ -1,47 +0,0 @@ -/* - * Copyright (c) 2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "src/runtime/gpu/cl/operators/ClAdd.h" - -#include "src/core/gpu/cl/ClCompileContext.h" -#include "src/core/gpu/cl/kernels/ClElementwiseKernel.h" - -namespace arm_compute -{ -namespace opencl -{ -void ClAdd::configure(const ClCompileContext &compile_context, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, - ConvertPolicy policy, const ActivationLayerInfo &act_info) -{ - auto k = std::make_unique<kernels::ClSaturatedArithmeticKernel>(); - k->configure(compile_context, ArithmeticOperation::ADD, src1, src2, dst, policy, act_info); - _kernel = std::move(k); -} - -Status ClAdd::validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, - ConvertPolicy policy, const ActivationLayerInfo &act_info) -{ - return kernels::ClSaturatedArithmeticKernel::validate(ArithmeticOperation::ADD, src1, src2, dst, policy, act_info); -} -} // namespace opencl -} // namespace arm_compute diff --git a/src/runtime/gpu/cl/operators/ClAdd.h b/src/runtime/gpu/cl/operators/ClAdd.h deleted file mode 100644 index f751d8dc83..0000000000 --- a/src/runtime/gpu/cl/operators/ClAdd.h +++ /dev/null @@ -1,100 +0,0 @@ -/* - * Copyright (c) 2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifndef ARM_COMPUTE_CL_ADD_H -#define ARM_COMPUTE_CL_ADD_H - -#include "src/core/gpu/cl/ClCompileContext.h" -#include "src/runtime/gpu/cl/IClOperator.h" - -namespace arm_compute -{ -namespace opencl -{ -/** Basic function to run arithmetic addition - * - * @note The tensor data type for the inputs must be U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/S32/F16/F32. - * @note The function performs an arithmetic addition between two tensors. - */ -class ClAdd : public IClOperator -{ -public: - /** Default Constructor */ - ClAdd() = default; - /** Configure function for a given list of arguments. - * - * Valid configurations (src1,src2) -> dst : - * - * - (U8,U8) -> U8 - * - (U8,U8) -> S16 - * - (S16,U8) -> S16 - * - (U8,S16) -> S16 - * - (S16,S16) -> S16 - * - (S32,S32) -> S32 - * - (F16,F16) -> F16 - * - (F32,F32) -> F32 - * - (QASYMM8,QASYMM8) -> QASYMM8 - * - (QASYMM8_SIGNED,QASYMM8_SIGNED) -> QASYMM8_SIGNED - * - (QSYMM16,QSYMM16) -> QSYMM16 - * - * @param[in] compile_context The compile context to be used. - * @param[in, out] src1 First source tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/S32/F16/F32. - * The source tensor is [in, out] because its TensorInfo might be modified inside the kernel in case of broadcasting of dimension 0. - * @param[in, out] src2 Second source tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/S32/F16/F32. - * The source tensor is [in, out] because its TensorInfo might be modified inside the kernel in case of broadcasting of dimension 0. - * @param[out] dst Destination tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/S32/F16/F32. - * @param[in] policy Policy to use to handle overflow. - * @param[in] act_info (Optional) Activation layer information in case of a fused activation. - */ - void configure(const ClCompileContext &compile_context, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, ConvertPolicy policy, - const ActivationLayerInfo &act_info = ActivationLayerInfo()); - /** Static function to check if given info will lead to a valid configuration of @ref ClAdd - * - * Valid configurations (src1,src2) -> dst : - * - * - (U8,U8) -> U8 - * - (U8,U8) -> S16 - * - (S16,U8) -> S16 - * - (U8,S16) -> S16 - * - (S16,S16) -> S16 - * - (S32,S32) -> S32 - * - (F16,F16) -> F16 - * - (F32,F32) -> F32 - * - (QASYMM8,QASYMM8) -> QASYMM8 - * - (QASYMM8_SIGNED,QASYMM8_SIGNED) -> QASYMM8_SIGNED - * - (QSYMM16,QSYMM16) -> QSYMM16 - * - * @param[in] src1 First source tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/S32/F16/F32. - * @param[in] src2 Second source tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/S32/F16/F32. - * @param[in] dst Destination tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/S32/F16/F32. - * @param[in] policy Policy to use to handle overflow. - * @param[in] act_info (Optional) Activation layer information in case of a fused activation. - * - * @return a status - */ - static Status validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, ConvertPolicy policy, - const ActivationLayerInfo &act_info = ActivationLayerInfo()); -}; -} // namespace opencl -} // namespace arm_compute -#endif /* ARM_COMPUTE_CL_ADD_H */ diff --git a/src/runtime/gpu/cl/operators/ClCast.cpp b/src/runtime/gpu/cl/operators/ClCast.cpp deleted file mode 100644 index 3f54004aa7..0000000000 --- a/src/runtime/gpu/cl/operators/ClCast.cpp +++ /dev/null @@ -1,45 +0,0 @@ -/* - * Copyright (c) 2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "src/runtime/gpu/cl/operators/ClCast.h" - -#include "src/core/gpu/cl/ClCompileContext.h" -#include "src/core/gpu/cl/kernels/ClCastKernel.h" - -namespace arm_compute -{ -namespace opencl -{ -void ClCast::configure(const ClCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst, ConvertPolicy policy) -{ - auto k = std::make_unique<kernels::ClCastKernel>(); - k->configure(compile_context, src, dst, policy); - _kernel = std::move(k); -} - -Status ClCast::validate(const ITensorInfo *src, const ITensorInfo *dst, ConvertPolicy policy) -{ - return kernels::ClCastKernel::validate(src, dst, policy); -} -} // namespace opencl -} // namespace arm_compute diff --git a/src/runtime/gpu/cl/operators/ClCast.h b/src/runtime/gpu/cl/operators/ClCast.h deleted file mode 100644 index 69e028debd..0000000000 --- a/src/runtime/gpu/cl/operators/ClCast.h +++ /dev/null @@ -1,74 +0,0 @@ -/* - * Copyright (c) 2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifndef ARM_COMPUTE_CL_CAST_H -#define ARM_COMPUTE_CL_CAST_H - -#include "src/core/gpu/cl/ClCompileContext.h" -#include "src/runtime/gpu/cl/IClOperator.h" - -namespace arm_compute -{ -namespace opencl -{ -/** Basic function to run @ref kernels::ClCastKernel */ -class ClCast : public IClOperator -{ -public: - /** Constructor */ - ClCast() = default; - /** Configure operator for a given list of arguments - * - * @note Input data type must be different than output data type. - * - * Valid data layouts: - * - All - * - * Valid data type configurations: - * |src |dst | - * |:--------------|:--------------------------------------| - * |U8 | S8, U16, S16, U32, S32, F16, F32 | - * |U16 | U8, S8, S16, U32, S32, F16, F32 | - * |S16 | U8, S8, U16, U32, S32, F16, F32 | - * |U32 | U8, S8, U16, S16, S32, F16, F32 | - * |S32 | U8, S8, U16, S16, U32, F16, F32 | - * |F16 | U8, S8, U16, S16, U32, F32 | - * |F32 | U8, S8, U16, S16, U32, F16 | - * - * @param[in] compile_context The compile context to be used. - * @param[in] src The source tensor to convert. Data types supported: U8/S8/U16/S16/U32/S32/F16/F32. - * @param[out] dst The destinatio tensor. Data types supported: U8/S8/U16/S16/U32/S32/F16/F32. - * @param[in] policy Conversion policy. - */ - void configure(const ClCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst, ConvertPolicy policy); - /** Static function to check if given info will lead to a valid configuration - * - * Similar to @ref ClCast::configure() - * - * @return a status - */ - static Status validate(const ITensorInfo *src, const ITensorInfo *dst, ConvertPolicy policy); -}; -} // namespace opencl -} // namespace arm_compute -#endif /* ARM_COMPUTE_CL_CAST_H */ diff --git a/src/runtime/gpu/cl/operators/ClConcatenate.cpp b/src/runtime/gpu/cl/operators/ClConcatenate.cpp deleted file mode 100644 index 4385fcfaed..0000000000 --- a/src/runtime/gpu/cl/operators/ClConcatenate.cpp +++ /dev/null @@ -1,254 +0,0 @@ -/* - * Copyright (c) 2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "src/runtime/gpu/cl/operators/ClConcatenate.h" - -#include "arm_compute/core/utils/misc/ShapeCalculator.h" -#include "arm_compute/runtime/CL/CLScheduler.h" - -#include "src/core/gpu/cl/kernels/ClBatchConcatenateKernel.h" -#include "src/core/gpu/cl/kernels/ClDepthConcatenateKernel.h" -#include "src/core/gpu/cl/kernels/ClHeightConcatenateKernel.h" -#include "src/core/gpu/cl/kernels/ClWidthConcatenate2TensorsKernel.h" -#include "src/core/gpu/cl/kernels/ClWidthConcatenate4TensorsKernel.h" -#include "src/core/gpu/cl/kernels/ClWidthConcatenateKernel.h" - -#include "arm_compute/core/Error.h" -#include "arm_compute/core/TensorInfo.h" -#include "arm_compute/core/Types.h" -#include "src/core/helpers/AutoConfiguration.h" - -namespace arm_compute -{ -namespace opencl -{ -ClConcatenate::ClConcatenate() - : _concat_kernels(), - _num_inputs(0), - _axis(Window::DimX) -{ -} - -void ClConcatenate::configure(const CLCompileContext &compile_context, const std::vector<ITensorInfo *> &src_vector, ITensorInfo *dst, size_t axis) -{ - ARM_COMPUTE_ERROR_ON(dst == nullptr); - _axis = axis; - _num_inputs = src_vector.size(); - - TensorShape dst_shape = arm_compute::misc::shape_calculator::calculate_concatenate_shape(src_vector, _axis); - std::vector<const ITensorInfo *> const_src_vector(src_vector.size()); - std::transform(src_vector.begin(), src_vector.end(), const_src_vector.begin(), [](ITensorInfo * t) - { - ARM_COMPUTE_ERROR_ON_NULLPTR(t); - return t; - }); - - // dst auto inizialitation if not yet initialized - auto_init_if_empty(*dst, dst_shape, 1, src_vector[0]->data_type()); - ARM_COMPUTE_ERROR_THROW_ON(ClConcatenate::validate(const_src_vector, dst, axis)); - - unsigned int offset = 0; - switch(_axis) - { - case Window::DimX: - { - switch(_num_inputs) - { - case 2: - { - // Configure WidthConcatenate2Tensors kernel - auto kernel = std::make_unique<kernels::ClWidthConcatenate2TensorsKernel>(); - kernel->configure(compile_context, src_vector.at(0), src_vector.at(1), dst); - _concat_kernels.emplace_back(std::move(kernel)); - break; - } - case 4: - { - // Configure WidthConcatenate4Tensors kernel - auto kernel = std::make_unique<kernels::ClWidthConcatenate4TensorsKernel>(); - kernel->configure(compile_context, src_vector.at(0), src_vector.at(1), src_vector.at(2), src_vector.at(3), dst); - _concat_kernels.emplace_back(std::move(kernel)); - break; - } - default: - { - // Configure generic case WidthConcatenate kernels - for(unsigned int i = 0; i < _num_inputs; ++i) - { - auto kernel = std::make_unique<kernels::ClWidthConcatenateKernel>(); - kernel->configure(compile_context, src_vector.at(i), offset, dst); - offset += src_vector.at(i)->dimension(_axis); - _concat_kernels.emplace_back(std::move(kernel)); - } - break; - } - } - break; - } - case Window::DimY: - { - for(unsigned int i = 0; i < _num_inputs; ++i) - { - auto kernel = std::make_unique<kernels::ClHeightConcatenateKernel>(); - kernel->configure(compile_context, src_vector.at(i), offset, dst); - offset += src_vector.at(i)->dimension(_axis); - _concat_kernels.emplace_back(std::move(kernel)); - } - break; - } - case Window::DimZ: - { - for(unsigned int i = 0; i < _num_inputs; ++i) - { - auto kernel = std::make_unique<kernels::ClDepthConcatenateKernel>(); - kernel->configure(compile_context, src_vector.at(i), offset, dst); - offset += src_vector.at(i)->dimension(_axis); - _concat_kernels.emplace_back(std::move(kernel)); - } - break; - } - case 3: - { - for(unsigned int i = 0; i < _num_inputs; ++i) - { - auto kernel = std::make_unique<kernels::ClBatchConcatenateKernel>(); - kernel->configure(compile_context, src_vector.at(i), offset, dst); - offset += src_vector.at(i)->dimension(_axis); - _concat_kernels.emplace_back(std::move(kernel)); - } - break; - } - default: - ARM_COMPUTE_ERROR("Axis not supported"); - } -} - -Status ClConcatenate::validate(const std::vector<const ITensorInfo *> &src_vector, const ITensorInfo *dst, size_t axis) -{ - ARM_COMPUTE_RETURN_ERROR_ON(dst == nullptr); - const unsigned int num_inputs = src_vector.size(); - - ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(dst); - ARM_COMPUTE_RETURN_ERROR_ON(num_inputs < 2); - - unsigned int offset = 0; - switch(axis) - { - case Window::DimX: - { - switch(num_inputs) - { - case 2: - // Validate WidthConcatenate2Tensors kernels if there are 2 inputs - ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src_vector[0], src_vector[1]); - ARM_COMPUTE_RETURN_ON_ERROR(kernels::ClWidthConcatenate2TensorsKernel::validate(src_vector[0], src_vector[1], dst)); - break; - case 4: - // Validate WidthConcatenate4Tensors kernels if there are 4 inputs - ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src_vector[0], src_vector[1], src_vector[2], src_vector[3]); - ARM_COMPUTE_RETURN_ON_ERROR(kernels::ClWidthConcatenate4TensorsKernel::validate(src_vector[0], src_vector[1], src_vector[2], src_vector[3], dst)); - break; - default: - // Validate generic case of WidthConcatenate kernel - for(const auto &src : src_vector) - { - ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src); - ARM_COMPUTE_RETURN_ON_ERROR(kernels::ClWidthConcatenateKernel::validate(src, offset, dst)); - offset += src->dimension(axis); - } - break; - } - break; - } - case Window::DimY: - { - for(const auto &src : src_vector) - { - ARM_COMPUTE_RETURN_ON_ERROR(kernels::ClHeightConcatenateKernel::validate(src, offset, dst)); - offset += src->dimension(axis); - } - break; - } - case Window::DimZ: - { - for(const auto &src : src_vector) - { - ARM_COMPUTE_RETURN_ON_ERROR(kernels::ClDepthConcatenateKernel::validate(src, offset, dst)); - offset += src->dimension(axis); - } - break; - } - case 3: - { - for(const auto &src : src_vector) - { - ARM_COMPUTE_RETURN_ON_ERROR(kernels::ClBatchConcatenateKernel::validate(src, offset, dst)); - offset += src->dimension(axis); - } - break; - } - default: - ARM_COMPUTE_ERROR("Axis not supported"); - } - - if(dst->total_size() != 0) - { - TensorShape dst_shape = arm_compute::misc::shape_calculator::calculate_concatenate_shape(src_vector, axis); - ARM_COMPUTE_RETURN_ERROR_ON(dst_shape.total_size() != dst->tensor_shape().total_size()); - } - - return Status{}; -} - -void ClConcatenate::run(ITensorPack &tensors) -{ - if(tensors.empty()) - { - ARM_COMPUTE_ERROR("No inputs provided"); - } - - if(static_cast<int>(tensors.size()) - 1 != static_cast<int>(_num_inputs)) - { - ARM_COMPUTE_ERROR("Configured with different number of inputs"); - } - - if(_axis == Window::DimX && (_num_inputs == 2 || _num_inputs == 4)) - { - ARM_COMPUTE_ERROR_ON(_concat_kernels.empty()); - CLScheduler::get().enqueue_op(*_concat_kernels.at(0), tensors, true); - } - else - { - int i = 0; - for(auto &k : _concat_kernels) - { - ITensorPack pack; - pack.add_tensor(TensorType::ACL_SRC, tensors.get_const_tensor(ACL_SRC_VEC + i)); - pack.add_tensor(TensorType::ACL_DST, tensors.get_tensor(ACL_DST)); - CLScheduler::get().enqueue_op(*k, pack, true); - ++i; - } - } -} -} // namespace opencl -} // namespace arm_compute diff --git a/src/runtime/gpu/cl/operators/ClConcatenate.h b/src/runtime/gpu/cl/operators/ClConcatenate.h deleted file mode 100644 index 0d960a605c..0000000000 --- a/src/runtime/gpu/cl/operators/ClConcatenate.h +++ /dev/null @@ -1,86 +0,0 @@ -/* - * Copyright (c) 2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifndef ARM_COMPUTE_CLCONCATENATE_H -#define ARM_COMPUTE_CLCONCATENATE_H - -#include "src/core/gpu/cl/ClCompileContext.h" -#include "src/core/gpu/cl/IClKernel.h" -#include "src/runtime/gpu/cl/IClOperator.h" - -#include <vector> - -namespace arm_compute -{ -namespace opencl -{ -/** Basic function to execute concatenate tensors along a given axis. This function calls the following kernels: - * - * -# @ref kernels::ClWidthConcatenateKernel (if underlying concatenation axis is 0). - * -# @ref kernels::ClHeightConcatenateKernel (if underlying concatenation axis is 1). - * -# @ref kernels::ClDepthConcatenateKernel (if underlying concatenation axis is 2). - * -# @ref kernels::ClBatchConcatenateKernel (if underlying concatenation axis is 3). - */ -class ClConcatenate : public IClOperator -{ -public: - /** Default constructor */ - ClConcatenate(); - /** Initialise the kernel's inputs vector and dst. - * - * @note Input and dst tensor dimensions preconditions defer depending on the concatenation axis. - * @note Preconditions can be found respectively at @ref kernels::ClWidthConcatenateKernel, - * @ref kernels::ClHeightConcatenateKernel and @ref kernels::ClDepthConcatenateKernel. - * - * - * @param[in] compile_context The compile context to be used. - * @param[in,out] src_vector The vectors containing all the tensors info to concatenate. Data types supported: All - * @param[out] dst Destination tensor info. Data types supported: same as @p src_vector. - * @param[in] axis Concatenation axis. Supported underlying concatenation axis are 0, 1, 2 and 3. - */ - void configure(const ClCompileContext &compile_context, const std::vector<ITensorInfo *> &src_vector, ITensorInfo *dst, size_t axis); - /** Static function to check if given info will lead to a valid configuration of @ref ClConcatenate - * - * @note Input and dst tensor dimensions preconditions defer depending on the concatenation axis. - * @note Preconditions can be found respectively at @ref kernels::ClWidthConcatenateKernel, - * @ref kernels::ClHeightConcatenateKernel and @ref kernels::ClDepthConcatenateKernel. - * - * @param[in] src_vector The vectors containing all the tensors info to concatenate. Data types supported: All - * @param[in] dst Destination tensor info. Data types supported: same as @p src_vector. - * @param[in] axis Concatenation axis. Supported underlying concatenation axis are 0, 1, 2 and 3. - * - * @return a status - */ - static Status validate(const std::vector<const ITensorInfo *> &src_vector, const ITensorInfo *dst, size_t axis); - - // Inherited methods overridden: - void run(ITensorPack &tensors) override; - -private: - std::vector<std::unique_ptr<IClKernel>> _concat_kernels; - unsigned int _num_inputs; - unsigned int _axis; -}; -} // namespace opencl -} // namespace arm_comPUTE -#endif /* ARM_COMPUTE_CL_CONCATENATE_H */ diff --git a/src/runtime/gpu/cl/operators/ClConvertFullyConnectedWeights.cpp b/src/runtime/gpu/cl/operators/ClConvertFullyConnectedWeights.cpp deleted file mode 100644 index 0d2f2925d3..0000000000 --- a/src/runtime/gpu/cl/operators/ClConvertFullyConnectedWeights.cpp +++ /dev/null @@ -1,45 +0,0 @@ -/* - * Copyright (c) 2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "src/runtime/gpu/cl/operators/ClConvertFullyConnectedWeights.h" - -#include "src/core/gpu/cl/ClCompileContext.h" -#include "src/core/gpu/cl/kernels/ClConvertFullyConnectedWeightsKernel.h" - -namespace arm_compute -{ -namespace opencl -{ -void ClConvertFullyConnectedWeights::configure(const ClCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst, const TensorShape &original_src_shape, DataLayout data_layout) -{ - auto k = std::make_unique<kernels::ClConvertFullyConnectedWeightsKernel>(); - k->configure(compile_context, src, dst, original_src_shape, data_layout); - _kernel = std::move(k); -} - -Status ClConvertFullyConnectedWeights::validate(const ITensorInfo *src, const ITensorInfo *dst, const TensorShape &original_src_shape, DataLayout data_layout) -{ - return kernels::ClConvertFullyConnectedWeightsKernel::validate(src, dst, original_src_shape, data_layout); -} -} // namespace opencl -} // namespace arm_compute
\ No newline at end of file diff --git a/src/runtime/gpu/cl/operators/ClConvertFullyConnectedWeights.h b/src/runtime/gpu/cl/operators/ClConvertFullyConnectedWeights.h deleted file mode 100644 index efedc2fcb7..0000000000 --- a/src/runtime/gpu/cl/operators/ClConvertFullyConnectedWeights.h +++ /dev/null @@ -1,62 +0,0 @@ -/* - * Copyright (c) 2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifndef ARM_COMPUTE_CL_CONVERTFULLYCONNECTEDWEIGHTS_H -#define ARM_COMPUTE_CL_CONVERTFULLYCONNECTEDWEIGHTS_H - -#include "src/core/gpu/cl/ClCompileContext.h" -#include "src/runtime/gpu/cl/IClOperator.h" - -namespace arm_compute -{ -namespace opencl -{ -/** Basic function to run @ref kernels::ClConvertFullyConnectedWeightsKernel */ -class ClConvertFullyConnectedWeights : public IClOperator -{ -public: - /** Constructor */ - ClConvertFullyConnectedWeights() = default; - /** Initialise the kernel's inputs and outputs - * - * @param[in] compile_context The compile context to be used. - * @param[in] src The src tensor info. Data types supported: All. - * @param[in] dst The dst tensor info. Data types supported: Same as @p src - * @param[in] original_src_shape Shape of the original src tensor (the one entering fully connected layer). - * @param[in] data_layout The data layout the weights have been trained in. - */ - void configure(const CLCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst, const TensorShape &original_src_shape, DataLayout data_layout); - /** Static function to check if given info will lead to a valid configuration of @ref kernels::ClConvertFullyConnectedWeightsKernel. - * - * @param[in] src First tensor src info. Data types supported: All. - * @param[in] dst Output tensor info. Data types supported: same as @p src. - * @param[in] original_src_shape Shape of the original src tensor (the one entering fully connected layer). - * @param[in] data_layout The data layout the weights have been trained in. - * - * @return a status - */ - static Status validate(const ITensorInfo *src, const ITensorInfo *dst, const TensorShape &original_src_shape, DataLayout data_layout); -}; -} // namespace opencl -} // namespace arm_compute -#endif /* ARM_COMPUTE_CL_CONVERTFULLYCONNECTEDWEIGHTS_H */ diff --git a/src/runtime/gpu/cl/operators/ClCopy.cpp b/src/runtime/gpu/cl/operators/ClCopy.cpp deleted file mode 100644 index 2bdb1f5ba1..0000000000 --- a/src/runtime/gpu/cl/operators/ClCopy.cpp +++ /dev/null @@ -1,45 +0,0 @@ -/* - * Copyright (c) 2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "src/runtime/gpu/cl/operators/ClCopy.h" - -#include "src/core/gpu/cl/ClCompileContext.h" -#include "src/core/gpu/cl/kernels/ClCopyKernel.h" - -namespace arm_compute -{ -namespace opencl -{ -void ClCopy::configure(const ClCompileContext &compile_context, ITensorInfo *src, ITensorInfo *dst, Window *dst_window) -{ - auto k = std::make_unique<kernels::ClCopyKernel>(); - k->configure(compile_context, src, dst, dst_window); - _kernel = std::move(k); -} - -Status ClCopy::validate(const ITensorInfo *src, const ITensorInfo *dst, Window *dst_window) -{ - return kernels::ClCopyKernel::validate(src, dst, dst_window); -} -} // namespace opencl -} // namespace arm_compute
\ No newline at end of file diff --git a/src/runtime/gpu/cl/operators/ClCopy.h b/src/runtime/gpu/cl/operators/ClCopy.h deleted file mode 100644 index 0b99676f65..0000000000 --- a/src/runtime/gpu/cl/operators/ClCopy.h +++ /dev/null @@ -1,62 +0,0 @@ -/* - * Copyright (c) 2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifndef ARM_COMPUTE_CL_COPY_H -#define ARM_COMPUTE_CL_COPY_H - -#include "arm_compute/core/Window.h" -#include "src/core/gpu/cl/ClCompileContext.h" -#include "src/runtime/gpu/cl/IClOperator.h" - -namespace arm_compute -{ -namespace opencl -{ -/** Basic function to run @ref kernels::ClCopyKernel */ -class ClCopy : public IClOperator -{ -public: - /** Constructor */ - ClCopy() = default; - /** Initialise the function's source and destination. - * - * @param[in] compile_context The compile context to be used. - * @param[in] src Source tensor info. Data types supported: All. - * @param[out] dst Output tensor info. Data types supported: Same as @p src. - * @param[in] dst_window (Optional) Window to be used in case only copying into part of a tensor. Default is nullptr. - * - */ - void configure(const CLCompileContext &compile_context, ITensorInfo *src, ITensorInfo *dst, Window *dst_window = nullptr); - /** Static function to check if given info will lead to a valid configuration of @ref kernels::ClCopyKernel - * - * @param[in] src Source tensor info. Data types supported: All. - * @param[in] dst Output tensor info. Data types supported: Same as @p src. - * @param[in] dst_window (Optional) Window to be used in case only copying into part of a tensor. Default is nullptr. - * - * @return a status - */ - static Status validate(const ITensorInfo *src, const ITensorInfo *dst, Window *dst_window = nullptr); -}; -} // namespace opencl -} // namespace arm_compute -#endif /* ARM_COMPUTE_CL_COPY_H */ diff --git a/src/runtime/gpu/cl/operators/ClCrop.cpp b/src/runtime/gpu/cl/operators/ClCrop.cpp deleted file mode 100644 index 17bb11912f..0000000000 --- a/src/runtime/gpu/cl/operators/ClCrop.cpp +++ /dev/null @@ -1,46 +0,0 @@ -/* - * Copyright (c) 2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "src/runtime/gpu/cl/operators/ClCrop.h" - -#include "src/core/gpu/cl/ClCompileContext.h" -#include "src/core/gpu/cl/kernels/ClCropKernel.h" - -namespace arm_compute -{ -namespace opencl -{ -void ClCrop::configure(const ClCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst, Coordinates2D start, Coordinates2D end, uint32_t batch_index, float extrapolation_value, - Window *dst_window) -{ - auto k = std::make_unique<kernels::ClCropKernel>(); - k->configure(compile_context, src, dst, start, end, batch_index, extrapolation_value, dst_window); - _kernel = std::move(k); -} - -Status ClCrop::validate(const ITensorInfo *src, const ITensorInfo *dst, Coordinates2D start, Coordinates2D end, uint32_t batch_index, float extrapolation_value, Window *dst_window) -{ - return kernels::ClCropKernel::validate(src, dst, start, end, batch_index, extrapolation_value, dst_window); -} -} // namespace opencl -} // namespace arm_compute
\ No newline at end of file diff --git a/src/runtime/gpu/cl/operators/ClCrop.h b/src/runtime/gpu/cl/operators/ClCrop.h deleted file mode 100644 index acfbf14742..0000000000 --- a/src/runtime/gpu/cl/operators/ClCrop.h +++ /dev/null @@ -1,74 +0,0 @@ -/* - * Copyright (c) 2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifndef ARM_COMPUTE_CL_COPY_H -#define ARM_COMPUTE_CL_COPY_H - -#include "arm_compute/core/Window.h" -#include "src/core/gpu/cl/ClCompileContext.h" -#include "src/runtime/gpu/cl/IClOperator.h" - -namespace arm_compute -{ -namespace opencl -{ -/** Basic function to run @ref kernels::ClCropKernel */ -class ClCrop : public IClOperator -{ -public: - /** Constructor */ - ClCrop() = default; - /** Initialise the function's source and destination. - * - * @note Supported tensor rank: up to 4 - * - * @param[in] compile_context The compile context to be used. - * @param[in] src Source tensor info. Data type supported: All. Data layouts supported: NHWC. - * @param[out] dst Destination tensor info. Data type supported: F32 - * @param[in] start Coordinates of where to start cropping the image. - * @param[in] end Coordinates of where to end cropping the image. - * @param[in] batch_index Fourth dimension index of the 3D image to crop in @p src. - * @param[in] extrapolation_value Value to be used for values outside of the image. Default is 0. - * @param[in] dst_window Output window to be used in case cropped image is being copied into a tensor. Default is nullptr. - */ - void configure(const CLCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst, Coordinates2D start, Coordinates2D end, uint32_t batch_index, float extrapolation_value = 0, - Window *dst_window = nullptr); - - /** Static function to check if given info will lead to a valid configuration of @ref kernels::ClCropKernel - * - * @note Supported tensor rank: up to 4 - * - * @param[in] src Source tensor info. Data type supported: All. Data layouts supported: NHWC. - * @param[in] dst Destination tensor info. Data type supported: F32 - * @param[in] start Coordinates of where to start cropping the image. - * @param[in] end Coordinates of where to end cropping the image. - * @param[in] batch_index Fourth dimension index of the 3D image to crop in @p src. - * @param[in] extrapolation_value Value to be used for values outside of the image. Default is 0. - * @param[in] dst_window Output window to be used in case cropped image is being copied into a tensor. Default is nullptr. - */ - static Status validate(const ITensorInfo *src, const ITensorInfo *dst, Coordinates2D start, Coordinates2D end, uint32_t batch_index, float extrapolation_value = 0, - Window *dst_window = nullptr); -}; -} // namespace opencl -} // namespace arm_compute -#endif /* ARM_COMPUTE_CL_COPY_H */ diff --git a/src/runtime/gpu/cl/operators/ClDequantize.cpp b/src/runtime/gpu/cl/operators/ClDequantize.cpp deleted file mode 100644 index 0c1391bb45..0000000000 --- a/src/runtime/gpu/cl/operators/ClDequantize.cpp +++ /dev/null @@ -1,53 +0,0 @@ -/* - * Copyright (c) 2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "src/runtime/gpu/cl/operators/ClDequantize.h" - -#include "arm_compute/core/Error.h" -#include "arm_compute/runtime/CL/CLScheduler.h" -#include "src/core/gpu/cl/ClCompileContext.h" -#include "src/core/gpu/cl/kernels/ClDequantizeKernel.h" - -namespace arm_compute -{ -namespace opencl -{ -void ClDequantize::configure(const CLCompileContext &compile_context, ITensorInfo *src, ITensorInfo *dst) -{ - auto k = std::make_unique<kernels::ClDequantizeKernel>(); - k->configure(compile_context, src, dst); - _kernel = std::move(k); -} - -Status ClDequantize::validate(const ITensorInfo *src, const ITensorInfo *dst) -{ - return kernels::ClDequantizeKernel::validate(src, dst); -} - -void ClDequantize::run(ITensorPack &tensors) -{ - ARM_COMPUTE_ERROR_ON_MSG(tensors.empty(), "No inputs provided"); - CLScheduler::get().enqueue_op(*_kernel.get(), tensors); -} -} // namespace opencl -} // namespace arm_compute diff --git a/src/runtime/gpu/cl/operators/ClDequantize.h b/src/runtime/gpu/cl/operators/ClDequantize.h deleted file mode 100644 index 47fad3eeee..0000000000 --- a/src/runtime/gpu/cl/operators/ClDequantize.h +++ /dev/null @@ -1,60 +0,0 @@ -/* - * Copyright (c) 2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifndef ARM_COMPUTE_CL_DEQUANTIZE_H -#define ARM_COMPUTE_CL_DEQUANTIZE_H - -#include "src/core/gpu/cl/ClCompileContext.h" -#include "src/runtime/gpu/cl/IClOperator.h" - -namespace arm_compute -{ -namespace opencl -{ -/** Basic function to run @ref kernels::ClDequantizeKernel that dequantizes an input tensor */ -class ClDequantize : public IClOperator -{ -public: - /** Constructor */ - ClDequantize() = default; - /** Set the input and output tensors. - * - * @param[in] compile_context The compile context to be used. - * @param[in] src Source tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL/QSYMM8/QSYMM16. - * @param[out] dst Destination tensor info with the same dimensions of @p src. Data type supported: F16/F32. - */ - void configure(const CLCompileContext &compile_context, ITensorInfo *src, ITensorInfo *dst); - /** Static function to check if given info will lead to a valid configuration - * - * Similar to @ref ClDequantize::configure() - * - * @return a status - */ - static Status validate(const ITensorInfo *src, const ITensorInfo *dst); - - // Inherited method overridden - void run(ITensorPack &tensors) override; -}; -} // namespace opencl -} // namespace arm_compute -#endif /* ARM_COMPUTE_CL_DEQUANTIZE_H */ diff --git a/src/runtime/gpu/cl/operators/ClDirectConv2d.cpp b/src/runtime/gpu/cl/operators/ClDirectConv2d.cpp deleted file mode 100644 index 13ef42a640..0000000000 --- a/src/runtime/gpu/cl/operators/ClDirectConv2d.cpp +++ /dev/null @@ -1,106 +0,0 @@ -/* - * Copyright (c) 2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "src/runtime/gpu/cl/operators/ClDirectConv2d.h" - -#include "arm_compute/runtime/CL/CLScheduler.h" -#include "src/core/CL/kernels/CLFillBorderKernel.h" -#include "src/core/gpu/cl/ClCompileContext.h" -#include "src/core/gpu/cl/kernels/ClActivationKernel.h" -#include "src/core/gpu/cl/kernels/ClDirectConv2dKernel.h" - -namespace arm_compute -{ -namespace opencl -{ -namespace -{ -ITensorPack select_activation_src_dst(ITensorPack &tensors) -{ - ITensorPack pack; - pack.add_tensor(TensorType::ACL_SRC, tensors.get_tensor(TensorType::ACL_DST)); - pack.add_tensor(TensorType::ACL_DST, tensors.get_tensor(TensorType::ACL_DST)); - return pack; -} -} // namespace - -void ClDirectConv2d::configure(const CLCompileContext &compile_context, ITensorInfo *src, ITensorInfo *weights, ITensorInfo *biases, ITensorInfo *dst, - const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info) -{ - ARM_COMPUTE_ERROR_ON_NULLPTR(src); - - // Configure direct convolution kernel - const ActivationLayerInfo conv2d_act_info = (src->data_layout() == DataLayout::NHWC && is_data_type_float(src->data_type())) ? act_info : ActivationLayerInfo(); - auto k = std::make_unique<kernels::ClDirectConv2dKernel>(); - k->set_target(CLScheduler::get().target()); - k->configure(compile_context, src, weights, biases, dst, conv_info, conv2d_act_info); - _direct_conv_kernel = std::move(k); - - // Configure border handler - PixelValue zero_value(0.f); - if(is_data_type_quantized_asymmetric(src->data_type())) - { - zero_value = PixelValue(0, src->data_type(), src->quantization_info()); - } - auto b = std::make_unique<CLFillBorderKernel>(); - b->configure(compile_context, src, _direct_conv_kernel->border_size(), BorderMode::CONSTANT, zero_value); - _src_border_handler = std::move(b); - - // Fused activation is currently supported for NHWC and floating point types - if(act_info.enabled() && !conv2d_act_info.enabled()) - { - auto a = std::make_unique<kernels::ClActivationKernel>(); - a->configure(compile_context, dst, dst, act_info); - _activation_kernel = std::move(a); - } - - // Tune kernels - CLScheduler::get().tune_kernel_static(*_direct_conv_kernel); -} - -Status ClDirectConv2d::validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, - const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info) -{ - ARM_COMPUTE_RETURN_ON_ERROR(kernels::ClDirectConv2dKernel::validate(src, weights, biases, dst, conv_info, ActivationLayerInfo(), CLScheduler::get().target())); - if(act_info.enabled()) - { - ARM_COMPUTE_RETURN_ON_ERROR(kernels::ClActivationKernel::validate(dst, dst, act_info)); - } - return Status{}; -} - -void ClDirectConv2d::run(ITensorPack &tensors) -{ - // Run border handler - CLScheduler::get().enqueue_op(*_src_border_handler.get(), tensors, false); - // Run direct convolution - CLScheduler::get().enqueue_op(*_direct_conv_kernel.get(), tensors, false); - // Run activation kernel - if(_activation_kernel) - { - auto act_pack = select_activation_src_dst(tensors); - CLScheduler::get().enqueue_op(*_activation_kernel.get(), act_pack, false); - } -} -} // namespace opencl -} // namespace arm_compute diff --git a/src/runtime/gpu/cl/operators/ClDirectConv2d.h b/src/runtime/gpu/cl/operators/ClDirectConv2d.h deleted file mode 100644 index e069733fab..0000000000 --- a/src/runtime/gpu/cl/operators/ClDirectConv2d.h +++ /dev/null @@ -1,83 +0,0 @@ -/* - * Copyright (c) 2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifndef ARM_COMPUTE_CL_DIRECT_CONV2D_H -#define ARM_COMPUTE_CL_DIRECT_CONV2D_H - -#include "src/core/gpu/cl/ClCompileContext.h" -#include "src/core/gpu/cl/IClKernel.h" -#include "src/runtime/gpu/cl/IClOperator.h" - -#include <memory> - -namespace arm_compute -{ -namespace opencl -{ -/** Basic function to simulate a directly convolution layer. This function calls the following OpenCL kernels: - * - * -# @ref CLFillBorderKernel (executed if padding size is different from zero) - * -# @ref opencl::ClDirectConv2d - */ -class ClDirectConv2d : public IClOperator -{ -public: - /** Constructor */ - ClDirectConv2d() = default; - /** Set the src and dst tensors. - * - * @param[in] compile_context The compile context to be used. - * @param[in] src Source tensor. 3 lower dimensions represent a single src [width, height, IFM], - * while every optional dimension from 4 and above represent a batch of srcs. - * Data types supported: QASYMM8_SIGNED/QASYMM8/F16/F32. - * @param[in] weights Weights tensor. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM]. Data type supported:Same as @p src. - * @param[in] biases Biases tensor. Shared biases supported. Biases are 1D tensor with dimensions [OFM]. - * Data type supported: Should match @p src data type, except for src of QASYMM8 and QASYMM8_SIGNED type where biases should be of S32 type. - * @param[out] dst Destination tensor. 3 lower dimensions represent a single dst [width, height, OFM], while the rest represent batch of dsts. - * Data types supported: Same as @p src. - * @param[in] conv_info Contains padding and stride information described in @ref PadStrideInfo. - * @param[in] act_info (Optional) Activation layer information in case of a fused activation. - * - */ - void configure(const CLCompileContext &compile_context, ITensorInfo *src, ITensorInfo *weights, ITensorInfo *biases, ITensorInfo *dst, const PadStrideInfo &conv_info, - const ActivationLayerInfo &act_info = ActivationLayerInfo()); - /** Static function to check if given info will lead to a valid configuration - * - * Similar to ClDirectConv2d::configure() - * - * @return a status - */ - static Status validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, const PadStrideInfo &conv_info, - const ActivationLayerInfo &act_info = ActivationLayerInfo()); - - // Inherited method overridden - void run(ITensorPack &tensors) override; - -private: - std::unique_ptr<IClKernel> _direct_conv_kernel{ nullptr }; - std::unique_ptr<IClKernel> _src_border_handler{ nullptr }; - std::unique_ptr<IClKernel> _activation_kernel{ nullptr }; -}; -} // namespace opencl -} // namespace arm_compute -#endif /* ARM_COMPUTE_CL_DIRECT_CONV2D_H */
\ No newline at end of file diff --git a/src/runtime/gpu/cl/operators/ClElementwiseOperations.cpp b/src/runtime/gpu/cl/operators/ClElementwiseOperations.cpp deleted file mode 100644 index e5b836a0d8..0000000000 --- a/src/runtime/gpu/cl/operators/ClElementwiseOperations.cpp +++ /dev/null @@ -1,92 +0,0 @@ -/* - * Copyright (c) 2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "src/runtime/gpu/cl/operators/ClElementwiseOperations.h" - -#include "src/core/gpu/cl/kernels/ClElementwiseKernel.h" - -namespace arm_compute -{ -namespace opencl -{ -void ClElementwiseDivision::configure(const ClCompileContext &compile_context, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, const ActivationLayerInfo &act_info) -{ - auto k = std::make_unique<kernels::ClArithmeticKernel>(); - k->configure(compile_context, ArithmeticOperation::DIV, src1, src2, dst, act_info); - _kernel = std::move(k); -} - -Status ClElementwiseDivision::validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, const ActivationLayerInfo &act_info) -{ - return kernels::ClArithmeticKernel::validate(ArithmeticOperation::DIV, src1, src2, dst, act_info); -} - -void ClElementwiseMax::configure(const ClCompileContext &compile_context, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, const ActivationLayerInfo &act_info) -{ - auto k = std::make_unique<kernels::ClArithmeticKernel>(); - k->configure(compile_context, ArithmeticOperation::MAX, src1, src2, dst, act_info); - _kernel = std::move(k); -} - -Status ClElementwiseMax::validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, const ActivationLayerInfo &act_info) -{ - return kernels::ClArithmeticKernel::validate(ArithmeticOperation::MAX, src1, src2, dst, act_info); -} - -void ClElementwiseMin::configure(const ClCompileContext &compile_context, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, const ActivationLayerInfo &act_info) -{ - auto k = std::make_unique<kernels::ClArithmeticKernel>(); - k->configure(compile_context, ArithmeticOperation::MIN, src1, src2, dst, act_info); - _kernel = std::move(k); -} - -Status ClElementwiseMin::validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, const ActivationLayerInfo &act_info) -{ - return kernels::ClArithmeticKernel::validate(ArithmeticOperation::MIN, src1, src2, dst, act_info); -} - -void ClElementwiseSquaredDiff::configure(const ClCompileContext &compile_context, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, const ActivationLayerInfo &act_info) -{ - auto k = std::make_unique<kernels::ClArithmeticKernel>(); - k->configure(compile_context, ArithmeticOperation::SQUARED_DIFF, src1, src2, dst, act_info); - _kernel = std::move(k); -} - -Status ClElementwiseSquaredDiff::validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, const ActivationLayerInfo &act_info) -{ - return kernels::ClArithmeticKernel::validate(ArithmeticOperation::SQUARED_DIFF, src1, src2, dst, act_info); -} - -void ClElementwisePower::configure(const ClCompileContext &compile_context, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, const ActivationLayerInfo &act_info) -{ - auto k = std::make_unique<kernels::ClArithmeticKernel>(); - k->configure(compile_context, ArithmeticOperation::POWER, src1, src2, dst, act_info); - _kernel = std::move(k); -} - -Status ClElementwisePower::validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, const ActivationLayerInfo &act_info) -{ - return kernels::ClArithmeticKernel::validate(ArithmeticOperation::POWER, src1, src2, dst, act_info); -} -} // namespace opencl -} // namespace arm_compute diff --git a/src/runtime/gpu/cl/operators/ClElementwiseOperations.h b/src/runtime/gpu/cl/operators/ClElementwiseOperations.h deleted file mode 100644 index b9ab1405c8..0000000000 --- a/src/runtime/gpu/cl/operators/ClElementwiseOperations.h +++ /dev/null @@ -1,190 +0,0 @@ -/* - * Copyright (c) 2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifndef ARM_COMPUTE_CL_ELEMENTWISE_OPERATIONS_H -#define ARM_COMPUTE_CL_ELEMENTWISE_OPERATIONS_H - -#include "src/core/gpu/cl/ClCompileContext.h" -#include "src/runtime/gpu/cl/IClOperator.h" - -namespace arm_compute -{ -namespace opencl -{ -/** Basic function to run @ref opencl::kernels::ClArithmeticKernel for division - * - * @note The tensor data type for the inputs must be F16/F32. - * @note The function performs an arithmetic division between two tensors. - */ -class ClElementwiseDivision : public IClOperator -{ -public: - /** Default Constructor */ - ClElementwiseDivision() = default; - /** Configure function for a given list of arguments. - * - * @param[in] compile_context The compile context to be used. - * @param[in] src1 First source tensor info. Data types supported: F16/F32. - * @param[in] src2 Second source tensor info. same as @p src1. - * @param[out] dst Destination tensor info. Data types supported: same as @p src1. - * @param[in] act_info (Optional) Activation layer information in case of a fused activation. - */ - void configure(const ClCompileContext &compile_context, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, const ActivationLayerInfo &act_info = ActivationLayerInfo()); - /** Static function to check if given info will lead to a valid configuration of @ref ClElementwiseDivision - * - * @param[in] src1 First source tensor info. Data types supported: F16/F32. - * @param[in] src2 Second source tensor info. Data types supported: same as @p src1. - * @param[in] dst Destination tensor info. Data types supported: same as @p src1. - * @param[in] act_info (Optional) Activation layer information in case of a fused activation. - * - * @return a status - */ - static Status validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, const ActivationLayerInfo &act_info = ActivationLayerInfo()); -}; - -/** Basic function to run @ref opencl::kernels::ClArithmeticKernel for max - * - * @note The tensor data type for the inputs must be U8/QASYMM8/S16/QSYMM16/S32/U32/F16/F32. - * @note The function performs a max operation between two tensors. - */ -class ClElementwiseMax : public IClOperator -{ -public: - /** Default Constructor */ - ClElementwiseMax() = default; - /** Configure function for a given list of arguments. - * - * @param[in] compile_context The compile context to be used. - * @param[in] src1 First source tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/S32/U32/F16/F32. - * @param[in] src2 Second source tensor info. Data types supported: same as @p src1. - * @param[out] dst Destination tensor info. Data types supported: same as @p src1. - * @param[in] act_info (Optional) Activation layer information in case of a fused activation. - */ - void configure(const ClCompileContext &compile_context, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, const ActivationLayerInfo &act_info = ActivationLayerInfo()); - /** Static function to check if given info will lead to a valid configuration of @ref opencl::kernels::ClArithmeticKernel for max - * - * @param[in] src1 First source tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/S32/U32/F16/F32. - * @param[in] src2 Second source tensor info. Data types supported: same as @p src1. - * @param[in] dst Destination tensor info. Data types supported: same as @p src1. - * @param[in] act_info (Optional) Activation layer information in case of a fused activation. - * - * @return a status - */ - static Status validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, const ActivationLayerInfo &act_info = ActivationLayerInfo()); -}; - -/** Basic function to run @ref opencl::kernels::ClArithmeticKernel for min - * - * @note The tensor data type for the inputs must be U8/QASYMM8/S16/QSYMM16/S32/U32/F16/F32. - * @note The function performs a max operation between two tensors. - */ -class ClElementwiseMin : public IClOperator -{ -public: - /** Default Constructor */ - ClElementwiseMin() = default; - /** Configure function for a given list of arguments. - * - * @param[in] compile_context The compile context to be used. - * @param[in] src1 First source tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/S32/U32/F16/F32. - * @param[in] src2 Second source tensor info. Data types supported: same as @p src1. - * @param[out] dst Destination tensor info. Data types supported: same as @p src1. - * @param[in] act_info (Optional) Activation layer information in case of a fused activation. - */ - void configure(const ClCompileContext &compile_context, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, const ActivationLayerInfo &act_info = ActivationLayerInfo()); - /** Static function to check if given info will lead to a valid configuration of @ref opencl::kernels::ClArithmeticKernel for min - * - * @param[in] src1 First source tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/S32/U32/F16/F32. - * @param[in] src2 Second source tensor info. Data types supported: same as @p src1. - * @param[in] dst Destination tensor info. Data types supported: same as @p src1. - * @param[in] act_info (Optional) Activation layer information in case of a fused activation. - * - * @return a status - */ - static Status validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, const ActivationLayerInfo &act_info = ActivationLayerInfo()); -}; - -/** Basic function to run @ref opencl::kernels::ClArithmeticKernel for squared difference - * - * @note The tensor data type for the inputs must be QASYMM8/U8/S16/QSYMM16/F16/F32. - * @note The function performs a squared different operation between two tensors (i.e., out[i] = (in1[i] - in2[i])^2 - */ -class ClElementwiseSquaredDiff : public IClOperator -{ -public: - /** Default Constructor */ - ClElementwiseSquaredDiff() = default; - /** Configure function for a given list of arguments. - * - * @param[in] compile_context The compile context to be used. - * @param[in] src1 First source tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/F32. - * @param[in] src2 Second source tensor info. Data types supported: same as @p src1. - * @param[out] dst Destination tensor info. Data types supported: same as @p src1. - * @param[in] act_info (Optional) Activation layer information in case of a fused activation. - */ - void configure(const ClCompileContext &compile_context, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, const ActivationLayerInfo &act_info = ActivationLayerInfo()); - /** Static function to check if given info will lead to a valid configuration of @ref opencl::kernels::ClArithmeticKernel for squared difference - * - * @param[in] src1 First source tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/F32. - * @param[in] src2 Second source tensor info. Data types supported: same as @p src1. - * @param[in] dst Destination tensor info. Data types supported: same as @p src1. - * @param[in] act_info (Optional) Activation layer information in case of a fused activation. - * - * @return a status - */ - static Status validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, const ActivationLayerInfo &act_info = ActivationLayerInfo()); -}; - -/** Basic function to run @ref opencl::kernels::ClArithmeticKernel for power - * - * @note The tensor data type for the inputs must be F16/F32. - * @note The function performs an elementwise power of in1 to in2 (i.e., out[i] = in1[i] ^ in2[i]) - */ -class ClElementwisePower : public IClOperator -{ -public: - /** Default Constructor */ - ClElementwisePower() = default; - /** Configure function for a given list of arguments. - * - * @param[in] compile_context The compile context to be used. - * @param[in] src1 First source tensor info. Data types supported: F16/F32. - * @param[in] src2 Second source tensor info. Data types supported: F16/F32. - * @param[out] dst Destination tensor info. Data types supported:F16/F32. - * @param[in] act_info (Optional) Activation layer information in case of a fused activation. - */ - void configure(const ClCompileContext &compile_context, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, const ActivationLayerInfo &act_info = ActivationLayerInfo()); - /** Static function to check if given info will lead to a valid configuration of @ref opencl::kernels::ClArithmeticKernel for power - * - * @param[in] src1 First source tensor info. Data types supported: F16/F32. - * @param[in] src2 Second source tensor info. Data types supported: F16/F32. - * @param[in] dst Destination tensor info. Data types supported: F16/F32. - * @param[in] act_info (Optional) Activation layer information in case of a fused activation. - * - * @return a status - */ - static Status validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, const ActivationLayerInfo &act_info = ActivationLayerInfo()); -}; -} // namespace opencl -} // namespace arm_compute -#endif /* ARM_COMPUTE_CL_ELEMENTWISE_OPERATIONS_H */ diff --git a/src/runtime/gpu/cl/operators/ClElementwiseUnary.cpp b/src/runtime/gpu/cl/operators/ClElementwiseUnary.cpp deleted file mode 100644 index 7b830a077f..0000000000 --- a/src/runtime/gpu/cl/operators/ClElementwiseUnary.cpp +++ /dev/null @@ -1,116 +0,0 @@ -/* - * Copyright (c) 2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "src/runtime/gpu/cl/operators/ClElementwiseUnary.h" - -#include "src/core/gpu/cl/kernels/ClElementwiseUnaryKernel.h" - -namespace arm_compute -{ -namespace opencl -{ -void ClRsqrt::configure(const ClCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst) -{ - auto k = std::make_unique<kernels::ClElementWiseUnaryKernel>(); - k->configure(compile_context, src, dst, ElementWiseUnary::RSQRT); - _kernel = std::move(k); -} - -Status ClRsqrt::validate(const ITensorInfo *src, const ITensorInfo *dst) -{ - return kernels::ClElementWiseUnaryKernel::validate(src, dst, ElementWiseUnary::RSQRT); -} - -void ClExp::configure(const ClCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst) -{ - auto k = std::make_unique<kernels::ClElementWiseUnaryKernel>(); - k->configure(compile_context, src, dst, ElementWiseUnary::EXP); - _kernel = std::move(k); -} - -Status ClExp::validate(const ITensorInfo *src, const ITensorInfo *dst) -{ - return kernels::ClElementWiseUnaryKernel::validate(src, dst, ElementWiseUnary::EXP); -} - -void ClNeg::configure(const ClCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst) -{ - auto k = std::make_unique<kernels::ClElementWiseUnaryKernel>(); - k->configure(compile_context, src, dst, ElementWiseUnary::NEG); - _kernel = std::move(k); -} - -Status ClNeg::validate(const ITensorInfo *src, const ITensorInfo *dst) -{ - return kernels::ClElementWiseUnaryKernel::validate(src, dst, ElementWiseUnary::NEG); -} - -void ClSin::configure(const ClCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst) -{ - auto k = std::make_unique<kernels::ClElementWiseUnaryKernel>(); - k->configure(compile_context, src, dst, ElementWiseUnary::SIN); - _kernel = std::move(k); -} - -Status ClSin::validate(const ITensorInfo *src, const ITensorInfo *dst) -{ - return kernels::ClElementWiseUnaryKernel::validate(src, dst, ElementWiseUnary::SIN); -} - -void ClAbs::configure(const ClCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst) -{ - auto k = std::make_unique<kernels::ClElementWiseUnaryKernel>(); - k->configure(compile_context, src, dst, ElementWiseUnary::ABS); - _kernel = std::move(k); -} - -Status ClAbs::validate(const ITensorInfo *src, const ITensorInfo *dst) -{ - return kernels::ClElementWiseUnaryKernel::validate(src, dst, ElementWiseUnary::ABS); -} - -void ClLog::configure(const ClCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst) -{ - auto k = std::make_unique<kernels::ClElementWiseUnaryKernel>(); - k->configure(compile_context, src, dst, ElementWiseUnary::LOG); - _kernel = std::move(k); -} - -Status ClLog::validate(const ITensorInfo *src, const ITensorInfo *dst) -{ - return kernels::ClElementWiseUnaryKernel::validate(src, dst, ElementWiseUnary::LOG); -} - -void ClRound::configure(const ClCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst) -{ - auto k = std::make_unique<kernels::ClElementWiseUnaryKernel>(); - k->configure(compile_context, src, dst, ElementWiseUnary::ROUND); - _kernel = std::move(k); -} - -Status ClRound::validate(const ITensorInfo *src, const ITensorInfo *dst) -{ - return kernels::ClElementWiseUnaryKernel::validate(src, dst, ElementWiseUnary::ROUND); -} -} // namespace opencl -} // namespace arm_compute diff --git a/src/runtime/gpu/cl/operators/ClElementwiseUnary.h b/src/runtime/gpu/cl/operators/ClElementwiseUnary.h deleted file mode 100644 index b40e3e9a3b..0000000000 --- a/src/runtime/gpu/cl/operators/ClElementwiseUnary.h +++ /dev/null @@ -1,192 +0,0 @@ -/* - * Copyright (c) 2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifndef ARM_COMPUTE_CL_ELEMENTWISE_UNARY_H -#define ARM_COMPUTE_CL_ELEMENTWISE_UNARY_H - -#include "src/core/gpu/cl/ClCompileContext.h" -#include "src/runtime/gpu/cl/IClOperator.h" - -namespace arm_compute -{ -namespace opencl -{ -/** Basic function to perform inverse square root on an src tensor. */ -class ClRsqrt : public IClOperator -{ -public: - /** Constructor */ - ClRsqrt() = default; - /** Initialize the function - * - * @param[in] compile_context The compile context to be used. - * @param[in] src Source tensor info. Data types supported: F16/F32. - * @param[out] dst Destination tensor info. Data types supported: same as @p src. - */ - void configure(const ClCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst); - /** Static function to check if given info will lead to a valid configuration of @ref ClRsqrt - * - * @param[in] src First source tensor info. Data types supported: F16/F32. - * @param[in] dst Destination tensor info. Data types supported: same as @p src. - * - * @return a status - */ - static Status validate(const ITensorInfo *src, const ITensorInfo *dst); -}; - -/** Basic function to perform exponential on an src tensor. */ -class ClExp : public IClOperator -{ -public: - /** Constructor */ - ClExp() = default; - /** Initialize the function - * - * @param[in] compile_context The compile context to be used. - * @param[in] src Source tensor info. Data types supported: F16/F32. - * @param[out] dst Destination tensor info. Data types supported: same as @p src. - */ - void configure(const ClCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst); - /** Static function to check if given info will lead to a valid configuration of @ref ClExp - * - * @param[in] src First source tensor info. Data types supported: F16/F32. - * @param[in] dst Destination tensor info. Data types supported: same as @p src. - * - * @return a status - */ - static Status validate(const ITensorInfo *src, const ITensorInfo *dst); -}; - -/** Basic function to negate an src tensor. */ -class ClNeg : public IClOperator -{ -public: - /** Constructor */ - ClNeg() = default; - /** Initialize the function - * - * @param[in] compile_context The compile context to be used. - * @param[in] src Source tensor info. Data types supported: F16/F32. - * @param[out] dst Destination tensor info. Data types supported: same as @p src. - */ - void configure(const ClCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst); - /** Static function to check if given info will lead to a valid configuration of @ref ClNeg - * - * @param[in] src First source tensor info. Data types supported: F16/F32. - * @param[in] dst Destination tensor info. Data types supported: same as @p src. - * - * @return a status - */ - static Status validate(const ITensorInfo *src, const ITensorInfo *dst); -}; - -/** Basic function to calculate sine of an src tensor. */ -class ClSin : public IClOperator -{ -public: - /** Constructor */ - ClSin() = default; - /** Initialize the function - * - * @param[in] compile_context The compile context to be used. - * @param[in] src Source tensor info. Data types supported: F16/F32. - * @param[out] dst Destination tensor info. Data types supported: same as @p src. - */ - void configure(const ClCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst); - /** Static function to check if given info will lead to a valid configuration of @ref ClSin - * - * @param[in] src First source tensor info. Data types supported: F16/F32. - * @param[in] dst Destination tensor info. Data types supported: same as @p src. - * - * @return a status - */ - static Status validate(const ITensorInfo *src, const ITensorInfo *dst); -}; - -/** Basic function to perform elementwise log on an src tensor. */ -class ClLog : public IClOperator -{ -public: - /** Constructor */ - ClLog() = default; - /** Initialize the function - * - * @param[in] compile_context The compile context to be used. - * @param[in] src Source tensor info. Data types supported: F16/F32. - * @param[out] dst Destination tensor info. Data types supported: same as @p src. - */ - void configure(const ClCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst); - /** Static function to check if given info will lead to a valid configuration of @ref ClLog - * - * @param[in] src First source tensor info. Data types supported: F16/F32. - * @param[in] dst Destination tensor info. Data types supported: same as @p src. - * - * @return a status - */ - static Status validate(const ITensorInfo *src, const ITensorInfo *dst); -}; - -/** Basic function to get the absolute value of an src tensor. */ -class ClAbs : public IClOperator -{ -public: - /** Initialize the function - * - * @param[in] compile_context The compile context to be used. - * @param[in] src Source tensor info. Data types supported: F16/F32. - * @param[out] dst Destination tensor info. Data types supported: same as @p src. - */ - void configure(const ClCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst); - /** Static function to check if given info will lead to a valid configuration of @ref ClAbs - * - * @param[in] src First source tensor info. Data types supported: F16/F32. - * @param[in] dst Destination tensor info. Data types supported: same as @p src. - * - * @return a status - */ - static Status validate(const ITensorInfo *src, const ITensorInfo *dst); -}; - -/** Basic function to get the round (to the nearest even) value of an src tensor. */ -class ClRound : public IClOperator -{ -public: - /** Initialize the function - * - * @param[in] compile_context The compile context to be used. - * @param[in] src Source tensor info. Data types supported: F16/F32. - * @param[out] dst Destination tensor info. Data types supported: same as @p src. - */ - void configure(const ClCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst); - /** Static function to check if given info will lead to a valid configuration of @ref ClRound - * - * @param[in] src First source tensor info. Data types supported: F16/F32. - * @param[in] dst Destination tensor info. Data types supported: same as @p src. - * - * @return a status - */ - static Status validate(const ITensorInfo *src, const ITensorInfo *dst); -}; -} // namespace opencl -} // namespace arm_compute -#endif /* ARM_COMPUTE_CL_ELEMENTWISE_UNARY_H */ diff --git a/src/runtime/gpu/cl/operators/ClFill.cpp b/src/runtime/gpu/cl/operators/ClFill.cpp deleted file mode 100644 index 4d0afaef24..0000000000 --- a/src/runtime/gpu/cl/operators/ClFill.cpp +++ /dev/null @@ -1,45 +0,0 @@ -/* - * Copyright (c) 2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "src/runtime/gpu/cl/operators/ClFill.h" - -#include "src/core/gpu/cl/ClCompileContext.h" -#include "src/core/gpu/cl/kernels/ClFillKernel.h" - -namespace arm_compute -{ -namespace opencl -{ -void ClFill::configure(const ClCompileContext &compile_context, ITensorInfo *tensor, const PixelValue &constant_value, Window *dst_window) -{ - auto k = std::make_unique<kernels::ClFillKernel>(); - k->configure(compile_context, tensor, constant_value, dst_window); - _kernel = std::move(k); -} - -Status ClFill::validate(const ITensorInfo *tensor, const PixelValue &constant_value, Window *dst_window) -{ - return kernels::ClFillKernel::validate(tensor, constant_value, dst_window); -} -} // namespace opencl -} // namespace arm_compute
\ No newline at end of file diff --git a/src/runtime/gpu/cl/operators/ClFill.h b/src/runtime/gpu/cl/operators/ClFill.h deleted file mode 100644 index e632d88546..0000000000 --- a/src/runtime/gpu/cl/operators/ClFill.h +++ /dev/null @@ -1,61 +0,0 @@ -/* - * Copyright (c) 2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifndef ARM_COMPUTE_CL_FILL_H -#define ARM_COMPUTE_CL_FILL_H - -#include "arm_compute/core/Window.h" -#include "src/core/gpu/cl/ClCompileContext.h" -#include "src/runtime/gpu/cl/IClOperator.h" - -namespace arm_compute -{ -namespace opencl -{ -/** Basic function to run @ref kernels::ClFillKernel */ -class ClFill : public IClOperator -{ -public: - /** Constructor */ - ClFill() = default; - /** Initialise the kernel's tensor and filling value - * - * @param[in] compile_context The compile context to be used. - * @param[in,out] tensor Source tensor info. Supported data types: All. - * @param[in] constant_value The value used to fill the planes of the tensor - * @param[in] window Window to be used in case setting only part of a tensor. Default is nullptr. - */ - void configure(const CLCompileContext &compile_context, ITensorInfo *tensor, const PixelValue &constant_value, Window *window = nullptr); - /** Static function to check if given info will lead to a valid configuration of @ref kernels::ClFillKernel - * - * @param[in] tensor Source tensor info. Data types supported: All. - * @param[in] constant_value The value used to fill the planes of the tensor. - * @param[in] window Window to be used in case setting only part of a tensor. Default is nullptr. - * - * @return a status - */ - static Status validate(const ITensorInfo *tensor, const PixelValue &constant_value, Window *window = nullptr); -}; -} // namespace opencl -} // namespace arm_compute -#endif /* ARM_COMPUTE_CL_FILL_H */ diff --git a/src/runtime/gpu/cl/operators/ClFlatten.cpp b/src/runtime/gpu/cl/operators/ClFlatten.cpp deleted file mode 100644 index 060b653dee..0000000000 --- a/src/runtime/gpu/cl/operators/ClFlatten.cpp +++ /dev/null @@ -1,45 +0,0 @@ -/* - * Copyright (c) 2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "src/runtime/gpu/cl/operators/ClFlatten.h" - -#include "src/core/gpu/cl/ClCompileContext.h" -#include "src/core/gpu/cl/kernels/ClReshapeKernel.h" - -namespace arm_compute -{ -namespace opencl -{ -void ClFlatten::configure(const ClCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst) -{ - auto k = std::make_unique<kernels::ClReshapeKernel>(); - k->configure(compile_context, src, dst); - _kernel = std::move(k); -} - -Status ClFlatten::validate(const ITensorInfo *src, const ITensorInfo *dst) -{ - return kernels::ClReshapeKernel::validate(src, dst); -} -} // namespace opencl -} // namespace arm_compute diff --git a/src/runtime/gpu/cl/operators/ClFlatten.h b/src/runtime/gpu/cl/operators/ClFlatten.h deleted file mode 100644 index 20ad06ee57..0000000000 --- a/src/runtime/gpu/cl/operators/ClFlatten.h +++ /dev/null @@ -1,68 +0,0 @@ -/* - * Copyright (c) 2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifndef ARM_COMPUTE_CL_FLATTEN_H -#define ARM_COMPUTE_CL_FLATTEN_H - -#include "src/core/gpu/cl/ClCompileContext.h" -#include "src/runtime/gpu/cl/IClOperator.h" - -namespace arm_compute -{ -namespace opencl -{ -/** Basic function to flatten a given input */ -class ClFlatten : public IClOperator -{ -public: - /** Constructor */ - ClFlatten() = default; - /** Configure operator for a given list of arguments - * - * Valid data layouts: - * - All - * - * Valid data type configurations: - * |src |dst | - * |:--------------|:--------------| - * |All |All | - * - * @param[in] compile_context The compile context to be used. - * @param[in] src Source tensor to flatten with at least 3 dimensions. - * The dimensions above the third will be interpreted as batches. Data types supported: All - * @param[in] dst Destination tensor with shape [w*h*d, input_batches] where: - * w = width input tensor, h = height input tensor and d = depth input tensor. - * Data type supported: same as @p src - */ - void configure(const ClCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst); - /** Static function to check if given info will lead to a valid configuration - * - * Similar to @ref ClFlatten::configure() - * - * @return a status - */ - static Status validate(const ITensorInfo *src, const ITensorInfo *dst); -}; -} // namespace opencl -} // namespace arm_compute -#endif /* ARM_COMPUTE_CL_FLATTEN_H */ diff --git a/src/runtime/gpu/cl/operators/ClFloor.cpp b/src/runtime/gpu/cl/operators/ClFloor.cpp deleted file mode 100644 index 94e77c0c54..0000000000 --- a/src/runtime/gpu/cl/operators/ClFloor.cpp +++ /dev/null @@ -1,45 +0,0 @@ -/* - * Copyright (c) 2017-2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "src/runtime/gpu/cl/operators/ClFloor.h" - -#include "src/core/gpu/cl/ClCompileContext.h" -#include "src/core/gpu/cl/kernels/ClFloorKernel.h" - -namespace arm_compute -{ -namespace opencl -{ -void ClFloor::configure(const ClCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst) -{ - auto k = std::make_unique<kernels::ClFloorKernel>(); - k->configure(compile_context, src, dst); - _kernel = std::move(k); -} - -Status ClFloor::validate(const ITensorInfo *src, const ITensorInfo *dst) -{ - return kernels::ClFloorKernel::validate(src, dst); -} -} // namespace opencl -} // namespace arm_compute diff --git a/src/runtime/gpu/cl/operators/ClFloor.h b/src/runtime/gpu/cl/operators/ClFloor.h deleted file mode 100644 index f54eef9140..0000000000 --- a/src/runtime/gpu/cl/operators/ClFloor.h +++ /dev/null @@ -1,58 +0,0 @@ -/* - * Copyright (c) 2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifndef ARM_COMPUTE_CL_FLOOR_H -#define ARM_COMPUTE_CL_FLOOR_H - -#include "src/core/gpu/cl/ClCompileContext.h" -#include "src/runtime/gpu/cl/IClOperator.h" - -namespace arm_compute -{ -namespace opencl -{ -/** Basic function to run @ref kernels::ClFloorKernel */ -class ClFloor : public IClOperator -{ -public: - /** Constructor */ - ClFloor() = default; - /** Configure operator for a given list of arguments - * - * @param[in] compile_context The compile context to be used. - * @param[in] src Source tensor info. Data types supported: F16/F32. - * @param[in] dst Destination tensor info. Data type supported: same as @p src - */ - void configure(const ClCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst); - /** Static function to check if given info will lead to a valid configuration of @ref ClFloor - * - * @param[in] src Source tensor info. Data types supported: F16/F32. - * @param[in] dst Destination tensor info. Data type supported: same as @p src - * - * @return a status - */ - static Status validate(const ITensorInfo *src, const ITensorInfo *dst); -}; -} // namespace opencl -} // namespace arm_compute -#endif /* ARM_COMPUTE_CL_FLOOR_H */ diff --git a/src/runtime/gpu/cl/operators/ClGemm.cpp b/src/runtime/gpu/cl/operators/ClGemm.cpp deleted file mode 100644 index a80375447d..0000000000 --- a/src/runtime/gpu/cl/operators/ClGemm.cpp +++ /dev/null @@ -1,760 +0,0 @@ -/* - * Copyright (c) 2017-2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "src/runtime/gpu/cl/operators/ClGemm.h" - -#include "arm_compute/core/CL/CLKernelLibrary.h" -#include "arm_compute/core/CL/ICLTensor.h" -#include "arm_compute/core/Error.h" -#include "arm_compute/core/GPUTarget.h" -#include "arm_compute/core/Helpers.h" -#include "arm_compute/core/KernelDescriptors.h" -#include "arm_compute/core/Log.h" -#include "arm_compute/core/TensorInfo.h" -#include "arm_compute/core/Types.h" -#include "arm_compute/core/Utils.h" -#include "arm_compute/core/Validate.h" -#include "arm_compute/core/utils/misc/ShapeCalculator.h" -#include "arm_compute/runtime/CL/CLScheduler.h" -#include "arm_compute/runtime/ITensorAllocator.h" -#include "src/core/gpu/cl/IClKernel.h" -#include "src/core/helpers/AutoConfiguration.h" -#include "src/core/helpers/MemoryHelpers.h" -#include "src/core/utils/helpers/float_ops.h" -#include "src/runtime/CL/gemm/CLGEMMKernelSelection.h" -#include "src/runtime/CL/gemm_auto_heuristics/CLGEMMAutoHeuristics.h" -#include "src/runtime/gpu/cl/utils/ClAuxTensorHandler.h" - -#include "support/Cast.h" -#include "utils/TypePrinter.h" - -namespace arm_compute -{ -namespace opencl -{ -using namespace arm_compute::misc::shape_calculator; -using namespace arm_compute::cl_gemm; -using namespace arm_compute::experimental; -using namespace arm_compute::utils::cast; -using namespace arm_compute::opencl::kernels; - -namespace -{ -inline bool validate_gemm_kernel(CLGEMMKernelType kernel_type) -{ - switch(kernel_type) - { - case CLGEMMKernelType::NATIVE_V1: - case CLGEMMKernelType::RESHAPED_ONLY_RHS: - case CLGEMMKernelType::RESHAPED_V1: - case CLGEMMKernelType::RESHAPED: - { - return true; - } - default: - { - return false; - } - } -} -//Automatically select between mlgo (prioritized) and default heuristics for gemm kernel type -inline CLGEMMKernelType auto_select_gemm_kernel(auto_heuristics::CommonQuery query, bool reshape_b_only_on_first_run, bool constant_weights) -{ - if(!constant_weights) - { - return CLGEMMKernelType::NATIVE_V1; - } - - auto gemm_kernel = auto_heuristics::select_mlgo_gemm_kernel(query, reshape_b_only_on_first_run); - if(bool(gemm_kernel)) - { - if(validate_gemm_kernel(gemm_kernel.gemm_type)) - { - ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("Use gemm kernel from mlgo heuristics: %s.", to_string(gemm_kernel.gemm_type).c_str()); - return gemm_kernel.gemm_type; - } - } - gemm_kernel = auto_heuristics::select_default_gemm_kernel(query, reshape_b_only_on_first_run); - ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("Use gemm kernel from default heuristics: %s.", to_string(gemm_kernel.gemm_type).c_str()); - return gemm_kernel.gemm_type; -} -// Validate lhs_info and rhs_info for reshaped only rhs kernel -inline bool validate_lhs_rhs_info_reshaped_only_rhs(const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info, const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, - const ITensorInfo *output, GEMMKernelInfo gemm_kernel_info) -{ - // Validate GEMMLHSMatrixInfo and GEMMRHSMatrixInfo for reshaped only rhs kernel - TensorInfo tmp_b_info{}; - // Validate reshape RHS kernel - auto_init_if_empty(tmp_b_info, b->clone()->set_tensor_shape(compute_rhs_reshaped_shape(*b, rhs_info))); - if(!bool(ClGemmReshapeRhsMatrixKernel::validate(b, &tmp_b_info, rhs_info))) - { - return false; - } - // Validate mm kernel - gemm_kernel_info.lhs_info = lhs_info; - gemm_kernel_info.rhs_info = rhs_info; - gemm_kernel_info.has_pad_y = false; - if(!bool(ClGemmMatrixMultiplyReshapedOnlyRhsKernel::validate(a, &tmp_b_info, c, output, 1.f, 0.f, lhs_info, rhs_info, gemm_kernel_info))) - { - return false; - } - gemm_kernel_info.has_pad_y = true; - if(!bool(ClGemmMatrixMultiplyReshapedOnlyRhsKernel::validate(a, &tmp_b_info, c, output, 1.f, 0.f, lhs_info, rhs_info, gemm_kernel_info))) - { - return false; - } - return true; -} - -//Automatically select between mlgo (prioritized) and default heuristics for reshaped only rhs kernel configs -inline std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> auto_select_gemm_config_reshaped_only_rhs(auto_heuristics::CommonQuery query, GEMMKernelInfo kernel_info, const ITensorInfo *a, - const ITensorInfo *b, - const ITensorInfo *c, const ITensorInfo *output) -{ - auto config = auto_heuristics::select_mlgo_gemm_config_reshaped_only_rhs(query); - if(config) - { - if(validate_lhs_rhs_info_reshaped_only_rhs(config.lhs_info, config.rhs_info, a, b, c, output, kernel_info)) - { - ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("Use reshaped_only_rhs config from mlgo heuristics: LHS info: %s ; RHS info: %s ", to_string(config.lhs_info).c_str(), to_string(config.rhs_info).c_str()); - return { config.lhs_info, config.rhs_info }; - } - } - config = auto_heuristics::select_default_gemm_config_reshaped_only_rhs(query); - ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("Use reshaped_only_rhs config from default heuristics: LHS info: %s ; RHS info: %s ", to_string(config.lhs_info).c_str(), to_string(config.rhs_info).c_str()); - return { config.lhs_info, config.rhs_info }; -} - -// Validate lhs_info and rhs_info for reshaped kernel -inline bool validate_lhs_rhs_info_reshaped(const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info, const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, - const ITensorInfo *output, GEMMKernelInfo gemm_kernel_info, bool reinterpret_input_as_3d) -{ - // Validate GEMMLHSMatrixInfo and GEMMRHSMatrixInfo for reshaped kernel - TensorInfo tmp_a_info{}; - TensorInfo tmp_b_info{}; - - // Validate reshape LHS kernel - auto_init_if_empty(tmp_a_info, a->clone()->set_tensor_shape(compute_lhs_reshaped_shape(*a, lhs_info, reinterpret_input_as_3d))); - if(!bool(ClGemmReshapeLhsMatrixKernel::validate(a, &tmp_a_info, lhs_info, reinterpret_input_as_3d))) - { - return false; - } - - // Validate reshape RHS kernel - auto_init_if_empty(tmp_b_info, b->clone()->set_tensor_shape(compute_rhs_reshaped_shape(*b, rhs_info))); - if(!bool(ClGemmReshapeRhsMatrixKernel::validate(b, &tmp_b_info, rhs_info))) - { - return false; - } - // Validate mm kernel - gemm_kernel_info.lhs_info = lhs_info; - gemm_kernel_info.rhs_info = rhs_info; - if(!bool(ClGemmMatrixMultiplyReshapedKernel::validate(&tmp_a_info, &tmp_b_info, c, output, 1.f, 0.f, lhs_info, rhs_info, gemm_kernel_info))) - { - return false; - } - return true; -} - -//Automatically select between mlgo (prioritized) and default heuristics for reshaped kernel configs -inline std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> auto_select_gemm_config_reshaped(auto_heuristics::CommonQuery query, GEMMKernelInfo kernel_info, const ITensorInfo *a, const ITensorInfo *b, - const ITensorInfo *c, const ITensorInfo *output, bool reinterpret_input_as_3d) -{ - auto config = auto_heuristics::select_mlgo_gemm_config_reshaped(query); - if(config) - { - if(validate_lhs_rhs_info_reshaped(config.lhs_info, config.rhs_info, a, b, c, output, kernel_info, reinterpret_input_as_3d)) - { - ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("Use reshaped config from mlgo heuristics: LHS info: %s ; RHS info: %s ", to_string(config.lhs_info).c_str(), to_string(config.rhs_info).c_str()); - return { config.lhs_info, config.rhs_info }; - } - } - config = auto_heuristics::select_default_gemm_config_reshaped(query); - ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("Use reshaped config from default heuristics: LHS info: %s ; RHS info: %s ", to_string(config.lhs_info).c_str(), to_string(config.rhs_info).c_str()); - return { config.lhs_info, config.rhs_info }; -} -} // namespace - -ClGemm::ClGemm() - : _mm_kernel(std::make_unique<ClGemmMatrixMultiplyKernel>()), - _reshape_lhs_kernel(std::make_unique<ClGemmReshapeLhsMatrixKernel>()), - _reshape_rhs_kernel(std::make_unique<ClGemmReshapeRhsMatrixKernel>()), - _mm_reshaped_kernel(std::make_unique<ClGemmMatrixMultiplyReshapedKernel>()), - _mm_reshaped_only_rhs_kernel(std::make_unique<ClGemmMatrixMultiplyReshapedOnlyRhsKernel>()), - _mm_reshaped_only_rhs_fallback_kernel(std::make_unique<ClGemmMatrixMultiplyReshapedOnlyRhsKernel>()), - _tmp_a(), - _tmp_b(), - _reshape_b_only_on_first_run(false), - _gemm_kernel_type(CLGEMMKernelType::NATIVE_V1), - _aux_mem(AuxTensorIdx::Count) -{ -} - -void ClGemm::configure_native_v1(const CLCompileContext &compile_context, ITensorInfo *a, ITensorInfo *b, ITensorInfo *c, ITensorInfo *output, float alpha, float beta, - const GEMMInfo &gemm_info) -{ - const unsigned int m = gemm_info.reinterpret_input_as_3d() ? (a->dimension(1) * a->dimension(2)) : a->dimension(1); - const unsigned int n = b->dimension(0); - const unsigned int k = a->dimension(0); - const GPUTarget gpu_target = CLScheduler::get().target(); - - // Set the target for the kernels - _mm_kernel->set_target(gpu_target); - - GEMMReshapeInfo reshape_info(m, n, k, 1, 1, gemm_info.depth_output_gemm3d(), gemm_info.reinterpret_input_as_3d(), gemm_info.broadcast_bias()); - - // Configure and tune matrix multiply kernel - _mm_kernel->configure(compile_context, a, b, c, output, alpha, beta, false, reshape_info, gemm_info.fp_mixed_precision(), gemm_info.activation_info()); - - // Tune kernel statically - CLScheduler::get().tune_kernel_static(*_mm_kernel); -} - -void ClGemm::configure_reshaped_v1(const CLCompileContext &compile_context, ITensorInfo *a, ITensorInfo *b, ITensorInfo *c, ITensorInfo *output, float alpha, float beta, - const GEMMInfo &gemm_info) -{ - bool reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d(); - const unsigned int m = reinterpret_input_as_3d ? (a->dimension(1) * a->dimension(2)) : a->dimension(1); - const unsigned int n = b->dimension(0); - const unsigned int k = a->dimension(0); - const int depth_output_gemm3d = gemm_info.depth_output_gemm3d(); - const GPUTarget gpu_target = CLScheduler::get().target(); - int mult_transpose1xW_width = 1; - int mult_interleave4x4_height = 1; - - // Set the target for the kernels - _reshape_lhs_kernel->set_target(gpu_target); - _mm_kernel->set_target(gpu_target); - - if(get_arch_from_target(gpu_target) == GPUTarget::BIFROST) - { - mult_transpose1xW_width = 4; - mult_interleave4x4_height = 2; - } - - GEMMRHSMatrixInfo rhs_info; - rhs_info.n0 = 16 / b->element_size(); - rhs_info.k0 = 1; - rhs_info.h0 = mult_transpose1xW_width; - rhs_info.interleave = false; - rhs_info.transpose = false; - - GEMMLHSMatrixInfo lhs_info; - lhs_info.m0 = 4; - lhs_info.k0 = 4; - lhs_info.v0 = mult_interleave4x4_height; - lhs_info.interleave = true; - lhs_info.transpose = true; - - GEMMReshapeInfo reshape_info(m, n, k, mult_transpose1xW_width, mult_interleave4x4_height, depth_output_gemm3d, false, gemm_info.broadcast_bias()); - - // Configure interleave kernel - _reshape_lhs_kernel->configure(compile_context, a, &_tmp_a, lhs_info, reinterpret_input_as_3d); - - // Configure transpose kernel - _reshape_rhs_kernel->configure(compile_context, b, &_tmp_b, rhs_info); - - // Configure and tune matrix multiply kernel - _mm_kernel->configure(compile_context, &_tmp_a, &_tmp_b, c, output, alpha, beta, true, reshape_info, gemm_info.fp_mixed_precision(), gemm_info.activation_info()); - - CLScheduler::get().tune_kernel_static(*_mm_kernel); - - // Request memory for LHS and RHS reshape matrix - _aux_mem[LhsReshape] = MemoryInfo(offset_int_vec(LhsReshape), MemoryLifetime::Temporary, _tmp_a.total_size()); - _aux_mem[RhsReshape] = MemoryInfo(offset_int_vec(RhsReshape), _reshape_b_only_on_first_run ? MemoryLifetime::Persistent : MemoryLifetime::Temporary, _tmp_b.total_size()); -} - -void ClGemm::configure_reshaped_v2(const CLCompileContext &compile_context, ITensorInfo *a, ITensorInfo *b, ITensorInfo *c, ITensorInfo *output, float alpha, float beta, - const GEMMInfo &gemm_info) -{ - DataType data_type = a->data_type(); - bool reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d(); - const unsigned int m = reinterpret_input_as_3d ? (a->dimension(1) * a->dimension(2)) : a->dimension(1); - const unsigned int n = b->dimension(0); - const unsigned int k = a->dimension(0); - const unsigned int batch_size = reinterpret_input_as_3d ? a->dimension(3) : a->dimension(2); - const int depth_output_gemm3d = gemm_info.depth_output_gemm3d(); - const GPUTarget gpu_target = CLScheduler::get().target(); - bool broadcast_bias = gemm_info.broadcast_bias(); - - GEMMKernelInfo kernel_info; - kernel_info.m = m; - kernel_info.n = n; - kernel_info.k = k; - kernel_info.depth_output_gemm3d = depth_output_gemm3d; - kernel_info.reinterpret_input_as_3d = false; - kernel_info.broadcast_bias = broadcast_bias; - kernel_info.activation_info = gemm_info.activation_info(); - - // Set the target for the kernels - _reshape_lhs_kernel->set_target(gpu_target); - _mm_kernel->set_target(gpu_target); - - GEMMLHSMatrixInfo lhs_info{}; - GEMMRHSMatrixInfo rhs_info{}; - - // Pick up the GEMM configuration - std::tie(lhs_info, rhs_info) = auto_select_gemm_config_reshaped(auto_heuristics::CommonQuery{ gpu_target, data_type, m, n, k, batch_size }, kernel_info, a, b, - c, output, gemm_info.reinterpret_input_as_3d()); - - _reshape_lhs_kernel->configure(compile_context, a, &_tmp_a, lhs_info, gemm_info.reinterpret_input_as_3d()); - _reshape_rhs_kernel->configure(compile_context, b, &_tmp_b, rhs_info); - - // Configure and tune matrix multiply kernel - _mm_reshaped_kernel->configure(compile_context, &_tmp_a, &_tmp_b, c, output, alpha, beta, lhs_info, rhs_info, kernel_info); - - // Request memory for LHS and RHS reshape matrix - _aux_mem[LhsReshape] = MemoryInfo(offset_int_vec(LhsReshape), MemoryLifetime::Temporary, _tmp_a.total_size()); - _aux_mem[RhsReshape] = MemoryInfo(offset_int_vec(RhsReshape), _reshape_b_only_on_first_run ? MemoryLifetime::Persistent : MemoryLifetime::Temporary, _tmp_b.total_size()); -} - -void ClGemm::configure_reshaped_only_rhs(const CLCompileContext &compile_context, ITensorInfo *a, ITensorInfo *b, ITensorInfo *c, ITensorInfo *output, float alpha, float beta, - const GEMMInfo &gemm_info) -{ - DataType data_type = a->data_type(); - bool reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d(); - const unsigned int m = reinterpret_input_as_3d ? (a->dimension(1) * a->dimension(2)) : a->dimension(1); - const unsigned int n = b->dimension(0); - const unsigned int k = a->dimension(0); - const unsigned int batch_size = reinterpret_input_as_3d ? a->dimension(3) : a->dimension(2); - const int depth_output_gemm3d = gemm_info.depth_output_gemm3d(); - const GPUTarget gpu_target = CLScheduler::get().target(); - bool broadcast_bias = gemm_info.broadcast_bias(); - - GEMMKernelInfo kernel_info; - kernel_info.m = m; - kernel_info.n = n; - kernel_info.k = k; - kernel_info.depth_output_gemm3d = depth_output_gemm3d; - kernel_info.reinterpret_input_as_3d = reinterpret_input_as_3d; - kernel_info.broadcast_bias = broadcast_bias; - kernel_info.activation_info = gemm_info.activation_info(); - - // Set the target for the kernels - _mm_kernel->set_target(gpu_target); - - GEMMLHSMatrixInfo lhs_info{}; - GEMMRHSMatrixInfo rhs_info{}; - - // Pick up the GEMM configuration - std::tie(lhs_info, rhs_info) = auto_select_gemm_config_reshaped_only_rhs(auto_heuristics::CommonQuery{ gpu_target, data_type, m, n, k, batch_size }, kernel_info, a, b, c, output); - - // Transpose matrix - _reshape_rhs_kernel->configure(compile_context, b, &_tmp_b, rhs_info); - - // Configure two variants of CLGEMMMatrixMultiplyReshapedOnlyRHSKernel (has_pad_y = false/true) - // During the prepare stage we check the padding requirement for the lhs and dst tensors. If they do not have - // pad y, we dispatch CLGEMMMatrixMultiplyReshapedOnlyRHSKernel with has_pad_y = false - - // Configure matrix multiply kernel with no y padding support - kernel_info.has_pad_y = false; - _mm_reshaped_only_rhs_kernel->configure(compile_context, a, &_tmp_b, c, output, alpha, beta, lhs_info, rhs_info, kernel_info); - - // Configure matrix multiply kernel with y padding support - kernel_info.has_pad_y = true; - _mm_reshaped_only_rhs_fallback_kernel->configure(compile_context, a, &_tmp_b, c, output, alpha, beta, lhs_info, rhs_info, kernel_info); - - // Request memory for RHS reshape matrix - _aux_mem[RhsReshape] = MemoryInfo(offset_int_vec(RhsReshape), _reshape_b_only_on_first_run ? MemoryLifetime::Persistent : MemoryLifetime::Temporary, _tmp_b.total_size()); -} - -Status ClGemm::validate_native_v1(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *output, float alpha, float beta, const GEMMInfo &gemm_info) -{ - ARM_COMPUTE_UNUSED(alpha); - ARM_COMPUTE_UNUSED(output); - - // Get the GPU target - const GPUTarget gpu_target = CLScheduler::get().target(); - bool reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d(); - const unsigned int m = reinterpret_input_as_3d ? (a->dimension(1) * a->dimension(2)) : a->dimension(1); - const unsigned int n = b->dimension(0); - const unsigned int k = a->dimension(0); - const int depth_output_gemm3d = gemm_info.depth_output_gemm3d(); - - const GEMMReshapeInfo reshape_info = GEMMReshapeInfo(m, n, k, 1, 1, depth_output_gemm3d, reinterpret_input_as_3d, gemm_info.broadcast_bias()); - - // Validate matrix multiply - ARM_COMPUTE_RETURN_ON_ERROR(ClGemmMatrixMultiplyKernel::validate(a, b, c, output, alpha, beta, - false, reshape_info, gpu_target, gemm_info.fp_mixed_precision(), gemm_info.activation_info())); - - return Status{}; -} - -Status ClGemm::validate_reshaped_v1(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *output, float alpha, float beta, const GEMMInfo &gemm_info) -{ - ARM_COMPUTE_UNUSED(alpha); - ARM_COMPUTE_UNUSED(output); - - TensorInfo tmp_a_info{}; - TensorInfo tmp_b_info{}; - - // Get the GPU target - const GPUTarget gpu_target = CLScheduler::get().target(); - const unsigned int m = gemm_info.reinterpret_input_as_3d() ? (a->dimension(1) * a->dimension(2)) : a->dimension(1); - const unsigned int n = b->dimension(0); - const unsigned int k = a->dimension(0); - int mult_transpose1xW_width = 1; - int mult_interleave4x4_height = 1; - const int depth_output_gemm3d = gemm_info.depth_output_gemm3d(); - - if(get_arch_from_target(gpu_target) == GPUTarget::BIFROST) - { - mult_transpose1xW_width = 4; - mult_interleave4x4_height = 2; - } - - GEMMRHSMatrixInfo rhs_info; - rhs_info.n0 = 16 / b->element_size(); - rhs_info.k0 = 1; - rhs_info.h0 = mult_transpose1xW_width; - rhs_info.interleave = false; - rhs_info.transpose = false; - - GEMMLHSMatrixInfo lhs_info; - lhs_info.m0 = 4; - lhs_info.k0 = 4; - lhs_info.v0 = mult_interleave4x4_height; - lhs_info.interleave = true; - lhs_info.transpose = true; - - const GEMMReshapeInfo reshape_info = GEMMReshapeInfo(m, n, k, mult_transpose1xW_width, mult_interleave4x4_height, depth_output_gemm3d, false, gemm_info.broadcast_bias()); - - // Validate interleave kernel - auto_init_if_empty(tmp_a_info, a->clone()->set_tensor_shape(compute_lhs_reshaped_shape(*a, lhs_info, gemm_info.reinterpret_input_as_3d()))); - ARM_COMPUTE_RETURN_ON_ERROR(ClGemmReshapeLhsMatrixKernel::validate(a, &tmp_a_info, lhs_info, gemm_info.reinterpret_input_as_3d())); - - // Validate transpose kernel - auto_init_if_empty(tmp_b_info, b->clone()->set_tensor_shape(compute_rhs_reshaped_shape(*b, rhs_info))); - ARM_COMPUTE_RETURN_ON_ERROR(ClGemmReshapeRhsMatrixKernel::validate(b, &tmp_b_info, rhs_info)); - - // Validate matrix multiply - ARM_COMPUTE_RETURN_ON_ERROR(ClGemmMatrixMultiplyKernel::validate(&tmp_a_info, &tmp_b_info, c, output, alpha, beta, - true, reshape_info, gpu_target, gemm_info.fp_mixed_precision(), gemm_info.activation_info())); - - return Status{}; -} - -Status ClGemm::validate_reshaped(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *output, float alpha, float beta, const GEMMInfo &gemm_info) -{ - ARM_COMPUTE_UNUSED(alpha); - ARM_COMPUTE_UNUSED(output); - - TensorInfo tmp_a_info{}; - TensorInfo tmp_b_info{}; - - // Get the GPU target - const GPUTarget gpu_target = CLScheduler::get().target(); - DataType data_type = a->data_type(); - bool reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d(); - const unsigned int m = reinterpret_input_as_3d ? (a->dimension(1) * a->dimension(2)) : a->dimension(1); - const unsigned int n = b->dimension(0); - const unsigned int k = a->dimension(0); - const unsigned int batch_size = reinterpret_input_as_3d ? a->dimension(3) : a->dimension(2); - const int depth_output_gemm3d = gemm_info.depth_output_gemm3d(); - const bool broadcast_bias = gemm_info.broadcast_bias(); - - GEMMKernelInfo kernel_info; - kernel_info.m = m; - kernel_info.n = n; - kernel_info.k = k; - kernel_info.depth_output_gemm3d = depth_output_gemm3d; - kernel_info.reinterpret_input_as_3d = false; - kernel_info.broadcast_bias = broadcast_bias; - kernel_info.activation_info = gemm_info.activation_info(); - - GEMMLHSMatrixInfo lhs_info; - GEMMRHSMatrixInfo rhs_info; - - // Pick up the GEMM configuration - // NOTE: No need to validate mlgo configurations as they automatically fall back to default heuristics if validation fails - const auto gemm_config = select_default_gemm_config_reshaped(auto_heuristics::CommonQuery{ gpu_target, data_type, m, n, k, batch_size }); - lhs_info = gemm_config.lhs_info; - rhs_info = gemm_config.rhs_info; - - auto_init_if_empty(tmp_a_info, a->clone()->set_tensor_shape(compute_lhs_reshaped_shape(*a, lhs_info, gemm_info.reinterpret_input_as_3d()))); - ARM_COMPUTE_RETURN_ON_ERROR(ClGemmReshapeLhsMatrixKernel::validate(a, &tmp_a_info, lhs_info, gemm_info.reinterpret_input_as_3d())); - - auto_init_if_empty(tmp_b_info, b->clone()->set_tensor_shape(compute_rhs_reshaped_shape(*b, rhs_info))); - ARM_COMPUTE_RETURN_ON_ERROR(ClGemmReshapeRhsMatrixKernel::validate(b, &tmp_b_info, rhs_info)); - - // Validate matrix multiply - ARM_COMPUTE_RETURN_ON_ERROR(ClGemmMatrixMultiplyReshapedKernel::validate(&tmp_a_info, &tmp_b_info, c, output, alpha, beta, lhs_info, rhs_info, kernel_info)); - - return Status{}; -} - -Status ClGemm::validate_reshaped_only_rhs(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *output, float alpha, float beta, const GEMMInfo &gemm_info) -{ - ARM_COMPUTE_UNUSED(alpha); - ARM_COMPUTE_UNUSED(output); - - TensorInfo tmp_b_info{}; - - // Get the GPU target - const GPUTarget gpu_target = CLScheduler::get().target(); - const DataType data_type = a->data_type(); - bool reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d(); - const unsigned int m = reinterpret_input_as_3d ? (a->dimension(1) * a->dimension(2)) : a->dimension(1); - const unsigned int n = b->dimension(0); - const unsigned int k = a->dimension(0); - const unsigned int batch_size = reinterpret_input_as_3d ? a->dimension(3) : a->dimension(2); - const int depth_output_gemm3d = gemm_info.depth_output_gemm3d(); - const bool broadcast_bias = gemm_info.broadcast_bias(); - - GEMMKernelInfo kernel_info; - kernel_info.m = m; - kernel_info.n = n; - kernel_info.k = k; - kernel_info.depth_output_gemm3d = depth_output_gemm3d; - kernel_info.reinterpret_input_as_3d = reinterpret_input_as_3d; - kernel_info.broadcast_bias = broadcast_bias; - kernel_info.activation_info = gemm_info.activation_info(); - - GEMMLHSMatrixInfo lhs_info; - GEMMRHSMatrixInfo rhs_info; - - // Pick up the GEMM configuration - // NOTE: No need to validate mlgo configurations as they automatically fall back to default heuristics if validation fails - const auto gemm_config = select_default_gemm_config_reshaped_only_rhs(auto_heuristics::CommonQuery{ gpu_target, data_type, m, n, k, batch_size }); - lhs_info = gemm_config.lhs_info; - rhs_info = gemm_config.rhs_info; - - auto_init_if_empty(tmp_b_info, b->clone()->set_tensor_shape(compute_rhs_reshaped_shape(*b, rhs_info))); - ARM_COMPUTE_RETURN_ON_ERROR(ClGemmReshapeRhsMatrixKernel::validate(b, &tmp_b_info, rhs_info)); - - // Validate matrix multiply - kernel_info.has_pad_y = false; - ARM_COMPUTE_RETURN_ON_ERROR(ClGemmMatrixMultiplyReshapedOnlyRhsKernel::validate(a, &tmp_b_info, c, output, alpha, beta, lhs_info, rhs_info, kernel_info)); - - kernel_info.has_pad_y = true; - ARM_COMPUTE_RETURN_ON_ERROR(ClGemmMatrixMultiplyReshapedOnlyRhsKernel::validate(a, &tmp_b_info, c, output, alpha, beta, lhs_info, rhs_info, kernel_info)); - - return Status{}; -} - -void ClGemm::configure(const CLCompileContext &compile_context, ITensorInfo *a, ITensorInfo *b, ITensorInfo *c, ITensorInfo *output, float alpha, float beta, const GEMMInfo &gemm_info) -{ - ARM_COMPUTE_ERROR_ON_NULLPTR(a, b, output); - - // Perform validation step - ARM_COMPUTE_ERROR_THROW_ON(validate(a, b, c, output, alpha, beta, gemm_info)); - - // Check if we need to reshape the matrix B only on the first run - _reshape_b_only_on_first_run = gemm_info.reshape_b_only_on_first_run(); - - bool reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d(); - const unsigned int m = reinterpret_input_as_3d ? (a->dimension(1) * a->dimension(2)) : a->dimension(1); - const unsigned int n = b->dimension(0); - const unsigned int k = a->dimension(0); - const unsigned int batch_size = reinterpret_input_as_3d ? a->dimension(3) : a->dimension(2); - - // Select GEMMType - _gemm_kernel_type = auto_select_gemm_kernel(auto_heuristics::CommonQuery{ CLScheduler::get().target(), a->data_type(), m, n, k, batch_size }, _reshape_b_only_on_first_run, - gemm_info.constant_weights()); - - const bool fuse_add_c = (!(helpers::float_ops::is_zero(beta)) && c != nullptr); - - ITensorInfo *c_to_use = fuse_add_c ? c : nullptr; - - switch(_gemm_kernel_type) - { - case CLGEMMKernelType::NATIVE_V1: - { - configure_native_v1(compile_context, a, b, c_to_use, output, alpha, beta, gemm_info); - break; - } - case CLGEMMKernelType::RESHAPED_V1: - { - configure_reshaped_v1(compile_context, a, b, c_to_use, output, alpha, beta, gemm_info); - break; - } - case CLGEMMKernelType::RESHAPED: - { - configure_reshaped_v2(compile_context, a, b, c_to_use, output, alpha, beta, gemm_info); - break; - } - case CLGEMMKernelType::RESHAPED_ONLY_RHS: - { - configure_reshaped_only_rhs(compile_context, a, b, c_to_use, output, alpha, beta, gemm_info); - break; - } - default: - { - ARM_COMPUTE_ERROR("GEMMType not supported"); - } - } -} - -Status ClGemm::validate(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *output, float alpha, float beta, const GEMMInfo &gemm_info) -{ - // Get the GPU target - bool reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d(); - const unsigned int m = reinterpret_input_as_3d ? (a->dimension(1) * a->dimension(2)) : a->dimension(1); - const unsigned int n = b->dimension(0); - const unsigned int k = a->dimension(0); - const unsigned int batch_size = reinterpret_input_as_3d ? a->dimension(3) : a->dimension(2); - - // Select GEMMType - CLGEMMKernelType gemm_kernel_type = auto_select_gemm_kernel(auto_heuristics::CommonQuery - { - CLScheduler::get().target(), a->data_type(), m, n, k, batch_size, - }, - gemm_info.reshape_b_only_on_first_run(), gemm_info.constant_weights()); - - const bool fuse_add_c = (!(helpers::float_ops::is_zero(beta)) && c != nullptr); - - const ITensorInfo *c_to_use = fuse_add_c ? c : nullptr; - - switch(gemm_kernel_type) - { - case CLGEMMKernelType::NATIVE_V1: - { - ARM_COMPUTE_RETURN_ON_ERROR(validate_native_v1(a, b, c_to_use, output, alpha, beta, gemm_info)); - break; - } - case CLGEMMKernelType::RESHAPED_V1: - { - ARM_COMPUTE_RETURN_ON_ERROR(validate_reshaped_v1(a, b, c_to_use, output, alpha, beta, gemm_info)); - break; - } - case CLGEMMKernelType::RESHAPED: - { - ARM_COMPUTE_RETURN_ON_ERROR(validate_reshaped(a, b, c_to_use, output, alpha, beta, gemm_info)); - break; - } - case CLGEMMKernelType::RESHAPED_ONLY_RHS: - { - ARM_COMPUTE_RETURN_ON_ERROR(validate_reshaped_only_rhs(a, b, c_to_use, output, alpha, beta, gemm_info)); - break; - } - default: - { - ARM_COMPUTE_RETURN_ERROR_MSG("GEMMType not supported"); - } - } - - return Status{}; -} - -void ClGemm::run(ITensorPack &tensors) -{ - const ITensor *lhs = tensors.get_const_tensor(ACL_SRC_0); - const ITensor *rhs = tensors.get_const_tensor(ACL_SRC_1); - const ITensor *src2 = tensors.get_const_tensor(ACL_SRC_2); - ITensor *dst = tensors.get_tensor(ACL_DST); - - ARM_COMPUTE_ERROR_ON_NULLPTR(lhs, dst); - - CLAuxTensorHandler lhs_reshaped(offset_int_vec(LhsReshape), _tmp_a, tensors, true); - CLAuxTensorHandler rhs_reshaped(offset_int_vec(RhsReshape), _tmp_b, tensors, true); - - // Prepare the consts if needed - prepare(tensors); - - // Run matrix multiply kernel - switch(_gemm_kernel_type) - { - case CLGEMMKernelType::NATIVE_V1: - { - CLScheduler::get().enqueue_op(*_mm_kernel, tensors, true); - break; - } - case CLGEMMKernelType::RESHAPED_V1: - case CLGEMMKernelType::RESHAPED: - { - // Run interleave kernel - ITensorPack reshape_lhs_pack{ { ACL_SRC, lhs }, { ACL_DST, lhs_reshaped.get() } }; - CLScheduler::get().enqueue_op(*_reshape_lhs_kernel, reshape_lhs_pack, false); - - if(!_reshape_b_only_on_first_run) - { - // Run transpose kernel - ITensorPack reshape_rhs_pack{ { ACL_SRC, rhs }, { ACL_DST, rhs_reshaped.get() } }; - CLScheduler::get().enqueue_op(*_reshape_rhs_kernel, reshape_rhs_pack, false); - } - - ITensorPack gemm_reshaped_pack{ { ACL_SRC_0, lhs_reshaped.get() }, { ACL_SRC_1, rhs_reshaped.get() }, { ACL_SRC_2, src2 }, { ACL_DST, dst } }; - if(_gemm_kernel_type == CLGEMMKernelType::RESHAPED) - { - CLScheduler::get().enqueue_op(*_mm_reshaped_kernel, gemm_reshaped_pack, true); - } - else - { - CLScheduler::get().enqueue_op(*_mm_kernel, gemm_reshaped_pack, true); - } - break; - } - case CLGEMMKernelType::RESHAPED_ONLY_RHS: - { - if(!_reshape_b_only_on_first_run) - { - // Run transpose kernel - ITensorPack reshape_rhs_pack{ { ACL_SRC, rhs }, { ACL_DST, rhs_reshaped.get() } }; - CLScheduler::get().enqueue_op(*_reshape_rhs_kernel, reshape_rhs_pack, false); - } - // In case of RESHAPED_ONLY_RHS, we need to check the padding requirement - // Check if the lhs or dst tensors have padding - const unsigned int cross_plane_pad_lhs = lhs->info()->padding().top + lhs->info()->padding().bottom; - const unsigned int cross_plane_pad_dst = dst->info()->padding().top + dst->info()->padding().bottom; - bool has_pad_y = (cross_plane_pad_lhs != 0) || (cross_plane_pad_dst != 0); - - ITensorPack gemm_reshaped_onlyrhs_pack{ { ACL_SRC_0, lhs }, { ACL_SRC_1, rhs_reshaped.get() }, { ACL_SRC_2, src2 }, { ACL_DST, dst } }; - if(has_pad_y) - { - CLScheduler::get().enqueue_op(*_mm_reshaped_only_rhs_fallback_kernel, gemm_reshaped_onlyrhs_pack, true); - } - else - { - CLScheduler::get().enqueue_op(*_mm_reshaped_only_rhs_kernel, gemm_reshaped_onlyrhs_pack, true); - } - break; - } - default: - { - ARM_COMPUTE_ERROR("GEMMType not supported"); - } - } -} - -void ClGemm::prepare(ITensorPack &constants) -{ - const ITensor *src1 = constants.get_const_tensor(ACL_SRC_1); - ICLTensor *rhs_aux = utils::cast::polymorphic_downcast<ICLTensor *>(constants.get_tensor(offset_int_vec(RhsReshape))); - - // If memory for RHS is persistent and src1 is provided re-transform else assume that RHS is transformed - if((_aux_mem[AuxTensorIdx::RhsReshape].lifetime == MemoryLifetime::Persistent) && (src1 != nullptr && rhs_aux != nullptr) && rhs_aux) - { - CLAuxTensorHandler rhs_reshaped(_tmp_b, *rhs_aux); - ARM_COMPUTE_ERROR_ON(rhs_reshaped.get()->cl_buffer().get() == nullptr); - - ITensorPack reshape_rhs_pack{ { ACL_SRC, src1 }, { ACL_DST, rhs_reshaped.get() } }; - CLScheduler::get().enqueue_op(*_reshape_rhs_kernel, reshape_rhs_pack, true); - } -} - -experimental::MemoryRequirements ClGemm::workspace() const -{ - return _aux_mem; -} -} // namespace opencl -} // namespace arm_compute diff --git a/src/runtime/gpu/cl/operators/ClGemm.h b/src/runtime/gpu/cl/operators/ClGemm.h deleted file mode 100644 index bd9ca17edf..0000000000 --- a/src/runtime/gpu/cl/operators/ClGemm.h +++ /dev/null @@ -1,136 +0,0 @@ -/* - * Copyright (c) 2016-2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifndef ARM_COMPUTE_CL_GEMM_H -#define ARM_COMPUTE_CL_GEMM_H - -#include "arm_compute/core/TensorInfo.h" -#include "arm_compute/runtime/CL/CLTensor.h" -#include "arm_compute/runtime/CL/CLTypes.h" -#include "src/core/gpu/cl/ClCompileContext.h" -#include "src/core/gpu/cl/IClKernel.h" -#include "src/core/gpu/cl/kernels/ClGemmMatrixMultiplyKernel.h" -#include "src/core/gpu/cl/kernels/ClGemmMatrixMultiplyNativeKernel.h" -#include "src/core/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedKernel.h" -#include "src/core/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedOnlyRhsKernel.h" -#include "src/core/gpu/cl/kernels/ClGemmReshapeLhsMatrixKernel.h" -#include "src/core/gpu/cl/kernels/ClGemmReshapeRhsMatrixKernel.h" -#include "src/runtime/gpu/cl/IClOperator.h" - -#include <memory> - -namespace arm_compute -{ -namespace opencl -{ -/** Basic function to execute GEMM on OpenCL. This function calls the following OpenCL kernels: - * - * -# @ref kernels::ClGemmReshapeLhsMatrixKernel (only if the RESHAPED_V1 is selected by the heuristic model) - * -# @ref kernels::ClGemmReshapeRhsMatrixKernel (only if either the RESHAPED_V1 or RESHAPED_ONLY_RHS is selected by the select_gemm_kernel method()) - * -# @ref kernels::ClGemmMatrixMultiplyKernel (only if either the NATIVE or RESHAPED_V1 is selected by the select_gemm_kernel method()) - * -# @ref kernels::ClGemmMatrixMultiplyReshapedKernel (only if RESHAPED_V1 is selected by the select_gemm_kernel method()) - * -# @ref kernels::ClGemmMatrixMultiplyReshapedOnlyRhsKernel (only if RESHAPED_ONLY_RHS is selected by the select_gemm_kernel method()) - */ -class ClGemm : public IClOperator -{ -public: - /** Constructor */ - ClGemm(); - /** Initialise the kernel's inputs and output - * - * Valid data layouts: - * - All - * - * Valid data type configurations: - * |src0 |src1 |src2 |dst | - * |:------------|:-----------|:---------|:--------------| - * |F32 |F32 |F32 |F32 | - * |F16 |F16 |F16 |F16 | - * - * @note GEMM: General Matrix Multiply - [alpha * A * B + beta * C]. - * - * @note All tensors must have the same data type. - * - * @note Whilst the first input tensor can be a vector, the second input tensor must be at least a matrix - * - * @param[in] compile_context The compile context to be used. - * @param[in] a First input tensor (Matrix or Vector A). Data types supported: F16/F32 - * @param[in] b Second input tensor (Matrix B). Data type supported: same as @p a. - * @param[in] c Third input tensor (Matrix C). It can be a nullptr if just the multiplication between @p a and @p b is needed. Data type supported: same as @p a. - * @param[out] output Output tensor. Data type supported: same as @p a - * @param[in] alpha Weight of the matrix product - * @param[in] beta Weight of matrix C - * @param[in] gemm_info (Optional) Specifies if the matrix A and/or matrix B have been reshaped and - * if the reshape of matrix B should happen only for the first run. GEMMInfo also contains information about the reshaping - * in case matrix A and matrix B have been already transformed. - */ - void configure(const CLCompileContext &compile_context, ITensorInfo *a, ITensorInfo *b, ITensorInfo *c, ITensorInfo *output, float alpha, float beta, const GEMMInfo &gemm_info); - /** Static function to check if given info will lead to a valid configuration - * - * Similar to ClGemm::configure() - * - * @return a status - */ - static Status validate(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *output, float alpha, float beta, const GEMMInfo &gemm_info); - - // Inherited methods overridden: - void run(ITensorPack &tensors) override; - void prepare(ITensorPack &constants) override; - experimental::MemoryRequirements workspace() const override; - -private: - void configure_native_v1(const CLCompileContext &compile_context, ITensorInfo *a, ITensorInfo *b, ITensorInfo *c, ITensorInfo *output, float alpha, float beta, const GEMMInfo &gemm_info); - void configure_reshaped_v1(const CLCompileContext &compile_context, ITensorInfo *a, ITensorInfo *b, ITensorInfo *c, ITensorInfo *output, float alpha, float beta, const GEMMInfo &gemm_info); - void configure_reshaped_v2(const CLCompileContext &compile_context, ITensorInfo *a, ITensorInfo *b, ITensorInfo *c, ITensorInfo *output, float alpha, float beta, const GEMMInfo &gemm_info); - void configure_reshaped_only_rhs(const CLCompileContext &compile_context, ITensorInfo *a, ITensorInfo *b, ITensorInfo *c, ITensorInfo *output, float alpha, float beta, const GEMMInfo &gemm_info); - - static Status validate_native_v1(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *output, float alpha, float beta, const GEMMInfo &gemm_info); - static Status validate_reshaped_v1(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *output, float alpha, float beta, const GEMMInfo &gemm_info); - static Status validate_reshaped(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *output, float alpha, float beta, const GEMMInfo &gemm_info); - static Status validate_reshaped_only_rhs(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *output, float alpha, float beta, const GEMMInfo &gemm_info); - -private: - enum AuxTensorIdx - { - LhsReshape = 0, - RhsReshape, - Count - }; - -private: - std::unique_ptr<kernels::ClGemmMatrixMultiplyKernel> _mm_kernel; - std::unique_ptr<kernels::ClGemmReshapeLhsMatrixKernel> _reshape_lhs_kernel; - std::unique_ptr<kernels::ClGemmReshapeRhsMatrixKernel> _reshape_rhs_kernel; - std::unique_ptr<kernels::ClGemmMatrixMultiplyReshapedKernel> _mm_reshaped_kernel; - std::unique_ptr<kernels::ClGemmMatrixMultiplyReshapedOnlyRhsKernel> _mm_reshaped_only_rhs_kernel; - std::unique_ptr<kernels::ClGemmMatrixMultiplyReshapedOnlyRhsKernel> _mm_reshaped_only_rhs_fallback_kernel; - TensorInfo _tmp_a; - TensorInfo _tmp_b; - bool _reshape_b_only_on_first_run; - CLGEMMKernelType _gemm_kernel_type; - - experimental::MemoryRequirements _aux_mem{}; -}; -} // namespace opencl -} // namespace arm_compute -#endif /* ARM_COMPUTE_CLGEMM_H */ diff --git a/src/runtime/gpu/cl/operators/ClLogicalNot.cpp b/src/runtime/gpu/cl/operators/ClLogicalNot.cpp deleted file mode 100644 index 400efe450d..0000000000 --- a/src/runtime/gpu/cl/operators/ClLogicalNot.cpp +++ /dev/null @@ -1,45 +0,0 @@ -/* - * Copyright (c) 2017-2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "src/runtime/gpu/cl/operators/ClLogicalNot.h" - -#include "src/core/gpu/cl/ClCompileContext.h" -#include "src/core/gpu/cl/kernels/ClElementwiseUnaryKernel.h" - -namespace arm_compute -{ -namespace opencl -{ -void ClLogicalNot::configure(const ClCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst) -{ - auto k = std::make_unique<kernels::ClElementWiseUnaryKernel>(); - k->configure(compile_context, src, dst, ElementWiseUnary::LOGICAL_NOT); - _kernel = std::move(k); -} - -Status ClLogicalNot::validate(const ITensorInfo *src, const ITensorInfo *dst) -{ - return kernels::ClElementWiseUnaryKernel::validate(src, dst, ElementWiseUnary::LOGICAL_NOT); -} -} // namespace opencl -} // namespace arm_compute diff --git a/src/runtime/gpu/cl/operators/ClLogicalNot.h b/src/runtime/gpu/cl/operators/ClLogicalNot.h deleted file mode 100644 index 25ddf564b5..0000000000 --- a/src/runtime/gpu/cl/operators/ClLogicalNot.h +++ /dev/null @@ -1,58 +0,0 @@ -/* - * Copyright (c) 2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifndef ARM_COMPUTE_CL_LOGICAL_NOT_H -#define ARM_COMPUTE_CL_LOGICAL_NOT_H - -#include "src/core/gpu/cl/ClCompileContext.h" -#include "src/runtime/gpu/cl/IClOperator.h" - -namespace arm_compute -{ -namespace opencl -{ -/** Basic function to run @ref kernels::ClElementWiseUnaryKernel for NOT operation */ -class ClLogicalNot : public IClOperator -{ -public: - /** Constructor */ - ClLogicalNot() = default; - /** Configure operator for a given list of arguments - * - * @param[in] compile_context The compile context to be used. - * @param[in] src Source tensor info. Data types supported: U8. - * @param[out] dst Destination tensor info. Data types supported: same as @p src. - */ - void configure(const CLCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst); - /** Static function to check if given info will lead to a valid configuration - * - * @param[in] src Soure tensor info. Data types supported: U8. - * @param[in] dst Destination tensor info. Data types supported: same as @p src. - * - * @return a status - */ - static Status validate(const ITensorInfo *src, const ITensorInfo *dst); -}; -} // namespace opencl -} // namespace arm_compute -#endif /* ARM_COMPUTE_CL_LOGICAL_NOT_H */ diff --git a/src/runtime/gpu/cl/operators/ClMul.cpp b/src/runtime/gpu/cl/operators/ClMul.cpp deleted file mode 100644 index d1e2bc806f..0000000000 --- a/src/runtime/gpu/cl/operators/ClMul.cpp +++ /dev/null @@ -1,60 +0,0 @@ -/* - * Copyright (c) 2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "src/runtime/gpu/cl/operators/ClMul.h" - -#include "arm_compute/runtime/CL/CLScheduler.h" -#include "src/core/gpu/cl/ClCompileContext.h" -#include "src/core/gpu/cl/kernels/ClMulKernel.h" - -namespace arm_compute -{ -namespace opencl -{ -void ClMul::configure(const CLCompileContext &compile_context, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, float scale, - ConvertPolicy overflow_policy, RoundingPolicy rounding_policy, const ActivationLayerInfo &act_info) -{ - auto k = std::make_unique<kernels::ClMulKernel>(); - k->configure(compile_context, src1, src2, dst, scale, overflow_policy, rounding_policy, act_info); - _kernel = std::move(k); -} - -Status ClMul::validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, float scale, - ConvertPolicy overflow_policy, RoundingPolicy rounding_policy, const ActivationLayerInfo &act_info) -{ - return kernels::ClMulKernel::validate(src1, src2, dst, scale, overflow_policy, rounding_policy, act_info); -} - -void ClComplexMul::configure(const CLCompileContext &compile_context, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, const ActivationLayerInfo &act_info) -{ - auto k = std::make_unique<kernels::ClComplexMulKernel>(); - k->configure(compile_context, src1, src2, dst, act_info); - _kernel = std::move(k); -} - -Status ClComplexMul::validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, const ActivationLayerInfo &act_info) -{ - return kernels::ClComplexMulKernel::validate(src1, src2, dst, act_info); -} -} // namespace opencl -} // namespace arm_compute
\ No newline at end of file diff --git a/src/runtime/gpu/cl/operators/ClMul.h b/src/runtime/gpu/cl/operators/ClMul.h deleted file mode 100644 index 4a662b3276..0000000000 --- a/src/runtime/gpu/cl/operators/ClMul.h +++ /dev/null @@ -1,107 +0,0 @@ -/* - * Copyright (c) 2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifndef ARM_COMPUTE_CL_MUL_H -#define ARM_COMPUTE_CL_MUL_H - -#include "src/core/gpu/cl/ClCompileContext.h" -#include "src/runtime/gpu/cl/IClOperator.h" - -namespace arm_compute -{ -namespace opencl -{ -/** Basic function to run @ref opencl::kernels::ClMulKernel */ -class ClMul : public IClOperator -{ -public: - /** Default Constructor */ - ClMul() = default; - /** Initialise the kernel's sources, dst and convertion policy. - * - * Valid configurations (src1,src2) -> Output : - * - * - (U8,U8) -> U8 - * - (U8,U8) -> S16 - * - (U8,S16) -> S16 - * - (S16,U8) -> S16 - * - (S16,S16) -> S16 - * - (F16,F16) -> F16 - * - (F32,F32) -> F32 - * - (QASYMM8,QASYMM8) -> QASYMM8 - * - (QASYMM8_SIGNED,QASYMM8_SIGNED) -> QASYMM8_SIGNED - * - (QSYMM16,QSYMM16) -> QSYMM16 - * - (QSYMM16,QSYMM16) -> S32 - * - * @param[in] compile_context The compile context to be used. - * @param[in, out] src1 An src tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/F32. - * The src tensor is [in, out] because its TensorInfo might be modified inside the kernel in case of broadcasting of dimension 0. - * @param[in, out] src2 An src tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/F32. - * The src tensor is [in, out] because its TensorInfo might be modified inside the kernel in case of broadcasting of dimension 0. - * @param[out] dst The dst tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/F32. - * @param[in] scale Scale to apply after multiplication. - * Scale must be positive and its value must be either 1/255 or 1/2^n where n is between 0 and 15. - * @param[in] overflow_policy Overflow policy. Supported overflow policies: Wrap, Saturate - * @param[in] rounding_policy Rounding policy. Supported rounding modes: to zero, to nearest even. - * @param[in] act_info (Optional) Activation layer information in case of a fused activation. - */ - void configure(const CLCompileContext &compile_context, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, float scale, - ConvertPolicy overflow_policy, RoundingPolicy rounding_policy, const ActivationLayerInfo &act_info = ActivationLayerInfo()); - /** Static function to check if given info will lead to a valid configuration - * - * Similar to @ref ClMul::configure() - * - * @return a status - */ - static Status validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, float scale, - ConvertPolicy overflow_policy, RoundingPolicy rounding_policy, const ActivationLayerInfo &act_info = ActivationLayerInfo()); -}; - -/** Basic function to run @ref opencl::kernels::ClComplexMulKernel */ -class ClComplexMul : public IClOperator -{ -public: - /** Default Constructor */ - ClComplexMul() = default; - /** Initialise the kernel's sources, dst. - * - * @param[in] compile_context The compile context to be used. - * @param[in, out] src1 An src tensor info. Data types supported: F16/F32. Number of channels supported: 2. - * The src tensor is [in, out] because its TensorInfo might be modified inside the kernel in case of broadcasting of dimension 0. - * @param[in, out] src2 An src tensor info. Data types supported: same as @p src1. Number of channels supported: same as @p src1. - * The src tensor is [in, out] because its TensorInfo might be modified inside the kernel in case of broadcasting of dimension 0. - * @param[out] dst The dst tensor info, Data types supported: same as @p src1. Number of channels supported: same as @p src1. - * @param[in] act_info (Optional) Activation layer information in case of a fused activation. - */ - void configure(const CLCompileContext &compile_context, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, const ActivationLayerInfo &act_info = ActivationLayerInfo()); - /** Static function to check if given info will lead to a valid configuration - * - * Similar to @ref ClComplexMul::configure() - * - * @return a status - */ - static Status validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, const ActivationLayerInfo &act_info = ActivationLayerInfo()); -}; -} // namespace opencl -} // namespace arm_compute -#endif /* ARM_COMPUTE_CL_MUL_H */
\ No newline at end of file diff --git a/src/runtime/gpu/cl/operators/ClPRelu.cpp b/src/runtime/gpu/cl/operators/ClPRelu.cpp deleted file mode 100644 index d1ce14cc87..0000000000 --- a/src/runtime/gpu/cl/operators/ClPRelu.cpp +++ /dev/null @@ -1,57 +0,0 @@ -/* - * Copyright (c) 2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "src/runtime/gpu/cl/operators/ClPRelu.h" -#include "src/core/gpu/cl/kernels/ClElementwiseKernel.h" - -namespace arm_compute -{ -namespace opencl -{ -using KernelType = kernels::ClArithmeticKernel; -void ClPRelu::configure(const CLCompileContext &compile_context, ITensorInfo *input, ITensorInfo *alpha, ITensorInfo *output) -{ - auto k = std::make_unique<KernelType>(); - k->configure(compile_context, ArithmeticOperation::PRELU, input, alpha, (output == nullptr ? input : output)); - _kernel = std::move(k); -} - -Status ClPRelu::validate(const ITensorInfo *input, const ITensorInfo *alpha, const ITensorInfo *output) -{ - return KernelType::validate(ArithmeticOperation::PRELU, input, alpha, (output == nullptr ? input : output)); -} - -void ClPRelu::run(ITensorPack &tensors) -{ - // Output tensor can be given as nullptr for in-place computation. - // In this case, get the input tensor and use it as the output tensor. - if(tensors.get_tensor(TensorType::ACL_DST) == nullptr) - { - auto src_tensor = const_cast<ITensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_0)); - ARM_COMPUTE_ERROR_ON_MSG(src_tensor == nullptr, "invalid source tensor is given for in-place computation"); - tensors.add_tensor(TensorType::ACL_DST, src_tensor); - } - IClOperator::run(tensors); -} -} // namespace opencl -} // namespace arm_compute
\ No newline at end of file diff --git a/src/runtime/gpu/cl/operators/ClPRelu.h b/src/runtime/gpu/cl/operators/ClPRelu.h deleted file mode 100644 index 70202aeb81..0000000000 --- a/src/runtime/gpu/cl/operators/ClPRelu.h +++ /dev/null @@ -1,68 +0,0 @@ -/* - * Copyright (c) 2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifndef ARM_COMPUTE_CL_PRELU_H -#define ARM_COMPUTE_CL_PRELU_H - -#include "src/core/gpu/cl/ClCompileContext.h" -#include "src/runtime/gpu/cl/IClOperator.h" - -namespace arm_compute -{ -namespace opencl -{ -/** Basic operator to run @ref arm_compute::opencl::kernels::ClArithmeticKernel for PRELU - * - * @note The operator implements an activation layer with the PRELU activation function. - */ -class ClPRelu : public IClOperator -{ -public: - /** Default constructor */ - ClPRelu() = default; - /** Set the input and output tensor. - * - * @note If the output tensor is a nullptr or is equal to the input, the activation function will be performed in-place - * - * @param[in] compile_context The compile context to be used. - * @param[in] input Source tensor. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32. - * @param[in] alpha PRelu layer parameters. Data types supported: same of @p input. - * @param[out] output Destination tensor. Data type supported: same as @p input - */ - void configure(const CLCompileContext &compile_context, ITensorInfo *input, ITensorInfo *alpha, ITensorInfo *output); - /** Static function to check if given info will lead to a valid configuration of @ref arm_compute::opencl::kernels::ClArithmeticKernel for PRELU - * - * @param[in] input Source tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32. - * @param[in] alpha PRelu layer parameters. Data types supported: same of @p input. - * @param[in] output Destination tensor info. Data type supported: same as @p input - * - * @return a status - */ - static Status validate(const ITensorInfo *input, const ITensorInfo *alpha, const ITensorInfo *output); - - // Inherited methods overridden: - void run(ITensorPack &tensors) override; -}; -} // namespace opencl -} // namespace arm_compute -#endif /* ARM_COMPUTE_CL_PRELU_H */ diff --git a/src/runtime/gpu/cl/operators/ClPermute.cpp b/src/runtime/gpu/cl/operators/ClPermute.cpp deleted file mode 100644 index 719bb6dac6..0000000000 --- a/src/runtime/gpu/cl/operators/ClPermute.cpp +++ /dev/null @@ -1,45 +0,0 @@ -/* - * Copyright (c) 2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "src/runtime/gpu/cl/operators/ClPermute.h" - -#include "src/core/gpu/cl/ClCompileContext.h" -#include "src/core/gpu/cl/kernels/ClPermuteKernel.h" - -namespace arm_compute -{ -namespace opencl -{ -void ClPermute::configure(const ClCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst, const PermutationVector &perm) -{ - auto k = std::make_unique<kernels::ClPermuteKernel>(); - k->configure(compile_context, src, dst, perm); - _kernel = std::move(k); -} - -Status ClPermute::validate(const ITensorInfo *src, const ITensorInfo *dst, const PermutationVector &perm) -{ - return kernels::ClPermuteKernel::validate(src, dst, perm); -} -} // namespace opencl -} // namespace arm_compute
\ No newline at end of file diff --git a/src/runtime/gpu/cl/operators/ClPermute.h b/src/runtime/gpu/cl/operators/ClPermute.h deleted file mode 100644 index 20e7a32428..0000000000 --- a/src/runtime/gpu/cl/operators/ClPermute.h +++ /dev/null @@ -1,64 +0,0 @@ -/* - * Copyright (c) 2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifndef ARM_COMPUTE_CL_PERMUTE_H -#define ARM_COMPUTE_CL_PERMUTE_H - -#include "src/core/gpu/cl/ClCompileContext.h" -#include "src/runtime/gpu/cl/IClOperator.h" - -namespace arm_compute -{ -namespace opencl -{ -/** Basic function to run @ref kernels::ClPermuteKernel */ -class ClPermute : public IClOperator -{ -public: - /** Constructor */ - ClPermute() = default; - /** Initialise the kernel's inputs and outputs and permute vector - * - * @note Arbitrary permutation vectors are supported with rank not greater than 4 - * - * @param[in] compile_context The compile context to be used. - * @param[in] src The src tensor info. Data types supported: All. - * @param[in] dst The dst tensor info. Data types supported: Same as @p src - * @param[in] perm Permutation vector - */ - void configure(const CLCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst, const PermutationVector &perm); - /** Static function to check if given info will lead to a valid configuration of @ref kernels::ClPermuteKernel. - * - * @note Arbitrary permutation vectors are supported with rank not greater than 4 - * - * @param[in] src First tensor src info. Data types supported: All. - * @param[in] dst Output tensor info. Data types supported: same as @p src. - * @param[in] perm Permutation vector - * - * @return a status - */ - static Status validate(const ITensorInfo *src, const ITensorInfo *dst, const PermutationVector &perm); -}; -} // namespace opencl -} // namespace arm_compute -#endif /* ARM_COMPUTE_CL_PERMUTE_H */
\ No newline at end of file diff --git a/src/runtime/gpu/cl/operators/ClPool2d.cpp b/src/runtime/gpu/cl/operators/ClPool2d.cpp deleted file mode 100644 index 40c2b0a8ba..0000000000 --- a/src/runtime/gpu/cl/operators/ClPool2d.cpp +++ /dev/null @@ -1,101 +0,0 @@ -/* - * Copyright (c) 2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "src/runtime/gpu/cl/operators/ClPool2d.h" - -#include "arm_compute/runtime/CL/CLScheduler.h" - -#include "src/core/CL/kernels/CLFillBorderKernel.h" -#include "src/core/gpu/cl/ClCompileContext.h" -#include "src/core/gpu/cl/kernels/ClPool2dKernel.h" - -namespace arm_compute -{ -namespace opencl -{ -void ClPool2d::configure(const ClCompileContext &compile_context, ITensorInfo *src, ITensorInfo *dst, const PoolingLayerInfo &info, ITensorInfo *indices) -{ - ARM_COMPUTE_ERROR_ON_NULLPTR(src); - // Configure pooling kernel - auto k = std::make_unique<kernels::ClPool2dKernel>(); - k->set_target(CLScheduler::get().target()); - k->configure(compile_context, src, dst, info, indices); - _pooling = std::move(k); - - const DataType data_type = src->data_type(); - - // Configure border depending on operation required (quantize border in case of asymmetric data_type) - BorderMode border_mode{}; - PixelValue pixel_value(0.f); - if(is_data_type_quantized_asymmetric(data_type) && !info.exclude_padding) - { - pixel_value = PixelValue(0, data_type, src->quantization_info()); - } - - // Data layout - const auto data_layout = info.data_layout == DataLayout::UNKNOWN ? src->data_layout() : info.data_layout; - - switch(data_layout) - { - case DataLayout::NCHW: - border_mode = (PoolingType::MAX == info.pool_type) ? BorderMode::REPLICATE : BorderMode::CONSTANT; - break; - case DataLayout::NHWC: - border_mode = BorderMode::CONSTANT; - if(PoolingType::MAX == info.pool_type) - { - if(is_data_type_quantized(data_type)) - { - std::tie(pixel_value, std::ignore) = get_min_max(data_type); - } - else - { - pixel_value = PixelValue(std::numeric_limits<float>::lowest()); - } - } - break; - default: - ARM_COMPUTE_ERROR("Data layout not supported"); - } - auto b = std::make_unique<CLFillBorderKernel>(); - b->configure(compile_context, src, _pooling->border_size(), border_mode, pixel_value); - _border_handler = std::move(b); - - // Tune kernels - CLScheduler::get().tune_kernel_static(*_pooling); -} - -Status ClPool2d::validate(const ITensorInfo *src, const ITensorInfo *dst, const PoolingLayerInfo &info, const ITensorInfo *indices) -{ - return kernels::ClPool2dKernel::validate(src, dst, info, indices); -} - -void ClPool2d::run(ITensorPack &tensors) -{ - ARM_COMPUTE_ERROR_ON_MSG(tensors.empty(), "No inputs provided"); - - CLScheduler::get().enqueue_op(*_border_handler.get(), tensors, false); - CLScheduler::get().enqueue_op(*_pooling.get(), tensors, false); -} -} // namespace opencl -} // namespace arm_compute diff --git a/src/runtime/gpu/cl/operators/ClPool2d.h b/src/runtime/gpu/cl/operators/ClPool2d.h deleted file mode 100644 index 8ac386a64b..0000000000 --- a/src/runtime/gpu/cl/operators/ClPool2d.h +++ /dev/null @@ -1,72 +0,0 @@ -/* - * Copyright (c) 2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifndef ARM_COMPUTE_CL_POOL2D_H -#define ARM_COMPUTE_CL_POOL2D_H - -#include "src/core/gpu/cl/ClCompileContext.h" -#include "src/runtime/gpu/cl/IClOperator.h" - -#include <memory> - -namespace arm_compute -{ -namespace opencl -{ -/** Basic function to simulate a pooling layer with the specified pooling operation. This function calls the following OpenCL kernels: - * - * -# @ref CLFillBorderKernel (executed if padding size is different from zero) - * -# @ref opencl::ClPool2d - */ -class ClPool2d : public IClOperator -{ -public: - /** Constructor */ - ClPool2d() = default; - /** Configure operator for a given list of arguments - * - * @param[in] compile_context The compile context to be used. - * @param[in] src Source tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32. - * @param[out] dst Destination tensor info. Data type supported: same as @p src - * @param[in] info Pooling layer parameters. - * @param[out] indices (optional) The indices info of the maximal values. Data type supported: U32. - */ - void configure(const ClCompileContext &compile_context, ITensorInfo *src, ITensorInfo *dst, const PoolingLayerInfo &info, ITensorInfo *indices = nullptr); - /** Static function to check if given info will lead to a valid configuration - * - * Similar to ClPool2d::configure() - * - * @return a status - */ - static Status validate(const ITensorInfo *src, const ITensorInfo *dst, const PoolingLayerInfo &info, const ITensorInfo *indices = nullptr); - - // Inherited method overridden - void run(ITensorPack &tensors) override; - -private: - std::unique_ptr<ICLKernel> _pooling{ nullptr }; - std::unique_ptr<ICLKernel> _border_handler{ nullptr }; -}; -} // namespace opencl -} // namespace arm_compute -#endif /* ARM_COMPUTE_CL_POOL2D_H */ diff --git a/src/runtime/gpu/cl/operators/ClQuantize.cpp b/src/runtime/gpu/cl/operators/ClQuantize.cpp deleted file mode 100644 index 92bbb62ba5..0000000000 --- a/src/runtime/gpu/cl/operators/ClQuantize.cpp +++ /dev/null @@ -1,53 +0,0 @@ -/* - * Copyright (c) 2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "src/runtime/gpu/cl/operators/ClQuantize.h" - -#include "arm_compute/core/Error.h" -#include "arm_compute/runtime/CL/CLScheduler.h" -#include "src/core/gpu/cl/ClCompileContext.h" -#include "src/core/gpu/cl/kernels/ClQuantizeKernel.h" - -namespace arm_compute -{ -namespace opencl -{ -void ClQuantize::configure(const CLCompileContext &compile_context, ITensorInfo *src, ITensorInfo *dst) -{ - auto k = std::make_unique<kernels::ClQuantizeKernel>(); - k->configure(compile_context, src, dst); - _kernel = std::move(k); -} - -Status ClQuantize::validate(const ITensorInfo *src, const ITensorInfo *dst) -{ - return kernels::ClQuantizeKernel::validate(src, dst); -} - -void ClQuantize::run(ITensorPack &tensors) -{ - ARM_COMPUTE_ERROR_ON_MSG(tensors.empty(), "No inputs provided"); - CLScheduler::get().enqueue_op(*_kernel.get(), tensors); -} -} // namespace opencl -} // namespace arm_compute diff --git a/src/runtime/gpu/cl/operators/ClQuantize.h b/src/runtime/gpu/cl/operators/ClQuantize.h deleted file mode 100644 index 0b6d2c8cbe..0000000000 --- a/src/runtime/gpu/cl/operators/ClQuantize.h +++ /dev/null @@ -1,62 +0,0 @@ -/* - * Copyright (c) 2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifndef ARM_COMPUTE_CL_QUANTIZE_H -#define ARM_COMPUTE_CL_QUANTIZE_H - -#include "src/core/gpu/cl/ClCompileContext.h" -#include "src/runtime/gpu/cl/IClOperator.h" - -namespace arm_compute -{ -namespace opencl -{ -/** Basic function to run @ref kernels::ClQuantizeKernel that dequantizes an input tensor */ -class ClQuantize : public IClOperator -{ -public: - /** Constructor */ - ClQuantize() = default; - /** Set the input and output tensors. - * - * @param[in] compile_context The compile context to be used. - * @param[in] src Source tensor. The dimensions over the third will be interpreted as batches. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/32. - * @param[out] dst Destination tensor with the same dimensions of input. Data types supported: QASYMM8/QASYMM8_SIGNED/QASYMM16. - * - * @note Output auto initialization is not supported by this function - */ - void configure(const CLCompileContext &compile_context, ITensorInfo *src, ITensorInfo *dst); - /** Static function to check if given info will lead to a valid configuration - * - * Similar to @ref ClQuantize::configure() - * - * @return a status - */ - static Status validate(const ITensorInfo *src, const ITensorInfo *dst); - - // Inherited method overridden - void run(ITensorPack &tensors) override; -}; -} // namespace opencl -} // namespace arm_compute -#endif /* ARM_COMPUTE_CL_QUANTIZE_H */ diff --git a/src/runtime/gpu/cl/operators/ClReshape.cpp b/src/runtime/gpu/cl/operators/ClReshape.cpp deleted file mode 100644 index d3fa9f10ab..0000000000 --- a/src/runtime/gpu/cl/operators/ClReshape.cpp +++ /dev/null @@ -1,45 +0,0 @@ -/* - * Copyright (c) 2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "src/runtime/gpu/cl/operators/ClReshape.h" - -#include "src/core/gpu/cl/ClCompileContext.h" -#include "src/core/gpu/cl/kernels/ClReshapeKernel.h" - -namespace arm_compute -{ -namespace opencl -{ -void ClReshape::configure(const ClCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst) -{ - auto k = std::make_unique<kernels::ClReshapeKernel>(); - k->configure(compile_context, src, dst); - _kernel = std::move(k); -} - -Status ClReshape::validate(const ITensorInfo *src, const ITensorInfo *dst) -{ - return kernels::ClReshapeKernel::validate(src, dst); -} -} // namespace opencl -} // namespace arm_compute
\ No newline at end of file diff --git a/src/runtime/gpu/cl/operators/ClReshape.h b/src/runtime/gpu/cl/operators/ClReshape.h deleted file mode 100644 index 8cccc5776c..0000000000 --- a/src/runtime/gpu/cl/operators/ClReshape.h +++ /dev/null @@ -1,59 +0,0 @@ -/* - * Copyright (c) 2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifndef ARM_COMPUTE_CL_RESHAPE_H -#define ARM_COMPUTE_CL_RESHAPE_H - -#include "src/core/gpu/cl/ClCompileContext.h" -#include "src/runtime/gpu/cl/IClOperator.h" - -namespace arm_compute -{ -namespace opencl -{ -/** Basic function to run @ref kernels::ClReshapeKernel */ -class ClReshape : public IClOperator -{ -public: - /** Constructor */ - ClReshape() = default; - /** Initialise the kernel's inputs and outputs - * - * @param[in] compile_context The compile context to be used. - * @param[in] input Input tensor info. Data type supported: All - * @param[out] output Output info. Data type supported: Same as @p input - */ - void configure(const CLCompileContext &compile_context, const ITensorInfo *input, ITensorInfo *output); - - /** Static function to check if given info will lead to a valid configuration of @ref kernels::ClReshapeKernel - * - * @param[in] input Input tensor info. Data type supported: All - * @param[in] output Output tensor info. Data type supported: Same as @p input - * - * @return a status - */ - static Status validate(const ITensorInfo *input, const ITensorInfo *output); -}; -} // namespace opencl -} // namespace arm_compute -#endif /* ARM_COMPUTE_CL_RESHAPE_H */
\ No newline at end of file diff --git a/src/runtime/gpu/cl/operators/ClScale.cpp b/src/runtime/gpu/cl/operators/ClScale.cpp deleted file mode 100644 index 4730c8a16e..0000000000 --- a/src/runtime/gpu/cl/operators/ClScale.cpp +++ /dev/null @@ -1,69 +0,0 @@ -/* - * Copyright (c) 2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "src/runtime/gpu/cl/operators/ClScale.h" - -#include "arm_compute/core/Error.h" -#include "arm_compute/runtime/CL/CLScheduler.h" -#include "src/core/CL/kernels/CLFillBorderKernel.h" -#include "src/core/gpu/cl/ClCompileContext.h" -#include "src/core/gpu/cl/kernels/ClScaleKernel.h" - -namespace arm_compute -{ -namespace opencl -{ -void ClScale::configure(const CLCompileContext &compile_context, ITensorInfo *src, ITensorInfo *dst, const ScaleKernelInfo &info) -{ - ARM_COMPUTE_ERROR_ON_NULLPTR(src); - // Configure Scale kernel - auto k = std::make_unique<kernels::ClScaleKernel>(); - k->set_target(CLScheduler::get().target()); - k->configure(compile_context, src, dst, info); - _kernel = std::move(k); - if(!_kernel->border_size().empty()) - { - auto b = std::make_unique<CLFillBorderKernel>(); - b->configure(compile_context, src, _kernel->border_size(), info.border_mode, info.constant_border_value); - _border_handler = std::move(b); - } - // Tune kernel - CLScheduler::get().tune_kernel_static(*_kernel); -} - -Status ClScale::validate(const ITensorInfo *src, const ITensorInfo *dst, const ScaleKernelInfo &info) -{ - return kernels::ClScaleKernel::validate(src, dst, info); -} - -void ClScale::run(ITensorPack &tensors) -{ - ARM_COMPUTE_ERROR_ON_MSG(tensors.empty(), "No inputs provided"); - if(!_kernel->border_size().empty()) - { - CLScheduler::get().enqueue_op(*_border_handler.get(), tensors, false); - } - CLScheduler::get().enqueue_op(*_kernel.get(), tensors); -} -} // namespace opencl -} // namespace arm_compute
\ No newline at end of file diff --git a/src/runtime/gpu/cl/operators/ClScale.h b/src/runtime/gpu/cl/operators/ClScale.h deleted file mode 100644 index 6eccb59be8..0000000000 --- a/src/runtime/gpu/cl/operators/ClScale.h +++ /dev/null @@ -1,74 +0,0 @@ -/* - * Copyright (c) 2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifndef ARM_COMPUTE_CL_SCALE_H -#define ARM_COMPUTE_CL_SCALE_H - -#include "arm_compute/core/KernelDescriptors.h" -#include "src/core/gpu/cl/ClCompileContext.h" -#include "src/runtime/gpu/cl/IClOperator.h" - -namespace arm_compute -{ -namespace opencl -{ -/** Basic function to simulate a scale layer. This function calls the following OpenCL kernels: - * - * -# @ref CLFillBorderKernel (executed if padding size is different from zero) - * -# @ref kernels::ClScaleKernel - */ -class ClScale : public IClOperator -{ -public: - /** Constructor */ - ClScale() = default; - /** Initialize the function's source, destination, interpolation type and border_mode. - * - * @param[in] compile_context The compile context to be used. - * @param[in,out] src Source tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/F16/F32. (Written to only for @p border_mode != UNDEFINED) - * @param[out] dst Destination tensor info. Data types supported: Same as @p src - * All but the lowest two dimensions must be the same size as in the input tensor, i.e. scaling is only performed within the XY-plane. - * @param[in] info @ref ScaleKernelInfo descriptor to be used to configure - */ - void configure(const CLCompileContext &compile_context, ITensorInfo *src, ITensorInfo *dst, const ScaleKernelInfo &info); - - /** Static function to check if given info will lead to a valid configuration of @ref ClScale - * - * @param[in] src Source tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/F16/F32. - * @param[in] dst Output tensor info. Data type supported: Same as @p src - * All but the lowest two dimensions must be the same size as in the input tensor, i.e. scaling is only performed within the XY-plane. - * @param[in] info @ref ScaleKernelInfo descriptor to be used to validate - * - * @return a status - */ - static Status validate(const ITensorInfo *src, const ITensorInfo *dst, const ScaleKernelInfo &info); - - // Inherited method overridden - void run(ITensorPack &tensors) override; - -protected: - std::unique_ptr<ICLKernel> _border_handler{ nullptr }; -}; -} // namespace opencl -} // namespace arm_compute -#endif /*ARM_COMPUTE_CLSCALE_H */ diff --git a/src/runtime/gpu/cl/operators/ClSoftmax.cpp b/src/runtime/gpu/cl/operators/ClSoftmax.cpp deleted file mode 100644 index 975bb0b932..0000000000 --- a/src/runtime/gpu/cl/operators/ClSoftmax.cpp +++ /dev/null @@ -1,186 +0,0 @@ -/* - * Copyright (c) 2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "src/runtime/gpu/cl/operators/ClSoftmax.h" -#include "arm_compute/core/utils/misc/ShapeCalculator.h" -#include "src/core/gpu/cl/kernels/ClSoftmaxKernel.h" -#include "src/core/helpers/MemoryHelpers.h" -#include "src/core/helpers/SoftmaxHelpers.h" -#include "src/runtime/gpu/cl/operators/ClPermute.h" -#include "src/runtime/gpu/cl/utils/ClAuxTensorHandler.h" -#include "support/Cast.h" - -using namespace arm_compute::experimental; - -namespace arm_compute -{ -namespace opencl -{ -ClSoftmax::ClSoftmax() - : _permute_input(std::make_unique<ClPermute>()), - _permute_output(std::make_unique<ClPermute>()), - _max_shift_exp_sum_kernel(std::make_unique<kernels::ClLogits1DMaxShiftExpSumKernel>()), - _norm_kernel(std::make_unique<kernels::ClLogits1DNormKernel>()), - _max_info(), - _sum_info(), - _tmp_info(), - _permuted_src_info(), - _permuted_dst_info(), - _aux_mem(InternalTensorIdx::COUNT) -{ -} - -void ClSoftmax::configure(const CLCompileContext &compile_context, const ITensorInfo &src, ITensorInfo &dst, const SoftmaxKernelInfo &info) -{ - ARM_COMPUTE_ERROR_THROW_ON(validate(src, dst, info)); - - const size_t actual_axis = static_cast<size_t>(wrap_around(info.axis, static_cast<int32_t>(src.num_dimensions()))); - - _needs_permute = actual_axis != 0; - - const ITensorInfo &tmp_input_info = _needs_permute ? _permuted_src_info : src; - ITensorInfo &tmp_output_info = _needs_permute ? _permuted_dst_info : dst; - - if(_needs_permute) - { - const auto perm_info = softmax_helpers::get_permutation_vector_from_softmax_axis(actual_axis); - _permute_input->configure(compile_context, &src, &_permuted_src_info, perm_info); - } - - DataType tmp_data_type = is_data_type_quantized_asymmetric(tmp_input_info.data_type()) ? DataType::S32 : tmp_input_info.data_type(); - _tmp_info = tmp_input_info.clone()->set_data_type(tmp_data_type); - - TensorShape max_sum_shape = tmp_input_info.tensor_shape(); - _max_info = tmp_input_info.clone()->set_tensor_shape(max_sum_shape); - _sum_info = tmp_input_info.clone()->set_tensor_shape(max_sum_shape).set_data_type(tmp_data_type); - - // Set GPU target to kernels - _max_shift_exp_sum_kernel->set_target(CLScheduler::get().target()); - - _max_shift_exp_sum_kernel->configure(compile_context, tmp_input_info, _max_info, _tmp_info, _sum_info, info); - _norm_kernel->configure(compile_context, _tmp_info, _sum_info, tmp_output_info, info); - - if(_needs_permute) - { - const auto perm_info = softmax_helpers::get_permutation_vector_from_softmax_axis(actual_axis); - _permute_output->configure(compile_context, &_permuted_dst_info, &dst, perm_info); - } - - _aux_mem[InternalTensorIdx::SUM] = MemoryInfo(offset_int_vec(InternalTensorIdx::SUM), MemoryLifetime::Temporary, _sum_info.total_size()); - _aux_mem[InternalTensorIdx::TMP] = MemoryInfo(offset_int_vec(InternalTensorIdx::TMP), MemoryLifetime::Temporary, _tmp_info.total_size()); - _aux_mem[InternalTensorIdx::MAX] = MemoryInfo(offset_int_vec(InternalTensorIdx::MAX), MemoryLifetime::Temporary, _max_info.total_size()); - - _aux_mem[InternalTensorIdx::PERMUTED_SRC] = MemoryInfo(offset_int_vec(InternalTensorIdx::PERMUTED_SRC), MemoryLifetime::Temporary, _permuted_src_info.total_size()); - _aux_mem[InternalTensorIdx::PERMUTED_DST] = MemoryInfo(offset_int_vec(InternalTensorIdx::PERMUTED_DST), MemoryLifetime::Temporary, _permuted_dst_info.total_size()); -} - -Status ClSoftmax::validate(const ITensorInfo &src, const ITensorInfo &dst, const SoftmaxKernelInfo &info) -{ - ARM_COMPUTE_RETURN_ERROR_ON_MSG(src.num_dimensions() > 4, "Only up to 4 dimensions are supported"); - ARM_COMPUTE_UNUSED(info.beta); - ARM_COMPUTE_RETURN_ERROR_ON(info.axis < static_cast<int32_t>(-src.num_dimensions()) || static_cast<int32_t>(src.num_dimensions()) <= info.axis); - - const size_t actual_axis = static_cast<size_t>(wrap_around(info.axis, static_cast<int32_t>(src.num_dimensions()))); - const bool needs_permute = actual_axis != 0; - if(needs_permute) - { - const PermutationVector permutation_vector = softmax_helpers::get_permutation_vector_from_softmax_axis(actual_axis); - const TensorShape permuted_shape = misc::shape_calculator::compute_permutation_output_shape(src, permutation_vector); - TensorInfo input_permuted(src.clone()->set_tensor_shape(permuted_shape)); - ARM_COMPUTE_RETURN_ON_ERROR(ClPermute::validate(&src, &input_permuted, permutation_vector)); - TensorInfo output_permuted(dst.clone()->set_tensor_shape(permuted_shape)); - ARM_COMPUTE_RETURN_ON_ERROR(ClPermute::validate(&output_permuted, &dst, permutation_vector)); - } - - // Create intermediate tensor info - DataType tmp_data_type = is_data_type_quantized_asymmetric(src.data_type()) ? DataType::S32 : src.data_type(); - TensorInfo tensor_info_tmp(src.clone()->set_data_type(tmp_data_type).set_is_resizable(true)); - - TensorShape max_sum_shape = src.tensor_shape(); - max_sum_shape.set(0, 1); - TensorInfo tensor_info_max(src.clone()->set_tensor_shape(max_sum_shape).set_is_resizable(true)); - TensorInfo tensor_info_sum(src.clone()->set_tensor_shape(max_sum_shape).set_data_type(tmp_data_type).set_quantization_info(QuantizationInfo()).set_is_resizable(true)); - - ARM_COMPUTE_RETURN_ON_ERROR(kernels::ClLogits1DMaxShiftExpSumKernel::validate(src, tensor_info_max, tensor_info_tmp, tensor_info_sum)); - ARM_COMPUTE_RETURN_ON_ERROR(kernels::ClLogits1DNormKernel::validate(tensor_info_tmp, tensor_info_sum, dst, info)); - - return Status{}; -} - -void ClSoftmax::run(ITensorPack &tensors) -{ - auto src = tensors.get_const_tensor(TensorType::ACL_SRC); - auto dst = tensors.get_tensor(TensorType::ACL_DST); - - CLAuxTensorHandler sum(offset_int_vec(InternalTensorIdx::SUM), _sum_info, tensors, false); - CLAuxTensorHandler tmp(offset_int_vec(InternalTensorIdx::TMP), _tmp_info, tensors, false); - CLAuxTensorHandler max(offset_int_vec(InternalTensorIdx::MAX), _max_info, tensors, false); - - CLAuxTensorHandler permuted_src(offset_int_vec(InternalTensorIdx::PERMUTED_SRC), _permuted_src_info, tensors, false); - CLAuxTensorHandler permuted_dst(offset_int_vec(InternalTensorIdx::PERMUTED_DST), _permuted_dst_info, tensors, false); - - if(_needs_permute) - { - ITensorPack pack; - pack.add_const_tensor(TensorType::ACL_SRC, src); - pack.add_tensor(TensorType::ACL_DST, permuted_src.get()); - _permute_input.get()->run(pack); - } - - ITensorPack sum_pack; - ITensorPack norm_pack; - if(_needs_permute) - { - sum_pack.add_const_tensor(TensorType::ACL_SRC, permuted_src.get()); - norm_pack.add_tensor(TensorType::ACL_DST, permuted_dst.get()); - } - else - { - sum_pack.add_const_tensor(TensorType::ACL_SRC, src); - norm_pack.add_tensor(TensorType::ACL_DST, dst); - } - sum_pack.add_tensor(TensorType::ACL_DST, tmp.get()); - sum_pack.add_tensor(TensorType::ACL_INT_0, max.get()); - sum_pack.add_tensor(TensorType::ACL_INT_1, sum.get()); - - norm_pack.add_const_tensor(TensorType::ACL_SRC, tmp.get()); - norm_pack.add_tensor(TensorType::ACL_INT_0, sum.get()); - - CLScheduler::get().enqueue_op(*_max_shift_exp_sum_kernel.get(), sum_pack, false); - CLScheduler::get().enqueue_op(*_norm_kernel.get(), norm_pack, false); - - if(_needs_permute) - { - ITensorPack pack; - pack.add_const_tensor(TensorType::ACL_SRC, permuted_dst.get()); - pack.add_tensor(TensorType::ACL_DST, dst); - _permute_output.get()->run(pack); - } -} - -experimental::MemoryRequirements ClSoftmax::workspace() const -{ - return _aux_mem; -} -} // namespace opencl -} // namespace arm_compute
\ No newline at end of file diff --git a/src/runtime/gpu/cl/operators/ClSoftmax.h b/src/runtime/gpu/cl/operators/ClSoftmax.h deleted file mode 100644 index f19a51fc5e..0000000000 --- a/src/runtime/gpu/cl/operators/ClSoftmax.h +++ /dev/null @@ -1,97 +0,0 @@ -/* - * Copyright (c) 2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifndef ARM_COMPUTE_CL_SOFTMAX_H -#define ARM_COMPUTE_CL_SOFTMAX_H - -#include "arm_compute/runtime/CL/CLTensor.h" -#include "src/core/gpu/cl/ClCompileContext.h" -#include "src/runtime/gpu/cl/IClOperator.h" - -namespace arm_compute -{ -struct SoftmaxKernelInfo; - -namespace opencl -{ -class ClPermute; -namespace kernels -{ -class ClLogits1DMaxShiftExpSumKernel; -class ClLogits1DNormKernel; -} // namespace kernels -class ClSoftmax : public IClOperator -{ -public: - /** Constructor */ - ClSoftmax(); - /** Configure the operator - * - * @param[in] compile_context The compile context to be used. - * @param[in] src Source tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32 for Softmax and F16/F32 for Log Softmax - * @param[out] dst Destination tensor info. Data types supported: same as @p src - * @param[in] info Contains information consumed by kernels for softmax described in @ref SoftmaxKernelInfo. - * - */ - void configure(const CLCompileContext &compile_context, const ITensorInfo &src, ITensorInfo &dst, const SoftmaxKernelInfo &info); - /** Static function to check if the given info will lead to a valid configuration - * - * @param[in] src Source tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32 for Softmax and F16/F32 for Log Softmax - * @param[out] dst Destination tensor info. Data types supported: same as @p src - * @param[in] info Contains information consumed by kernels for softmax described in @ref SoftmaxKernelInfo. - * - */ - static Status validate(const ITensorInfo &src, const ITensorInfo &dst, const SoftmaxKernelInfo &info); - // Inherited methods overridden: - void run(ITensorPack &tensors) override; - experimental::MemoryRequirements workspace() const override; - -private: - enum InternalTensorIdx - { - MAX = 0, - SUM, - TMP, - PERMUTED_SRC, - PERMUTED_DST, - COUNT - }; - - std::unique_ptr<ClPermute> _permute_input; - std::unique_ptr<ClPermute> _permute_output; - std::unique_ptr<kernels::ClLogits1DMaxShiftExpSumKernel> _max_shift_exp_sum_kernel; - std::unique_ptr<kernels::ClLogits1DNormKernel> _norm_kernel; - bool _needs_permute{ false }; - - TensorInfo _max_info; - TensorInfo _sum_info; - TensorInfo _tmp_info; - TensorInfo _permuted_src_info; - TensorInfo _permuted_dst_info; - - experimental::MemoryRequirements _aux_mem{}; -}; - -} // opencl -} // arm_compute -#endif /* ARM_COMPUTE_CL_SOFTMAX_H */
\ No newline at end of file diff --git a/src/runtime/gpu/cl/operators/ClSub.cpp b/src/runtime/gpu/cl/operators/ClSub.cpp deleted file mode 100644 index 429f23a837..0000000000 --- a/src/runtime/gpu/cl/operators/ClSub.cpp +++ /dev/null @@ -1,47 +0,0 @@ -/* - * Copyright (c) 2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "src/runtime/gpu/cl/operators/ClSub.h" - -#include "src/core/gpu/cl/ClCompileContext.h" -#include "src/core/gpu/cl/kernels/ClElementwiseKernel.h" - -namespace arm_compute -{ -namespace opencl -{ -void ClSub::configure(const ClCompileContext &compile_context, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, - ConvertPolicy policy, const ActivationLayerInfo &act_info) -{ - auto k = std::make_unique<kernels::ClSaturatedArithmeticKernel>(); - k->configure(compile_context, ArithmeticOperation::SUB, src1, src2, dst, policy, act_info); - _kernel = std::move(k); -} - -Status ClSub::validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, - ConvertPolicy policy, const ActivationLayerInfo &act_info) -{ - return kernels::ClSaturatedArithmeticKernel::validate(ArithmeticOperation::SUB, src1, src2, dst, policy, act_info); -} -} // namespace opencl -} // namespace arm_compute diff --git a/src/runtime/gpu/cl/operators/ClSub.h b/src/runtime/gpu/cl/operators/ClSub.h deleted file mode 100644 index bcad84d583..0000000000 --- a/src/runtime/gpu/cl/operators/ClSub.h +++ /dev/null @@ -1,100 +0,0 @@ -/* - * Copyright (c) 2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifndef ARM_COMPUTE_CL_SUB_H -#define ARM_COMPUTE_CL_SUB_H - -#include "src/core/gpu/cl/ClCompileContext.h" -#include "src/runtime/gpu/cl/IClOperator.h" - -namespace arm_compute -{ -namespace opencl -{ -/** Basic function to run arithmetic subtraction - * - * @note The tensor data type for the inputs must be U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/S32/F16/F32. - * @note The function performs an arithmetic subtraction between two tensors. - */ -class ClSub : public IClOperator -{ -public: - /** Default Constructor */ - ClSub() = default; - /** Configure function for a given list of arguments. - * - * Valid configurations (src1,src2) -> dst : - * - * - (U8,U8) -> U8 - * - (U8,U8) -> S16 - * - (S16,U8) -> S16 - * - (U8,S16) -> S16 - * - (S16,S16) -> S16 - * - (S32,S32) -> S32 - * - (F16,F16) -> F16 - * - (F32,F32) -> F32 - * - (QASYMM8,QASYMM8) -> QASYMM8 - * - (QASYMM8_SIGNED,QASYMM8_SIGNED) -> QASYMM8_SIGNED - * - (QSYMM16,QSYMM16) -> QSYMM16 - * - * @param[in] compile_context The compile context to be used. - * @param[in, out] src1 First source tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/S32/F16/F32. - * The source tensor is [in, out] because its TensorInfo might be modified inside the kernel in case of broadcasting of dimension 0. - * @param[in, out] src2 Second source tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/S32/F16/F32. - * The source tensor is [in, out] because its TensorInfo might be modified inside the kernel in case of broadcasting of dimension 0. - * @param[out] dst Destination tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/S32/F16/F32. - * @param[in] policy Policy to use to handle overflow. - * @param[in] act_info (Optional) Activation layer information in case of a fused activation. - */ - void configure(const ClCompileContext &compile_context, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, ConvertPolicy policy, - const ActivationLayerInfo &act_info = ActivationLayerInfo()); - /** Static function to check if given info will lead to a valid configuration of @ref ClSub - * - * Valid configurations (src1,src2) -> dst : - * - * - (U8,U8) -> U8 - * - (U8,U8) -> S16 - * - (S16,U8) -> S16 - * - (U8,S16) -> S16 - * - (S16,S16) -> S16 - * - (S32,S32) -> S32 - * - (F16,F16) -> F16 - * - (F32,F32) -> F32 - * - (QASYMM8,QASYMM8) -> QASYMM8 - * - (QASYMM8_SIGNED,QASYMM8_SIGNED) -> QASYMM8_SIGNED - * - (QSYMM16,QSYMM16) -> QSYMM16 - * - * @param[in] src1 First source tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/S32/F16/F32. - * @param[in] src2 Second source tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/S32/F16/F32. - * @param[in] dst Destination tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/S32/F16/F32. - * @param[in] policy Policy to use to handle overflow. - * @param[in] act_info (Optional) Activation layer information in case of a fused activation. - * - * @return a status - */ - static Status validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, ConvertPolicy policy, - const ActivationLayerInfo &act_info = ActivationLayerInfo()); -}; -} // namespace opencl -} // namespace arm_compute -#endif /* ARM_COMPUTE_CL_SUB_H */ diff --git a/src/runtime/gpu/cl/operators/ClTranspose.cpp b/src/runtime/gpu/cl/operators/ClTranspose.cpp deleted file mode 100644 index 48f44282e8..0000000000 --- a/src/runtime/gpu/cl/operators/ClTranspose.cpp +++ /dev/null @@ -1,45 +0,0 @@ -/* - * Copyright (c) 2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "src/runtime/gpu/cl/operators/ClTranspose.h" - -#include "src/core/gpu/cl/ClCompileContext.h" -#include "src/core/gpu/cl/kernels/ClTransposeKernel.h" - -namespace arm_compute -{ -namespace opencl -{ -void ClTranspose::configure(const ClCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst) -{ - auto k = std::make_unique<kernels::ClTransposeKernel>(); - k->configure(compile_context, src, dst); - _kernel = std::move(k); -} - -Status ClTranspose::validate(const ITensorInfo *src, const ITensorInfo *dst) -{ - return kernels::ClTransposeKernel::validate(src, dst); -} -} // namespace opencl -} // namespace arm_compute
\ No newline at end of file diff --git a/src/runtime/gpu/cl/operators/ClTranspose.h b/src/runtime/gpu/cl/operators/ClTranspose.h deleted file mode 100644 index d898f677ca..0000000000 --- a/src/runtime/gpu/cl/operators/ClTranspose.h +++ /dev/null @@ -1,58 +0,0 @@ -/* - * Copyright (c) 2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifndef ARM_COMPUTE_CL_TRANSPOSE_H -#define ARM_COMPUTE_CL_TRANSPOSE_H - -#include "src/core/gpu/cl/ClCompileContext.h" -#include "src/runtime/gpu/cl/IClOperator.h" - -namespace arm_compute -{ -namespace opencl -{ -/** Basic function to run @ref kernels::ClTransposeKernel */ -class ClTranspose : public IClOperator -{ -public: - /** Constructor */ - ClTranspose() = default; - /** Initialise the kernel's inputs and outputs - * - * @param[in] compile_context The compile context to be used. - * @param[in] src The src tensor info. Data types supported: All. - * @param[in] dst The dst tensor info. Data types supported: Same as @p src - */ - void configure(const CLCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst); - /** Static function to check if given info will lead to a valid configuration of @ref kernels::ClTransposeKernel. - * - * @param[in] src First tensor src info. Data types supported: All. - * @param[in] dst Output tensor info. Data types supported: same as @p src. - * - * @return a status - */ - static Status validate(const ITensorInfo *src, const ITensorInfo *dst); -}; -} // namespace opencl -} // namespace arm_compute -#endif /* ARM_COMPUTE_CL_TRANSPOSE_H */ diff --git a/src/runtime/gpu/cl/operators/ClWinogradConv2d.cpp b/src/runtime/gpu/cl/operators/ClWinogradConv2d.cpp deleted file mode 100644 index c8db697778..0000000000 --- a/src/runtime/gpu/cl/operators/ClWinogradConv2d.cpp +++ /dev/null @@ -1,299 +0,0 @@ -/* - * Copyright (c) 2018-2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "src/runtime/gpu/cl/operators/ClWinogradConv2d.h" - -#include "arm_compute/core/CL/ICLTensor.h" -#include "arm_compute/core/Utils.h" -#include "arm_compute/core/Validate.h" -#include "arm_compute/core/experimental/Types.h" -#include "arm_compute/core/utils/misc/ShapeCalculator.h" -#include "arm_compute/runtime/CL/CLScheduler.h" -#include "src/core/CL/kernels/CLFillBorderKernel.h" -#include "src/core/CL/kernels/CLFillBorderKernel.h" -#include "src/core/gpu/cl/kernels/ClWinogradFilterTransformKernel.h" -#include "src/core/gpu/cl/kernels/ClWinogradInputTransformKernel.h" -#include "src/core/gpu/cl/kernels/ClWinogradOutputTransformKernel.h" -#include "src/core/helpers/MemoryHelpers.h" -#include "src/runtime/gpu/cl/utils/ClAuxTensorHandler.h" -#include "support/Cast.h" - -using namespace arm_compute::experimental; - -namespace arm_compute -{ -namespace opencl -{ -namespace -{ -Size2D winograd_output_tile(const Size2D &input_dims, const Size2D &kernel_dims, DataLayout data_layout) -{ - Size2D output_tile = Size2D{}; - - const unsigned int kernel_max_dim = std::max(kernel_dims.width, kernel_dims.height); - - // Check if the input spatial dimensions are smaller than 4 - const bool is_input_lt4_nchw = (input_dims.width <= 4 && input_dims.height <= 4) && (data_layout == DataLayout::NCHW); - - if(kernel_max_dim == 3U) - { - if(kernel_dims == Size2D(3U, 3U)) - { - output_tile = is_input_lt4_nchw ? Size2D(2U, 2U) : Size2D(4U, 4U); - } - else if(kernel_dims == Size2D(3U, 1U)) - { - output_tile = is_input_lt4_nchw ? Size2D(2U, 1U) : Size2D(4U, 1U); - } - else - { - output_tile = is_input_lt4_nchw ? Size2D(1U, 2U) : Size2D(1U, 4U); - } - } - else if(kernel_max_dim == 5U) - { - output_tile = Size2D(kernel_dims.width == 1 ? 1U : 4U, - kernel_dims.height == 1 ? 1U : 4U); - } - else if(kernel_max_dim == 7U) - { - output_tile = Size2D(kernel_dims.width == 1 ? 1U : 2U, - kernel_dims.height == 1 ? 1U : 2U); - } - - return output_tile; -} - -bool check_support_fast_math(const Size2D &output_tile, const Size2D &kernel_size) -{ - // Check if we want to configure a Winograd configuration which requires fast math - using WinogradConfiguration = std::pair<std::pair<int, int>, std::pair<int, int>>; - - std::vector<WinogradConfiguration> fast_math_winograd = - { - WinogradConfiguration(std::pair<int, int>(4, 4), std::pair<int, int>(5, 5)), - WinogradConfiguration(std::pair<int, int>(2, 2), std::pair<int, int>(7, 7)) - }; - - auto p = std::make_pair(std::pair<int, int>(output_tile.width, output_tile.height), - std::pair<int, int>(kernel_size.width, kernel_size.height)); - - return std::find(fast_math_winograd.begin(), fast_math_winograd.end(), p) != fast_math_winograd.end(); -} - -Status validate_arguments(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, const PadStrideInfo &conv_info, - const ActivationLayerInfo &act_info, bool enable_fast_math) -{ - // Get indeces for the width and height - const size_t idx_width = get_data_layout_dimension_index(src->data_layout(), DataLayoutDimension::WIDTH); - const size_t idx_height = get_data_layout_dimension_index(src->data_layout(), DataLayoutDimension::HEIGHT); - - // Input shape, kernel size and output tile - const Size2D input_dims = Size2D(src->tensor_shape()[idx_width], src->tensor_shape()[idx_height]); - const Size2D kernel_size = Size2D(weights->tensor_shape()[idx_width], weights->tensor_shape()[idx_height]); - const Size2D output_tile = winograd_output_tile(input_dims, kernel_size, src->data_layout()); - - ARM_COMPUTE_RETURN_ERROR_ON_MSG(((conv_info.pad_left() > (kernel_size.x() / 2u)) || (conv_info.pad_right() > (kernel_size.x() / 2u))), "Winograd only supports padding up to half kernel size"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(((conv_info.pad_top() > (kernel_size.y() / 2u)) || (conv_info.pad_bottom() > (kernel_size.y() / 2u))), "Winograd only supports padding up to half kernel size"); - - // Check if the Winograd configuration requires fast math - if(!enable_fast_math) - { - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::F32); //disable winograd for fp16 if fast math is false. - ARM_COMPUTE_RETURN_ERROR_ON_MSG(check_support_fast_math(output_tile, kernel_size), "This Winograd configuration requires enable_fast_math=true"); - } - - const WinogradInfo winograd_info = WinogradInfo(output_tile, - kernel_size, - input_dims, - conv_info, - src->data_layout()); - - // Validate input transform - const TensorShape input0_shape = misc::shape_calculator::compute_winograd_input_transform_shape(*src, winograd_info); - const TensorInfo input0 = src->clone()->set_tensor_shape(input0_shape); - ARM_COMPUTE_RETURN_ON_ERROR(kernels::ClWinogradInputTransformKernel::validate(src, &input0, winograd_info)); - - // Validate filter transform - const TensorShape input1_shape = misc::shape_calculator::compute_winograd_filter_transform_shape(*weights, winograd_info); - const TensorInfo input1 = weights->clone()->set_tensor_shape(input1_shape); - ARM_COMPUTE_RETURN_ON_ERROR(kernels::ClWinogradFilterTransformKernel::validate(weights, &input1, winograd_info)); - - // Validate batched matrix multiply - TensorShape batched_mm_output_shape = input0.tensor_shape(); - batched_mm_output_shape[0] = input1.tensor_shape()[0]; - const TensorInfo batched_mm_output = input0.clone()->set_tensor_shape(batched_mm_output_shape); - ARM_COMPUTE_RETURN_ON_ERROR(ClGemm::validate(&input0, &input1, nullptr, &batched_mm_output, 1.0f, 0.0f, GEMMInfo(false, false, true /* Reshape weights only for the first run*/, 0, false, false, - GEMMLowpOutputStageInfo(), (src->data_type() == DataType::F16)))); - - // Configure output transform - ARM_COMPUTE_RETURN_ON_ERROR(kernels::ClWinogradOutputTransformKernel::validate(&batched_mm_output, biases, dst, winograd_info, act_info)); - return Status{}; -} - -} // namespace - -ClWinogradConv2d::ClWinogradConv2d() - : _batched_mm(), - _input_transform(std::make_unique<kernels::ClWinogradInputTransformKernel>()), - _filter_transform(std::make_unique<kernels::ClWinogradFilterTransformKernel>()), - _output_transform(std::make_unique<kernels::ClWinogradOutputTransformKernel>()), - _border_handler(), - _input0(), - _input1(), - _batched_mm_output(), - _is_prepared(false), - _aux_mem() -{ -} - -ClWinogradConv2d::~ClWinogradConv2d() = default; - -void ClWinogradConv2d::configure(const ClCompileContext &compile_context, ITensorInfo *src, ITensorInfo *weights, ITensorInfo *biases, ITensorInfo *dst, - const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info, bool enable_fast_math) -{ - ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, weights, biases, dst, conv_info, act_info, enable_fast_math)); - // Get indices for the width and height - const size_t idx_width = get_data_layout_dimension_index(src->data_layout(), DataLayoutDimension::WIDTH); - const size_t idx_height = get_data_layout_dimension_index(src->data_layout(), DataLayoutDimension::HEIGHT); - - // Input shape, kernel size and output tile - const Size2D input_dims = Size2D(src->tensor_shape()[idx_width], src->tensor_shape()[idx_height]); - const Size2D kernel_size = Size2D(weights->tensor_shape()[idx_width], weights->tensor_shape()[idx_height]); - const Size2D output_tile = winograd_output_tile(input_dims, kernel_size, src->data_layout()); - - // Check if the Winograd configuration requires fast math - if(!enable_fast_math) - { - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::F32); //disable winograd for fp16 if fast math is false. - ARM_COMPUTE_ERROR_ON_MSG(check_support_fast_math(output_tile, kernel_size), "This Winograd configuration requires enable_fast_math=true"); - } - const WinogradInfo winograd_info = WinogradInfo(output_tile, - kernel_size, - input_dims, - conv_info, - src->data_layout()); - - _is_prepared = false; - - // Configure input transform - _input_transform->configure(compile_context, src, &_input0, winograd_info); - _border_handler.configure(compile_context, src, _input_transform->border_size(), BorderMode::CONSTANT, PixelValue()); - - // Configure filter transform - _filter_transform->configure(compile_context, weights, &_input1, winograd_info); - - // Configure batched matrix multiply - _batched_mm.configure(compile_context, &_input0, &_input1, nullptr, &_batched_mm_output, 1.0f, 0.0f, GEMMInfo(false, false, true /* Reshape weights only for the first run*/, 0, - false, false, - GEMMLowpOutputStageInfo(), - (src->data_type() == DataType::F16))); - - // Configure output transform - _output_transform->configure(compile_context, &_batched_mm_output, biases, dst, winograd_info, act_info); - - _aux_mem = _batched_mm.workspace(); - _aux_mem.push_back(MemoryInfo(offset_int_vec(2), MemoryLifetime::Temporary, _input0.total_size())); - _aux_mem.push_back(MemoryInfo(offset_int_vec(3), MemoryLifetime::Persistent, _input1.total_size())); - _aux_mem.push_back(MemoryInfo(offset_int_vec(4), MemoryLifetime::Temporary, _batched_mm_output.total_size())); -} - -Status ClWinogradConv2d::validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, const PadStrideInfo &conv_info, - const ActivationLayerInfo &act_info, bool enable_fast_math) -{ - ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, weights, biases, dst, conv_info, act_info, enable_fast_math)); - return Status{}; -} - -void ClWinogradConv2d::run(ITensorPack &tensors) -{ - prepare(tensors); - - // Run input transform - auto src = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_0)); - auto biases = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_2)); - auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST)); - - CLAuxTensorHandler input0(offset_int_vec(2), _input0, tensors, true); - CLAuxTensorHandler input1(offset_int_vec(3), _input1, tensors, true); - CLAuxTensorHandler batched_mm_output(offset_int_vec(4), _batched_mm_output, tensors, true); - - ITensorPack pack_it - { - { TensorType::ACL_SRC, src }, - { TensorType::ACL_DST, input0.get() }, - }; - CLScheduler::get().enqueue_op(_border_handler, pack_it); - CLScheduler::get().enqueue_op(*_input_transform, pack_it); - - // Run batched matrix multiplication - ITensorPack pack_mm - { - { TensorType::ACL_SRC_0, input0.get() }, - { TensorType::ACL_SRC_1, input1.get() }, - { TensorType::ACL_DST, batched_mm_output.get() }, - }; - _batched_mm.run(pack_mm); - - // Run output transform - ITensorPack pack_ot - { - { TensorType::ACL_SRC_0, batched_mm_output.get() }, - { TensorType::ACL_SRC_1, biases }, - { TensorType::ACL_DST, dst }, - }; - CLScheduler::get().enqueue_op(*_output_transform, pack_ot); -} - -void ClWinogradConv2d::prepare(ITensorPack &tensors) -{ - if(!_is_prepared) - { - auto weights = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_1)); - ICLTensor *in1_aux = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(offset_int_vec(3))); - - CLAuxTensorHandler input1(_input1, *in1_aux); - ITensorPack pack_ft - { - { TensorType::ACL_SRC, weights }, - { TensorType::ACL_DST, input1.get() }, - }; - // Run filter transform and mark original weights as unused - CLScheduler::get().enqueue_op(*_filter_transform, pack_ft, false); - weights->mark_as_unused(); - - tensors.add_tensor(ACL_SRC_1, input1.get()); - // Prepare GEMM and release reshaped weights if marked unused by ClGemm - _batched_mm.prepare(tensors); - - CLScheduler::get().queue().finish(); - _is_prepared = true; - } -} - -experimental::MemoryRequirements ClWinogradConv2d::workspace() const -{ - return _aux_mem; -} -} // namespace opencl -} // namespace arm_compute
\ No newline at end of file diff --git a/src/runtime/gpu/cl/operators/ClWinogradConv2d.h b/src/runtime/gpu/cl/operators/ClWinogradConv2d.h deleted file mode 100644 index 83b31f1c99..0000000000 --- a/src/runtime/gpu/cl/operators/ClWinogradConv2d.h +++ /dev/null @@ -1,126 +0,0 @@ -/* - * Copyright (c) 2018-2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifndef ARM_COMPUTE_CL_WINOGRADCONV2D_H -#define ARM_COMPUTE_CL_WINOGRADCONV2D_H - -#include "arm_compute/runtime/CL/CLTensor.h" -#include "src/core/CL/kernels/CLFillBorderKernel.h" -#include "src/core/gpu/cl/ClCompileContext.h" -#include "src/runtime/gpu/cl/IClOperator.h" -#include "src/runtime/gpu/cl/operators/ClGemm.h" - -namespace arm_compute -{ -class CLCompileContext; -class ITensorInfo; -namespace opencl -{ -namespace kernels -{ -class ClWinogradInputTransformKernel; -class ClWinogradFilterTransformKernel; -class ClWinogradOutputTransformKernel; -} // kernels -/** Basic function to execute Winograd-based convolution on OpenCL. This function calls the following OpenCL functions/kernels: - * - * -# @ref kernels::ClWinogradInputTransformKernel - * -# @ref kernels::ClWinogradFilterTransformKernel (only once) - * -# @ref ClGemm - * -# @ref kernels::ClWinogradOutputTransformKernel - * - */ -class ClWinogradConv2d : public IClOperator -{ -public: - /** Default constructor */ - ClWinogradConv2d(); - /** Default destructor */ - ~ClWinogradConv2d(); - /** Prevent instances of this class from being copied (As this class contains pointers) */ - ClWinogradConv2d(const ClWinogradConv2d &) = delete; - /** Default move constructor */ - ClWinogradConv2d(ClWinogradConv2d &&) = default; - /** Prevent instances of this class from being copied (As this class contains pointers) */ - ClWinogradConv2d &operator=(const ClWinogradConv2d &) = delete; - /** Default move assignment operator */ - ClWinogradConv2d &operator=(ClWinogradConv2d &&) = default; - /** Set the input and output tensors. - * - * Valid data layouts: - * - NHWC - * - NCHW - * - * Valid data type configurations: - * |src0 |src1 |src2 |dst | - * |:--------------|:--------------|:------|:--------------| - * |F16 |F16 |F16 |F16 | - * |F32 |F32 |F32 |F32 | - * - * @note: This function only works with 3x3,3x1,1x3,5x5,5x1,1x5,7x1 and 1x7 kernels along with unit strides for both NCHW and NHWC data layout - * @note Some Winograd configurations (i.e. F(4x4, 5x5)) are supported only with enable_fast_math = true - * - * @param[in] compile_context The compile context to be used. - * @param[in] src Source tensor info. 3 lower dimensions represent a single input [width, height, IFM], - * while every optional dimension from 4 and above represent a batch of inputs. - * Data types supported: F16/F32. - * @param[in] weights Weights tensor info. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM]. Data type supported:Same as @p src. - * @param[in] biases Biases tensor info. Shared biases supported. Biases are 1D tensor with dimensions [OFM].Data type supported: Same as @p src - * @param[out] dst Destination tensor info. 3 lower dimensions represent a single output [width, height, OFM], while the rest represent batch of outputs. - * Data types supported: Same as @p src. - * @param[in] conv_info Contains padding and stride information described in @ref PadStrideInfo. - * @param[in] act_info (Optional) Activation layer information in case of a fused activation. - * @param[in] enable_fast_math (Optional) Enable fast math computation. In case this flag were set, the function could dispatch the fastest implementation - * available which may introduce a drop of accuracy as well. Default is false - */ - void configure(const ClCompileContext &compile_context, ITensorInfo *src, ITensorInfo *weights, ITensorInfo *biases, ITensorInfo *dst, const PadStrideInfo &conv_info, - const ActivationLayerInfo &act_info = ActivationLayerInfo(), bool enable_fast_math = false); - /** Static function to check if given info will lead to a valid configuration - * - * Similar to ClWinogradConv2d::configure() - * - * @return a status - */ - static Status validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, const PadStrideInfo &conv_info, - const ActivationLayerInfo &act_info = ActivationLayerInfo(), bool enable_fast_math = false); - - // Inherited method overridden - void run(ITensorPack &tensors) override; - void prepare(ITensorPack &tensors) override; - experimental::MemoryRequirements workspace() const override; - -private: - ClGemm _batched_mm; - std::unique_ptr<kernels::ClWinogradInputTransformKernel> _input_transform; - std::unique_ptr<kernels::ClWinogradFilterTransformKernel> _filter_transform; - std::unique_ptr<kernels::ClWinogradOutputTransformKernel> _output_transform; - CLFillBorderKernel _border_handler; - TensorInfo _input0; - TensorInfo _input1; - TensorInfo _batched_mm_output; - bool _is_prepared; - experimental::MemoryRequirements _aux_mem{}; -}; -} // namespace opencl -} // namespace arm_compute -#endif /* ARM_COMPUTE_CL_WINOGRADCONV2D_H */ diff --git a/src/runtime/gpu/cl/utils/ClAuxTensorHandler.h b/src/runtime/gpu/cl/utils/ClAuxTensorHandler.h deleted file mode 100644 index 152e3c6c04..0000000000 --- a/src/runtime/gpu/cl/utils/ClAuxTensorHandler.h +++ /dev/null @@ -1,101 +0,0 @@ -/* - * Copyright (c) 2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifndef ARM_COMPUTE_CL_UTILS_CL_AUX_TENSOR_HANDLER_H -#define ARM_COMPUTE_CL_UTILS_CL_AUX_TENSOR_HANDLER_H - -#include "arm_compute/core/ITensorPack.h" -#include "arm_compute/core/TensorInfo.h" -#include "arm_compute/runtime/CL/CLTensor.h" - -#include "support/Cast.h" - -namespace arm_compute -{ -namespace opencl -{ -/* Tensor handler to wrap and handle tensor allocations on workspace buffers */ -class CLAuxTensorHandler -{ -public: - CLAuxTensorHandler(int slot_id, TensorInfo &info, ITensorPack &pack, bool pack_inject = false) - : _tensor() - { - _tensor.allocator()->soft_init(info); - - ICLTensor *packed_tensor = utils::cast::polymorphic_downcast<ICLTensor *>(pack.get_tensor(slot_id)); - if((packed_tensor == nullptr) || (info.total_size() > packed_tensor->info()->total_size())) - { - _tensor.allocator()->allocate(); - if(pack_inject) - { - pack.add_tensor(slot_id, &_tensor); - _injected_tensor_pack = &pack; - _injected_slot_id = slot_id; - } - } - else - { - _tensor.allocator()->import_memory(packed_tensor->cl_buffer()); - } - } - - CLAuxTensorHandler(TensorInfo &info, ICLTensor &tensor) - : _tensor() - { - _tensor.allocator()->soft_init(info); - if(info.total_size() <= tensor.info()->total_size()) - { - _tensor.allocator()->import_memory(tensor.cl_buffer()); - } - } - - CLAuxTensorHandler(const CLAuxTensorHandler &) = delete; - CLAuxTensorHandler &operator=(const CLAuxTensorHandler) = delete; - - ~CLAuxTensorHandler() - { - if(_injected_tensor_pack) - { - _injected_tensor_pack->remove_tensor(_injected_slot_id); - } - } - - ICLTensor *get() - { - return &_tensor; - } - - ICLTensor *operator()() - { - return &_tensor; - } - -private: - CLTensor _tensor{}; - ITensorPack *_injected_tensor_pack{ nullptr }; - int _injected_slot_id{ TensorType::ACL_UNKNOWN }; -}; -} // namespace opencl -} // namespace arm_compute -#endif /* ARM_COMPUTE_CL_UTILS_CL_AUX_TENSOR_HANDLER_H */
\ No newline at end of file diff --git a/src/runtime/heuristics/direct_conv/ClDirectConvDefaultConfigBifrost.cpp b/src/runtime/heuristics/direct_conv/ClDirectConvDefaultConfigBifrost.cpp new file mode 100644 index 0000000000..aba32871d0 --- /dev/null +++ b/src/runtime/heuristics/direct_conv/ClDirectConvDefaultConfigBifrost.cpp @@ -0,0 +1,205 @@ +/* + * Copyright (c) 2022 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "src/runtime/heuristics/direct_conv/ClDirectConvDefaultConfigBifrost.h" + +#include "arm_compute/core/CL/CLHelpers.h" +#include "arm_compute/core/CL/CLKernelLibrary.h" +#include "arm_compute/core/GPUTarget.h" +#include "arm_compute/core/TensorInfo.h" +#include "arm_compute/core/TensorShape.h" +#include "arm_compute/core/utils/misc/ShapeCalculator.h" + +#include <utility> + +namespace arm_compute +{ +namespace cl_direct_conv +{ +using namespace arm_compute::misc::shape_calculator; + +ClDirectConvDefaultConfigBifrost::ClDirectConvDefaultConfigBifrost(GPUTarget gpu) : IClDirectConvKernelConfig(gpu) +{ +} + +DirectConvComputeKernelInfo ClDirectConvDefaultConfigBifrost::configure(const ITensorInfo *src, + const ITensorInfo *wei, + const PadStrideInfo &conv_info) +{ + using ConfigurationFunctionExecutorPtr = DirectConvComputeKernelInfo (ClDirectConvDefaultConfigBifrost::*)( + const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info); + + ClDirectConvConfigArray<ConfigurationFunctionExecutorPtr> configs_G71( + &ClDirectConvDefaultConfigBifrost::configure_G71_f32, &ClDirectConvDefaultConfigBifrost::configure_G71_f16, + &ClDirectConvDefaultConfigBifrost::configure_G71_u8); + + ClDirectConvConfigArray<ConfigurationFunctionExecutorPtr> configs_default( + &ClDirectConvDefaultConfigBifrost::configure_default_f32, + &ClDirectConvDefaultConfigBifrost::configure_default_f16, &ClDirectConvDefaultConfigBifrost::configure_G71_u8); + + ConfigurationFunctionExecutorPtr func = nullptr; + switch (_target) + { + case GPUTarget::G71: + func = configs_G71.get_function(src->data_type()); + break; + default: + func = configs_default.get_function(src->data_type()); + break; + } + + ARM_COMPUTE_ERROR_ON_MSG(func == nullptr, "Data type not supported for direct convolution"); + return (this->*func)(src, wei, conv_info); +} + +DirectConvComputeKernelInfo ClDirectConvDefaultConfigBifrost::configure_G71_f32(const ITensorInfo *src, + const ITensorInfo *wei, + const PadStrideInfo &conv_info) +{ + DirectConvComputeKernelInfo desc; + + if (src->data_layout() == DataLayout::NHWC) + { + // Get the output shape + TensorShape output_shape = misc::shape_calculator::compute_deep_convolution_shape(*src, *wei, conv_info); + + desc.n0 = 4; + + if (output_shape[0] > 16) + { + desc.m0 = 2; + } + + desc.k0 = 8; + + desc.export_weights_to_cl_image = false; + } + + return desc; +} + +DirectConvComputeKernelInfo ClDirectConvDefaultConfigBifrost::configure_G71_f16(const ITensorInfo *src, + const ITensorInfo *wei, + const PadStrideInfo &conv_info) +{ + DirectConvComputeKernelInfo desc; + + if (src->data_layout() == DataLayout::NHWC) + { + // Get the output shape + TensorShape output_shape = misc::shape_calculator::compute_deep_convolution_shape(*src, *wei, conv_info); + + desc.n0 = 4; + + if (output_shape[0] > 16) + { + desc.m0 = 4; + } + + desc.k0 = 8; + + desc.export_weights_to_cl_image = false; + } + + return desc; +} + +DirectConvComputeKernelInfo ClDirectConvDefaultConfigBifrost::configure_G71_u8(const ITensorInfo *src, + const ITensorInfo *wei, + const PadStrideInfo &conv_info) +{ + DirectConvComputeKernelInfo desc; + + if (src->data_layout() == DataLayout::NHWC) + { + // Get the output shape + TensorShape output_shape = misc::shape_calculator::compute_deep_convolution_shape(*src, *wei, conv_info); + + desc.n0 = 4; + + if (output_shape[0] > 16) + { + desc.m0 = 4; + } + + desc.k0 = 16; + + desc.export_weights_to_cl_image = false; + } + + return desc; +} + +DirectConvComputeKernelInfo ClDirectConvDefaultConfigBifrost::configure_default_f32(const ITensorInfo *src, + const ITensorInfo *wei, + const PadStrideInfo &conv_info) +{ + DirectConvComputeKernelInfo desc; + + if (src->data_layout() == DataLayout::NHWC) + { + // Get the output shape + TensorShape output_shape = misc::shape_calculator::compute_deep_convolution_shape(*src, *wei, conv_info); + + desc.n0 = 4; + + if (output_shape[0] > 16) + { + desc.m0 = 2; + } + + desc.k0 = 8; + + desc.export_weights_to_cl_image = export_to_cl_image(wei); + } + + return desc; +} + +DirectConvComputeKernelInfo ClDirectConvDefaultConfigBifrost::configure_default_f16(const ITensorInfo *src, + const ITensorInfo *wei, + const PadStrideInfo &conv_info) +{ + DirectConvComputeKernelInfo desc; + + if (src->data_layout() == DataLayout::NHWC) + { + // Get the output shape + TensorShape output_shape = misc::shape_calculator::compute_deep_convolution_shape(*src, *wei, conv_info); + + desc.n0 = 4; + + if (output_shape[0] > 16) + { + desc.m0 = 4; + } + + desc.k0 = 8; + + desc.export_weights_to_cl_image = export_to_cl_image(wei); + } + + return desc; +} +} // namespace cl_direct_conv +} // namespace arm_compute diff --git a/src/runtime/heuristics/direct_conv/ClDirectConvDefaultConfigBifrost.h b/src/runtime/heuristics/direct_conv/ClDirectConvDefaultConfigBifrost.h new file mode 100644 index 0000000000..ed6a4c3c68 --- /dev/null +++ b/src/runtime/heuristics/direct_conv/ClDirectConvDefaultConfigBifrost.h @@ -0,0 +1,61 @@ +/* + * Copyright (c) 2022 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef SRC_RUNTIME_HEURISTICS_DIRECT_CONV_CLDIRECTCONVDEFAULTCONFIGBIFROST +#define SRC_RUNTIME_HEURISTICS_DIRECT_CONV_CLDIRECTCONVDEFAULTCONFIGBIFROST + +#include "src/runtime/heuristics/direct_conv/IClDirectConvKernelConfig.h" + +namespace arm_compute +{ +namespace cl_direct_conv +{ +/** Bifrost based OpenCL direct convolution configuration */ +class ClDirectConvDefaultConfigBifrost final : public IClDirectConvKernelConfig +{ +public: + /** Constructor + * + * @param[in] gpu GPU target + */ + ClDirectConvDefaultConfigBifrost(GPUTarget gpu); + + // Inherited overridden method + DirectConvComputeKernelInfo + configure(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info) override; + +private: + DirectConvComputeKernelInfo + configure_G71_f32(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info); + DirectConvComputeKernelInfo + configure_G71_f16(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info); + DirectConvComputeKernelInfo + configure_G71_u8(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info); + DirectConvComputeKernelInfo + configure_default_f32(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info); + DirectConvComputeKernelInfo + configure_default_f16(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info); +}; +} // namespace cl_direct_conv +} // namespace arm_compute +#endif /* SRC_RUNTIME_HEURISTICS_DIRECT_CONV_CLDIRECTCONVDEFAULTCONFIGBIFROST */ diff --git a/src/runtime/heuristics/direct_conv/ClDirectConvDefaultConfigValhall.cpp b/src/runtime/heuristics/direct_conv/ClDirectConvDefaultConfigValhall.cpp new file mode 100644 index 0000000000..4b7666d5aa --- /dev/null +++ b/src/runtime/heuristics/direct_conv/ClDirectConvDefaultConfigValhall.cpp @@ -0,0 +1,413 @@ +/* + * Copyright (c) 2022 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "src/runtime/heuristics/direct_conv/ClDirectConvDefaultConfigValhall.h" + +#include "arm_compute/core/CL/CLHelpers.h" +#include "arm_compute/core/CL/CLKernelLibrary.h" +#include "arm_compute/core/GPUTarget.h" +#include "arm_compute/core/TensorInfo.h" +#include "arm_compute/core/TensorShape.h" +#include "arm_compute/core/utils/misc/ShapeCalculator.h" + +#include <utility> + +namespace arm_compute +{ +namespace cl_direct_conv +{ +using namespace arm_compute::misc::shape_calculator; + +ClDirectConvDefaultConfigValhall::ClDirectConvDefaultConfigValhall(GPUTarget gpu) : IClDirectConvKernelConfig(gpu) +{ +} + +DirectConvComputeKernelInfo ClDirectConvDefaultConfigValhall::configure(const ITensorInfo *src, + const ITensorInfo *wei, + const PadStrideInfo &conv_info) +{ + using ConfigurationFunctionExecutorPtr = DirectConvComputeKernelInfo (ClDirectConvDefaultConfigValhall::*)( + const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info); + + ClDirectConvConfigArray<ConfigurationFunctionExecutorPtr> configs_G78( + &ClDirectConvDefaultConfigValhall::configure_G78_f32, &ClDirectConvDefaultConfigValhall::configure_G78_f16, + &ClDirectConvDefaultConfigValhall::configure_G78_u8); + + ClDirectConvConfigArray<ConfigurationFunctionExecutorPtr> configs_G57( + &ClDirectConvDefaultConfigValhall::configure_G57_f32, &ClDirectConvDefaultConfigValhall::configure_G57_f16, + &ClDirectConvDefaultConfigValhall::configure_G78_u8); + + ConfigurationFunctionExecutorPtr func = nullptr; + switch (_target) + { + case GPUTarget::G57: + func = configs_G57.get_function(src->data_type()); + break; + case GPUTarget::G78: + default: + func = configs_G78.get_function(src->data_type()); + break; + } + + ARM_COMPUTE_ERROR_ON_MSG(func == nullptr, "Data type not supported for direct convolution"); + return (this->*func)(src, wei, conv_info); +} + +DirectConvComputeKernelInfo ClDirectConvDefaultConfigValhall::configure_G78_f32(const ITensorInfo *src, + const ITensorInfo *wei, + const PadStrideInfo &conv_info) +{ + DirectConvComputeKernelInfo desc; + + if (src->data_layout() == DataLayout::NHWC) + { + // Get the output shape + const TensorShape wei_shape = wei->tensor_shape(); + const TensorShape dst_shape = misc::shape_calculator::compute_deep_convolution_shape(*src, *wei, conv_info); + const bool export_weights_to_cl_image = export_to_cl_image(wei); + + const int32_t ofm = dst_shape[0]; + const int32_t m = dst_shape[1] * dst_shape[2]; + const bool is_pointwise = (wei_shape[1] == wei_shape[2]) && wei_shape[1] == 1; + + desc.export_weights_to_cl_image = export_weights_to_cl_image; + + if (dst_shape[0] <= 4) + { + if (is_pointwise) + { + if (ofm == 4) + { + desc.m0 = 1; + desc.n0 = 4; + desc.k0 = 16; + } + else + { + desc.m0 = 1; + desc.n0 = 1; + desc.k0 = 16; + } + } + else + { + desc.m0 = 1; + desc.n0 = 2; + desc.k0 = 16; + } + } + else + { + if (m < 64) + { + desc.m0 = 1; + desc.n0 = 1; + desc.k0 = 16; + } + else + { + desc.m0 = 4; + desc.n0 = 4; + desc.k0 = 4; + } + } + } + + return desc; +} + +DirectConvComputeKernelInfo ClDirectConvDefaultConfigValhall::configure_G78_f16(const ITensorInfo *src, + const ITensorInfo *wei, + const PadStrideInfo &conv_info) +{ + DirectConvComputeKernelInfo desc; + + if (src->data_layout() == DataLayout::NHWC) + { + // Get the output shape + const TensorShape wei_shape = wei->tensor_shape(); + const TensorShape dst_shape = misc::shape_calculator::compute_deep_convolution_shape(*src, *wei, conv_info); + const bool export_weights_to_cl_image = export_to_cl_image(wei); + + const int32_t ofm = dst_shape[0]; + const int32_t m = dst_shape[1] * dst_shape[2]; + const int32_t k = wei_shape[0]; + const bool is_pointwise = (wei_shape[1] == wei_shape[2]) && wei_shape[1] == 1; + + desc.export_weights_to_cl_image = export_weights_to_cl_image; + + if (dst_shape[0] <= 4) + { + // k0 should be as larger as possible. However, we should avoid + // having left-over for loops that make the implementation slower. + if ((k % 16) == 0) + { + desc.k0 = 16; + } + else if ((k % 8) == 0) + { + desc.k0 = 8; + } + else + { + desc.k0 = 4; + } + + if (is_pointwise) + { + if (ofm == 4) + { + desc.m0 = 1; + desc.n0 = 4; + } + else + { + desc.m0 = 1; + desc.n0 = 1; + } + } + else + { + desc.m0 = 1; + desc.n0 = dst_shape[0]; + } + } + else + { + if (m < 64) + { + desc.m0 = 1; + desc.n0 = 1; + if ((k % 16) == 0) + { + desc.k0 = 16; + } + else if ((k % 8) == 0) + { + desc.k0 = 8; + } + else + { + desc.k0 = 4; + } + } + else + { + if (ofm >= 16) + { + if (m / 6 > 24000) + { + desc.m0 = 6; + } + else + { + desc.m0 = 5; + } + desc.n0 = 8; + desc.k0 = 4; + } + else + { + desc.m0 = 2; + desc.n0 = 8; + if ((k % 16) == 0) + { + desc.k0 = 16; + } + else if ((k % 8) == 0) + { + desc.k0 = 8; + } + else + { + desc.k0 = 4; + } + } + } + } + } + + return desc; +} + +DirectConvComputeKernelInfo ClDirectConvDefaultConfigValhall::configure_G78_u8(const ITensorInfo *src, + const ITensorInfo *wei, + const PadStrideInfo &conv_info) +{ + DirectConvComputeKernelInfo desc; + + if (src->data_layout() == DataLayout::NHWC) + { + // Get the output shape + TensorShape output_shape = misc::shape_calculator::compute_deep_convolution_shape(*src, *wei, conv_info); + + desc.n0 = 4; + + if (output_shape[0] > 16) + { + desc.m0 = 4; + } + + desc.k0 = 16; + + desc.export_weights_to_cl_image = false; + } + + return desc; +} + +DirectConvComputeKernelInfo ClDirectConvDefaultConfigValhall::configure_G57_f32(const ITensorInfo *src, + const ITensorInfo *wei, + const PadStrideInfo &conv_info) +{ + DirectConvComputeKernelInfo desc; + + if (src->data_layout() == DataLayout::NHWC) + { + // Get the output shape + const TensorShape wei_shape = wei->tensor_shape(); + const TensorShape dst_shape = misc::shape_calculator::compute_deep_convolution_shape(*src, *wei, conv_info); + const bool export_weights_to_cl_image = export_to_cl_image(wei); + + const int32_t m = dst_shape[1] * dst_shape[2]; + const bool is_pointwise = (wei_shape[1] == wei_shape[2]) && wei_shape[1] == 1; + + desc.export_weights_to_cl_image = export_weights_to_cl_image; + + if (dst_shape[0] <= 4) + { + if (is_pointwise) + { + desc.m0 = 1; + desc.n0 = 1; + desc.k0 = 16; + } + else + { + desc.m0 = 1; + desc.n0 = dst_shape[0]; + desc.k0 = 16; + } + } + else + { + if (m < 64) + { + if (m == 1) + { + desc.m0 = 1; + desc.n0 = 1; + desc.k0 = 16; + } + else + { + desc.m0 = 4; + desc.n0 = 2; + desc.k0 = 8; + } + } + else + { + desc.m0 = 4; + desc.n0 = 4; + desc.k0 = 4; + } + } + } + + return desc; +} + +DirectConvComputeKernelInfo ClDirectConvDefaultConfigValhall::configure_G57_f16(const ITensorInfo *src, + const ITensorInfo *wei, + const PadStrideInfo &conv_info) +{ + DirectConvComputeKernelInfo desc; + + if (src->data_layout() == DataLayout::NHWC) + { + // Get the output shape + const TensorShape wei_shape = wei->tensor_shape(); + const TensorShape dst_shape = misc::shape_calculator::compute_deep_convolution_shape(*src, *wei, conv_info); + const bool export_weights_to_cl_image = export_to_cl_image(wei); + + const int32_t ofm = dst_shape[0]; + const int32_t m = dst_shape[1] * dst_shape[2]; + const bool is_pointwise = (wei_shape[1] == wei_shape[2]) && wei_shape[1] == 1; + + desc.export_weights_to_cl_image = export_weights_to_cl_image; + + if (dst_shape[0] <= 4) + { + if (is_pointwise) + { + desc.m0 = 2; + desc.n0 = 1; + desc.k0 = 16; + } + else + { + desc.m0 = 1; + desc.n0 = dst_shape[0]; + desc.k0 = 16; + } + } + else + { + if (m < 64) + { + if (m == 1) + { + desc.m0 = 1; + desc.n0 = 1; + desc.k0 = 16; + } + else + { + desc.m0 = 4; + desc.n0 = 2; + desc.k0 = 8; + } + } + else + { + if (ofm > 16) + { + desc.m0 = 4; + desc.n0 = 8; + desc.k0 = 8; + } + else + { + desc.m0 = 8; + desc.n0 = 4; + desc.k0 = 4; + } + } + } + } + + return desc; +} +} // namespace cl_direct_conv +} // namespace arm_compute diff --git a/src/runtime/heuristics/direct_conv/ClDirectConvDefaultConfigValhall.h b/src/runtime/heuristics/direct_conv/ClDirectConvDefaultConfigValhall.h new file mode 100644 index 0000000000..efd879a567 --- /dev/null +++ b/src/runtime/heuristics/direct_conv/ClDirectConvDefaultConfigValhall.h @@ -0,0 +1,61 @@ +/* + * Copyright (c) 2022 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef SRC_RUNTIME_HEURISTICS_DIRECT_CONV_CLDIRECTCONVDEFAULTCONFIGVALHALL +#define SRC_RUNTIME_HEURISTICS_DIRECT_CONV_CLDIRECTCONVDEFAULTCONFIGVALHALL + +#include "src/runtime/heuristics/direct_conv/IClDirectConvKernelConfig.h" + +namespace arm_compute +{ +namespace cl_direct_conv +{ +/** Valhall based OpenCL direct convolution configuration */ +class ClDirectConvDefaultConfigValhall final : public IClDirectConvKernelConfig +{ +public: + /** Constructor + * + * @param[in] gpu GPU target + */ + ClDirectConvDefaultConfigValhall(GPUTarget gpu); + + // Inherited overridden method + DirectConvComputeKernelInfo + configure(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info) override; + +private: + DirectConvComputeKernelInfo + configure_G78_f32(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info); + DirectConvComputeKernelInfo + configure_G78_f16(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info); + DirectConvComputeKernelInfo + configure_G78_u8(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info); + DirectConvComputeKernelInfo + configure_G57_f32(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info); + DirectConvComputeKernelInfo + configure_G57_f16(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info); +}; +} // namespace cl_direct_conv +} // namespace arm_compute +#endif /* SRC_RUNTIME_HEURISTICS_DIRECT_CONV_CLDIRECTCONVDEFAULTCONFIGVALHALL */ diff --git a/src/runtime/heuristics/direct_conv/ClDirectConvKernelConfig.h b/src/runtime/heuristics/direct_conv/ClDirectConvKernelConfig.h new file mode 100644 index 0000000000..215b17ef79 --- /dev/null +++ b/src/runtime/heuristics/direct_conv/ClDirectConvKernelConfig.h @@ -0,0 +1,65 @@ +/* + * Copyright (c) 2022-2023 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ACL_SRC_RUNTIME_HEURISTICS_DIRECT_CONV_CLDIRECTCONVKERNELCONFIG_H +#define ACL_SRC_RUNTIME_HEURISTICS_DIRECT_CONV_CLDIRECTCONVKERNELCONFIG_H + +#include "src/runtime/heuristics/direct_conv/ClDirectConvDefaultConfigBifrost.h" +#include "src/runtime/heuristics/direct_conv/ClDirectConvDefaultConfigValhall.h" +#include "src/runtime/heuristics/direct_conv/IClDirectConvKernelConfig.h" + +#include <memory> + +namespace arm_compute +{ +namespace cl_direct_conv +{ +/** ClDirectConvolution factory class */ +class ClDirectConvKernelConfigurationFactory final +{ +public: + /** Static method to call the ClDirectConvolution kernel configuration class accordingly with the GPU target + * + * @param[in] gpu GPU target + * + * @return IClDirectConvKernelConfig + */ + static std::unique_ptr<IClDirectConvKernelConfig> create(GPUTarget gpu) + { + switch (get_arch_from_target(gpu)) + { + case GPUTarget::MIDGARD: + return std::make_unique<ClDirectConvDefaultConfigBifrost>(GPUTarget::G71); + case GPUTarget::BIFROST: + return std::make_unique<ClDirectConvDefaultConfigBifrost>(gpu); + case GPUTarget::VALHALL: + case GPUTarget::FIFTHGEN: + return std::make_unique<ClDirectConvDefaultConfigValhall>(gpu); + default: + ARM_COMPUTE_ERROR("Not supported GPU target"); + } + } +}; +} // namespace cl_direct_conv +} // namespace arm_compute +#endif // ACL_SRC_RUNTIME_HEURISTICS_DIRECT_CONV_CLDIRECTCONVKERNELCONFIG_H diff --git a/src/runtime/heuristics/direct_conv/IClDirectConvKernelConfig.h b/src/runtime/heuristics/direct_conv/IClDirectConvKernelConfig.h new file mode 100644 index 0000000000..e5b270c720 --- /dev/null +++ b/src/runtime/heuristics/direct_conv/IClDirectConvKernelConfig.h @@ -0,0 +1,115 @@ +/* + * Copyright (c) 2022 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef SRC_RUNTIME_HEURISTICS_DIRECT_CONV_ICLDIRECTCONVKERNELCONFIG +#define SRC_RUNTIME_HEURISTICS_DIRECT_CONV_ICLDIRECTCONVKERNELCONFIG + +#include "arm_compute/core/GPUTarget.h" +#include "arm_compute/core/KernelDescriptors.h" +#include "arm_compute/core/Types.h" + +#include "src/core/common/Macros.h" + +namespace arm_compute +{ +namespace cl_direct_conv +{ +/** Basic container for the OpenCL direct convolution configuration functions */ +template <class T> +class ClDirectConvConfigArray +{ +public: + /** Alias for F32 index */ + static constexpr size_t DT_F32 = 0; + /** Alias for F16 index */ + static constexpr size_t DT_F16 = 1; + /** Alias for Int8 index */ + static constexpr size_t DT_INT8 = 2; + + /** Constructor + * + * @param[in] func_f32 Function to call for direct convolution F32 + * @param[in] func_f16 Function to call for direct convolution F16 + * @param[in] func_int8 Function to call for direct convolution Int8 (QASYMM8, QASYMM8_SIGNED, QSYMM8_PER_CHANNEL) + * + */ + ClDirectConvConfigArray(T func_f32, T func_f16, T func_int8) : _configs{func_f32, func_f16, func_int8} + { + } + + /** Method to return the direct convolution configuration function based on data type + * + * @param[in] data_type Input data type + * + * @return the valid function otherwise it returns nullptr if the data type is not valid + */ + T get_function(DataType data_type) + { + switch (data_type) + { + case DataType::F32: + return _configs.at(DT_F32); + case DataType::F16: + return _configs.at(DT_F16); + case DataType::QASYMM8: + case DataType::QASYMM8_SIGNED: + case DataType::QSYMM8_PER_CHANNEL: + return _configs.at(DT_INT8); + default: + return nullptr; + } + } + +private: + std::array<T, 3> _configs; +}; + +/** Basic interface for the Direct convolution kernel configuration */ +class IClDirectConvKernelConfig +{ +public: + /** Constructor + * + * @param[in] arch GPU target + */ + IClDirectConvKernelConfig(GPUTarget arch) : _target(arch) + { + } + ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(IClDirectConvKernelConfig); + /** Virtual destructor */ + virtual ~IClDirectConvKernelConfig() = default; + /** This method returns the @ref DirectConvComputeKernelInfo for the given inputs + * + * @param[in] src Source tensor (activation tensor) + * @param[in] wei Weights tensor + * @param[in] conv_info Convolution info + */ + virtual DirectConvComputeKernelInfo + configure(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info) = 0; + +protected: + GPUTarget _target; +}; +} // namespace cl_direct_conv +} // namespace arm_compute +#endif /* SRC_RUNTIME_HEURISTICS_DIRECT_CONV_ICLDIRECTCONVKERNELCONFIG */ diff --git a/src/runtime/heuristics/dwc_native/ClDWCNativeDefaultConfigBifrost.cpp b/src/runtime/heuristics/dwc_native/ClDWCNativeDefaultConfigBifrost.cpp new file mode 100644 index 0000000000..98ebf3ebbe --- /dev/null +++ b/src/runtime/heuristics/dwc_native/ClDWCNativeDefaultConfigBifrost.cpp @@ -0,0 +1,309 @@ +/* + * Copyright (c) 2022-2023 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "src/runtime/heuristics/dwc_native/ClDWCNativeDefaultConfigBifrost.h" + +#include "arm_compute/core/CL/CLHelpers.h" +#include "arm_compute/core/GPUTarget.h" +#include "arm_compute/core/TensorInfo.h" +#include "arm_compute/core/TensorShape.h" +#include "arm_compute/core/utils/helpers/AdjustVecSize.h" + +#include "src/runtime/heuristics/dwc_native/ClDWCNativeHeuristicsHelpers.h" + +namespace arm_compute +{ +namespace cl_dwc +{ +namespace +{ +DWCComputeKernelInfo configure_f32(const ITensorInfo *src, + const ITensorInfo *wei, + const PadStrideInfo &conv_info, + const Size2D &dilation, + unsigned int depth_multiplier, + bool is_g71) +{ + DWCComputeKernelInfo desc; + + if (src->data_layout() == DataLayout::NHWC) + { + const size_t idx_c = get_data_layout_dimension_index(wei->data_layout(), DataLayoutDimension::CHANNEL); + const size_t idx_w = get_data_layout_dimension_index(wei->data_layout(), DataLayoutDimension::WIDTH); + const TensorShape wei_shape = wei->tensor_shape(); + const size_t kernel_c = wei_shape[idx_c]; + const size_t kernel_w = wei_shape[idx_w]; + + desc.export_input_to_cl_image = false; + + if (is_g71) + { + desc.export_weights_to_cl_image = false; + } + else + { + desc.export_weights_to_cl_image = use_cl_image_for_weights(wei, depth_multiplier); + } + + if (depth_multiplier == 1) + { + desc.n0 = 4; + } + else + { + if ((depth_multiplier % 4) == 0) + { + desc.n0 = 4; + } + else if ((depth_multiplier % 2) == 0) + { + desc.n0 = 2; + } + else + { + desc.n0 = 1; + } + } + + // Note: If we reduce n0, export to cl_image must be false + ARM_COMPUTE_ERROR_ON((adjust_vec_size(desc.n0, kernel_c) != desc.n0) && + (desc.export_weights_to_cl_image == true)); + + desc.n0 = adjust_vec_size(desc.n0, kernel_c); + + // Set m0 only if stride_x == 1 and dilation_x == 1 + if (conv_info.stride().first == 1 && dilation.x() == 1) + { + if ((kernel_w >= 9) || (kernel_w == 1)) + { + desc.m0 = 1; + } + else + { + desc.m0 = 2; + } + } + else + { + desc.m0 = 1; + } + } + + return desc; +} + +DWCComputeKernelInfo configure_f16(const ITensorInfo *src, + const ITensorInfo *wei, + const PadStrideInfo &conv_info, + const Size2D &dilation, + unsigned int depth_multiplier, + bool is_g71) +{ + DWCComputeKernelInfo desc; + + if (src->data_layout() == DataLayout::NHWC) + { + // Src and weights have the same dimension indices + const size_t idx_c = get_data_layout_dimension_index(wei->data_layout(), DataLayoutDimension::CHANNEL); + const size_t idx_w = get_data_layout_dimension_index(wei->data_layout(), DataLayoutDimension::WIDTH); + const TensorShape src_shape = src->tensor_shape(); + const TensorShape wei_shape = wei->tensor_shape(); + const size_t src_w = src_shape[idx_w]; + const size_t kernel_c = wei_shape[idx_c]; + const size_t kernel_w = wei_shape[idx_w]; + + desc.export_input_to_cl_image = false; + + if (is_g71) + { + desc.export_weights_to_cl_image = false; + } + else + { + desc.export_weights_to_cl_image = use_cl_image_for_weights(wei, depth_multiplier); + } + + if (depth_multiplier == 1) + { + if (desc.export_weights_to_cl_image == false) + { + desc.n0 = 8; + } + else + { + desc.n0 = 4; + } + } + else + { + if ((depth_multiplier % 4) == 0) + { + desc.n0 = 4; + } + else if ((depth_multiplier % 2) == 0) + { + desc.n0 = 2; + } + else + { + desc.n0 = 1; + } + } + + // Note: If we reduce n0, export to cl_image must be false + ARM_COMPUTE_ERROR_ON((adjust_vec_size(desc.n0, kernel_c) != desc.n0) && + (desc.export_weights_to_cl_image == true)); + + desc.n0 = adjust_vec_size(desc.n0, kernel_c); + + // Set m0 only if stride_x == 1 and dilation_x == 1 + if (conv_info.stride().first == 1 && dilation.x() == 1) + { + if ((kernel_w >= 9) || (kernel_w == 1)) + { + desc.m0 = 1; + } + else + { + if ((src_w % 5) == 0) + { + desc.m0 = 5; + } + else + { + desc.m0 = 4; + } + } + } + else + { + desc.m0 = 1; + } + } + + return desc; +} +} // namespace + +ClDWCNativeDefaultConfigBifrost::ClDWCNativeDefaultConfigBifrost(GPUTarget gpu) : IClDWCNativeKernelConfig(gpu) +{ +} + +DWCComputeKernelInfo ClDWCNativeDefaultConfigBifrost::configure(const ITensorInfo *src, + const ITensorInfo *wei, + const PadStrideInfo &conv_info, + const Size2D &dilation, + unsigned int depth_multiplier) +{ + using ConfigurationFunctionExecutorPtr = DWCComputeKernelInfo (ClDWCNativeDefaultConfigBifrost::*)( + const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info, const Size2D &dilation, + unsigned int depth_multiplier); + + ClDWCNativeConfigArray<ConfigurationFunctionExecutorPtr> configs_G71( + &ClDWCNativeDefaultConfigBifrost::configure_G71_f32, &ClDWCNativeDefaultConfigBifrost::configure_G71_f16, + &ClDWCNativeDefaultConfigBifrost::configure_G7x_u8); + + ClDWCNativeConfigArray<ConfigurationFunctionExecutorPtr> configs_G7x( + &ClDWCNativeDefaultConfigBifrost::configure_G7x_f32, &ClDWCNativeDefaultConfigBifrost::configure_G7x_f16, + &ClDWCNativeDefaultConfigBifrost::configure_G7x_u8); + + ConfigurationFunctionExecutorPtr func = nullptr; + switch (_target) + { + case GPUTarget::G71: + func = configs_G71.get_function(src->data_type()); + break; + default: + func = configs_G7x.get_function(src->data_type()); + break; + } + + ARM_COMPUTE_ERROR_ON_MSG(func == nullptr, "Data type not supported for depthwise convolution"); + return (this->*func)(src, wei, conv_info, dilation, depth_multiplier); +} + +DWCComputeKernelInfo ClDWCNativeDefaultConfigBifrost::configure_G71_f32(const ITensorInfo *src, + const ITensorInfo *wei, + const PadStrideInfo &conv_info, + const Size2D &dilation, + unsigned int depth_multiplier) +{ + return configure_f32(src, wei, conv_info, dilation, depth_multiplier, true); +} + +DWCComputeKernelInfo ClDWCNativeDefaultConfigBifrost::configure_G71_f16(const ITensorInfo *src, + const ITensorInfo *wei, + const PadStrideInfo &conv_info, + const Size2D &dilation, + unsigned int depth_multiplier) +{ + return configure_f16(src, wei, conv_info, dilation, depth_multiplier, true); +} + +DWCComputeKernelInfo ClDWCNativeDefaultConfigBifrost::configure_G7x_f32(const ITensorInfo *src, + const ITensorInfo *wei, + const PadStrideInfo &conv_info, + const Size2D &dilation, + unsigned int depth_multiplier) +{ + return configure_f32(src, wei, conv_info, dilation, depth_multiplier, false); +} + +DWCComputeKernelInfo ClDWCNativeDefaultConfigBifrost::configure_G7x_f16(const ITensorInfo *src, + const ITensorInfo *wei, + const PadStrideInfo &conv_info, + const Size2D &dilation, + unsigned int depth_multiplier) +{ + return configure_f16(src, wei, conv_info, dilation, depth_multiplier, false); +} + +DWCComputeKernelInfo ClDWCNativeDefaultConfigBifrost::configure_G7x_u8(const ITensorInfo *src, + const ITensorInfo *wei, + const PadStrideInfo &conv_info, + const Size2D &dilation, + unsigned int depth_multiplier) +{ + ARM_COMPUTE_UNUSED(wei); + + DWCComputeKernelInfo desc; + + if (src->data_layout() == DataLayout::NHWC) + { + desc.export_input_to_cl_image = false; + desc.export_weights_to_cl_image = false; + desc.n0 = (depth_multiplier == 1) ? 4 : 1; + if (conv_info.stride().first == 1 && dilation.x() == 1 && depth_multiplier == 1) + { + desc.m0 = 2; + } + else + { + desc.m0 = 1; + } + } + + return desc; +} +} // namespace cl_dwc +} // namespace arm_compute diff --git a/src/runtime/heuristics/dwc_native/ClDWCNativeDefaultConfigBifrost.h b/src/runtime/heuristics/dwc_native/ClDWCNativeDefaultConfigBifrost.h new file mode 100644 index 0000000000..41d86c9c14 --- /dev/null +++ b/src/runtime/heuristics/dwc_native/ClDWCNativeDefaultConfigBifrost.h @@ -0,0 +1,79 @@ +/* + * Copyright (c) 2022 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef SRC_RUNTIME_HEURISTICS_DWC_NATIVE_CLDWCNATIVEDEFAULTCONFIGBIFROST +#define SRC_RUNTIME_HEURISTICS_DWC_NATIVE_CLDWCNATIVEDEFAULTCONFIGBIFROST + +#include "src/runtime/heuristics/dwc_native/IClDWCNativeKernelConfig.h" + +namespace arm_compute +{ +namespace cl_dwc +{ +/** Bifrost based OpenCL depthwise convolution configuration */ +class ClDWCNativeDefaultConfigBifrost final : public IClDWCNativeKernelConfig +{ +public: + /** Constructor + * + * @param[in] gpu GPU target + */ + ClDWCNativeDefaultConfigBifrost(GPUTarget gpu); + + // Inherited overridden method + DWCComputeKernelInfo configure(const ITensorInfo *src, + const ITensorInfo *wei, + const PadStrideInfo &conv_info, + const Size2D &dilation, + unsigned int depth_multiplier) override; + +private: + DWCComputeKernelInfo configure_G71_f32(const ITensorInfo *src, + const ITensorInfo *wei, + const PadStrideInfo &conv_info, + const Size2D &dilation, + unsigned int depth_multiplier); + DWCComputeKernelInfo configure_G71_f16(const ITensorInfo *src, + const ITensorInfo *wei, + const PadStrideInfo &conv_info, + const Size2D &dilation, + unsigned int depth_multiplier); + DWCComputeKernelInfo configure_G7x_f32(const ITensorInfo *src, + const ITensorInfo *wei, + const PadStrideInfo &conv_info, + const Size2D &dilation, + unsigned int depth_multiplier); + DWCComputeKernelInfo configure_G7x_f16(const ITensorInfo *src, + const ITensorInfo *wei, + const PadStrideInfo &conv_info, + const Size2D &dilation, + unsigned int depth_multiplier); + DWCComputeKernelInfo configure_G7x_u8(const ITensorInfo *src, + const ITensorInfo *wei, + const PadStrideInfo &conv_info, + const Size2D &dilation, + unsigned int depth_multiplier); +}; +} // namespace cl_dwc +} // namespace arm_compute +#endif /* SRC_RUNTIME_HEURISTICS_DWC_NATIVE_CLDWCNATIVEDEFAULTCONFIGBIFROST */ diff --git a/src/runtime/heuristics/dwc_native/ClDWCNativeDefaultConfigValhall.cpp b/src/runtime/heuristics/dwc_native/ClDWCNativeDefaultConfigValhall.cpp new file mode 100644 index 0000000000..ef1bb3858c --- /dev/null +++ b/src/runtime/heuristics/dwc_native/ClDWCNativeDefaultConfigValhall.cpp @@ -0,0 +1,326 @@ +/* + * Copyright (c) 2022-2023 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "src/runtime/heuristics/dwc_native/ClDWCNativeDefaultConfigValhall.h" + +#include "arm_compute/core/CL/CLHelpers.h" +#include "arm_compute/core/GPUTarget.h" +#include "arm_compute/core/TensorInfo.h" +#include "arm_compute/core/TensorShape.h" +#include "arm_compute/core/utils/helpers/AdjustVecSize.h" + +#include "src/runtime/heuristics/dwc_native/ClDWCNativeHeuristicsHelpers.h" + +namespace arm_compute +{ +namespace cl_dwc +{ +ClDWCNativeDefaultConfigValhall::ClDWCNativeDefaultConfigValhall(GPUTarget gpu) : IClDWCNativeKernelConfig(gpu) +{ +} + +DWCComputeKernelInfo ClDWCNativeDefaultConfigValhall::configure(const ITensorInfo *src, + const ITensorInfo *wei, + const PadStrideInfo &conv_info, + const Size2D &dilation, + unsigned int depth_multiplier) +{ + using ConfigurationFunctionExecutorPtr = DWCComputeKernelInfo (ClDWCNativeDefaultConfigValhall::*)( + const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info, const Size2D &dilation, + unsigned int depth_multiplier); + + ClDWCNativeConfigArray<ConfigurationFunctionExecutorPtr> configs_G78( + &ClDWCNativeDefaultConfigValhall::configure_G78_f32, &ClDWCNativeDefaultConfigValhall::configure_G78_f16, + &ClDWCNativeDefaultConfigValhall::configure_G78_u8); + + ClDWCNativeConfigArray<ConfigurationFunctionExecutorPtr> configs_G77( + &ClDWCNativeDefaultConfigValhall::configure_G78_f32, &ClDWCNativeDefaultConfigValhall::configure_G77_f16, + &ClDWCNativeDefaultConfigValhall::configure_G78_u8); + + ConfigurationFunctionExecutorPtr func = nullptr; + switch (_target) + { + case GPUTarget::G77: + func = configs_G77.get_function(src->data_type()); + break; + case GPUTarget::G78: + default: + func = configs_G78.get_function(src->data_type()); + break; + } + + ARM_COMPUTE_ERROR_ON_MSG(func == nullptr, "Data type not supported for depthwise convolution"); + return (this->*func)(src, wei, conv_info, dilation, depth_multiplier); +} + +DWCComputeKernelInfo ClDWCNativeDefaultConfigValhall::configure_G78_f32(const ITensorInfo *src, + const ITensorInfo *wei, + const PadStrideInfo &conv_info, + const Size2D &dilation, + unsigned int depth_multiplier) +{ + DWCComputeKernelInfo desc; + + if (src->data_layout() == DataLayout::NHWC) + { + const size_t idx_c = get_data_layout_dimension_index(wei->data_layout(), DataLayoutDimension::CHANNEL); + const size_t idx_w = get_data_layout_dimension_index(wei->data_layout(), DataLayoutDimension::WIDTH); + const TensorShape wei_shape = wei->tensor_shape(); + const size_t kernel_c = wei_shape[idx_c]; + const size_t kernel_w = wei_shape[idx_w]; + + desc.export_input_to_cl_image = false; + desc.export_weights_to_cl_image = use_cl_image_for_weights(wei, depth_multiplier); + + if (depth_multiplier == 1) + { + desc.n0 = 4; + } + else + { + if ((depth_multiplier % 4) == 0) + { + desc.n0 = 4; + } + else if ((depth_multiplier % 2) == 0) + { + desc.n0 = 2; + } + else + { + desc.n0 = 1; + } + } + + // Note: If we reduce n0, export to cl_image must be false + ARM_COMPUTE_ERROR_ON((adjust_vec_size(desc.n0, kernel_c) != desc.n0) && + (desc.export_weights_to_cl_image == true)); + + desc.n0 = adjust_vec_size(desc.n0, kernel_c); + + // Set m0 only if stride_x == 1 and dilation_x == 1 + if (conv_info.stride().first == 1 && dilation.x() == 1) + { + if ((kernel_w >= 9) || (kernel_w == 1)) + { + desc.m0 = 1; + } + else + { + desc.m0 = 2; + } + } + else + { + desc.m0 = 1; + } + } + + return desc; +} + +DWCComputeKernelInfo ClDWCNativeDefaultConfigValhall::configure_G78_f16(const ITensorInfo *src, + const ITensorInfo *wei, + const PadStrideInfo &conv_info, + const Size2D &dilation, + unsigned int depth_multiplier) +{ + DWCComputeKernelInfo desc; + + if (src->data_layout() == DataLayout::NHWC) + { + // Src and weights have the same dimension indices + const size_t idx_c = get_data_layout_dimension_index(wei->data_layout(), DataLayoutDimension::CHANNEL); + const size_t idx_w = get_data_layout_dimension_index(wei->data_layout(), DataLayoutDimension::WIDTH); + const TensorShape src_shape = src->tensor_shape(); + const TensorShape wei_shape = wei->tensor_shape(); + const size_t src_w = src_shape[idx_w]; + const size_t kernel_c = wei_shape[idx_c]; + const size_t kernel_w = wei_shape[idx_w]; + + desc.export_input_to_cl_image = false; + desc.export_weights_to_cl_image = use_cl_image_for_weights(wei, depth_multiplier); + + if (depth_multiplier == 1) + { + if (desc.export_weights_to_cl_image == false) + { + desc.n0 = 8; + } + else + { + desc.n0 = 4; + } + } + else + { + if ((depth_multiplier % 4) == 0) + { + desc.n0 = 4; + } + else if ((depth_multiplier % 2) == 0) + { + desc.n0 = 2; + } + else + { + desc.n0 = 1; + } + } + + // Note: If we reduce n0, export to cl_image must be false + ARM_COMPUTE_ERROR_ON((adjust_vec_size(desc.n0, kernel_c) != desc.n0) && + (desc.export_weights_to_cl_image == true)); + + desc.n0 = adjust_vec_size(desc.n0, kernel_c); + + // Set m0 only if stride_x == 1 and dilation_x == 1 + if (conv_info.stride().first == 1 && dilation.x() == 1) + { + if ((kernel_w >= 9) || (kernel_w == 1)) + { + desc.m0 = 1; + } + else + { + if ((src_w % 5) == 0) + { + desc.m0 = 5; + } + else + { + desc.m0 = 4; + } + } + } + else + { + desc.m0 = 1; + } + } + + return desc; +} + +DWCComputeKernelInfo ClDWCNativeDefaultConfigValhall::configure_G78_u8(const ITensorInfo *src, + const ITensorInfo *wei, + const PadStrideInfo &conv_info, + const Size2D &dilation, + unsigned int depth_multiplier) +{ + ARM_COMPUTE_UNUSED(wei); + + DWCComputeKernelInfo desc; + + if (src->data_layout() == DataLayout::NHWC) + { + desc.export_input_to_cl_image = false; + desc.export_weights_to_cl_image = false; + desc.n0 = (depth_multiplier == 1) ? 4 : 1; + if (conv_info.stride().first == 1 && dilation.x() == 1 && depth_multiplier == 1) + { + desc.m0 = 2; + } + else + { + desc.m0 = 1; + } + } + + return desc; +} + +DWCComputeKernelInfo ClDWCNativeDefaultConfigValhall::configure_G77_f16(const ITensorInfo *src, + const ITensorInfo *wei, + const PadStrideInfo &conv_info, + const Size2D &dilation, + unsigned int depth_multiplier) +{ + DWCComputeKernelInfo desc; + + if (src->data_layout() == DataLayout::NHWC) + { + const size_t idx_c = get_data_layout_dimension_index(wei->data_layout(), DataLayoutDimension::CHANNEL); + const size_t idx_w = get_data_layout_dimension_index(wei->data_layout(), DataLayoutDimension::WIDTH); + const TensorShape wei_shape = wei->tensor_shape(); + const size_t kernel_c = wei_shape[idx_c]; + const size_t kernel_w = wei_shape[idx_w]; + + desc.export_input_to_cl_image = false; + desc.export_weights_to_cl_image = use_cl_image_for_weights(wei, depth_multiplier); + + if (depth_multiplier == 1) + { + if (desc.export_weights_to_cl_image == false) + { + desc.n0 = 8; + } + else + { + desc.n0 = 4; + } + } + else + { + if ((depth_multiplier % 4) == 0) + { + desc.n0 = 4; + } + else if ((depth_multiplier % 2) == 0) + { + desc.n0 = 2; + } + else + { + desc.n0 = 1; + } + } + + // Note: If we reduce n0, export to cl_image must be false + ARM_COMPUTE_ERROR_ON((adjust_vec_size(desc.n0, kernel_c) != desc.n0) && + (desc.export_weights_to_cl_image == true)); + + desc.n0 = adjust_vec_size(desc.n0, kernel_c); + + // Set m0 only if stride_x == 1 and dilation_x == 1 + if (conv_info.stride().first == 1 && dilation.x() == 1) + { + if ((kernel_w >= 9) || (kernel_w == 1)) + { + desc.m0 = 1; + } + else + { + desc.m0 = 2; + } + } + else + { + desc.m0 = 1; + } + } + + return desc; +} +} // namespace cl_dwc +} // namespace arm_compute diff --git a/src/runtime/heuristics/dwc_native/ClDWCNativeDefaultConfigValhall.h b/src/runtime/heuristics/dwc_native/ClDWCNativeDefaultConfigValhall.h new file mode 100644 index 0000000000..fabce77b54 --- /dev/null +++ b/src/runtime/heuristics/dwc_native/ClDWCNativeDefaultConfigValhall.h @@ -0,0 +1,74 @@ +/* + * Copyright (c) 2022 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef SRC_RUNTIME_HEURISTICS_DWC_NATIVE_CLDWCNATIVEDEFAULTCONFIGVALHALL +#define SRC_RUNTIME_HEURISTICS_DWC_NATIVE_CLDWCNATIVEDEFAULTCONFIGVALHALL + +#include "src/runtime/heuristics/dwc_native/IClDWCNativeKernelConfig.h" + +namespace arm_compute +{ +namespace cl_dwc +{ +/** Valhall based OpenCL depthwise convolution configuration */ +class ClDWCNativeDefaultConfigValhall final : public IClDWCNativeKernelConfig +{ +public: + /** Constructor + * + * @param[in] gpu GPU target + */ + ClDWCNativeDefaultConfigValhall(GPUTarget gpu); + + // Inherited overridden method + DWCComputeKernelInfo configure(const ITensorInfo *src, + const ITensorInfo *wei, + const PadStrideInfo &conv_info, + const Size2D &dilation, + unsigned int depth_multiplier) override; + +private: + DWCComputeKernelInfo configure_G78_f32(const ITensorInfo *src, + const ITensorInfo *wei, + const PadStrideInfo &conv_info, + const Size2D &dilation, + unsigned int depth_multiplier); + DWCComputeKernelInfo configure_G78_f16(const ITensorInfo *src, + const ITensorInfo *wei, + const PadStrideInfo &conv_info, + const Size2D &dilation, + unsigned int depth_multiplier); + DWCComputeKernelInfo configure_G78_u8(const ITensorInfo *src, + const ITensorInfo *wei, + const PadStrideInfo &conv_info, + const Size2D &dilation, + unsigned int depth_multiplier); + DWCComputeKernelInfo configure_G77_f16(const ITensorInfo *src, + const ITensorInfo *wei, + const PadStrideInfo &conv_info, + const Size2D &dilation, + unsigned int depth_multiplier); +}; +} // namespace cl_dwc +} // namespace arm_compute +#endif /* SRC_RUNTIME_HEURISTICS_DWC_NATIVE_CLDWCNATIVEDEFAULTCONFIGVALHALL */ diff --git a/src/runtime/cpu/operators/CpuDequantize.cpp b/src/runtime/heuristics/dwc_native/ClDWCNativeHeuristicsHelpers.cpp index 80a2e28aee..c8b006c546 100644 --- a/src/runtime/cpu/operators/CpuDequantize.cpp +++ b/src/runtime/heuristics/dwc_native/ClDWCNativeHeuristicsHelpers.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021 Arm Limited. + * Copyright (c) 2022 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -21,34 +21,41 @@ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ -#include "src/runtime/cpu/operators/CpuDequantize.h" - +#include "arm_compute/core/CL/CLHelpers.h" #include "arm_compute/core/TensorInfo.h" -#include "arm_compute/core/Validate.h" -#include "arm_compute/runtime/NEON/NEScheduler.h" -#include "src/core/cpu/kernels/CpuDequantizeKernel.h" +#include "arm_compute/core/TensorShape.h" namespace arm_compute { -namespace cpu +namespace cl_dwc { -void CpuDequantize::configure(const ITensorInfo *src, ITensorInfo *dst) +bool use_cl_image_for_weights(const ITensorInfo *weights, unsigned int depth_multiplier) { - auto k = std::make_unique<kernels::CpuDequantizeKernel>(); - k->configure(src, dst); - _kernel = std::move(k); -} + // Check whether we can use the cl image with the weights. + if (!export_to_cl_image(weights)) + { + return false; + } -Status CpuDequantize::validate(const ITensorInfo *src, const ITensorInfo *dst) -{ - return kernels::CpuDequantizeKernel::validate(src, dst); -} + const size_t idx_w = get_data_layout_dimension_index(weights->data_layout(), DataLayoutDimension::WIDTH); + const size_t idx_h = get_data_layout_dimension_index(weights->data_layout(), DataLayoutDimension::HEIGHT); + const size_t kernel_w = weights->tensor_shape()[idx_w]; + const size_t kernel_h = weights->tensor_shape()[idx_h]; -void CpuDequantize::run(ITensorPack &tensors) -{ - ARM_COMPUTE_ERROR_ON_MSG(tensors.empty(), "No inputs provided"); - prepare(tensors); - NEScheduler::get().schedule_op(_kernel.get(), Window::DimY, _kernel->window(), tensors); + // If we can use the cl image storage with the weights, we prefer to use the cl buffer storage in the following cases for performance reasons: + // 1- When the kernel size is 1x1 + // 2- When the depth multiplier is greater than 1 and not multiple of 4. + if ((kernel_w == 1) && (kernel_h == 1)) + { + return false; + } + + if ((depth_multiplier > 1) && (depth_multiplier % 4) != 0) + { + return false; + } + + return true; } -} // namespace cpu +} // namespace cl_dwc } // namespace arm_compute diff --git a/src/runtime/cpu/operators/CpuCast.cpp b/src/runtime/heuristics/dwc_native/ClDWCNativeHeuristicsHelpers.h index 5a4f6c518e..e3484c04ff 100644 --- a/src/runtime/cpu/operators/CpuCast.cpp +++ b/src/runtime/heuristics/dwc_native/ClDWCNativeHeuristicsHelpers.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021 Arm Limited. + * Copyright (c) 2022 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -21,24 +21,25 @@ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ -#include "src/runtime/cpu/operators/CpuCast.h" - -#include "src/core/cpu/kernels/CpuCastKernel.h" +#ifndef SRC_RUNTIME_HEURISTICS_DWC_NATIVE_CLDWCNATIVEHEURISTICSHELPERS +#define SRC_RUNTIME_HEURISTICS_DWC_NATIVE_CLDWCNATIVEHEURISTICSHELPERS namespace arm_compute { -namespace cpu -{ -void CpuCast::configure(const ITensorInfo *src, ITensorInfo *dst, ConvertPolicy policy) -{ - auto k = std::make_unique<kernels::CpuCastKernel>(); - k->configure(src, dst, policy); - _kernel = std::move(k); -} +// Forward declaration +class ITensorInfo; -Status CpuCast::validate(const ITensorInfo *src, const ITensorInfo *dst, ConvertPolicy policy) +namespace cl_dwc { - return kernels::CpuCastKernel::validate(src, dst, policy); -} -} // namespace cpu +/** Utility function to know whether we can use the cl image storage for the weights of depthwise convolution to get better performance + * + * @param[in] weights Weights TensorInfo of the depthwise convolution + * @param[in] depth_multiplier Depth multiplier + * + * @return true if the weights of depthwise convolution can be kept in the cl image storage to improve the performance + */ +bool use_cl_image_for_weights(const ITensorInfo *weights, unsigned int depth_multiplier); + +} // namespace cl_dwc } // namespace arm_compute +#endif /* SRC_RUNTIME_HEURISTICS_DWC_NATIVE_CLDWCNATIVEHEURISTICSHELPERS */ diff --git a/src/runtime/heuristics/dwc_native/ClDWCNativeKernelConfig.h b/src/runtime/heuristics/dwc_native/ClDWCNativeKernelConfig.h new file mode 100644 index 0000000000..031cf1859a --- /dev/null +++ b/src/runtime/heuristics/dwc_native/ClDWCNativeKernelConfig.h @@ -0,0 +1,66 @@ +/* + * Copyright (c) 2022-2023 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ACL_SRC_RUNTIME_HEURISTICS_DWC_NATIVE_CLDWCNATIVEKERNELCONFIG_H +#define ACL_SRC_RUNTIME_HEURISTICS_DWC_NATIVE_CLDWCNATIVEKERNELCONFIG_H + +#include "src/runtime/heuristics/dwc_native/ClDWCNativeDefaultConfigBifrost.h" +#include "src/runtime/heuristics/dwc_native/ClDWCNativeDefaultConfigValhall.h" +#include "src/runtime/heuristics/dwc_native/IClDWCNativeKernelConfig.h" + +#include <memory> + +namespace arm_compute +{ +namespace cl_dwc +{ +/** ClDWCNativeKernelConfigurationFactory factory class */ +class ClDWCNativeKernelConfigurationFactory final +{ +public: + /** Static method to call the ClDWCNative kernel configuration class accordingly with the GPU target + * + * @param[in] gpu GPU target + * + * @return IClDWCNativeKernelConfig + */ + static std::unique_ptr<IClDWCNativeKernelConfig> create(GPUTarget gpu) + { + switch (get_arch_from_target(gpu)) + { + case GPUTarget::MIDGARD: + // The heuristic for Midgard is the same as the one used for Arm Mali-G71 + return std::make_unique<ClDWCNativeDefaultConfigBifrost>(GPUTarget::G71); + case GPUTarget::BIFROST: + return std::make_unique<ClDWCNativeDefaultConfigBifrost>(gpu); + case GPUTarget::VALHALL: + case GPUTarget::FIFTHGEN: + return std::make_unique<ClDWCNativeDefaultConfigValhall>(gpu); + default: + ARM_COMPUTE_ERROR("Not supported GPU target"); + } + } +}; +} // namespace cl_dwc +} // namespace arm_compute +#endif // ACL_SRC_RUNTIME_HEURISTICS_DWC_NATIVE_CLDWCNATIVEKERNELCONFIG_H diff --git a/src/runtime/heuristics/dwc_native/IClDWCNativeKernelConfig.h b/src/runtime/heuristics/dwc_native/IClDWCNativeKernelConfig.h new file mode 100644 index 0000000000..614a6622df --- /dev/null +++ b/src/runtime/heuristics/dwc_native/IClDWCNativeKernelConfig.h @@ -0,0 +1,120 @@ +/* + * Copyright (c) 2022 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef SRC_RUNTIME_HEURISTICS_DWC_NATIVE_ICLDWCNATIVEKERNELCONFIG +#define SRC_RUNTIME_HEURISTICS_DWC_NATIVE_ICLDWCNATIVEKERNELCONFIG + +#include "arm_compute/core/GPUTarget.h" +#include "arm_compute/core/KernelDescriptors.h" +#include "arm_compute/core/Types.h" + +#include "src/core/common/Macros.h" + +namespace arm_compute +{ +namespace cl_dwc +{ +/** Basic container for the OpenCL depthwise convolution configuration functions */ +template <class T> +class ClDWCNativeConfigArray +{ +public: + /** Alias for F32 index */ + static constexpr size_t DT_F32 = 0; + /** Alias for F16 index */ + static constexpr size_t DT_F16 = 1; + /** Alias for Int8 index */ + static constexpr size_t DT_INT8 = 2; + + /** Constructor + * + * @param[in] func_f32 Function to call for depthwise convolution F32 + * @param[in] func_f16 Function to call for depthwise convolution F16 + * @param[in] func_int8 Function to call for depthwise convolution Int8 (QASYMM8, QASYMM8_SIGNED, QSYMM8_PER_CHANNEL) + * + */ + ClDWCNativeConfigArray(T func_f32, T func_f16, T func_int8) : _configs{func_f32, func_f16, func_int8} + { + } + + /** Method to return the depthwise convolution configuration function based on data type + * + * @param[in] data_type Input data type + * + * @return the valid function otherwise it returns nullptr if the data type is not valid + */ + T get_function(DataType data_type) + { + switch (data_type) + { + case DataType::F32: + return _configs.at(DT_F32); + case DataType::F16: + return _configs.at(DT_F16); + case DataType::QASYMM8: + case DataType::QASYMM8_SIGNED: + case DataType::QSYMM8_PER_CHANNEL: + return _configs.at(DT_INT8); + default: + return nullptr; + } + } + +private: + std::array<T, 3> _configs; +}; + +/** Basic interface for the depthwise convolution kernel configuration */ +class IClDWCNativeKernelConfig +{ +public: + /** Constructor + * + * @param[in] arch GPU target + */ + IClDWCNativeKernelConfig(GPUTarget arch) : _target(arch) + { + } + ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(IClDWCNativeKernelConfig); + /** Virtual destructor */ + virtual ~IClDWCNativeKernelConfig() = default; + /** This method returns the @ref DWCComputeKernelInfo for the given inputs + * + * @param[in] src Source tensor (activation tensor) + * @param[in] wei Weights tensor + * @param[in] conv_info Convolution info + * @param[in] dilation Kernel dilation + * @param[in] depth_multiplier Output feature maps multiplier + */ + virtual DWCComputeKernelInfo configure(const ITensorInfo *src, + const ITensorInfo *wei, + const PadStrideInfo &conv_info, + const Size2D &dilation, + unsigned int depth_multiplier) = 0; + +protected: + GPUTarget _target; +}; +} // namespace cl_dwc +} // namespace arm_compute +#endif /* SRC_RUNTIME_HEURISTICS_DWC_NATIVE_ICLDWCNATIVEKERNELCONFIG */ diff --git a/src/runtime/heuristics/indirect_conv/ClIndirectConvDefaultConfigValhall.cpp b/src/runtime/heuristics/indirect_conv/ClIndirectConvDefaultConfigValhall.cpp new file mode 100644 index 0000000000..3380d8f1b7 --- /dev/null +++ b/src/runtime/heuristics/indirect_conv/ClIndirectConvDefaultConfigValhall.cpp @@ -0,0 +1,167 @@ +/* + * Copyright (c) 2022 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "src/runtime/heuristics/indirect_conv/ClIndirectConvDefaultConfigValhall.h" + +#include "arm_compute/core/CL/CLHelpers.h" +#include "arm_compute/core/GPUTarget.h" +#include "arm_compute/core/TensorInfo.h" +#include "arm_compute/core/TensorShape.h" +#include "arm_compute/core/utils/misc/ShapeCalculator.h" + +namespace arm_compute +{ +namespace cl_indirect_conv +{ +using namespace arm_compute::misc::shape_calculator; + +ClIndirectConvDefaultConfigValhall::ClIndirectConvDefaultConfigValhall(GPUTarget gpu) : IClIndirectConvKernelConfig(gpu) +{ +} + +DirectConvComputeKernelInfo ClIndirectConvDefaultConfigValhall::configure(const ITensorInfo *src, + const ITensorInfo *wei, + const PadStrideInfo &conv_info) +{ + using ConfigurationFunctionExecutorPtr = DirectConvComputeKernelInfo (ClIndirectConvDefaultConfigValhall::*)( + const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info); + + ClIndirectConvConfigArray<ConfigurationFunctionExecutorPtr> configs_G77( + &ClIndirectConvDefaultConfigValhall::configure_G77_f32, &ClIndirectConvDefaultConfigValhall::configure_G77_f16); + + // Important note: Indirect convolution should not be used when the kernel size is 1x1 (pointwise). The reason is because the indirect buffer makes + // indirect convolution less efficient than direct convolution or gemm. For this reason, the heuristic of indirect convolution has not been tuned + // for the pointwise convolution cases. + + ConfigurationFunctionExecutorPtr func = configs_G77.get_function(src->data_type()); + + ARM_COMPUTE_ERROR_ON_MSG(func == nullptr, "Data type not supported for indirect convolution"); + return (this->*func)(src, wei, conv_info); +} + +DirectConvComputeKernelInfo ClIndirectConvDefaultConfigValhall::configure_G77_f32(const ITensorInfo *src, + const ITensorInfo *wei, + const PadStrideInfo &conv_info) +{ + DirectConvComputeKernelInfo desc; + + if (src->data_layout() == DataLayout::NHWC) + { + const TensorShape dst_shape = misc::shape_calculator::compute_deep_convolution_shape(*src, *wei, conv_info); + const bool export_weights_to_cl_image = export_to_cl_image(wei); + const int32_t stride_x = conv_info.stride().first; + const int32_t stride_y = conv_info.stride().second; + const int32_t ofm = dst_shape[0]; + const int32_t m = (dst_shape[1] / stride_x) * (dst_shape[2] / stride_y); + + desc.export_weights_to_cl_image = export_weights_to_cl_image; + + if (ofm <= 4) + { + desc.m0 = 1; + desc.n0 = 2; + desc.k0 = 16; + } + else + { + // The 16000 threshold value has been identified as the right + // one for using the biggest block size allowed on F32: 5x4x4 + if (m < 16000) + { + desc.m0 = 4; + desc.n0 = 4; + desc.k0 = 4; + } + else + { + desc.m0 = 5; + desc.n0 = 4; + desc.k0 = 4; + } + } + } + + return desc; +} + +DirectConvComputeKernelInfo ClIndirectConvDefaultConfigValhall::configure_G77_f16(const ITensorInfo *src, + const ITensorInfo *wei, + const PadStrideInfo &conv_info) +{ + DirectConvComputeKernelInfo desc; + + if (src->data_layout() == DataLayout::NHWC) + { + const TensorShape wei_shape = wei->tensor_shape(); + const TensorShape dst_shape = misc::shape_calculator::compute_deep_convolution_shape(*src, *wei, conv_info); + const bool export_weights_to_cl_image = export_to_cl_image(wei); + + const int32_t ofm = dst_shape[0]; + const int32_t m = dst_shape[1] * dst_shape[2]; + const int32_t k = wei_shape[0]; + + desc.export_weights_to_cl_image = export_weights_to_cl_image; + + if (ofm <= 4) + { + // k0 should be as larger as possible. However, we should avoid + // having left-over for loops that make the implementation slower. + if ((k % 16) == 0) + { + desc.k0 = 16; + } + else if ((k % 8) == 0) + { + desc.k0 = 8; + } + else + { + desc.k0 = 4; + } + + desc.m0 = 1; + desc.n0 = ofm; + } + else + { + // The 16000 threshold value has been identified as the right + // one for using the biggest block size allowed on F16: 8x4 + if (m >= 16000 && k < 4) + { + desc.m0 = 8; + desc.n0 = 4; + desc.k0 = 4; // k0 is clamped to k inside the kernel when k is less than 4 + } + else + { + desc.m0 = 5; + desc.n0 = 4; + desc.k0 = 8; + } + } + } + + return desc; +} +} // namespace cl_indirect_conv +} // namespace arm_compute diff --git a/src/runtime/cpu/operators/CpuCopy.h b/src/runtime/heuristics/indirect_conv/ClIndirectConvDefaultConfigValhall.h index 057bb6efa5..bab808c66c 100644 --- a/src/runtime/cpu/operators/CpuCopy.h +++ b/src/runtime/heuristics/indirect_conv/ClIndirectConvDefaultConfigValhall.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021 Arm Limited. + * Copyright (c) 2022 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -21,37 +21,35 @@ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ -#ifndef ARM_COMPUTE_CPU_COPY_H -#define ARM_COMPUTE_CPU_COPY_H +#ifndef SRC_RUNTIME_HEURISTICS_INDIRECT_CONV_CLINDIRECTCONVDEFAULTCONFIGVALHALL +#define SRC_RUNTIME_HEURISTICS_INDIRECT_CONV_CLINDIRECTCONVDEFAULTCONFIGVALHALL -#include "src/runtime/cpu/ICpuOperator.h" +#include "src/runtime/heuristics/indirect_conv/IClIndirectConvKernelConfig.h" namespace arm_compute { -namespace cpu +namespace cl_indirect_conv { -/** Basic function to run @ref kernels::CpuCopyKernel */ -class CpuCopy : public ICpuOperator +/** Valhall based OpenCL indirect convolution configuration */ +class ClIndirectConvDefaultConfigValhall final : public IClIndirectConvKernelConfig { public: - /** Constructor */ - CpuCopy() = default; - /** Configure operator for a given list of arguments + /** Constructor * - * @param[in] src Source tensor info. Data type supported: All - * @param[out] dst Destination info. Data type supported: Same as @p src + * @param[in] gpu GPU target */ - void configure(const ITensorInfo *src, ITensorInfo *dst); + ClIndirectConvDefaultConfigValhall(GPUTarget gpu); - /** Static function to check if given info will lead to a valid configuration of @ref CpuCopy - * - * @param[in] src Source tensor info. Data type supported: All - * @param[in] dst Destination tensor info. Data type supported: Same as @p src - * - * @return a status - */ - static Status validate(const ITensorInfo *src, const ITensorInfo *dst); + // Inherited overridden method + DirectConvComputeKernelInfo + configure(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info) override; + +private: + DirectConvComputeKernelInfo + configure_G77_f32(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info); + DirectConvComputeKernelInfo + configure_G77_f16(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info); }; -} // namespace cpu +} // namespace cl_indirect_conv } // namespace arm_compute -#endif /* ARM_COMPUTE_CPU_COPY_H */ +#endif /* SRC_RUNTIME_HEURISTICS_INDIRECT_CONV_CLINDIRECTCONVDEFAULTCONFIGVALHALL */ diff --git a/src/runtime/heuristics/indirect_conv/ClIndirectConvKernelConfig.h b/src/runtime/heuristics/indirect_conv/ClIndirectConvKernelConfig.h new file mode 100644 index 0000000000..5e7ba6f8e9 --- /dev/null +++ b/src/runtime/heuristics/indirect_conv/ClIndirectConvKernelConfig.h @@ -0,0 +1,62 @@ +/* + * Copyright (c) 2022-2023 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ACL_SRC_RUNTIME_HEURISTICS_INDIRECT_CONV_CLINDIRECTCONVKERNELCONFIG_H +#define ACL_SRC_RUNTIME_HEURISTICS_INDIRECT_CONV_CLINDIRECTCONVKERNELCONFIG_H + +#include "src/runtime/heuristics/indirect_conv/ClIndirectConvDefaultConfigValhall.h" +#include "src/runtime/heuristics/indirect_conv/IClIndirectConvKernelConfig.h" + +#include <memory> + +namespace arm_compute +{ +namespace cl_indirect_conv +{ +/** ClIndirectConvolution factory class */ +class ClIndirectConvKernelConfigurationFactory final +{ +public: + /** Static method to call the ClIndirectConvolution kernel configuration class accordingly with the GPU target + * + * @param[in] gpu GPU target + * + * @return IClIndirectConvKernelConfig + */ + static std::unique_ptr<IClIndirectConvKernelConfig> create(GPUTarget gpu) + { + switch (get_arch_from_target(gpu)) + { + case GPUTarget::MIDGARD: + case GPUTarget::BIFROST: + case GPUTarget::VALHALL: + case GPUTarget::FIFTHGEN: + return std::make_unique<ClIndirectConvDefaultConfigValhall>(gpu); + default: + ARM_COMPUTE_ERROR("Not supported GPU target"); + } + } +}; +} // namespace cl_indirect_conv +} // namespace arm_compute +#endif // ACL_SRC_RUNTIME_HEURISTICS_INDIRECT_CONV_CLINDIRECTCONVKERNELCONFIG_H diff --git a/src/runtime/heuristics/indirect_conv/IClIndirectConvKernelConfig.h b/src/runtime/heuristics/indirect_conv/IClIndirectConvKernelConfig.h new file mode 100644 index 0000000000..d05da18b58 --- /dev/null +++ b/src/runtime/heuristics/indirect_conv/IClIndirectConvKernelConfig.h @@ -0,0 +1,108 @@ +/* + * Copyright (c) 2022 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef SRC_RUNTIME_HEURISTICS_INDIRECT_CONV_ICLINDIRECTCONVKERNELCONFIG +#define SRC_RUNTIME_HEURISTICS_INDIRECT_CONV_ICLINDIRECTCONVKERNELCONFIG + +#include "arm_compute/core/GPUTarget.h" +#include "arm_compute/core/KernelDescriptors.h" +#include "arm_compute/core/Types.h" + +#include "src/core/common/Macros.h" + +namespace arm_compute +{ +namespace cl_indirect_conv +{ +/** Basic container for the OpenCL indirect convolution configuration functions */ +template <class T> +class ClIndirectConvConfigArray +{ +public: + /** Alias for F32 index */ + static constexpr size_t DT_F32 = 0; + /** Alias for F16 index */ + static constexpr size_t DT_F16 = 1; + + /** Constructor + * + * @param[in] func_f32 Function to call for indirect convolution F32 + * @param[in] func_f16 Function to call for indirect convolution F16 + * + */ + ClIndirectConvConfigArray(T func_f32, T func_f16) : _configs{func_f32, func_f16} + { + } + + /** Method to return the indirect convolution configuration function based on data type + * + * @param[in] data_type Input data type + * + * @return the valid function otherwise it returns nullptr if the data type is not valid + */ + T get_function(DataType data_type) + { + switch (data_type) + { + case DataType::F32: + return _configs.at(DT_F32); + case DataType::F16: + return _configs.at(DT_F16); + default: + return nullptr; + } + } + +private: + std::array<T, 2> _configs; +}; + +/** Basic interface for the indirect convolution kernel configuration */ +class IClIndirectConvKernelConfig +{ +public: + /** Constructor + * + * @param[in] arch GPU target + */ + IClIndirectConvKernelConfig(GPUTarget arch) : _target(arch) + { + } + ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(IClIndirectConvKernelConfig); + /** Virtual destructor */ + virtual ~IClIndirectConvKernelConfig() = default; + /** This method returns the @ref DirectConvComputeKernelInfo for the given inputs + * + * @param[in] src Source tensor (activation tensor) + * @param[in] wei Weights tensor + * @param[in] conv_info Convolution info + */ + virtual DirectConvComputeKernelInfo + configure(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info) = 0; + +protected: + GPUTarget _target; +}; +} // namespace cl_indirect_conv +} // namespace arm_compute +#endif /* SRC_RUNTIME_HEURISTICS_INDIRECT_CONV_ICLINDIRECTCONVKERNELCONFIG */ diff --git a/src/runtime/heuristics/matmul_native/ClMatMulNativeDefaultConfigValhall.cpp b/src/runtime/heuristics/matmul_native/ClMatMulNativeDefaultConfigValhall.cpp new file mode 100644 index 0000000000..3a02a60650 --- /dev/null +++ b/src/runtime/heuristics/matmul_native/ClMatMulNativeDefaultConfigValhall.cpp @@ -0,0 +1,314 @@ +/* + * Copyright (c) 2023 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "src/runtime/heuristics/matmul_native/ClMatMulNativeDefaultConfigValhall.h" + +#include "arm_compute/core/CL/CLHelpers.h" +#include "arm_compute/core/CL/CLKernelLibrary.h" +#include "arm_compute/core/GPUTarget.h" +#include "arm_compute/core/KernelDescriptors.h" +#include "arm_compute/core/TensorInfo.h" + +#include "src/gpu/cl/kernels/ClMatMulNativeKernel.h" +#include "src/runtime/heuristics/matmul_native/ClMatMulNativeHelpers.h" + +#include <utility> + +namespace arm_compute +{ +namespace cl_matmul +{ +ClMatMulNativeDefaultConfigValhall::ClMatMulNativeDefaultConfigValhall(GPUTarget gpu) : IClMatMulNativeKernelConfig(gpu) +{ +} + +MatMulKernelInfo +ClMatMulNativeDefaultConfigValhall::configure(const ITensorInfo *lhs, const ITensorInfo *rhs, const MatMulInfo &info) +{ + using ConfigurationFunctionExecutorPtr = MatMulKernelInfo (ClMatMulNativeDefaultConfigValhall::*)( + unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool rhs_lock_padding, const MatMulInfo &info); + + ClMatMulNativeConfigArray<ConfigurationFunctionExecutorPtr> configs_G710( + &ClMatMulNativeDefaultConfigValhall::configure_G710_f32, + &ClMatMulNativeDefaultConfigValhall::configure_G710_f16, + &ClMatMulNativeDefaultConfigValhall::configure_G710_u8); + + ClMatMulNativeConfigArray<ConfigurationFunctionExecutorPtr> configs_G715( + &ClMatMulNativeDefaultConfigValhall::configure_G715_f32, + &ClMatMulNativeDefaultConfigValhall::configure_G715_f16, + &ClMatMulNativeDefaultConfigValhall::configure_G715_u8); + + ConfigurationFunctionExecutorPtr func = nullptr; + switch (_target) + { + case GPUTarget::G715: + case GPUTarget::G615: + func = configs_G715.get_function(lhs->data_type()); + break; + case GPUTarget::G710: + default: + func = configs_G710.get_function(lhs->data_type()); + break; + } + + const bool adj_lhs = info.adj_lhs(); + const bool adj_rhs = info.adj_rhs(); + + TensorShape lhs_shape = lhs->tensor_shape(); + TensorShape rhs_shape = rhs->tensor_shape(); + + const bool is_batched = lhs_shape.num_dimensions() > 2; + + if (is_batched == true) + { + lhs_shape.collapse_from(2); + } + + const unsigned int m = adj_lhs ? lhs_shape.x() : lhs_shape.y(); + const unsigned int n = adj_rhs ? rhs_shape.y() : rhs_shape.x(); + const unsigned int k = adj_lhs ? lhs_shape.y() : lhs_shape.x(); + const unsigned int b = lhs_shape.z(); + + ARM_COMPUTE_ERROR_ON_MSG(func == nullptr, "Data type not supported for matmul native"); + return (this->*func)(m, n, k, b, rhs->lock_paddings(), info); +} + +MatMulKernelInfo ClMatMulNativeDefaultConfigValhall::configure_G715_f32( + unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool rhs_lock_padding, const MatMulInfo &info) +{ + ARM_COMPUTE_UNUSED(m, n, k, b, rhs_lock_padding); + return {info.adj_lhs(), info.adj_rhs(), /* m0 */ 1, /* n0 */ 4, /* k0 */ 1, /* export_to_cl_image */ false}; +} + +MatMulKernelInfo ClMatMulNativeDefaultConfigValhall::configure_G715_f16( + unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool rhs_lock_padding, const MatMulInfo &info) +{ + return configure_G715_f32(m, n, k, b, rhs_lock_padding, info); +} + +MatMulKernelInfo ClMatMulNativeDefaultConfigValhall::configure_G715_u8( + unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool rhs_lock_padding, const MatMulInfo &info) +{ + ARM_COMPUTE_UNUSED(m, n, k, b, rhs_lock_padding); + return {info.adj_lhs(), info.adj_rhs(), /* m0 */ 4, /* n0 */ 16, /* k0 */ 4, /* export_to_cl_image */ false}; +} + +MatMulKernelInfo ClMatMulNativeDefaultConfigValhall::configure_G710_f32( + unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool rhs_lock_padding, const MatMulInfo &info) +{ + const MatMulNativeConfigsMatrix configs_mnkb_best_nt_nt = { + {3136, 64, 64, 36, 4, 4, 16, 1}, {4096, 48, 32, 36, 4, 4, 4, 1}, {688, 92, 68, 32, 2, 8, 4, 1}, + {24, 464, 412, 24, 2, 8, 4, 1}, {112, 184, 144, 28, 4, 4, 16, 1}, {5776, 64, 32, 36, 2, 4, 16, 1}, + {1568, 64, 40, 36, 2, 8, 8, 1}, {2920, 64, 64, 24, 4, 4, 16, 1}}; + + const MatMulNativeConfigsMatrix configs_mnkb_fallback_nt_nt = { + {3136, 64, 64, 36, 4, 4, 8, 0}, {4096, 48, 32, 36, 4, 4, 8, 0}, {688, 92, 68, 32, 5, 4, 4, 0}, + {24, 464, 412, 24, 6, 2, 8, 0}, {112, 184, 144, 28, 6, 4, 4, 0}, {5776, 64, 32, 36, 5, 4, 4, 0}, + {1568, 64, 40, 36, 4, 4, 8, 0}, {2920, 64, 64, 24, 4, 4, 8, 0}}; + + const MatMulNativeConfigsMatrix configs_mnkb_best_nt_t = { + {3136, 64, 64, 36, 4, 4, 4, 1}, {4096, 48, 32, 36, 2, 2, 16, 1}, {688, 92, 68, 32, 4, 4, 4, 1}, + {24, 464, 412, 24, 6, 2, 8, 1}, {112, 184, 144, 28, 4, 2, 16, 1}, {5776, 64, 32, 36, 4, 4, 4, 1}, + {1568, 64, 40, 36, 4, 4, 8, 1}, {2920, 64, 64, 24, 4, 4, 4, 1}}; + + const MatMulNativeConfigsMatrix configs_mnkb_fallback_nt_t = { + {3136, 64, 64, 36, 5, 4, 4, 0}, {4096, 48, 32, 36, 5, 4, 4, 0}, {688, 92, 68, 32, 5, 4, 4, 0}, + {24, 464, 412, 24, 6, 2, 4, 0}, {112, 184, 144, 28, 5, 4, 4, 0}, {5776, 64, 32, 36, 5, 4, 4, 0}, + {1568, 64, 40, 36, 5, 4, 4, 0}, {2920, 64, 64, 24, 6, 2, 4, 0}}; + + const MatMulNativeConfigsMatrix configs_mnkb_best_t_nt = { + {3136, 64, 64, 36, 4, 4, 16, 1}, {4096, 48, 32, 36, 4, 4, 4, 1}, {688, 92, 68, 32, 2, 8, 4, 1}, + {24, 464, 412, 24, 2, 8, 4, 1}, {112, 184, 144, 28, 4, 4, 16, 1}, {5776, 64, 32, 36, 2, 8, 8, 1}, + {1568, 64, 40, 36, 4, 4, 8, 1}, {2920, 64, 64, 24, 4, 4, 16, 1}}; + + const MatMulNativeConfigsMatrix configs_mnkb_fallback_t_nt = { + {3136, 64, 64, 36, 4, 4, 4, 0}, {4096, 48, 32, 36, 4, 4, 4, 0}, {688, 92, 68, 32, 4, 4, 4, 0}, + {24, 464, 412, 24, 4, 4, 4, 0}, {112, 184, 144, 28, 4, 4, 4, 0}, {5776, 64, 32, 36, 4, 4, 8, 0}, + {1568, 64, 40, 36, 4, 4, 4, 0}, {2920, 64, 64, 24, 4, 4, 4, 0}}; + + const MatMulNativeConfigsMatrix configs_mnkb_best_t_t = { + {3136, 64, 64, 36, 4, 4, 4, 1}, {4096, 48, 32, 36, 4, 4, 4, 1}, {688, 92, 68, 32, 4, 4, 4, 1}, + {24, 464, 412, 24, 2, 2, 16, 1}, {112, 184, 144, 28, 4, 4, 4, 1}, {5776, 64, 32, 36, 4, 4, 4, 1}, + {1568, 64, 40, 36, 4, 4, 4, 1}, {2920, 64, 64, 24, 4, 4, 4, 1}}; + + const MatMulNativeConfigsMatrix configs_mnkb_fallback_t_t = { + {3136, 64, 64, 36, 4, 4, 4, 0}, {4096, 48, 32, 36, 4, 4, 4, 0}, {688, 92, 68, 32, 4, 4, 4, 0}, + {24, 464, 412, 24, 4, 2, 8, 0}, {112, 184, 144, 28, 4, 4, 4, 0}, {5776, 64, 32, 36, 4, 4, 4, 0}, + {1568, 64, 40, 36, 4, 4, 4, 0}, {2920, 64, 64, 24, 4, 4, 4, 0}}; + + const bool adj_lhs = info.adj_lhs(); + const bool adj_rhs = info.adj_rhs(); + + const MatMulNativeConfigsMatrix *configs_best_to_use = nullptr; + const MatMulNativeConfigsMatrix *configs_fallback_to_use = nullptr; + + if ((adj_lhs == false) && (adj_rhs == false)) + { + configs_best_to_use = &configs_mnkb_best_nt_nt; + configs_fallback_to_use = &configs_mnkb_fallback_nt_nt; + } + else if ((adj_lhs == false) && (adj_rhs == true)) + { + configs_best_to_use = &configs_mnkb_best_nt_t; + configs_fallback_to_use = &configs_mnkb_fallback_nt_t; + } + else if ((adj_lhs == true) && (adj_rhs == false)) + { + configs_best_to_use = &configs_mnkb_best_t_nt; + configs_fallback_to_use = &configs_mnkb_fallback_t_nt; + } + else + { + configs_best_to_use = &configs_mnkb_best_t_t; + configs_fallback_to_use = &configs_mnkb_fallback_t_t; + } + + MatMulKernelInfo desc0 = find_info(*configs_best_to_use, adj_lhs, adj_rhs, m, n, k, b); + MatMulKernelInfo desc1 = find_info(*configs_fallback_to_use, adj_lhs, adj_rhs, m, n, k, b); + + return select_info(desc0, desc1, m, n, k, b, DataType::F32, rhs_lock_padding); +} + +MatMulKernelInfo ClMatMulNativeDefaultConfigValhall::configure_G710_f16( + unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool rhs_lock_padding, const MatMulInfo &info) +{ + const MatMulNativeConfigsMatrix configs_mnkb_best_nt_nt = { + {3136, 64, 64, 36, 4, 4, 16, 1}, {4096, 48, 32, 36, 4, 4, 8, 1}, {688, 92, 68, 32, 4, 4, 16, 1}, + {24, 464, 412, 24, 4, 4, 4, 1}, {112, 184, 144, 28, 4, 4, 16, 1}, {5776, 64, 32, 36, 4, 4, 8, 1}, + {1568, 64, 40, 36, 4, 4, 8, 1}, {2920, 64, 64, 24, 4, 4, 16, 1}}; + + const MatMulNativeConfigsMatrix configs_mnkb_fallback_nt_nt = { + {3136, 64, 64, 36, 6, 4, 8, 0}, {4096, 48, 32, 36, 6, 4, 8, 0}, {688, 92, 68, 32, 6, 4, 8, 0}, + {24, 464, 412, 24, 4, 4, 8, 0}, {112, 184, 144, 28, 6, 4, 8, 0}, {5776, 64, 32, 36, 6, 4, 8, 0}, + {1568, 64, 40, 36, 6, 4, 8, 0}, {2920, 64, 64, 24, 6, 4, 8, 0}}; + + const MatMulNativeConfigsMatrix configs_mnkb_best_nt_t = { + {3136, 64, 64, 36, 6, 4, 8, 1}, {4096, 48, 32, 36, 6, 4, 8, 1}, {688, 92, 68, 32, 4, 4, 4, 1}, + {24, 464, 412, 24, 6, 2, 4, 1}, {112, 184, 144, 28, 4, 2, 16, 1}, {5776, 64, 32, 36, 6, 4, 8, 1}, + {1568, 64, 40, 36, 6, 4, 8, 1}, {2920, 64, 64, 24, 6, 4, 8, 1}}; + + const MatMulNativeConfigsMatrix configs_mnkb_fallback_nt_t = { + {3136, 64, 64, 36, 6, 2, 16, 0}, {4096, 48, 32, 36, 5, 4, 8, 0}, {688, 92, 68, 32, 6, 2, 16, 0}, + {24, 464, 412, 24, 6, 2, 16, 0}, {112, 184, 144, 28, 6, 2, 16, 0}, {5776, 64, 32, 36, 5, 4, 8, 0}, + {1568, 64, 40, 36, 5, 4, 8, 0}, {2920, 64, 64, 24, 6, 2, 16, 0}}; + + const MatMulNativeConfigsMatrix configs_mnkb_best_t_nt = { + {3136, 64, 64, 36, 4, 4, 16, 1}, {4096, 48, 32, 36, 4, 4, 4, 1}, {688, 92, 68, 32, 4, 4, 4, 1}, + {24, 464, 412, 24, 4, 4, 4, 1}, {112, 184, 144, 28, 4, 4, 4, 1}, {5776, 64, 32, 36, 4, 4, 4, 1}, + {1568, 64, 40, 36, 4, 4, 4, 1}, {2920, 64, 64, 24, 4, 4, 4, 1}}; + + const MatMulNativeConfigsMatrix configs_mnkb_fallback_t_nt = { + {3136, 64, 64, 36, 4, 4, 4, 0}, {4096, 48, 32, 36, 4, 4, 4, 0}, {688, 92, 68, 32, 4, 4, 4, 0}, + {24, 464, 412, 24, 4, 4, 4, 0}, {112, 184, 144, 28, 4, 4, 4, 0}, {5776, 64, 32, 36, 4, 4, 4, 0}, + {1568, 64, 40, 36, 4, 4, 4, 0}, {2920, 64, 64, 24, 4, 4, 4, 0}}; + + const MatMulNativeConfigsMatrix configs_mnkb_best_t_t = { + {3136, 64, 64, 36, 4, 4, 16, 1}, {4096, 48, 32, 36, 4, 4, 8, 1}, {688, 92, 68, 32, 4, 4, 4, 1}, + {24, 464, 412, 24, 4, 2, 8, 1}, {112, 184, 144, 28, 4, 2, 16, 1}, {5776, 64, 32, 36, 4, 4, 16, 1}, + {1568, 64, 40, 36, 4, 4, 8, 1}, {2920, 64, 64, 24, 4, 4, 16, 1}}; + + const MatMulNativeConfigsMatrix configs_mnkb_fallback_t_t = { + {3136, 64, 64, 36, 4, 4, 8, 0}, {4096, 48, 32, 36, 4, 4, 8, 0}, {688, 92, 68, 32, 4, 4, 8, 0}, + {24, 464, 412, 24, 4, 4, 8, 0}, {112, 184, 144, 28, 4, 4, 8, 0}, {5776, 64, 32, 36, 4, 4, 8, 0}, + {1568, 64, 40, 36, 4, 4, 8, 0}, {2920, 64, 64, 24, 4, 4, 8, 0}}; + + const bool adj_lhs = info.adj_lhs(); + const bool adj_rhs = info.adj_rhs(); + + const MatMulNativeConfigsMatrix *configs_best_to_use = nullptr; + const MatMulNativeConfigsMatrix *configs_fallback_to_use = nullptr; + + if ((adj_lhs == false) && (adj_rhs == false)) + { + configs_best_to_use = &configs_mnkb_best_nt_nt; + configs_fallback_to_use = &configs_mnkb_fallback_nt_nt; + } + else if ((adj_lhs == false) && (adj_rhs == true)) + { + configs_best_to_use = &configs_mnkb_best_nt_t; + configs_fallback_to_use = &configs_mnkb_fallback_nt_t; + } + else if ((adj_lhs == true) && (adj_rhs == false)) + { + configs_best_to_use = &configs_mnkb_best_t_nt; + configs_fallback_to_use = &configs_mnkb_fallback_t_nt; + } + else + { + configs_best_to_use = &configs_mnkb_best_t_t; + configs_fallback_to_use = &configs_mnkb_fallback_t_t; + } + + MatMulKernelInfo desc0 = find_info(*configs_best_to_use, adj_lhs, adj_rhs, m, n, k, b); + MatMulKernelInfo desc1 = find_info(*configs_fallback_to_use, adj_lhs, adj_rhs, m, n, k, b); + + return select_info(desc0, desc1, m, n, k, b, DataType::F16, rhs_lock_padding); +} + +MatMulKernelInfo ClMatMulNativeDefaultConfigValhall::configure_G710_u8( + unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool rhs_lock_padding, const MatMulInfo &info) +{ + ARM_COMPUTE_UNUSED(rhs_lock_padding); + + const MatMulNativeConfigsMatrix configs_mnkb_best_nt_nt = { + {3136, 64, 64, 36, 6, 4, 4, 0}, {4096, 48, 32, 36, 6, 4, 4, 0}, {688, 92, 68, 32, 2, 8, 4, 0}, + {24, 464, 412, 24, 4, 4, 4, 0}, {112, 184, 144, 28, 6, 4, 4, 0}, {5776, 64, 32, 36, 6, 4, 4, 0}, + {1568, 64, 40, 36, 6, 4, 4, 0}, {2920, 64, 64, 24, 5, 4, 4, 0}}; + + const MatMulNativeConfigsMatrix configs_mnkb_best_nt_t = { + {3136, 64, 64, 36, 4, 4, 16, 0}, {4096, 48, 32, 36, 4, 4, 16, 0}, {688, 92, 68, 32, 4, 4, 16, 0}, + {24, 464, 412, 24, 6, 2, 16, 0}, {112, 184, 144, 28, 4, 4, 16, 0}, {5776, 64, 32, 36, 4, 4, 16, 0}, + {1568, 64, 40, 36, 6, 4, 4, 0}, {2920, 64, 64, 24, 4, 4, 16, 0}}; + + const MatMulNativeConfigsMatrix configs_mnkb_best_t_nt = { + {3136, 64, 64, 36, 4, 4, 8, 0}, {4096, 48, 32, 36, 4, 4, 8, 0}, {688, 92, 68, 32, 4, 4, 4, 0}, + {24, 464, 412, 24, 4, 4, 4, 0}, {112, 184, 144, 28, 4, 4, 8, 0}, {5776, 64, 32, 36, 4, 4, 8, 0}, + {1568, 64, 40, 36, 4, 4, 8, 0}, {2920, 64, 64, 24, 4, 4, 8, 0}}; + + const MatMulNativeConfigsMatrix configs_mnkb_best_t_t = { + {3136, 64, 64, 36, 4, 2, 16, 0}, {4096, 48, 32, 36, 4, 4, 4, 0}, {688, 92, 68, 32, 4, 4, 8, 0}, + {24, 464, 412, 24, 4, 2, 16, 0}, {112, 184, 144, 28, 4, 2, 16, 0}, {5776, 64, 32, 36, 4, 4, 4, 0}, + {1568, 64, 40, 36, 4, 4, 8, 0}, {2920, 64, 64, 24, 4, 2, 16, 0}}; + + const bool adj_lhs = info.adj_lhs(); + const bool adj_rhs = info.adj_rhs(); + + if ((adj_lhs == false) && (adj_rhs == false)) + { + return find_info(configs_mnkb_best_nt_nt, adj_lhs, adj_rhs, m, n, k, b); + } + else if ((adj_lhs == false) && (adj_rhs == true)) + { + return find_info(configs_mnkb_best_nt_t, adj_lhs, adj_rhs, m, n, k, b); + } + else if ((adj_lhs == true) && (adj_rhs == false)) + { + return find_info(configs_mnkb_best_t_nt, adj_lhs, adj_rhs, m, n, k, b); + } + else + { + return find_info(configs_mnkb_best_t_t, adj_lhs, adj_rhs, m, n, k, b); + } +} +} // namespace cl_matmul +} // namespace arm_compute diff --git a/src/runtime/heuristics/matmul_native/ClMatMulNativeDefaultConfigValhall.h b/src/runtime/heuristics/matmul_native/ClMatMulNativeDefaultConfigValhall.h new file mode 100644 index 0000000000..5279871057 --- /dev/null +++ b/src/runtime/heuristics/matmul_native/ClMatMulNativeDefaultConfigValhall.h @@ -0,0 +1,62 @@ +/* + * Copyright (c) 2023 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ACL_SRC_RUNTIME_HEURISTICS_MATMUL_NATIVE_CLMATMULNATIVEDEFAULTCONFIGVALHALL_H +#define ACL_SRC_RUNTIME_HEURISTICS_MATMUL_NATIVE_CLMATMULNATIVEDEFAULTCONFIGVALHALL_H + +#include "src/runtime/heuristics/matmul_native/IClMatMulNativeKernelConfig.h" + +namespace arm_compute +{ +namespace cl_matmul +{ +/** Valhall based OpenCL matmul configuration */ +class ClMatMulNativeDefaultConfigValhall final : public IClMatMulNativeKernelConfig +{ +public: + /** Constructor + * + * @param[in] gpu GPU target + */ + ClMatMulNativeDefaultConfigValhall(GPUTarget gpu); + + // Inherited overridden method + MatMulKernelInfo configure(const ITensorInfo *lhs, const ITensorInfo *rhs, const MatMulInfo &info) override; + +private: + MatMulKernelInfo configure_G710_f32( + unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool rhs_lock_padding, const MatMulInfo &info); + MatMulKernelInfo configure_G710_f16( + unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool rhs_lock_padding, const MatMulInfo &info); + MatMulKernelInfo configure_G710_u8( + unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool rhs_lock_padding, const MatMulInfo &info); + MatMulKernelInfo configure_G715_f32( + unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool rhs_lock_padding, const MatMulInfo &info); + MatMulKernelInfo configure_G715_f16( + unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool rhs_lock_padding, const MatMulInfo &info); + MatMulKernelInfo configure_G715_u8( + unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool rhs_lock_padding, const MatMulInfo &info); +}; +} // namespace cl_matmul +} // namespace arm_compute +#endif // ACL_SRC_RUNTIME_HEURISTICS_MATMUL_NATIVE_CLMATMULNATIVEDEFAULTCONFIGVALHALL_H diff --git a/src/runtime/heuristics/matmul_native/ClMatMulNativeDefaultVariantValhall.cpp b/src/runtime/heuristics/matmul_native/ClMatMulNativeDefaultVariantValhall.cpp new file mode 100644 index 0000000000..3878f698fd --- /dev/null +++ b/src/runtime/heuristics/matmul_native/ClMatMulNativeDefaultVariantValhall.cpp @@ -0,0 +1,113 @@ +/* + * Copyright (c) 2023 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "src/runtime/heuristics/matmul_native/ClMatMulNativeDefaultVariantValhall.h" + +#include "arm_compute/core/Error.h" +#include "arm_compute/core/GPUTarget.h" +#include "arm_compute/core/TensorInfo.h" + +namespace arm_compute +{ +namespace cl_matmul +{ +ClMatMulNativeDefaultVariantValhall::ClMatMulNativeDefaultVariantValhall(GPUTarget gpu) + : IClMatMulNativeKernelVariant(gpu) +{ +} + +MatMulKernelType ClMatMulNativeDefaultVariantValhall::select_kernel(const ITensorInfo *lhs, + const ITensorInfo *rhs, + const MatMulInfo &info, + const ActivationLayerInfo &act_info) +{ + ARM_COMPUTE_UNUSED(rhs); + + using VariantFunctionExecutorPtr = + MatMulKernelType (ClMatMulNativeDefaultVariantValhall::*)(int k, bool act_enabled); + + ClMatMulNativeVariantArray<VariantFunctionExecutorPtr> configs_G715( + &ClMatMulNativeDefaultVariantValhall::configure_G715_float, + &ClMatMulNativeDefaultVariantValhall::configure_G715_quantized); + + ClMatMulNativeVariantArray<VariantFunctionExecutorPtr> configs_default( + &ClMatMulNativeDefaultVariantValhall::configure_default_float, + &ClMatMulNativeDefaultVariantValhall::configure_default_quantized); + + VariantFunctionExecutorPtr func = nullptr; + switch (_target) + { + case GPUTarget::G715: + case GPUTarget::G615: + func = configs_G715.get_function(lhs->data_type()); + break; + default: + func = configs_default.get_function(lhs->data_type()); + break; + } + + const int k = info.adj_lhs() ? lhs->tensor_shape().y() : lhs->tensor_shape().x(); + const bool act_enabled = act_info.enabled(); + + ARM_COMPUTE_ERROR_ON_MSG(func == nullptr, "Data type not supported for matmul native"); + return (this->*func)(k, act_enabled); +} + +MatMulKernelType ClMatMulNativeDefaultVariantValhall::configure_G715_float(int k, bool act_enabled) +{ + // MMUL kernel works only when K is a multiple of 4 + if (!act_enabled && k % 4 == 0) + { + return MatMulKernelType::NATIVE_MMUL_FP; + } + + return MatMulKernelType::NATIVE_FP; +} + +MatMulKernelType ClMatMulNativeDefaultVariantValhall::configure_G715_quantized(int k, bool act_enabled) +{ + // MMUL kernel works only when K is a multiple of 16 + if (!act_enabled && k % 16 == 0) + { + return MatMulKernelType::NATIVE_MMUL_QUANTIZED; + } + + return MatMulKernelType::NATIVE_QUANTIZED; +} + +MatMulKernelType ClMatMulNativeDefaultVariantValhall::configure_default_float(int k, bool act_enabled) +{ + ARM_COMPUTE_UNUSED(k, act_enabled); + + return MatMulKernelType::NATIVE_FP; +} + +MatMulKernelType ClMatMulNativeDefaultVariantValhall::configure_default_quantized(int k, bool act_enabled) +{ + ARM_COMPUTE_UNUSED(k, act_enabled); + + return MatMulKernelType::NATIVE_QUANTIZED; +} + +} // namespace cl_matmul +} // namespace arm_compute diff --git a/src/runtime/heuristics/matmul_native/ClMatMulNativeDefaultVariantValhall.h b/src/runtime/heuristics/matmul_native/ClMatMulNativeDefaultVariantValhall.h new file mode 100644 index 0000000000..a202676e98 --- /dev/null +++ b/src/runtime/heuristics/matmul_native/ClMatMulNativeDefaultVariantValhall.h @@ -0,0 +1,57 @@ +/* + * Copyright (c) 2023 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ACL_SRC_RUNTIME_HEURISTICS_MATMUL_NATIVE_CLMATMULNATIVEDEFAULTVARIANTVALHALL_H +#define ACL_SRC_RUNTIME_HEURISTICS_MATMUL_NATIVE_CLMATMULNATIVEDEFAULTVARIANTVALHALL_H + +#include "src/runtime/heuristics/matmul_native/IClMatMulNativeKernelVariant.h" + +namespace arm_compute +{ +namespace cl_matmul +{ +/** Valhall based OpenCL matmul configuration */ +class ClMatMulNativeDefaultVariantValhall final : public IClMatMulNativeKernelVariant +{ +public: + /** Constructor + * + * @param[in] gpu GPU target + */ + ClMatMulNativeDefaultVariantValhall(GPUTarget gpu); + + // Inherited overridden method + MatMulKernelType select_kernel(const ITensorInfo *lhs, + const ITensorInfo *rhs, + const MatMulInfo &info, + const ActivationLayerInfo &act_info) override; + +private: + MatMulKernelType configure_G715_float(int k, bool act_enabled); + MatMulKernelType configure_G715_quantized(int k, bool act_enabled); + MatMulKernelType configure_default_float(int k, bool act_enabled); + MatMulKernelType configure_default_quantized(int k, bool act_enabled); +}; +} // namespace cl_matmul +} // namespace arm_compute +#endif // ACL_SRC_RUNTIME_HEURISTICS_MATMUL_NATIVE_CLMATMULNATIVEDEFAULTVARIANTVALHALL_H diff --git a/src/runtime/heuristics/matmul_native/ClMatMulNativeHelpers.cpp b/src/runtime/heuristics/matmul_native/ClMatMulNativeHelpers.cpp new file mode 100644 index 0000000000..89cad30214 --- /dev/null +++ b/src/runtime/heuristics/matmul_native/ClMatMulNativeHelpers.cpp @@ -0,0 +1,134 @@ +/* + * Copyright (c) 2023 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "src/runtime/heuristics/matmul_native/ClMatMulNativeHelpers.h" + +#include "arm_compute/core/KernelDescriptors.h" +#include "arm_compute/core/TensorInfo.h" +#include "arm_compute/core/TensorShape.h" + +#include "src/gpu/cl/kernels/ClMatMulNativeKernel.h" + +#include <limits> +#include <utility> + +namespace arm_compute +{ +namespace cl_matmul +{ +MatMulKernelInfo select_info(const MatMulKernelInfo &info0, + const MatMulKernelInfo &info1, + unsigned int m, + unsigned int n, + unsigned int k, + unsigned int b, + DataType data_type, + bool rhs_lock_padding) +{ + ARM_COMPUTE_ERROR_ON_MSG(info1.export_rhs_to_cl_image == true, + "The fallback MatMul configuration cannot have export_to_cl_image = true"); + ARM_COMPUTE_ERROR_ON_MSG(info0.adj_lhs != info1.adj_lhs, + "The MatMul configurations must have the same adj_lhs value"); + ARM_COMPUTE_ERROR_ON_MSG(info0.adj_rhs != info1.adj_rhs, + "The MatMul configurations must have the same adj_rhs value"); + + const bool adj_lhs = info0.adj_lhs; + const bool adj_rhs = info0.adj_rhs; + + TensorInfo lhs_info = + !adj_lhs ? TensorInfo(TensorShape(k, m, b), 1, data_type) : TensorInfo(TensorShape(m, k, b), 1, data_type); + TensorInfo rhs_info = + !adj_rhs ? TensorInfo(TensorShape(n, k, b), 1, data_type) : TensorInfo(TensorShape(k, n, b), 1, data_type); + TensorInfo dst_info; + + if (rhs_lock_padding == false) + { + if (bool(opencl::kernels::ClMatMulNativeKernel::validate(&lhs_info, &rhs_info, nullptr, &dst_info, info0))) + { + return info0; + } + else + { + return info1; + } + } + else + { + return info1; + } +} + +MatMulKernelInfo find_info(const MatMulNativeConfigsMatrix &configs, + bool adj_lhs, + bool adj_rhs, + unsigned int m, + unsigned int n, + unsigned int k, + unsigned int b) +{ + size_t min_acc = std::numeric_limits<size_t>::max(); + size_t min_idx = 0; + + ARM_COMPUTE_ERROR_ON(configs.size() == 0); + const size_t num_rows = configs.size(); + const size_t num_cols = configs[0].size(); + + ARM_COMPUTE_ERROR_ON_MSG(num_cols != 8U, + "The entry should have 8 integer values representing: M, N, K, B, M0, N0. K0, IMG_RHS"); + ARM_COMPUTE_UNUSED(num_cols); + + // Find nearest GeMM workload + // Note: the workload does not depend on the K dimension + for (size_t y = 0; y < num_rows; ++y) + { + size_t mc0 = static_cast<size_t>(configs[y][0]); + size_t nc0 = static_cast<size_t>(configs[y][1]); + size_t kc0 = static_cast<size_t>(configs[y][2]); + size_t bc0 = static_cast<size_t>(configs[y][3]); + + size_t acc = 0; + acc += (m - mc0) * (m - mc0); + acc += (n - nc0) * (n - nc0); + acc += (k - kc0) * (k - kc0); + acc += (b - bc0) * (b - bc0); + acc = std::sqrt(acc); + if (acc < min_acc) + { + min_acc = acc; + min_idx = y; + } + } + + // Get the configuration from the nearest GeMM shape + MatMulKernelInfo desc; + desc.adj_lhs = adj_lhs; + desc.adj_rhs = adj_rhs; + desc.m0 = configs[min_idx][4]; + desc.n0 = configs[min_idx][5]; + desc.k0 = configs[min_idx][6]; + desc.export_rhs_to_cl_image = configs[min_idx][7]; + + return desc; +} +} // namespace cl_matmul +} // namespace arm_compute diff --git a/src/runtime/heuristics/matmul_native/ClMatMulNativeHelpers.h b/src/runtime/heuristics/matmul_native/ClMatMulNativeHelpers.h new file mode 100644 index 0000000000..699f5fe8c1 --- /dev/null +++ b/src/runtime/heuristics/matmul_native/ClMatMulNativeHelpers.h @@ -0,0 +1,83 @@ +/* + * Copyright (c) 2023 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ACL_SRC_RUNTIME_HEURISTICS_MATMUL_NATIVE_CLMATMULNATIVEHELPERS_H +#define ACL_SRC_RUNTIME_HEURISTICS_MATMUL_NATIVE_CLMATMULNATIVEHELPERS_H + +#include "arm_compute/core/Types.h" + +namespace arm_compute +{ +// Forward declaration +struct MatMulKernelInfo; + +namespace cl_matmul +{ +using MatMulNativeConfigsMatrix = std::vector<std::vector<int32_t>>; + +/** This function accepts two MatMulKernelInfo objects where only the first can be with cl_image2d support enabled. + * The aim of this function is to check whether the first MatMulKernelInfo object is valid. If not, the function will + * return the second MatMulKernelInfo object. Otherwise, the first one. + * + * @param[in] info0 MatMulKernelInfo with cl_image2d support + * @param[in] info1 MatMulKernelInfo to fall-back if cl_image2d cannot be used + * @param[in] m Number of rows (M) of the LHS matrix + * @param[in] n Number of columns (N) in the RHS matrix not reshaped + * @param[in] k Number of rows (K) in the RHS matrix not reshaped + * @param[in] b Batch size + * @param[in] data_type Data type + * @param[in] rhs_lock_padding Flag used to know whether the RHS paddings are locked + * + * @return @ref MatMulKernelInfo + */ +MatMulKernelInfo select_info(const MatMulKernelInfo &info0, + const MatMulKernelInfo &info1, + unsigned int m, + unsigned int n, + unsigned int k, + unsigned int b, + DataType data_type, + bool rhs_lock_padding); + +/** Find the preferred configurations for the MatMul Native kernel using the MatMulNativeConfigsMatrix provided by the user + * + * @param[in] configs List of best configurations for a limited number of MatMul shapes + * @param[in] adj_lhs Adjoint LHS flag value + * @param[in] adj_rhs Adjoint RHS flag value + * @param[in] m Number of rows (M) of the LHS matrix + * @param[in] n Number of columns (N) in the RHS matrix not reshaped + * @param[in] k Number of rows (K) in the RHS matrix not reshaped + * @param[in] b Batch size + * + * @return @ref MatMulKernelInfo + */ +MatMulKernelInfo find_info(const MatMulNativeConfigsMatrix &configs, + bool adj_lhs, + bool adj_rhs, + unsigned int m, + unsigned int n, + unsigned int k, + unsigned int b); +} // namespace cl_matmul +} // namespace arm_compute +#endif // ACL_SRC_RUNTIME_HEURISTICS_MATMUL_NATIVE_CLMATMULNATIVEHELPERS_H diff --git a/src/runtime/heuristics/matmul_native/ClMatMulNativeKernelConfig.h b/src/runtime/heuristics/matmul_native/ClMatMulNativeKernelConfig.h new file mode 100644 index 0000000000..e7485bca81 --- /dev/null +++ b/src/runtime/heuristics/matmul_native/ClMatMulNativeKernelConfig.h @@ -0,0 +1,62 @@ +/* + * Copyright (c) 2023 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ACL_SRC_RUNTIME_HEURISTICS_MATMUL_NATIVE_CLMATMULNATIVEKERNELCONFIG_H +#define ACL_SRC_RUNTIME_HEURISTICS_MATMUL_NATIVE_CLMATMULNATIVEKERNELCONFIG_H + +#include "src/runtime/heuristics/matmul_native/ClMatMulNativeDefaultConfigValhall.h" +#include "src/runtime/heuristics/matmul_native/IClMatMulNativeKernelConfig.h" + +#include <memory> + +namespace arm_compute +{ +namespace cl_matmul +{ +/** ClMatMul configuration factory class */ +class ClMatMulNativeKernelConfigurationFactory final +{ +public: + /** Static method to call the ClMatMul configuration class accordingly with the GPU target + * + * @param[in] gpu GPU target + * + * @return IClMatMulNativeKernelConfig + */ + static std::unique_ptr<IClMatMulNativeKernelConfig> create(GPUTarget gpu) + { + switch (get_arch_from_target(gpu)) + { + case GPUTarget::MIDGARD: + case GPUTarget::BIFROST: + case GPUTarget::VALHALL: + case GPUTarget::FIFTHGEN: + return std::make_unique<ClMatMulNativeDefaultConfigValhall>(gpu); + default: + ARM_COMPUTE_ERROR("Not supported GPU target"); + } + } +}; +} // namespace cl_matmul +} // namespace arm_compute +#endif // ACL_SRC_RUNTIME_HEURISTICS_MATMUL_NATIVE_CLMATMULNATIVEKERNELCONFIG_H diff --git a/src/runtime/heuristics/matmul_native/ClMatMulNativeKernelVariant.h b/src/runtime/heuristics/matmul_native/ClMatMulNativeKernelVariant.h new file mode 100644 index 0000000000..c2895b8919 --- /dev/null +++ b/src/runtime/heuristics/matmul_native/ClMatMulNativeKernelVariant.h @@ -0,0 +1,63 @@ +/* + * Copyright (c) 2023 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ACL_SRC_RUNTIME_HEURISTICS_MATMUL_NATIVE_CLMATMULNATIVEKERNELVARIANT_H +#define ACL_SRC_RUNTIME_HEURISTICS_MATMUL_NATIVE_CLMATMULNATIVEKERNELVARIANT_H + +#include "src/runtime/heuristics/matmul_native/ClMatMulNativeDefaultVariantValhall.h" +#include "src/runtime/heuristics/matmul_native/IClMatMulNativeKernelVariant.h" + +#include <memory> + +namespace arm_compute +{ +namespace cl_matmul +{ + +/** ClMatMul variant factory class */ +class ClMatMulNativeKernelVariantFactory final +{ +public: + /** Static method to call the ClMatMul configuration class accordingly with the GPU target + * + * @param[in] gpu GPU target + * + * @return IClMatMulNativeKernelVariant + */ + static std::unique_ptr<IClMatMulNativeKernelVariant> create(GPUTarget gpu) + { + switch (get_arch_from_target(gpu)) + { + case GPUTarget::MIDGARD: + case GPUTarget::BIFROST: + case GPUTarget::VALHALL: + case GPUTarget::FIFTHGEN: + return std::make_unique<ClMatMulNativeDefaultVariantValhall>(gpu); + default: + ARM_COMPUTE_ERROR("Not supported GPU target"); + } + } +}; +} // namespace cl_matmul +} // namespace arm_compute +#endif // ACL_SRC_RUNTIME_HEURISTICS_MATMUL_NATIVE_CLMATMULNATIVEKERNELVARIANT_H diff --git a/src/runtime/heuristics/matmul_native/IClMatMulNativeKernelConfig.h b/src/runtime/heuristics/matmul_native/IClMatMulNativeKernelConfig.h new file mode 100644 index 0000000000..00ba3641d5 --- /dev/null +++ b/src/runtime/heuristics/matmul_native/IClMatMulNativeKernelConfig.h @@ -0,0 +1,117 @@ +/* + * Copyright (c) 2023 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ACL_SRC_RUNTIME_HEURISTICS_MATMUL_NATIVE_ICLMATMULNATIVEKERNELCONFIG_H +#define ACL_SRC_RUNTIME_HEURISTICS_MATMUL_NATIVE_ICLMATMULNATIVEKERNELCONFIG_H + +#include "arm_compute/core/GPUTarget.h" +#include "arm_compute/core/KernelDescriptors.h" +#include "arm_compute/core/Types.h" +#include "arm_compute/function_info/MatMulInfo.h" + +#include "src/core/common/Macros.h" + +namespace arm_compute +{ +namespace cl_matmul +{ +/** Basic container for the OpenCL MatMul Native configuration functions */ +template <class T> +class ClMatMulNativeConfigArray +{ +public: + /** Alias for F32 index */ + static constexpr size_t DT_F32 = 0; + /** Alias for F16 index */ + static constexpr size_t DT_F16 = 1; + /** Alias for Int8 index */ + static constexpr size_t DT_INT8 = 2; + + /** Constructor + * + * @param[in] func_f32 Function to call for matmul native F32 + * @param[in] func_f16 Function to call for matmul native F16 + * @param[in] func_int8 Function to call for matmul native Int8 (QASYMM8, QASYMM8_SIGNED, QSYMM8_PER_CHANNEL) + * + */ + ClMatMulNativeConfigArray(T func_f32, T func_f16, T func_int8) : _configs{func_f32, func_f16, func_int8} + { + } + + /** Method to return the matmul native configuration function based on data type + * + * @param[in] data_type Input data type + * + * @return the valid function otherwise it returns nullptr if the data type is not valid + */ + T get_function(DataType data_type) + { + switch (data_type) + { + case DataType::F32: + return _configs.at(DT_F32); + case DataType::F16: + return _configs.at(DT_F16); + case DataType::QASYMM8: + case DataType::QASYMM8_SIGNED: + case DataType::QSYMM8_PER_CHANNEL: + return _configs.at(DT_INT8); + default: + return nullptr; + } + } + +private: + std::array<T, 3> _configs; +}; + +/** Basic interface for the matmul native kernel configuration + * This is the base class that chooses architecture specific kernel configurations. +*/ +class IClMatMulNativeKernelConfig +{ +public: + /** Constructor + * + * @param[in] arch GPU target + */ + IClMatMulNativeKernelConfig(GPUTarget arch) : _target(arch) + { + } + ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(IClMatMulNativeKernelConfig); + /** Virtual destructor */ + virtual ~IClMatMulNativeKernelConfig() = default; + /** This method returns the @ref MatMulKernelInfo for the given inputs + * + * @param[in] lhs LHS tensor + * @param[in] rhs RHS tensor + * @param[in] info MatMul info + */ + virtual MatMulKernelInfo configure(const ITensorInfo *lhs, const ITensorInfo *rhs, const MatMulInfo &info) = 0; + +protected: + GPUTarget _target; +}; +} // namespace cl_matmul +} // namespace arm_compute +#endif // ACL_SRC_RUNTIME_HEURISTICS_MATMUL_NATIVE_ICLMATMULNATIVEKERNELCONFIG_H diff --git a/src/runtime/heuristics/matmul_native/IClMatMulNativeKernelVariant.h b/src/runtime/heuristics/matmul_native/IClMatMulNativeKernelVariant.h new file mode 100644 index 0000000000..eac41dd6a3 --- /dev/null +++ b/src/runtime/heuristics/matmul_native/IClMatMulNativeKernelVariant.h @@ -0,0 +1,135 @@ +/* + * Copyright (c) 2023 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ACL_SRC_RUNTIME_HEURISTICS_MATMUL_NATIVE_ICLMATMULNATIVEKERNELVARIANT_H +#define ACL_SRC_RUNTIME_HEURISTICS_MATMUL_NATIVE_ICLMATMULNATIVEKERNELVARIANT_H + +#include "arm_compute/core/CoreTypes.h" // DataType +#include "arm_compute/core/GPUTarget.h" +#include "arm_compute/core/ITensorInfo.h" +#include "arm_compute/function_info/ActivationLayerInfo.h" +#include "arm_compute/function_info/MatMulInfo.h" + +#include "src/core/common/Macros.h" + +#include <array> + +namespace arm_compute +{ +namespace cl_matmul +{ +enum class MatMulKernelType +{ + /** Native matrix multiplication for FP types */ + NATIVE_FP, + + /** Native matrix multiplication for quantized types */ + NATIVE_QUANTIZED, + + /** Native matrix multiplication using MMUL extension for FP types */ + NATIVE_MMUL_FP, + + /** Native matrix multiplication using MMUL extension for Quantized types */ + NATIVE_MMUL_QUANTIZED +}; + +/** Basic container for the OpenCL MatMul Native variant functions */ +template <class T> +class ClMatMulNativeVariantArray +{ +public: + /** Alias for Float index */ + static constexpr size_t DT_FLOAT = 0; + /** Alias for Quantized type index */ + static constexpr size_t DT_QUANTIZED = 1; + + /** Constructor + * + * @param[in] func_float Function to call for matmul native float (F32, F16) + * @param[in] func_quantized Function to call for matmul native quantized (QASYMM8, QASYMM8_SIGNED, QSYMM8_PER_CHANNEL) + * + */ + ClMatMulNativeVariantArray(T func_float, T func_quantized) : _configs{func_float, func_quantized} + { + } + + /** Method to return the matmul native variant function based on data type + * + * @param[in] data_type Input data type + * + * @return the valid function otherwise it returns nullptr if the data type is not valid + */ + T get_function(DataType data_type) + { + switch (data_type) + { + case DataType::F32: + case DataType::F16: + return _configs.at(DT_FLOAT); + case DataType::QASYMM8: + case DataType::QASYMM8_SIGNED: + case DataType::QSYMM8_PER_CHANNEL: + return _configs.at(DT_QUANTIZED); + default: + return nullptr; + } + } + +private: + std::array<T, 2> _configs; +}; + +/** Basic interface for the matmul native kernel variant + * This is the base class that chooses architecture specific kernel variants. +*/ +class IClMatMulNativeKernelVariant +{ +public: + /** Constructor + * + * @param[in] arch GPU target + */ + IClMatMulNativeKernelVariant(GPUTarget arch) : _target(arch) + { + } + ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(IClMatMulNativeKernelVariant); + /** Virtual destructor */ + virtual ~IClMatMulNativeKernelVariant() = default; + /** This method returns the @ref MatMulKernelType for the given inputs + * + * @param[in] lhs LHS tensor + * @param[in] rhs RHS tensor + * @param[in] info MatMul info + * @param[in] act_info Activation layer info + */ + virtual MatMulKernelType select_kernel(const ITensorInfo *lhs, + const ITensorInfo *rhs, + const MatMulInfo &info, + const ActivationLayerInfo &act_info) = 0; + +protected: + GPUTarget _target; +}; +} // namespace cl_matmul +} // namespace arm_compute +#endif // ACL_SRC_RUNTIME_HEURISTICS_MATMUL_NATIVE_ICLMATMULNATIVEKERNELVARIANT_H |