diff options
Diffstat (limited to 'src/runtime')
422 files changed, 21610 insertions, 26635 deletions
diff --git a/src/runtime/Allocator.cpp b/src/runtime/Allocator.cpp index bf219514fc..eca712dbf0 100644 --- a/src/runtime/Allocator.cpp +++ b/src/runtime/Allocator.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2020 ARM Limited. + * Copyright (c) 2017-2020 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -22,10 +22,9 @@ * SOFTWARE. */ #include "arm_compute/runtime/Allocator.h" -#include "arm_compute/runtime/MemoryRegion.h" #include "arm_compute/core/Error.h" -#include "support/MemorySupport.h" +#include "arm_compute/runtime/MemoryRegion.h" #include <cstddef> @@ -44,5 +43,5 @@ void Allocator::free(void *ptr) std::unique_ptr<IMemoryRegion> Allocator::make_region(size_t size, size_t alignment) { - return arm_compute::support::cpp14::make_unique<MemoryRegion>(size, alignment); + return std::make_unique<MemoryRegion>(size, alignment); } diff --git a/src/runtime/BlobLifetimeManager.cpp b/src/runtime/BlobLifetimeManager.cpp index 6bbb731d5d..8a0fc05c39 100644 --- a/src/runtime/BlobLifetimeManager.cpp +++ b/src/runtime/BlobLifetimeManager.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2020 ARM Limited. + * Copyright (c) 2017-2022 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -27,16 +27,15 @@ #include "arm_compute/runtime/BlobMemoryPool.h" #include "arm_compute/runtime/IAllocator.h" #include "arm_compute/runtime/IMemoryGroup.h" -#include "support/MemorySupport.h" #include <algorithm> #include <cmath> +#include <iterator> #include <map> namespace arm_compute { -BlobLifetimeManager::BlobLifetimeManager() - : _blobs() +BlobLifetimeManager::BlobLifetimeManager() : _blobs() { } @@ -48,7 +47,7 @@ const BlobLifetimeManager::info_type &BlobLifetimeManager::info() const std::unique_ptr<IMemoryPool> BlobLifetimeManager::create_pool(IAllocator *allocator) { ARM_COMPUTE_ERROR_ON(allocator == nullptr); - return support::cpp14::make_unique<BlobMemoryPool>(allocator, _blobs); + return std::make_unique<BlobMemoryPool>(allocator, _blobs); } MappingType BlobLifetimeManager::mapping_type() const @@ -62,33 +61,32 @@ void BlobLifetimeManager::update_blobs_and_mappings() ARM_COMPUTE_ERROR_ON(_active_group == nullptr); // Sort free blobs requirements in descending order. - _free_blobs.sort([](const Blob & ba, const Blob & bb) - { - return ba.max_size > bb.max_size; - }); + _free_blobs.sort([](const Blob &ba, const Blob &bb) { return ba.max_size > bb.max_size; }); // Create group sizes vector std::vector<BlobInfo> group_sizes; - std::transform(std::begin(_free_blobs), std::end(_free_blobs), std::back_inserter(group_sizes), [](const Blob & b) - { - return BlobInfo{ b.max_size, b.max_alignment, b.bound_elements.size() }; - }); + std::transform(std::begin(_free_blobs), std::end(_free_blobs), std::back_inserter(group_sizes), + [](const Blob &b) { + return BlobInfo{b.max_size, b.max_alignment, b.bound_elements.size()}; + }); // Update blob sizes size_t max_size = std::max(_blobs.size(), group_sizes.size()); _blobs.resize(max_size); group_sizes.resize(max_size); - std::transform(std::begin(_blobs), std::end(_blobs), std::begin(group_sizes), std::begin(_blobs), [](BlobInfo lhs, BlobInfo rhs) - { - return BlobInfo{ std::max(lhs.size, rhs.size), std::max(lhs.alignment, rhs.alignment), std::max(lhs.owners, rhs.owners) }; - }); + std::transform(std::begin(_blobs), std::end(_blobs), std::begin(group_sizes), std::begin(_blobs), + [](BlobInfo lhs, BlobInfo rhs) + { + return BlobInfo{std::max(lhs.size, rhs.size), std::max(lhs.alignment, rhs.alignment), + std::max(lhs.owners, rhs.owners)}; + }); // Calculate group mappings auto &group_mappings = _active_group->mappings(); int blob_idx = 0; - for(auto &free_blob : _free_blobs) + for (auto &free_blob : _free_blobs) { - for(auto &bound_element_id : free_blob.bound_elements) + for (auto &bound_element_id : free_blob.bound_elements) { ARM_COMPUTE_ERROR_ON(_active_elements.find(bound_element_id) == std::end(_active_elements)); Element &bound_element = _active_elements[bound_element_id]; diff --git a/src/runtime/BlobMemoryPool.cpp b/src/runtime/BlobMemoryPool.cpp index 907b39f9c6..a2f63ef52b 100644 --- a/src/runtime/BlobMemoryPool.cpp +++ b/src/runtime/BlobMemoryPool.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2020 ARM Limited. + * Copyright (c) 2017-2021 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -27,7 +27,6 @@ #include "arm_compute/runtime/IAllocator.h" #include "arm_compute/runtime/IMemoryPool.h" #include "arm_compute/runtime/Types.h" -#include "support/MemorySupport.h" #include <vector> @@ -42,14 +41,13 @@ BlobMemoryPool::BlobMemoryPool(IAllocator *allocator, std::vector<BlobInfo> blob BlobMemoryPool::~BlobMemoryPool() { - ARM_COMPUTE_ERROR_ON(!_allocator); free_blobs(); } void BlobMemoryPool::acquire(MemoryMappings &handles) { // Set memory to handlers - for(auto &handle : handles) + for (auto &handle : handles) { ARM_COMPUTE_ERROR_ON(handle.first == nullptr); handle.first->set_region(_blobs[handle.second].get()); @@ -58,7 +56,7 @@ void BlobMemoryPool::acquire(MemoryMappings &handles) void BlobMemoryPool::release(MemoryMappings &handles) { - for(auto &handle : handles) + for (auto &handle : handles) { ARM_COMPUTE_ERROR_ON(handle.first == nullptr); handle.first->set_region(nullptr); @@ -73,14 +71,14 @@ MappingType BlobMemoryPool::mapping_type() const std::unique_ptr<IMemoryPool> BlobMemoryPool::duplicate() { ARM_COMPUTE_ERROR_ON(!_allocator); - return support::cpp14::make_unique<BlobMemoryPool>(_allocator, _blob_info); + return std::make_unique<BlobMemoryPool>(_allocator, _blob_info); } void BlobMemoryPool::allocate_blobs(const std::vector<BlobInfo> &blob_info) { ARM_COMPUTE_ERROR_ON(!_allocator); - for(const auto &bi : blob_info) + for (const auto &bi : blob_info) { _blobs.push_back(_allocator->make_region(bi.size, bi.alignment)); } diff --git a/src/runtime/CL/CLBufferAllocator.cpp b/src/runtime/CL/CLBufferAllocator.cpp index f50d10034c..b4545b93bf 100644 --- a/src/runtime/CL/CLBufferAllocator.cpp +++ b/src/runtime/CL/CLBufferAllocator.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2020 ARM Limited. + * Copyright (c) 2017-2021 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -23,34 +23,20 @@ */ #include "arm_compute/runtime/CL/CLBufferAllocator.h" -#include "arm_compute/core/CL/CLCoreRuntimeContext.h" #include "arm_compute/core/CL/OpenCL.h" #include "arm_compute/core/Error.h" #include "arm_compute/runtime/CL/CLMemoryRegion.h" #include "arm_compute/runtime/CL/CLScheduler.h" -#include "support/MemorySupport.h" #include <cstddef> namespace arm_compute { -CLBufferAllocator::CLBufferAllocator(CLCoreRuntimeContext *ctx) - : _ctx(ctx) -{ -} - void *CLBufferAllocator::allocate(size_t size, size_t alignment) { ARM_COMPUTE_UNUSED(alignment); - cl_mem buf; - if(_ctx == nullptr) - { - buf = clCreateBuffer(CLScheduler::get().context().get(), CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE, size, nullptr, nullptr); - } - else - { - buf = clCreateBuffer(_ctx->context().get(), CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE, size, nullptr, nullptr); - } + cl_mem buf{clCreateBuffer(CLScheduler::get().context().get(), CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE, size, + nullptr, nullptr)}; return static_cast<void *>(buf); } @@ -63,6 +49,6 @@ void CLBufferAllocator::free(void *ptr) std::unique_ptr<IMemoryRegion> CLBufferAllocator::make_region(size_t size, size_t alignment) { ARM_COMPUTE_UNUSED(alignment); - return arm_compute::support::cpp14::make_unique<CLBufferMemoryRegion>(_ctx, CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE, size); + return std::make_unique<CLBufferMemoryRegion>(CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE, size); } } // namespace arm_compute diff --git a/src/runtime/CL/CLDistribution1D.cpp b/src/runtime/CL/CLDistribution1D.cpp deleted file mode 100644 index f1dd95e77e..0000000000 --- a/src/runtime/CL/CLDistribution1D.cpp +++ /dev/null @@ -1,61 +0,0 @@ -/* - * Copyright (c) 2016, 2017 ARM Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "arm_compute/runtime/CL/CLDistribution1D.h" - -#include "arm_compute/core/Error.h" -#include "arm_compute/runtime/CL/CLScheduler.h" - -using namespace arm_compute; - -CLDistribution1D::CLDistribution1D(size_t num_bins, int32_t offset, uint32_t range) - : ICLDistribution1D(num_bins, offset, range), _mem(CLScheduler::get().context(), CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE, num_bins * sizeof(int32_t)) -{ -} - -void CLDistribution1D::map(bool blocking) -{ - ICLDistribution1D::map(CLScheduler::get().queue(), blocking); -} - -void CLDistribution1D::unmap() -{ - ICLDistribution1D::unmap(CLScheduler::get().queue()); -} - -uint32_t *CLDistribution1D::do_map(cl::CommandQueue &q, bool blocking) -{ - ARM_COMPUTE_ERROR_ON(_mem.get() == nullptr); - return static_cast<uint32_t *>(q.enqueueMapBuffer(_mem, blocking ? CL_TRUE : CL_FALSE, CL_MAP_READ | CL_MAP_WRITE, 0, size())); -} - -void CLDistribution1D::do_unmap(cl::CommandQueue &q) -{ - ARM_COMPUTE_ERROR_ON(_mem.get() == nullptr); - q.enqueueUnmapMemObject(_mem, _mapping); -} - -cl::Buffer &CLDistribution1D::cl_buffer() -{ - return _mem; -} diff --git a/src/runtime/DeviceProperties.cpp b/src/runtime/CL/CLGEMMHeuristicsHandle.cpp index e88aa7124c..d680dc08bb 100644 --- a/src/runtime/DeviceProperties.cpp +++ b/src/runtime/CL/CLGEMMHeuristicsHandle.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019 ARM Limited. + * Copyright (c) 2021 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -21,14 +21,24 @@ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ -#include "arm_compute/runtime/DeviceProperties.h" +#include "arm_compute/runtime/CL/CLGEMMHeuristicsHandle.h" -#include "arm_compute/runtime/CPUUtils.h" +#include "src/runtime/CL/mlgo/MLGOHeuristics.h" namespace arm_compute { -DeviceProperties::DeviceProperties() +CLGEMMHeuristicsHandle::CLGEMMHeuristicsHandle() : _heuristics(std::make_unique<mlgo::MLGOHeuristics>()) { - get_cpu_configuration(cpu_info); } +CLGEMMHeuristicsHandle::~CLGEMMHeuristicsHandle() = default; + +bool CLGEMMHeuristicsHandle::reload_from_file(const std::string &filename) +{ + return _heuristics->reload_from_file(filename); +} +const mlgo::MLGOHeuristics *CLGEMMHeuristicsHandle::get() const +{ + return _heuristics.get(); +} + } // namespace arm_compute diff --git a/src/runtime/CL/CLHOG.cpp b/src/runtime/CL/CLHOG.cpp deleted file mode 100644 index c4ea6398e5..0000000000 --- a/src/runtime/CL/CLHOG.cpp +++ /dev/null @@ -1,84 +0,0 @@ -/* - * Copyright (c) 2017-2018 ARM Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ - -#include "arm_compute/runtime/CL/CLHOG.h" - -#include "arm_compute/core/CL/OpenCL.h" -#include "arm_compute/core/Error.h" -#include "arm_compute/runtime/CL/CLScheduler.h" - -using namespace arm_compute; - -CLHOG::CLHOG() - : _info(), _buffer() -{ -} - -void CLHOG::init(const HOGInfo &input) -{ - ARM_COMPUTE_ERROR_ON(_buffer.get() != nullptr); - _info = input; - _buffer = cl::Buffer(CLScheduler::get().context(), CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE, info()->descriptor_size() * sizeof(float)); -} - -void CLHOG::free() -{ - ARM_COMPUTE_ERROR_ON(_buffer.get() == nullptr); - - _buffer = cl::Buffer(); -} - -const HOGInfo *CLHOG::info() const -{ - return &_info; -} - -const cl::Buffer &CLHOG::cl_buffer() const -{ - return _buffer; -} - -void CLHOG::map(bool blocking) -{ - ARM_COMPUTE_ERROR_ON(descriptor() != nullptr); - ICLHOG::map(CLScheduler::get().queue(), blocking); -} - -void CLHOG::unmap() -{ - ARM_COMPUTE_ERROR_ON(descriptor() == nullptr); - ICLHOG::unmap(CLScheduler::get().queue()); -} - -uint8_t *CLHOG::do_map(cl::CommandQueue &q, bool blocking) -{ - ARM_COMPUTE_ERROR_ON(_buffer.get() == nullptr); - return static_cast<uint8_t *>(q.enqueueMapBuffer(_buffer, blocking ? CL_TRUE : CL_FALSE, CL_MAP_READ | CL_MAP_WRITE, 0, info()->descriptor_size() * sizeof(float))); -} - -void CLHOG::do_unmap(cl::CommandQueue &q) -{ - ARM_COMPUTE_ERROR_ON(_buffer.get() == nullptr); - q.enqueueUnmapMemObject(_buffer, descriptor()); -} diff --git a/src/runtime/CL/CLHelpers.cpp b/src/runtime/CL/CLHelpers.cpp index c4c7ee2107..eb28ecbf8d 100644 --- a/src/runtime/CL/CLHelpers.cpp +++ b/src/runtime/CL/CLHelpers.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019 ARM Limited. + * Copyright (c) 2019-2021 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -26,6 +26,7 @@ #include "arm_compute/core/CL/CLHelpers.h" #include "arm_compute/core/Error.h" +#include "arm_compute/core/Validate.h" #include "arm_compute/runtime/CL/CLRuntimeContext.h" namespace @@ -49,34 +50,30 @@ void printf_callback(const char *buffer, unsigned int len, size_t complete, void * @return A pointer to the context properties which can be used to create an opencl context */ -void initialise_context_properties(const cl::Platform &platform, const cl::Device &device, std::array<cl_context_properties, 7> &prop) +void initialise_context_properties(const cl::Platform &platform, + const cl::Device &device, + std::array<cl_context_properties, 7> &prop) { ARM_COMPUTE_UNUSED(device); #if defined(ARM_COMPUTE_ASSERTS_ENABLED) // Query devices in the context for cl_arm_printf support - if(arm_compute::device_supports_extension(device, "cl_arm_printf")) + if (arm_compute::device_supports_extension(device, "cl_arm_printf")) { // Create a cl_context with a printf_callback and user specified buffer size. - std::array<cl_context_properties, 7> properties_printf = - { + std::array<cl_context_properties, 7> properties_printf = { CL_CONTEXT_PLATFORM, reinterpret_cast<cl_context_properties>(platform()), // Enable a printf callback function for this context. CL_PRINTF_CALLBACK_ARM, reinterpret_cast<cl_context_properties>(printf_callback), // Request a minimum printf buffer size of 4MB for devices in the // context that support this extension. - CL_PRINTF_BUFFERSIZE_ARM, 0x1000, - 0 - }; + CL_PRINTF_BUFFERSIZE_ARM, 0x1000, 0}; prop = properties_printf; } else #endif // defined(ARM_COMPUTE_ASSERTS_ENABLED) { - std::array<cl_context_properties, 3> properties = - { - CL_CONTEXT_PLATFORM, reinterpret_cast<cl_context_properties>(platform()), - 0 - }; + std::array<cl_context_properties, 3> properties = {CL_CONTEXT_PLATFORM, + reinterpret_cast<cl_context_properties>(platform()), 0}; std::copy(properties.begin(), properties.end(), prop.begin()); }; } @@ -84,21 +81,54 @@ void initialise_context_properties(const cl::Platform &platform, const cl::Devic namespace arm_compute { -std::tuple<cl::Context, cl::Device, cl_int> -create_opencl_context_and_device() +cl::Platform select_preferable_platform(CLBackendType cl_backend_type) { - ARM_COMPUTE_ERROR_ON(!opencl_is_available()); std::vector<cl::Platform> platforms; cl::Platform::get(&platforms); ARM_COMPUTE_ERROR_ON_MSG(platforms.size() == 0, "Couldn't find any OpenCL platform"); - cl::Platform p = platforms[0]; + + cl::Platform selected_platform{nullptr}; + + // If the user has selected the Native platform, return the first available. + switch (cl_backend_type) + { + case CLBackendType::Native: + selected_platform = platforms[0]; + break; + case CLBackendType::Clvk: + for (auto p : platforms) + { + std::string res = p.getInfo<CL_PLATFORM_NAME>(); + if (res.find("clvk") != std::string::npos) + { + selected_platform = p; + break; + } + } + break; + default: + ARM_COMPUTE_ERROR("Unsupported backend type"); + } + + if (!selected_platform()) + { + ARM_COMPUTE_ERROR("No valid platform found"); + } + + return selected_platform; +} + +std::tuple<cl::Context, cl::Device, cl_int> create_opencl_context_and_device(CLBackendType cl_backend_type) +{ + ARM_COMPUTE_ERROR_ON(!opencl_is_available()); + cl::Platform p = select_preferable_platform(cl_backend_type); cl::Device device; std::vector<cl::Device> platform_devices; p.getDevices(CL_DEVICE_TYPE_DEFAULT, &platform_devices); ARM_COMPUTE_ERROR_ON_MSG(platform_devices.size() == 0, "Couldn't find any OpenCL device"); - device = platform_devices[0]; - cl_int err = CL_SUCCESS; - std::array<cl_context_properties, 7> properties = { 0, 0, 0, 0, 0, 0, 0 }; + device = platform_devices[0]; + cl_int err = CL_SUCCESS; + std::array<cl_context_properties, 7> properties = {0, 0, 0, 0, 0, 0, 0}; initialise_context_properties(p, device, properties); cl::Context cl_context = cl::Context(device, properties.data(), nullptr, nullptr, &err); ARM_COMPUTE_ERROR_ON_MSG(err != CL_SUCCESS, "Failed to create OpenCL context"); @@ -108,7 +138,7 @@ create_opencl_context_and_device() void schedule_kernel_on_ctx(CLRuntimeContext *ctx, ICLKernel *kernel, bool flush) { ARM_COMPUTE_ERROR_ON_NULLPTR(kernel); - if(ctx) + if (ctx) { ARM_COMPUTE_ERROR_ON(ctx->gpu_scheduler() == nullptr); ctx->gpu_scheduler()->enqueue(*kernel, flush); diff --git a/src/runtime/CL/CLLut.cpp b/src/runtime/CL/CLLut.cpp deleted file mode 100644 index a8cbf2131f..0000000000 --- a/src/runtime/CL/CLLut.cpp +++ /dev/null @@ -1,99 +0,0 @@ -/* - * Copyright (c) 2016, 2017 ARM Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "arm_compute/runtime/CL/CLLut.h" - -#include "arm_compute/runtime/CL/CLScheduler.h" - -#include <cstring> - -using namespace arm_compute; - -CLLut::CLLut() - : _allocator() -{ -} - -CLLut::CLLut(size_t num_elements, DataType data_type) - : _allocator() -{ - _allocator.init(num_elements, data_type); -} - -size_t CLLut::num_elements() const -{ - return _allocator.num_elements(); -} - -uint32_t CLLut::index_offset() const -{ - return (DataType::S16 == _allocator.type()) ? num_elements() / 2 : 0; -} - -size_t CLLut::size_in_bytes() const -{ - return _allocator.size(); -} - -DataType CLLut::type() const -{ - return _allocator.type(); -} - -const cl::Buffer &CLLut::cl_buffer() const -{ - return _allocator.cl_data(); -} - -void CLLut::clear() -{ - cl::CommandQueue &q = CLScheduler::get().queue(); - uint8_t *data = _allocator.map(q, true /* blocking */); - std::memset(data, 0, size_in_bytes()); - _allocator.unmap(q, data); -} - -ILutAllocator *CLLut::allocator() -{ - return &_allocator; -} - -void CLLut::map(bool blocking) -{ - ICLLut::map(CLScheduler::get().queue(), blocking); -} - -void CLLut::unmap() -{ - ICLLut::unmap(CLScheduler::get().queue()); -} - -uint8_t *CLLut::do_map(cl::CommandQueue &q, bool blocking) -{ - return _allocator.map(q, blocking); -} - -void CLLut::do_unmap(cl::CommandQueue &q) -{ - _allocator.unmap(q, buffer()); -} diff --git a/src/runtime/CL/CLLutAllocator.cpp b/src/runtime/CL/CLLutAllocator.cpp deleted file mode 100644 index 311de4bb8d..0000000000 --- a/src/runtime/CL/CLLutAllocator.cpp +++ /dev/null @@ -1,77 +0,0 @@ -/* - * Copyright (c) 2016, 2017 ARM Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "arm_compute/runtime/CL/CLLutAllocator.h" - -#include "arm_compute/core/Error.h" -#include "arm_compute/runtime/CL/CLScheduler.h" - -using namespace arm_compute; - -CLLutAllocator::CLLutAllocator() - : _buffer(), _mapping(nullptr) -{ -} - -uint8_t *CLLutAllocator::data() -{ - return _mapping; -} - -const cl::Buffer &CLLutAllocator::cl_data() const -{ - return _buffer; -} - -uint8_t *CLLutAllocator::map(cl::CommandQueue &q, bool blocking) -{ - ARM_COMPUTE_ERROR_ON(_buffer.get() == nullptr); - return static_cast<uint8_t *>(q.enqueueMapBuffer(_buffer, blocking ? CL_TRUE : CL_FALSE, CL_MAP_READ | CL_MAP_WRITE, 0, size())); -} - -void CLLutAllocator::unmap(cl::CommandQueue &q, uint8_t *mapping) -{ - ARM_COMPUTE_ERROR_ON(_buffer.get() == nullptr); - q.enqueueUnmapMemObject(_buffer, mapping); -} - -void CLLutAllocator::allocate() -{ - _buffer = cl::Buffer(CLScheduler::get().context(), CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE, size()); -} - -uint8_t *CLLutAllocator::lock() -{ - ARM_COMPUTE_ERROR_ON(_mapping != nullptr); - cl::CommandQueue q = CLScheduler::get().queue(); - _mapping = map(q, true); - return _mapping; -} - -void CLLutAllocator::unlock() -{ - ARM_COMPUTE_ERROR_ON(_mapping == nullptr); - cl::CommandQueue q = CLScheduler::get().queue(); - unmap(q, _mapping); - _mapping = nullptr; -} diff --git a/src/runtime/CL/CLMemory.cpp b/src/runtime/CL/CLMemory.cpp index 557378b6f1..c6ee6fde83 100644 --- a/src/runtime/CL/CLMemory.cpp +++ b/src/runtime/CL/CLMemory.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018-2019 ARM Limited. + * Copyright (c) 2018-2020 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -24,24 +24,22 @@ #include "arm_compute/runtime/CL/CLMemory.h" #include "arm_compute/core/Error.h" -#include "arm_compute/core/utils/misc/Cast.h" + +#include "support/Cast.h" namespace arm_compute { -CLMemory::CLMemory() - : _region(nullptr), _region_owned(nullptr) +CLMemory::CLMemory() : _region(nullptr), _region_owned(nullptr) { } -CLMemory::CLMemory(const std::shared_ptr<ICLMemoryRegion> &memory) - : _region(nullptr), _region_owned(memory) +CLMemory::CLMemory(const std::shared_ptr<ICLMemoryRegion> &memory) : _region(nullptr), _region_owned(memory) { _region_owned = memory; _region = _region_owned.get(); } -CLMemory::CLMemory(ICLMemoryRegion *memory) - : _region(memory), _region_owned(nullptr) +CLMemory::CLMemory(ICLMemoryRegion *memory) : _region(memory), _region_owned(nullptr) { _region = memory; } @@ -78,4 +76,4 @@ void CLMemory::set_owned_region(std::unique_ptr<IMemoryRegion> region) _region_owned = utils::cast::polymorphic_downcast_unique_ptr<ICLMemoryRegion>(std::move(region)); _region = _region_owned.get(); } -} // namespace arm_compute
\ No newline at end of file +} // namespace arm_compute diff --git a/src/runtime/CL/CLMemoryRegion.cpp b/src/runtime/CL/CLMemoryRegion.cpp index 7ae16ec6fc..c9ddf9b85c 100644 --- a/src/runtime/CL/CLMemoryRegion.cpp +++ b/src/runtime/CL/CLMemoryRegion.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018-2020 ARM Limited. + * Copyright (c) 2018-2021, 2023 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -23,18 +23,15 @@ */ #include "arm_compute/runtime/CL/CLMemoryRegion.h" -#include "arm_compute/core/CL/CLCoreRuntimeContext.h" #include "arm_compute/core/Error.h" #include "arm_compute/runtime/CL/CLScheduler.h" +#include "src/common/utils/Log.h" + namespace arm_compute { -ICLMemoryRegion::ICLMemoryRegion(CLCoreRuntimeContext *ctx, size_t size) - : IMemoryRegion(size), - _queue((ctx != nullptr) ? ctx->queue() : CLScheduler::get().queue()), - _ctx((ctx != nullptr) ? ctx->context() : CLScheduler::get().context()), - _mapping(nullptr), - _mem() +ICLMemoryRegion::ICLMemoryRegion(size_t size) + : IMemoryRegion(size), _ctx(CLScheduler::get().context()), _mapping(nullptr), _mem() { } @@ -59,21 +56,34 @@ std::unique_ptr<IMemoryRegion> ICLMemoryRegion::extract_subregion(size_t offset, return nullptr; } -CLBufferMemoryRegion::CLBufferMemoryRegion(CLCoreRuntimeContext *ctx, cl_mem_flags flags, size_t size) - : ICLMemoryRegion(ctx, size) +CLBufferMemoryRegion::CLBufferMemoryRegion(cl_mem_flags flags, size_t size) : ICLMemoryRegion(size) { - if(_size != 0) + if (_size != 0) { - _mem = cl::Buffer((ctx != nullptr) ? ctx->context() : CLScheduler::get().context(), flags, _size); + _mem = cl::Buffer(CLScheduler::get().context(), flags, _size); } } -CLBufferMemoryRegion::CLBufferMemoryRegion(const cl::Buffer &buffer, CLCoreRuntimeContext *ctx) - : ICLMemoryRegion(ctx, buffer.getInfo<CL_MEM_SIZE>()) +CLBufferMemoryRegion::CLBufferMemoryRegion(const cl::Buffer &buffer) : ICLMemoryRegion(buffer.getInfo<CL_MEM_SIZE>()) { _mem = buffer; } +CLBufferMemoryRegion::~CLBufferMemoryRegion() +{ + // Flush the command queue to ensure all commands that may use this memory buffer are scheduled to be finished before + // this buffer is freed + // Do not call finish as it is a blocking call which affects the performance + try + { + CLScheduler::get().queue().flush(); + } + catch (const std::exception &e) + { + ARM_COMPUTE_LOG_ERROR_ACL(e.what()); + } +} + void *CLBufferMemoryRegion::ptr() { return nullptr; @@ -93,30 +103,33 @@ void CLBufferMemoryRegion::unmap(cl::CommandQueue &q) _mapping = nullptr; } -ICLSVMMemoryRegion::ICLSVMMemoryRegion(CLCoreRuntimeContext *ctx, cl_mem_flags flags, size_t size, size_t alignment) - : ICLMemoryRegion(ctx, size), _ptr(nullptr) +ICLSVMMemoryRegion::ICLSVMMemoryRegion(cl_mem_flags flags, size_t size, size_t alignment) + : ICLMemoryRegion(size), _ptr(nullptr) { - if(size != 0) + if (size != 0) { - _ptr = clSVMAlloc((ctx != nullptr) ? ctx->context().get() : CLScheduler::get().context().get(), flags, size, alignment); - if(_ptr != nullptr) + _ptr = clSVMAlloc(CLScheduler::get().context().get(), flags, size, alignment); + if (_ptr != nullptr) { - _mem = cl::Buffer((ctx != nullptr) ? ctx->context() : CLScheduler::get().context(), CL_MEM_READ_WRITE | CL_MEM_USE_HOST_PTR, _size, _ptr); + _mem = cl::Buffer(CLScheduler::get().context(), CL_MEM_READ_WRITE | CL_MEM_USE_HOST_PTR, _size, _ptr); } } } ICLSVMMemoryRegion::~ICLSVMMemoryRegion() { - if(_ptr != nullptr) + if (_ptr != nullptr) { try { - clFinish(_queue.get()); + // Can only use the blocking finish instead of the non-blocking flush here, because clSVMFree requires all + // commands that may use the svm pointer to finish beforehand + // https://registry.khronos.org/OpenCL/sdk/3.0/docs/man/html/clSVMFree.html + clFinish(CLScheduler::get().queue().get()); _mem = cl::Buffer(); clSVMFree(_ctx.get(), _ptr); } - catch(...) + catch (...) { } } @@ -127,15 +140,16 @@ void *ICLSVMMemoryRegion::ptr() return _ptr; } -CLCoarseSVMMemoryRegion::CLCoarseSVMMemoryRegion(CLCoreRuntimeContext *ctx, cl_mem_flags flags, size_t size, size_t alignment) - : ICLSVMMemoryRegion(ctx, flags, size, alignment) +CLCoarseSVMMemoryRegion::CLCoarseSVMMemoryRegion(cl_mem_flags flags, size_t size, size_t alignment) + : ICLSVMMemoryRegion(flags, size, alignment) { } void *CLCoarseSVMMemoryRegion::map(cl::CommandQueue &q, bool blocking) { ARM_COMPUTE_ERROR_ON(_ptr == nullptr); - clEnqueueSVMMap(q.get(), blocking ? CL_TRUE : CL_FALSE, CL_MAP_READ | CL_MAP_WRITE, _ptr, _size, 0, nullptr, nullptr); + clEnqueueSVMMap(q.get(), blocking ? CL_TRUE : CL_FALSE, CL_MAP_READ | CL_MAP_WRITE, _ptr, _size, 0, nullptr, + nullptr); _mapping = _ptr; return _mapping; } @@ -147,14 +161,14 @@ void CLCoarseSVMMemoryRegion::unmap(cl::CommandQueue &q) _mapping = nullptr; } -CLFineSVMMemoryRegion::CLFineSVMMemoryRegion(CLCoreRuntimeContext *ctx, cl_mem_flags flags, size_t size, size_t alignment) - : ICLSVMMemoryRegion(ctx, flags, size, alignment) +CLFineSVMMemoryRegion::CLFineSVMMemoryRegion(cl_mem_flags flags, size_t size, size_t alignment) + : ICLSVMMemoryRegion(flags, size, alignment) { } void *CLFineSVMMemoryRegion::map(cl::CommandQueue &q, bool blocking) { - if(blocking) + if (blocking) { clFinish(q.get()); } diff --git a/src/runtime/CL/CLMultiHOG.cpp b/src/runtime/CL/CLMultiHOG.cpp deleted file mode 100644 index 14cd68a646..0000000000 --- a/src/runtime/CL/CLMultiHOG.cpp +++ /dev/null @@ -1,52 +0,0 @@ -/* - * Copyright (c) 2017-2020 ARM Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "arm_compute/runtime/CL/CLMultiHOG.h" - -#include "arm_compute/core/CL/ICLHOG.h" -#include "arm_compute/core/Error.h" - -using namespace arm_compute; - -CLMultiHOG::CLMultiHOG(size_t num_models) - : _num_models(num_models), _model() -{ - _model.resize(_num_models); -} - -size_t CLMultiHOG::num_models() const -{ - return _num_models; -} - -ICLHOG *CLMultiHOG::cl_model(size_t index) -{ - ARM_COMPUTE_ERROR_ON(index >= _num_models); - return (&_model[index]); -} - -const ICLHOG *CLMultiHOG::cl_model(size_t index) const -{ - ARM_COMPUTE_ERROR_ON(index >= _num_models); - return (&_model[index]); -} diff --git a/src/runtime/CL/CLMultiImage.cpp b/src/runtime/CL/CLMultiImage.cpp deleted file mode 100644 index 92254f3531..0000000000 --- a/src/runtime/CL/CLMultiImage.cpp +++ /dev/null @@ -1,172 +0,0 @@ -/* - * Copyright (c) 2016-2018 ARM Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "arm_compute/runtime/CL/CLMultiImage.h" - -#include "arm_compute/core/Error.h" -#include "arm_compute/core/TensorInfo.h" -#include "arm_compute/core/Utils.h" -#include "arm_compute/runtime/ITensorAllocator.h" - -using namespace arm_compute; - -CLMultiImage::CLMultiImage() - : _info(), _plane() -{ -} - -const MultiImageInfo *CLMultiImage::info() const -{ - return &_info; -} - -void CLMultiImage::init(unsigned int width, unsigned int height, Format format) -{ - internal_init(width, height, format, false); -} - -void CLMultiImage::init_auto_padding(unsigned int width, unsigned int height, Format format) -{ - internal_init(width, height, format, true); -} - -void CLMultiImage::internal_init(unsigned int width, unsigned int height, Format format, bool auto_padding) -{ - TensorShape shape = adjust_odd_shape(TensorShape{ width, height }, format); - TensorInfo info(shape, Format::U8); - - if(auto_padding) - { - info.auto_padding(); - } - - switch(format) - { - case Format::U8: - case Format::S16: - case Format::U16: - case Format::S32: - case Format::F16: - case Format::F32: - case Format::U32: - case Format::RGB888: - case Format::RGBA8888: - case Format::YUYV422: - case Format::UYVY422: - { - TensorInfo info_full(shape, format); - - if(auto_padding) - { - info_full.auto_padding(); - } - - std::get<0>(_plane).allocator()->init(info_full); - break; - } - case Format::NV12: - case Format::NV21: - { - const TensorShape shape_uv88 = calculate_subsampled_shape(shape, Format::UV88); - TensorInfo info_uv88(shape_uv88, Format::UV88); - - if(auto_padding) - { - info_uv88.auto_padding(); - } - - std::get<0>(_plane).allocator()->init(info); - std::get<1>(_plane).allocator()->init(info_uv88); - break; - } - case Format::IYUV: - { - const TensorShape shape_sub2 = calculate_subsampled_shape(shape, Format::IYUV); - TensorInfo info_sub2(shape_sub2, Format::U8); - - if(auto_padding) - { - info_sub2.auto_padding(); - } - - std::get<0>(_plane).allocator()->init(info); - std::get<1>(_plane).allocator()->init(info_sub2); - std::get<2>(_plane).allocator()->init(info_sub2); - break; - } - case Format::YUV444: - std::get<0>(_plane).allocator()->init(info); - std::get<1>(_plane).allocator()->init(info); - std::get<2>(_plane).allocator()->init(info); - break; - default: - ARM_COMPUTE_ERROR("Not supported"); - break; - } - - _info.init(shape.x(), shape.y(), format); -} - -void CLMultiImage::allocate() -{ - switch(_info.format()) - { - case Format::U8: - case Format::S16: - case Format::U16: - case Format::S32: - case Format::F16: - case Format::F32: - case Format::U32: - case Format::RGB888: - case Format::RGBA8888: - case Format::YUYV422: - case Format::UYVY422: - std::get<0>(_plane).allocator()->allocate(); - break; - case Format::NV12: - case Format::NV21: - std::get<0>(_plane).allocator()->allocate(); - std::get<1>(_plane).allocator()->allocate(); - break; - case Format::IYUV: - case Format::YUV444: - std::get<0>(_plane).allocator()->allocate(); - std::get<1>(_plane).allocator()->allocate(); - std::get<2>(_plane).allocator()->allocate(); - break; - default: - ARM_COMPUTE_ERROR("Not supported"); - break; - } -} - -CLImage *CLMultiImage::cl_plane(unsigned int index) -{ - return &_plane[index]; -} - -const CLImage *CLMultiImage::cl_plane(unsigned int index) const -{ - return &_plane[index]; -} diff --git a/src/runtime/NEON/functions/NEComputeAllAnchors.cpp b/src/runtime/CL/CLOperator.cpp index 4fb4e8b86d..89d4520038 100644 --- a/src/runtime/NEON/functions/NEComputeAllAnchors.cpp +++ b/src/runtime/CL/CLOperator.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2020 ARM Limited. + * Copyright (c) 2020 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -21,22 +21,37 @@ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ -#include "arm_compute/runtime/NEON/functions/NEComputeAllAnchors.h" +#include "arm_compute/runtime/CL/CLScheduler.h" +#include "arm_compute/runtime/CL/ICLOperator.h" -#include "support/MemorySupport.h" +#include "src/core/CL/ICLKernel.h" namespace arm_compute { -void NEComputeAllAnchors::configure(const ITensor *anchors, ITensor *all_anchors, const ComputeAnchorsInfo &info) +namespace experimental { - // Configure ComputeAllAnchors kernel - auto k = arm_compute::support::cpp14::make_unique<NEComputeAllAnchorsKernel>(); - k->configure(anchors, all_anchors, info); - _kernel = std::move(k); +ICLOperator::ICLOperator(IRuntimeContext *ctx) : _kernel(), _ctx(ctx), _workspace() +{ +} + +void ICLOperator::run(ITensorPack &tensors) +{ + if (tensors.empty()) + { + ARM_COMPUTE_ERROR("No inputs provided"); + } + + CLScheduler::get().enqueue_op(*_kernel.get(), tensors, false); +} + +void ICLOperator::prepare(ITensorPack &constants) +{ + ARM_COMPUTE_UNUSED(constants); } -Status NEComputeAllAnchors::validate(const ITensorInfo *anchors, const ITensorInfo *all_anchors, const ComputeAnchorsInfo &info) +MemoryRequirements ICLOperator::workspace() const { - return NEComputeAllAnchorsKernel::validate(anchors, all_anchors, info); + return {}; } +} // namespace experimental } // namespace arm_compute diff --git a/src/runtime/CL/CLPyramid.cpp b/src/runtime/CL/CLPyramid.cpp deleted file mode 100644 index ef8a1e5294..0000000000 --- a/src/runtime/CL/CLPyramid.cpp +++ /dev/null @@ -1,127 +0,0 @@ -/* - * Copyright (c) 2016-2020 ARM Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "arm_compute/runtime/CL/CLPyramid.h" - -#include "arm_compute/core/Error.h" -#include "arm_compute/core/PyramidInfo.h" -#include "arm_compute/core/TensorInfo.h" -#include "arm_compute/core/TensorShape.h" - -#include <array> -#include <cmath> - -using namespace arm_compute; - -CLPyramid::CLPyramid() - : _info(), _pyramid() -{ -} - -void CLPyramid::init(const PyramidInfo &info) -{ - internal_init(info, false); -} - -void CLPyramid::init_auto_padding(const PyramidInfo &info) -{ - internal_init(info, true); -} - -void CLPyramid::internal_init(const PyramidInfo &info, bool auto_padding) -{ - _info = info; - _pyramid.resize(_info.num_levels()); - - size_t w = _info.width(); - size_t h = _info.height(); - size_t ref_w = w; - size_t ref_h = h; - const bool is_orb_scale = (SCALE_PYRAMID_ORB == _info.scale()); - TensorShape tensor_shape = _info.tensor_shape(); - - // Note: Look-up table used by the OpenVX sample implementation - const std::array<float, 4> c_orbscale = - { - { - 0.5f, - SCALE_PYRAMID_ORB, - SCALE_PYRAMID_ORB * SCALE_PYRAMID_ORB, - SCALE_PYRAMID_ORB *SCALE_PYRAMID_ORB * SCALE_PYRAMID_ORB - } - }; - - for(size_t i = 0; i < _info.num_levels(); ++i) - { - TensorInfo tensor_info(tensor_shape, _info.format()); - - if(auto_padding) - { - tensor_info.auto_padding(); - } - - _pyramid[i].allocator()->init(tensor_info); - - if(is_orb_scale) - { - const float orb_scale = c_orbscale[(i + 1) % 4]; - w = std::ceil(ref_w * orb_scale); - h = std::ceil(ref_h * orb_scale); - - if(0 == ((i + 1) % 4)) - { - ref_w = w; - ref_h = h; - } - } - else - { - w = (w + 1) * _info.scale(); - h = (h + 1) * _info.scale(); - } - - // Update tensor_shape - tensor_shape.set(0, w); - tensor_shape.set(1, h); - } -} - -void CLPyramid::allocate() -{ - for(size_t i = 0; i < _info.num_levels(); ++i) - { - _pyramid[i].allocator()->allocate(); - } -} - -const PyramidInfo *CLPyramid::info() const -{ - return &_info; -} - -CLTensor *CLPyramid::get_pyramid_level(size_t index) const -{ - ARM_COMPUTE_ERROR_ON(index >= _info.num_levels()); - - return &_pyramid[index]; -} diff --git a/src/runtime/CL/CLRuntimeContext.cpp b/src/runtime/CL/CLRuntimeContext.cpp index 4d70edac2f..b426b8c304 100644 --- a/src/runtime/CL/CLRuntimeContext.cpp +++ b/src/runtime/CL/CLRuntimeContext.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2020 ARM Limited. + * Copyright (c) 2019-2021 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -22,6 +22,7 @@ * SOFTWARE. */ #include "arm_compute/runtime/CL/CLRuntimeContext.h" + #include "arm_compute/core/Validate.h" #include "arm_compute/runtime/CL/CLHelpers.h" #include "arm_compute/runtime/CL/CLScheduler.h" @@ -29,10 +30,13 @@ namespace arm_compute { CLRuntimeContext::CLRuntimeContext() - : _gpu_owned_scheduler(support::cpp14::make_unique<CLScheduler>()), _gpu_scheduler(_gpu_owned_scheduler.get()), _symbols(), _core_context() + : _gpu_owned_scheduler(std::make_unique<CLScheduler>()), + _gpu_scheduler(_gpu_owned_scheduler.get()), + _symbols(), + _backend_type() { _symbols.load_default(); - auto ctx_dev_err = create_opencl_context_and_device(); + auto ctx_dev_err = create_opencl_context_and_device(_backend_type); ARM_COMPUTE_ERROR_ON_MSG(std::get<2>(ctx_dev_err) != CL_SUCCESS, "Failed to create OpenCL context"); auto ctx = std::get<0>(ctx_dev_err); auto dev = std::get<1>(ctx_dev_err); @@ -40,7 +44,6 @@ CLRuntimeContext::CLRuntimeContext() _gpu_owned_scheduler->init(ctx, queue, dev, &_tuner); const std::string cl_kernels_folder("./cl_kernels"); CLKernelLibrary::get().init(cl_kernels_folder, ctx, dev); - _core_context = CLCoreRuntimeContext(&CLKernelLibrary::get(), _gpu_owned_scheduler->context(), _gpu_owned_scheduler->queue()); } CLKernelLibrary &CLRuntimeContext::kernel_library() @@ -48,11 +51,6 @@ CLKernelLibrary &CLRuntimeContext::kernel_library() return CLKernelLibrary::get(); } -CLCoreRuntimeContext *CLRuntimeContext::core_runtime_context() -{ - return &_core_context; -} - void CLRuntimeContext::set_gpu_scheduler(CLScheduler *scheduler) { ARM_COMPUTE_ERROR_ON_NULLPTR(scheduler); diff --git a/src/runtime/CL/CLScheduler.cpp b/src/runtime/CL/CLScheduler.cpp index e78eaa482f..f0a42f55fd 100644 --- a/src/runtime/CL/CLScheduler.cpp +++ b/src/runtime/CL/CLScheduler.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2016-2019 ARM Limited. + * Copyright (c) 2016-2023 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -24,10 +24,9 @@ #include "arm_compute/runtime/CL/CLScheduler.h" #include "arm_compute/core/CL/CLKernelLibrary.h" -#include "arm_compute/core/CL/ICLKernel.h" -#include "arm_compute/runtime/CL/CLHelpers.h" #include "arm_compute/runtime/CL/CLTuner.h" -#include "arm_compute/runtime/CL/tuners/Tuners.h" + +#include "src/core/CL/ICLKernel.h" namespace arm_compute { @@ -49,6 +48,11 @@ GPUTarget CLScheduler::target() const return _target; } +CLGEMMHeuristicsHandle *CLScheduler::gemm_heuristics() const +{ + return _gemm_heuristics; +} + void CLScheduler::set_queue(cl::CommandQueue queue) { _queue = std::move(queue); @@ -78,7 +82,7 @@ cl::Event CLScheduler::enqueue_sync_event() void CLScheduler::tune_kernel_static(ICLKernel &kernel) { - if(_cl_tuner != nullptr) + if (_cl_tuner != nullptr) { _cl_tuner->tune_kernel_static(kernel); } @@ -92,7 +96,16 @@ bool CLScheduler::is_initialised() const std::once_flag CLScheduler::_initialize_symbols; CLScheduler::CLScheduler() - : _context(), _queue(), _target(GPUTarget::MIDGARD), _is_initialised(false), _cl_tuner(nullptr), _cl_default_static_tuner(nullptr) + : _context(), + _queue(), + _target(GPUTarget::MIDGARD), + _is_initialised(false), + _cl_tuner(nullptr), + _gemm_heuristics(nullptr), + _backend_type(CLBackendType::Native), + _job_chaining_enabled(true), + _job_chaining_size(1), + _job_chaining_count(0) { } @@ -103,37 +116,45 @@ CLScheduler &CLScheduler::get() return scheduler; } -void CLScheduler::default_init_with_context(cl::Device &device, cl::Context &ctx, ICLTuner *cl_tuner) +void CLScheduler::default_init_with_context(cl::Device &device, + cl::Context &ctx, + ICLTuner *cl_tuner, + CLGEMMHeuristicsHandle *gemm_h) { - if(!_is_initialised) + if (!_is_initialised) { const std::string cl_kernels_folder("./cl_kernels/"); cl::CommandQueue queue = cl::CommandQueue(ctx, device); CLKernelLibrary::get().init(cl_kernels_folder, ctx, device); - init(ctx, queue, device, cl_tuner); - _cl_default_static_tuner = tuners::TunerFactory::create_tuner(_target); - _cl_tuner = (cl_tuner == nullptr) ? _cl_default_static_tuner.get() : cl_tuner; + init(ctx, queue, device, cl_tuner, gemm_h); + _cl_tuner = cl_tuner; } } -void CLScheduler::default_init(ICLTuner *cl_tuner) +void CLScheduler::default_init(ICLTuner *cl_tuner, CLGEMMHeuristicsHandle *gemm_h, CLBackendType cl_backend_type) { - if(!_is_initialised) + if (!_is_initialised) { cl::Context ctx; cl::Device dev; cl_int err; - std::tie(ctx, dev, err) = create_opencl_context_and_device(); + std::tie(ctx, dev, err) = create_opencl_context_and_device(cl_backend_type); ARM_COMPUTE_ERROR_ON_MSG(err != CL_SUCCESS, "Failed to create OpenCL context"); cl::CommandQueue queue = cl::CommandQueue(ctx, dev); CLKernelLibrary::get().init("./cl_kernels/", ctx, dev); - init(ctx, queue, dev, cl_tuner); - // Create a default static tuner and set if none was provided - _cl_default_static_tuner = tuners::TunerFactory::create_tuner(_target); + init(ctx, queue, dev, cl_tuner, gemm_h); } - // Set CL tuner - _cl_tuner = (cl_tuner == nullptr) ? _cl_default_static_tuner.get() : cl_tuner; + // Set CL tuner and GEMM heuristics + _cl_tuner = cl_tuner; + _gemm_heuristics = gemm_h; +} + +void CLScheduler::default_reinit(ICLTuner *cl_tuner, CLGEMMHeuristicsHandle *gemm_h, CLBackendType cl_backend_type) +{ + _is_initialised = false; + + default_init(cl_tuner, gemm_h, cl_backend_type); } void CLScheduler::set_context(cl::Context context) @@ -142,34 +163,86 @@ void CLScheduler::set_context(cl::Context context) CLKernelLibrary::get().set_context(_context); } -void CLScheduler::init(cl::Context context, cl::CommandQueue queue, const cl::Device &device, ICLTuner *cl_tuner) +void CLScheduler::init(cl::Context context, + cl::CommandQueue queue, + const cl::Device &device, + ICLTuner *cl_tuner, + CLGEMMHeuristicsHandle *gemm_h, + CLBackendType cl_backend_type) { set_context(std::move(context)); - _queue = std::move(queue); - _target = get_target_from_device(device); - _is_initialised = true; - _cl_tuner = cl_tuner; + _queue = std::move(queue); + _target = get_target_from_device(device); + _is_initialised = true; + _cl_tuner = cl_tuner; + _gemm_heuristics = gemm_h; + _backend_type = cl_backend_type; } -void CLScheduler::enqueue(ICLKernel &kernel, bool flush) +void CLScheduler::enqueue_common(ICLKernel &kernel, ITensorPack &tensors, bool flush) { - ARM_COMPUTE_ERROR_ON_MSG(!_is_initialised, - "The CLScheduler is not initialised yet! Please call the CLScheduler::get().default_init(), \ + ARM_COMPUTE_ERROR_ON_MSG( + !_is_initialised, "The CLScheduler is not initialised yet! Please call the CLScheduler::get().default_init(), \ or CLScheduler::get()::init() and CLKernelLibrary::get()::init() function before running functions!"); + const bool inject_memory = !tensors.empty(); + // Tune the kernel if the CLTuner has been provided - if(_cl_tuner != nullptr) + if (_cl_tuner != nullptr) { - // Tune the OpenCL kernel - _cl_tuner->tune_kernel_dynamic(kernel); + inject_memory ? _cl_tuner->tune_kernel_dynamic(kernel, tensors) : _cl_tuner->tune_kernel_dynamic(kernel); } // Run kernel - kernel.run(kernel.window(), _queue); + inject_memory ? kernel.run_op(tensors, kernel.window(), _queue) : kernel.run(kernel.window(), _queue); + if (_job_chaining_enabled) + { + ++_job_chaining_count; + } + + flush_queue(flush); +} - if(flush) +void CLScheduler::flush_queue(bool flush) +{ + if (_job_chaining_enabled) + { + if (_job_chaining_count >= _job_chaining_size) + { + _job_chaining_count = 0; + /* + Optimisation note: Flush the queue at the first enqueue to start the GPU + execution and then incrementally saturate the clFlush calls to minimize + the CPU activity for job-scheduling. + For eg. job-chain size goes from 1, 2, 4, 8 and 16 + */ + if (_job_chaining_size < 16) + { + _job_chaining_size <<= 1; + } + _queue.flush(); + } + } + else if (flush) { _queue.flush(); } } + +void CLScheduler::enqueue(ICLKernel &kernel, bool flush) +{ + ITensorPack pack; + enqueue_common(kernel, pack, flush); +} + +void CLScheduler::enqueue_op(ICLKernel &kernel, ITensorPack &tensors, bool flush) +{ + enqueue_common(kernel, tensors, flush); +} + +void CLScheduler::enable_job_chaining(int job_chaining_size) +{ + _job_chaining_enabled = true; + _job_chaining_size = job_chaining_size; +} } // namespace arm_compute diff --git a/src/runtime/CL/CLSubTensor.cpp b/src/runtime/CL/CLSubTensor.cpp index 0f362507cf..ace820bbb7 100644 --- a/src/runtime/CL/CLSubTensor.cpp +++ b/src/runtime/CL/CLSubTensor.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2019 ARM Limited. + * Copyright (c) 2017-2019, 2022 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -29,12 +29,14 @@ using namespace arm_compute; -CLSubTensor::CLSubTensor() - : _parent(nullptr), _info() +CLSubTensor::CLSubTensor() : _parent(nullptr), _info() { } -CLSubTensor::CLSubTensor(ICLTensor *parent, const TensorShape &tensor_shape, const Coordinates &coords, bool extend_parent) +CLSubTensor::CLSubTensor(ICLTensor *parent, + const TensorShape &tensor_shape, + const Coordinates &coords, + bool extend_parent) : _parent(nullptr), _info() { ARM_COMPUTE_ERROR_ON(parent == nullptr); @@ -81,11 +83,15 @@ void CLSubTensor::unmap() uint8_t *CLSubTensor::do_map(cl::CommandQueue &q, bool blocking) { ARM_COMPUTE_ERROR_ON(cl_buffer().get() == nullptr); - return static_cast<uint8_t *>(q.enqueueMapBuffer(cl_buffer(), blocking ? CL_TRUE : CL_FALSE, CL_MAP_READ | CL_MAP_WRITE, 0, info()->total_size())); + if (_parent->buffer() == nullptr) + { + _parent->map(q, blocking); + } + return _parent->buffer(); } void CLSubTensor::do_unmap(cl::CommandQueue &q) { ARM_COMPUTE_ERROR_ON(cl_buffer().get() == nullptr); - q.enqueueUnmapMemObject(cl_buffer(), buffer()); + _parent->unmap(q); } diff --git a/src/runtime/CL/CLTensor.cpp b/src/runtime/CL/CLTensor.cpp index a6d0cf77ca..db94639190 100644 --- a/src/runtime/CL/CLTensor.cpp +++ b/src/runtime/CL/CLTensor.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2016-2019 ARM Limited. + * Copyright (c) 2016-2019 Arm Limited. * * SPDX-License-Identifier: MIT * diff --git a/src/runtime/CL/CLTensorAllocator.cpp b/src/runtime/CL/CLTensorAllocator.cpp index eaf46d42ca..e6457218c7 100644 --- a/src/runtime/CL/CLTensorAllocator.cpp +++ b/src/runtime/CL/CLTensorAllocator.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2016-2019 ARM Limited. + * Copyright (c) 2016-2021 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -31,34 +31,33 @@ namespace arm_compute { const cl::Buffer CLTensorAllocator::_empty_buffer = cl::Buffer(); - namespace { +/** Global user-defined allocator that can be used for all internal allocations of a CLTensor */ +static IAllocator *static_global_cl_allocator = nullptr; + /** Helper function used to allocate the backing memory of a tensor * - * @param[in] context OpenCL context to use * @param[in] size Size of the allocation * @param[in] alignment Alignment of the allocation * * @return A wrapped memory region */ -std::unique_ptr<ICLMemoryRegion> allocate_region(CLCoreRuntimeContext *ctx, size_t size, cl_uint alignment) +std::unique_ptr<ICLMemoryRegion> allocate_region(size_t size, cl_uint alignment) { // Try fine-grain SVM - std::unique_ptr<ICLMemoryRegion> region = support::cpp14::make_unique<CLFineSVMMemoryRegion>(ctx, - CL_MEM_READ_WRITE | CL_MEM_SVM_FINE_GRAIN_BUFFER, - size, - alignment); + std::unique_ptr<ICLMemoryRegion> region = + std::make_unique<CLFineSVMMemoryRegion>(CL_MEM_READ_WRITE | CL_MEM_SVM_FINE_GRAIN_BUFFER, size, alignment); // Try coarse-grain SVM in case of failure - if(region != nullptr && region->ptr() == nullptr) + if (region != nullptr && region->ptr() == nullptr) { - region = support::cpp14::make_unique<CLCoarseSVMMemoryRegion>(ctx, CL_MEM_READ_WRITE, size, alignment); + region = std::make_unique<CLCoarseSVMMemoryRegion>(CL_MEM_READ_WRITE, size, alignment); } // Try legacy buffer memory in case of failure - if(region != nullptr && region->ptr() == nullptr) + if (region != nullptr && region->ptr() == nullptr) { - region = support::cpp14::make_unique<CLBufferMemoryRegion>(ctx, CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE, size); + region = std::make_unique<CLBufferMemoryRegion>(CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE, size); } return region; } @@ -80,7 +79,10 @@ void clear_quantization_arrays(CLFloatArray &scale, CLInt32Array &offset) * @param[in] qinfo Quantization info * @param[in] pad_size Pad size to use in case array needs to be padded for computation purposes */ -void populate_quantization_info(CLFloatArray &scale, CLInt32Array &offset, const QuantizationInfo &qinfo, size_t pad_size) +void populate_quantization_info(CLFloatArray &scale, + CLInt32Array &offset, + const QuantizationInfo &qinfo, + size_t pad_size) { clear_quantization_arrays(scale, offset); @@ -90,16 +92,18 @@ void populate_quantization_info(CLFloatArray &scale, CLInt32Array &offset, const const size_t element_size = sizeof(std::remove_reference<decltype(qscale)>::type::value_type); scale = CLFloatArray(num_elements + pad_size); scale.resize(num_elements); - CLScheduler::get().queue().enqueueWriteBuffer(scale.cl_buffer(), CL_TRUE, 0, num_elements * element_size, qinfo.scale().data()); + CLScheduler::get().queue().enqueueWriteBuffer(scale.cl_buffer(), CL_TRUE, 0, num_elements * element_size, + qinfo.scale().data()); - if(!qinfo.offset().empty()) + if (!qinfo.offset().empty()) { // Create offset array - const std::vector<int32_t> &qoffset = qinfo.offset(); - const size_t offset_element_size = sizeof(std::remove_reference<decltype(qoffset)>::type::value_type); - offset = CLInt32Array(num_elements + pad_size); + const std::vector<int32_t> &qoffset = qinfo.offset(); + const size_t offset_element_size = sizeof(std::remove_reference<decltype(qoffset)>::type::value_type); + offset = CLInt32Array(num_elements + pad_size); offset.resize(num_elements); - CLScheduler::get().queue().enqueueWriteBuffer(offset.cl_buffer(), CL_TRUE, 0, num_elements * offset_element_size, qinfo.offset().data()); + CLScheduler::get().queue().enqueueWriteBuffer(offset.cl_buffer(), CL_TRUE, 0, + num_elements * offset_element_size, qinfo.offset().data()); } } } // namespace @@ -111,7 +115,7 @@ CLTensorAllocator::CLTensorAllocator(IMemoryManageable *owner, CLRuntimeContext CLQuantization CLTensorAllocator::quantization() const { - return { &_scale, &_offset }; + return {&_scale, &_offset}; } uint8_t *CLTensorAllocator::data() @@ -127,26 +131,26 @@ const cl::Buffer &CLTensorAllocator::cl_data() const void CLTensorAllocator::allocate() { // Allocate tensor backing memory - if(_associated_memory_group == nullptr) + if (_associated_memory_group == nullptr) { // Perform memory allocation - if(_ctx == nullptr) + if (static_global_cl_allocator != nullptr) { - auto legacy_ctx = CLCoreRuntimeContext(nullptr, CLScheduler::get().context(), CLScheduler::get().queue()); - _memory.set_owned_region(allocate_region(&legacy_ctx, info().total_size(), 0)); + _memory.set_owned_region(static_global_cl_allocator->make_region(info().total_size(), 0)); } else { - _memory.set_owned_region(allocate_region(_ctx->core_runtime_context(), info().total_size(), 0)); + _memory.set_owned_region(allocate_region(info().total_size(), 0)); } } else { + // Finalize memory management instead _associated_memory_group->finalize_memory(_owner, _memory, info().total_size(), alignment()); } // Allocate and fill the quantization parameter arrays - if(is_data_type_quantized_per_channel(info().data_type())) + if (is_data_type_quantized_per_channel(info().data_type())) { const size_t pad_size = 0; populate_quantization_info(_scale, _offset, info().quantization_info(), pad_size); @@ -171,15 +175,7 @@ Status CLTensorAllocator::import_memory(cl::Buffer buffer) ARM_COMPUTE_RETURN_ERROR_ON(buffer.getInfo<CL_MEM_CONTEXT>().get() != CLScheduler::get().context().get()); ARM_COMPUTE_RETURN_ERROR_ON(_associated_memory_group != nullptr); - if(_ctx == nullptr) - { - auto legacy_ctx = CLCoreRuntimeContext(nullptr, CLScheduler::get().context(), CLScheduler::get().queue()); - _memory.set_owned_region(support::cpp14::make_unique<CLBufferMemoryRegion>(buffer, &legacy_ctx)); - } - else - { - _memory.set_owned_region(support::cpp14::make_unique<CLBufferMemoryRegion>(buffer, _ctx->core_runtime_context())); - } + _memory.set_owned_region(std::make_unique<CLBufferMemoryRegion>(buffer)); info().set_is_resizable(false); return Status{}; @@ -194,9 +190,14 @@ void CLTensorAllocator::set_associated_memory_group(IMemoryGroup *associated_mem _associated_memory_group = associated_memory_group; } +void CLTensorAllocator::set_global_allocator(IAllocator *allocator) +{ + static_global_cl_allocator = allocator; +} + uint8_t *CLTensorAllocator::lock() { - if(_ctx) + if (_ctx) { return map(_ctx->gpu_scheduler()->queue(), true); } @@ -209,7 +210,7 @@ uint8_t *CLTensorAllocator::lock() void CLTensorAllocator::unlock() { ARM_COMPUTE_ERROR_ON(_memory.region() == nullptr); - if(_ctx) + if (_ctx) { unmap(_ctx->gpu_scheduler()->queue(), reinterpret_cast<uint8_t *>(_memory.region()->buffer())); } diff --git a/src/runtime/CL/CLTuner.cpp b/src/runtime/CL/CLTuner.cpp index 5f2fa7d0e2..0d62fe3afe 100644 --- a/src/runtime/CL/CLTuner.cpp +++ b/src/runtime/CL/CLTuner.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2020 ARM Limited. + * Copyright (c) 2017-2023 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -22,27 +22,52 @@ * SOFTWARE. */ #include "arm_compute/runtime/CL/CLTuner.h" -#include "arm_compute/runtime/CL/tuners/CLLWSList.h" -#include "arm_compute/core/CL/ICLKernel.h" #include "arm_compute/core/Error.h" #include "arm_compute/runtime/CL/CLScheduler.h" +#include "arm_compute/runtime/CL/tuners/CLTuningParametersList.h" + +#include "src/common/utils/Log.h" +#include "src/core/CL/ICLKernel.h" #include "support/StringSupport.h" #include <cerrno> #include <fstream> -#include <iostream> #include <limits> -#include <memory> -#include <string> namespace arm_compute { -CLTuner::CLTuner(bool tune_new_kernels) - : real_clEnqueueNDRangeKernel(nullptr), _lws_table(), _kernel_event(), _tune_new_kernels(tune_new_kernels), _tuner_mode(CLTunerMode::NORMAL) +CLTuner::CLTuner(bool tune_new_kernels, CLTuningInfo tuning_info) + : real_clEnqueueNDRangeKernel(nullptr), + _tuning_params_table(), + _lws_table(), + _kernel_event(), + _tune_new_kernels(tune_new_kernels), + _tuning_info(tuning_info) { } +struct CLTuner::IKernelData +{ + virtual ~IKernelData() = default; + virtual void do_run(ICLKernel &kernel, cl::CommandQueue &queue) = 0; +}; +struct DefaultKernelData : public CLTuner::IKernelData +{ + DefaultKernelData(ITensorPack &tensors) : _tensors{tensors} + { + } + ~DefaultKernelData() override = default; + void do_run(ICLKernel &kernel, cl::CommandQueue &queue) override + { + const bool inject_memory = !_tensors.empty(); + inject_memory ? kernel.run_op(_tensors, kernel.window(), queue) : kernel.run(kernel.window(), queue); + } + +private: + ITensorPack &_tensors; +}; + bool CLTuner::kernel_event_is_set() const { return _kernel_event() != nullptr; @@ -63,11 +88,7 @@ bool CLTuner::tune_new_kernels() const void CLTuner::set_tuner_mode(CLTunerMode mode) { - _tuner_mode = mode; -} -CLTunerMode CLTuner::get_tuner_mode() const -{ - return _tuner_mode; + _tuning_info.tuner_mode = mode; } void CLTuner::tune_kernel_static(ICLKernel &kernel) @@ -77,48 +98,69 @@ void CLTuner::tune_kernel_static(ICLKernel &kernel) void CLTuner::tune_kernel_dynamic(ICLKernel &kernel) { + ITensorPack pack; + tune_kernel_dynamic(kernel, pack); +} + +void CLTuner::do_tune_kernel_dynamic(ICLKernel &kernel, IKernelData *data) +{ // Get the configuration ID from the kernel and append GPU target name and number of available compute units - const std::string config_id = kernel.config_id() + "_" + string_from_target(kernel.get_target()) + "_MP" + support::cpp11::to_string(CLKernelLibrary::get().get_num_compute_units()); + const std::string config_id = kernel.config_id() + "_" + string_from_target(kernel.get_target()) + "_MP" + + support::cpp11::to_string(CLKernelLibrary::get().get_num_compute_units()); // Check if we need to find the Optimal LWS. If the kernel's config_id is equal to default_config_id, the kernel does not require to be tuned - if(kernel.config_id() != arm_compute::default_config_id) + if (kernel.config_id() != arm_compute::default_config_id) { - auto p = _lws_table.find(config_id); + auto p = _tuning_params_table.find(config_id); - if(p == _lws_table.end()) + if (p == _tuning_params_table.end()) { - if(_tune_new_kernels) + if (_tune_new_kernels) { // Find the optimal LWS for the kernel - cl::NDRange opt_lws = find_optimal_lws(kernel); + CLTuningParams opt_tuning_params = find_optimal_tuning_params(kernel, data); // Insert the optimal LWS in the table - add_lws_to_table(config_id, opt_lws); + add_tuning_params(config_id, opt_tuning_params); // Set Local-Workgroup-Size - kernel.set_lws_hint(opt_lws); + kernel.set_lws_hint(opt_tuning_params.get_lws()); + if (_tuning_info.tune_wbsm) + { + kernel.set_wbsm_hint(opt_tuning_params.get_wbsm()); + } } } else { // Set Local-Workgroup-Size - kernel.set_lws_hint(p->second); + kernel.set_lws_hint(p->second.get_lws()); + if (_tuning_info.tune_wbsm) + { + kernel.set_wbsm_hint(p->second.get_wbsm()); + } } } } +void CLTuner::tune_kernel_dynamic(ICLKernel &kernel, ITensorPack &tensors) +{ + DefaultKernelData data{tensors}; -void CLTuner::add_lws_to_table(const std::string &kernel_id, cl::NDRange optimal_lws) + do_tune_kernel_dynamic(kernel, &data); +} + +void CLTuner::add_tuning_params(const std::string &kernel_id, CLTuningParams optimal_tuning_params) { - _lws_table.emplace(kernel_id, optimal_lws); + _tuning_params_table.emplace(kernel_id, optimal_tuning_params); } -cl::NDRange CLTuner::find_optimal_lws(ICLKernel &kernel) +CLTuningParams CLTuner::find_optimal_tuning_params(ICLKernel &kernel, IKernelData *data) { // Profiling queue cl::CommandQueue queue_profiler; // Extract real OpenCL function to intercept - if(real_clEnqueueNDRangeKernel == nullptr) + if (real_clEnqueueNDRangeKernel == nullptr) { real_clEnqueueNDRangeKernel = CLSymbols::get().clEnqueueNDRangeKernel_ptr; } @@ -129,7 +171,7 @@ cl::NDRange CLTuner::find_optimal_lws(ICLKernel &kernel) // Check if we can use the OpenCL timer with the default queue cl_command_queue_properties props = default_queue.getInfo<CL_QUEUE_PROPERTIES>(); - if((props & CL_QUEUE_PROFILING_ENABLE) == 0) + if ((props & CL_QUEUE_PROFILING_ENABLE) == 0) { // Set the queue for profiling queue_profiler = cl::CommandQueue(CLScheduler::get().context(), props | CL_QUEUE_PROFILING_ENABLE); @@ -140,21 +182,23 @@ cl::NDRange CLTuner::find_optimal_lws(ICLKernel &kernel) } // Start intercepting enqueues: - auto interceptor = [this](cl_command_queue command_queue, cl_kernel kernel, cl_uint work_dim, const size_t *gwo, const size_t *gws, const size_t *lws, cl_uint num_events_in_wait_list, - const cl_event * event_wait_list, cl_event * event) + auto interceptor = [this](cl_command_queue command_queue, cl_kernel kernel, cl_uint work_dim, const size_t *gwo, + const size_t *gws, const size_t *lws, cl_uint num_events_in_wait_list, + const cl_event *event_wait_list, cl_event *event) { - if(this->kernel_event_is_set()) + if (this->kernel_event_is_set()) { // If the event is already set it means the kernel enqueue is sliced: given that we only time the first slice we can save time by skipping the other enqueues. return CL_SUCCESS; } cl_event tmp; - cl_int retval = this->real_clEnqueueNDRangeKernel(command_queue, kernel, work_dim, gwo, gws, lws, num_events_in_wait_list, event_wait_list, &tmp); + cl_int retval = this->real_clEnqueueNDRangeKernel(command_queue, kernel, work_dim, gwo, gws, lws, + num_events_in_wait_list, event_wait_list, &tmp); // Set OpenCL event this->set_cl_kernel_event(tmp); - if(event != nullptr) + if (event != nullptr) { //return cl_event from the intercepted call clRetainEvent(tmp); @@ -164,10 +208,19 @@ cl::NDRange CLTuner::find_optimal_lws(ICLKernel &kernel) }; CLSymbols::get().clEnqueueNDRangeKernel_ptr = interceptor; - cl::NDRange gws = ICLKernel::gws_from_window(kernel.window()); - // Run the kernel with default lws to be used as baseline - kernel.run(kernel.window(), queue_profiler); + data->do_run(kernel, queue_profiler); + + /// Get the cached gws used by the kernel + /// NOTE: The window configured inside configure() is usually changed in run(). Thus we should not calculate gws + /// from this static window. Instead we get the real gws used (and cached) by run() in the previous step. + /// This is only a temporary workaround. An ideal solution involves decoupling the execution window from run() / run_op() + /// Please see COMPMID-5934 + cl::NDRange gws = kernel.get_cached_gws(); + ARM_COMPUTE_LOG_MSG_WITH_FORMAT_ACL( + arm_compute::logging::LogLevel::INFO, + "[CLTuner] Kernel with config_id '%s' uses %s as the upper-bound for lws search", kernel.config_id().c_str(), + to_string(gws).c_str()); queue_profiler.finish(); @@ -176,28 +229,36 @@ cl::NDRange CLTuner::find_optimal_lws(ICLKernel &kernel) cl_ulong min_exec_time = end - start; _kernel_event = nullptr; - cl::NDRange opt_lws = cl::NullRange; + CLTuningParams opt_tuning_params(cl::NullRange, 0); - //Construct the list of LWS values to be tested based on the tuner mode. - auto lws_list = cl_tuner::CLLWSListFactory::get_lws_list(_tuner_mode, gws); - for(size_t i = 0; i < lws_list->size(); ++i) + // Construct the list of tuning parameters values to be tested based on the tuner mode. + auto tuning_list = cl_tuner::get_tuning_parameters_list(_tuning_info, gws); + for (size_t i = 0; i < tuning_list->size(); ++i) { - cl::NDRange lws_test = (*lws_list)[i]; + CLTuningParams tuning_test = (*tuning_list)[i]; + // Setting the lws + cl::NDRange lws_test = tuning_test.get_lws(); auto x = lws_test[0]; auto y = lws_test[1]; auto z = lws_test[2]; const bool invalid_lws = (x * y * z > kernel.get_max_workgroup_size()) || (x == 1 && y == 1 && z == 1); - if(invalid_lws) + if (invalid_lws) { continue; } - //Set the Local-Workgroup-Size kernel.set_lws_hint(lws_test); + if (_tuning_info.tune_wbsm && CLKernelLibrary::get().is_wbsm_supported()) + { + cl_int wbsm_test = tuning_test.get_wbsm(); + kernel.set_wbsm_hint(wbsm_test); + } + ARM_COMPUTE_LOG_MSG_WITH_FORMAT_ACL(arm_compute::logging::LogLevel::INFO, "[CLTuner] Trying LWS: %s, WBSM: %d", + to_string(kernel.lws_hint()).c_str(), kernel.wbsm_hint()); // Run the kernel - kernel.run(kernel.window(), queue_profiler); + data->do_run(kernel, queue_profiler); queue_profiler.finish(); @@ -207,28 +268,31 @@ cl::NDRange CLTuner::find_optimal_lws(ICLKernel &kernel) _kernel_event = nullptr; // Check the execution time - if(diff < min_exec_time) + if (diff < min_exec_time) { min_exec_time = diff; - opt_lws = cl::NDRange(x, y, z); + opt_tuning_params.set_lws(tuning_test.get_lws()); + if (_tuning_info.tune_wbsm) + { + opt_tuning_params.set_wbsm(tuning_test.get_wbsm()); + } } } // Restore real function CLSymbols::get().clEnqueueNDRangeKernel_ptr = real_clEnqueueNDRangeKernel; - - return opt_lws; + return opt_tuning_params; } -void CLTuner::import_lws_table(const std::unordered_map<std::string, cl::NDRange> &lws_table) +const std::unordered_map<std::string, CLTuningParams> &CLTuner::tuning_params_table() const { - _lws_table.clear(); - _lws_table = lws_table; + return _tuning_params_table; } -const std::unordered_map<std::string, cl::NDRange> &CLTuner::lws_table() const +void CLTuner::import_tuning_params(const std::unordered_map<std::string, CLTuningParams> &tuning_params_table) { - return _lws_table; + _tuning_params_table.clear(); + _tuning_params_table = tuning_params_table; } void CLTuner::load_from_file(const std::string &filename) @@ -236,49 +300,79 @@ void CLTuner::load_from_file(const std::string &filename) std::ifstream fs; fs.exceptions(std::ifstream::badbit); fs.open(filename, std::ios::in); - if(!fs.is_open()) + if (!fs.is_open()) { ARM_COMPUTE_ERROR_VAR("Failed to open '%s' (%s [%d])", filename.c_str(), strerror(errno), errno); } std::string line; - while(!std::getline(fs, line).fail()) + bool header_line = true; + while (!std::getline(fs, line).fail()) { - std::istringstream ss(line); - std::string token; - if(std::getline(ss, token, ';').fail()) - { - ARM_COMPUTE_ERROR_VAR("Malformed row '%s' in %s (Should be of the form 'kernel_id;lws[0];lws[1];lws[2]')", ss.str().c_str(), filename.c_str()); - } - std::string kernel_id = token; - cl::NDRange lws(1, 1, 1); - for(int i = 0; i < 3; i++) + if (header_line) { - if(std::getline(ss, token, ';').fail()) + header_line = false; + size_t pos_lws = line.find("lws"); + size_t pos_wbsm = line.find("wbsm"); + _tuning_info.tune_wbsm = false; + if (pos_lws != std::string::npos || pos_wbsm != std::string::npos) { - ARM_COMPUTE_ERROR_VAR("Malformed row '%s' in %s (Should be of the form 'kernel_id;lws[0];lws[1];lws[2]')", ss.str().c_str(), filename.c_str()); + // The file has in the first line the parameters it has been tuned on + if (pos_wbsm != std::string::npos) + { + _tuning_info.tune_wbsm = true; + } + // Once the line with the tuning parameter is read we can + // read the next one to start collecting the values + if (std::getline(fs, line).fail()) + { + break; + } } - lws.get()[i] = support::cpp11::stoi(token); } - // If all dimensions are 0: reset to NullRange (i.e nullptr) - if(lws[0] == 0 && lws[1] == 0 && lws[2] == 0) + CLTuningParams tuning_params; + size_t pos = line.find(";"); + if (pos == std::string::npos) { - lws = cl::NullRange; + ARM_COMPUTE_ERROR_VAR("Malformed row '%s' in %s", line.c_str(), filename.c_str()); } - add_lws_to_table(kernel_id, lws); + std::string kernel_id = line.substr(0, pos); + line.erase(0, pos + 1); + if (!tuning_params.from_string(_tuning_info, line)) + { + ARM_COMPUTE_ERROR_VAR("Malformed row '%s' in %s", line.c_str(), filename.c_str()); + } + add_tuning_params(kernel_id, tuning_params); } fs.close(); } -void CLTuner::save_to_file(const std::string &filename) const +bool CLTuner::save_to_file(const std::string &filename) const { + if (!_tune_new_kernels || _tuning_params_table.empty() || filename.empty()) + { + return false; + } std::ofstream fs; fs.exceptions(std::ifstream::failbit | std::ifstream::badbit); fs.open(filename, std::ios::out); - for(auto const &kernel_data : _lws_table) + std::string header_string = ""; + header_string += "lws"; + if (_tuning_info.tune_wbsm) + { + if (!header_string.empty()) + { + header_string += " "; + } + header_string += "wbsm"; + } + fs << header_string << std::endl; + for (auto const &kernel_data : _tuning_params_table) { - fs << kernel_data.first << ";" << kernel_data.second[0] << ";" << kernel_data.second[1] << ";" << kernel_data.second[2] << std::endl; + CLTuningParams tun_pams(kernel_data.second); + fs << kernel_data.first << tun_pams.to_string(_tuning_info) << std::endl; } fs.close(); + return true; } } // namespace arm_compute diff --git a/src/runtime/CL/ICLSimpleFunction.cpp b/src/runtime/CL/ICLSimpleFunction.cpp index fb8eba8aa4..bc782c3a2c 100644 --- a/src/runtime/CL/ICLSimpleFunction.cpp +++ b/src/runtime/CL/ICLSimpleFunction.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2016-2019 ARM Limited. + * Copyright (c) 2016-2020 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -27,18 +27,21 @@ #include "arm_compute/runtime/CL/CLHelpers.h" #include "arm_compute/runtime/CL/CLScheduler.h" +#include "src/core/CL/ICLKernel.h" +#include "src/core/CL/kernels/CLFillBorderKernel.h" + using namespace arm_compute; ICLSimpleFunction::ICLSimpleFunction(CLRuntimeContext *ctx) // NOLINT - : _kernel(), - _border_handler(), - _ctx(ctx) + : _kernel(), _border_handler(std::make_unique<CLFillBorderKernel>()), _ctx(ctx) { } +ICLSimpleFunction::~ICLSimpleFunction() = default; + void ICLSimpleFunction::run() { ARM_COMPUTE_ERROR_ON_MSG(!_kernel, "The child class didn't set the CL kernel or function isn't configured"); - schedule_kernel_on_ctx(_ctx, &_border_handler, false); + schedule_kernel_on_ctx(_ctx, _border_handler.get(), false); schedule_kernel_on_ctx(_ctx, _kernel.get()); } diff --git a/src/runtime/CL/TracePoint.cpp b/src/runtime/CL/TracePoint.cpp deleted file mode 100644 index 97029f532e..0000000000 --- a/src/runtime/CL/TracePoint.cpp +++ /dev/null @@ -1,41 +0,0 @@ -/* - * Copyright (c) 2020 ARM Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "arm_compute/core/TracePoint.h" -#include "arm_compute/runtime/CL/CLArray.h" -#include "arm_compute/runtime/CL/CLPyramid.h" -#include "arm_compute/runtime/CL/functions/CLLSTMLayer.h" -#include "utils/TypePrinter.h" - -#include <cstdio> -#include <vector> - -namespace arm_compute -{ -TRACE_TO_STRING(CLPyramid) -TRACE_TO_STRING(LSTMParams<ICLTensor>) -TRACE_TO_STRING(CLCoordinates2DArray) -CONST_PTR_CLASS(CLPyramid) -CONST_PTR_CLASS(LSTMParams<ICLTensor>) -CONST_PTR_CLASS(CLCoordinates2DArray) -} // namespace arm_compute diff --git a/src/runtime/CL/Utils.cpp b/src/runtime/CL/Utils.cpp index 5e22dfd4eb..294396c28a 100644 --- a/src/runtime/CL/Utils.cpp +++ b/src/runtime/CL/Utils.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020 ARM Limited. + * Copyright (c) 2020-2021 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -21,6 +21,8 @@ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ +#include "arm_compute/runtime/CL/Utils.h" + #include "arm_compute/core/CL/CLKernelLibrary.h" #include "arm_compute/runtime/CL/CLScheduler.h" @@ -33,20 +35,20 @@ namespace arm_compute void restore_program_cache_from_file(const std::string &filename) { std::ifstream cache_file(filename, std::ios::binary); - if(cache_file.is_open()) + if (cache_file.is_open()) { - if(!CLScheduler::get().is_initialised()) + if (!CLScheduler::get().is_initialised()) { arm_compute::CLScheduler::get().default_init(); } - while(!cache_file.eof()) + while (!cache_file.eof()) { size_t name_len = 0; size_t binary_len = 0; cache_file.read(reinterpret_cast<char *>(&name_len), sizeof(size_t)); cache_file.read(reinterpret_cast<char *>(&binary_len), sizeof(size_t)); - if(name_len == 0 || binary_len == 0) + if (name_len == 0 || binary_len == 0) { break; } @@ -58,7 +60,7 @@ void restore_program_cache_from_file(const std::string &filename) tmp.resize(binary_len); cache_file.read(reinterpret_cast<char *>(binary.data()), binary_len); cl::Context context = arm_compute::CLScheduler::get().context(); - cl::Program::Binaries binaries{ binary }; + cl::Program::Binaries binaries{binary}; std::vector<cl::Device> devices = context.getInfo<CL_CONTEXT_DEVICES>(); cl::Program program(context, devices, binaries); program.build(); @@ -70,12 +72,12 @@ void restore_program_cache_from_file(const std::string &filename) void save_program_cache_to_file(const std::string &filename) { - if(CLScheduler::get().is_initialised()) + if (CLScheduler::get().is_initialised()) { std::ofstream cache_file(filename, std::ios::binary); - if(cache_file.is_open()) + if (cache_file.is_open()) { - for(const auto &it : CLKernelLibrary::get().get_built_programs()) + for (const auto &it : CLKernelLibrary::get().get_built_programs()) { std::vector<std::vector<unsigned char>> binaries = it.second.getInfo<CL_PROGRAM_BINARIES>(); ARM_COMPUTE_ERROR_ON(binaries.size() != 1); diff --git a/src/runtime/CL/functions/CLAbsoluteDifference.cpp b/src/runtime/CL/functions/CLAbsoluteDifference.cpp deleted file mode 100644 index 492c54e4d3..0000000000 --- a/src/runtime/CL/functions/CLAbsoluteDifference.cpp +++ /dev/null @@ -1,43 +0,0 @@ -/* - * Copyright (c) 2016-2020 ARM Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "arm_compute/runtime/CL/functions/CLAbsoluteDifference.h" - -#include "arm_compute/core/CL/kernels/CLAbsoluteDifferenceKernel.h" -#include "support/MemorySupport.h" - -#include <utility> - -using namespace arm_compute; - -void CLAbsoluteDifference::configure(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output) -{ - configure(CLKernelLibrary::get().get_compile_context(), input1, input2, output); -} - -void CLAbsoluteDifference::configure(const CLCompileContext &compile_context, const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output) -{ - auto k = arm_compute::support::cpp14::make_unique<CLAbsoluteDifferenceKernel>(); - k->configure(compile_context, input1, input2, output); - _kernel = std::move(k); -} diff --git a/src/runtime/CL/functions/CLAccumulate.cpp b/src/runtime/CL/functions/CLAccumulate.cpp deleted file mode 100644 index a81d1d042b..0000000000 --- a/src/runtime/CL/functions/CLAccumulate.cpp +++ /dev/null @@ -1,67 +0,0 @@ -/* - * Copyright (c) 2016-2020 ARM Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "arm_compute/runtime/CL/functions/CLAccumulate.h" - -#include "arm_compute/core/CL/kernels/CLAccumulateKernel.h" -#include "support/MemorySupport.h" - -#include <utility> - -using namespace arm_compute; - -void CLAccumulate::configure(const ICLTensor *input, ICLTensor *accum) -{ - configure(CLKernelLibrary::get().get_compile_context(), input, accum); -} - -void CLAccumulate::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *accum) -{ - auto k = arm_compute::support::cpp14::make_unique<CLAccumulateKernel>(); - k->configure(compile_context, input, accum); - _kernel = std::move(k); -} - -void CLAccumulateWeighted::configure(const ICLTensor *input, float alpha, ICLTensor *accum) -{ - configure(CLKernelLibrary::get().get_compile_context(), input, alpha, accum); -} - -void CLAccumulateWeighted::configure(const CLCompileContext &compile_context, const ICLTensor *input, float alpha, ICLTensor *accum) -{ - auto k = arm_compute::support::cpp14::make_unique<CLAccumulateWeightedKernel>(); - k->configure(compile_context, input, alpha, accum); - _kernel = std::move(k); -} - -void CLAccumulateSquared::configure(const ICLTensor *input, uint32_t shift, ICLTensor *accum) -{ - configure(CLKernelLibrary::get().get_compile_context(), input, shift, accum); -} - -void CLAccumulateSquared::configure(const CLCompileContext &compile_context, const ICLTensor *input, uint32_t shift, ICLTensor *accum) -{ - auto k = arm_compute::support::cpp14::make_unique<CLAccumulateSquaredKernel>(); - k->configure(compile_context, input, shift, accum); - _kernel = std::move(k); -} diff --git a/src/runtime/CL/functions/CLActivationLayer.cpp b/src/runtime/CL/functions/CLActivationLayer.cpp index 989603a9df..c035644e4a 100644 --- a/src/runtime/CL/functions/CLActivationLayer.cpp +++ b/src/runtime/CL/functions/CLActivationLayer.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2016-2020 ARM Limited. + * Copyright (c) 2016-2021, 2023 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -23,32 +23,63 @@ */ #include "arm_compute/runtime/CL/functions/CLActivationLayer.h" -#include "arm_compute/core/CL/kernels/CLActivationLayerKernel.h" +#include "arm_compute/core/CL/ICLTensor.h" #include "arm_compute/core/Types.h" +#include "arm_compute/core/Validate.h" +#include "arm_compute/function_info/ActivationLayerInfo.h" #include "arm_compute/runtime/CL/CLRuntimeContext.h" -#include "support/MemorySupport.h" + +#include "src/core/CL/ICLKernel.h" +#include "src/gpu/cl/operators/ClActivation.h" namespace arm_compute { -CLActivationLayer::CLActivationLayer(CLRuntimeContext *ctx) - : ICLSimpleFunction(ctx) +struct CLActivationLayer::Impl +{ + const ICLTensor *src{nullptr}; + ICLTensor *dst{nullptr}; + CLRuntimeContext *ctx{nullptr}; + std::unique_ptr<opencl::ClActivation> op{nullptr}; +}; + +CLActivationLayer::CLActivationLayer(CLRuntimeContext *ctx) : _impl(std::make_unique<Impl>()) { + _impl->ctx = ctx; } +CLActivationLayer::CLActivationLayer(CLActivationLayer &&) = default; +CLActivationLayer &CLActivationLayer::operator=(CLActivationLayer &&) = default; +CLActivationLayer::~CLActivationLayer() = default; void CLActivationLayer::configure(ICLTensor *input, ICLTensor *output, ActivationLayerInfo act_info) { configure(CLKernelLibrary::get().get_compile_context(), input, output, act_info); } -void CLActivationLayer::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, ActivationLayerInfo act_info) +void CLActivationLayer::configure(const CLCompileContext &compile_context, + ICLTensor *input, + ICLTensor *output, + ActivationLayerInfo act_info) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(input); + + _impl->src = input; + _impl->dst = output == nullptr ? input : output; + + _impl->op = std::make_unique<opencl::ClActivation>(); + _impl->op->configure(compile_context, _impl->src->info(), _impl->dst->info(), act_info); +} + +Status +CLActivationLayer::validate(const ITensorInfo *input, const ITensorInfo *output, const ActivationLayerInfo &act_info) { - auto k = arm_compute::support::cpp14::make_unique<CLActivationLayerKernel>(); - k->configure(compile_context, input, output, act_info); - _kernel = std::move(k); + return opencl::ClActivation::validate(input, output, act_info); } -Status CLActivationLayer::validate(const ITensorInfo *input, const ITensorInfo *output, const ActivationLayerInfo &act_info) +void CLActivationLayer::run() { - return CLActivationLayerKernel::validate(input, output, act_info); + ITensorPack pack; + pack.add_tensor(TensorType::ACL_SRC, _impl->src); + pack.add_tensor(TensorType::ACL_DST, _impl->dst); + _impl->op->run(pack); } } // namespace arm_compute diff --git a/src/runtime/CL/functions/CLArgMinMaxLayer.cpp b/src/runtime/CL/functions/CLArgMinMaxLayer.cpp index 5b4c694f33..f9bbd31e8a 100644 --- a/src/runtime/CL/functions/CLArgMinMaxLayer.cpp +++ b/src/runtime/CL/functions/CLArgMinMaxLayer.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018-2020 ARM Limited. + * Copyright (c) 2018-2021, 2023 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -27,80 +27,65 @@ #include "arm_compute/core/Error.h" #include "arm_compute/core/TensorInfo.h" #include "arm_compute/core/Types.h" -#include "arm_compute/core/Validate.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" -#include "arm_compute/runtime/Utils.h" +#include "arm_compute/core/Validate.h" + +#include "src/common/utils/Log.h" +#include "src/core/CL/CLValidate.h" +#include "src/core/CL/kernels/CLArgMinMaxLayerKernel.h" +#include "src/core/helpers/AutoConfiguration.h" +#include "src/runtime/Utils.h" namespace arm_compute { CLArgMinMaxLayer::CLArgMinMaxLayer(std::shared_ptr<IMemoryManager> memory_manager) - : _memory_group(std::move(memory_manager)), _results_vector(), _not_reshaped_output(), _reduction_kernels_vector(), _reshape_kernel(), _num_of_stages(), _reduction_axis() + : _memory_group(std::move(memory_manager)), + _not_reshaped_output(), + _arg_min_max_kernel(), + _reshape(), + _reduction_axis() { } -Status CLArgMinMaxLayer::validate(const ITensorInfo *input, int axis, const ITensorInfo *output, const ReductionOperation &op) +CLArgMinMaxLayer::~CLArgMinMaxLayer() = default; + +Status +CLArgMinMaxLayer::validate(const ITensorInfo *input, int axis, const ITensorInfo *output, const ReductionOperation &op) { ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(op != ReductionOperation::ARG_IDX_MAX && op != ReductionOperation::ARG_IDX_MIN, "Invalid reduction operation"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis >= static_cast<int>(TensorShape::num_max_dimensions), "Reduction axis greater than max number of dimensions"); + ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, + DataType::S32, DataType::F16, DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(op != ReductionOperation::ARG_IDX_MAX && op != ReductionOperation::ARG_IDX_MIN, + "Invalid reduction operation"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis >= static_cast<int>(TensorShape::num_max_dimensions), + "Reduction axis greater than max number of dimensions"); ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis > 3, "Unsupported reduction axis"); - const unsigned int num_of_stages = calculate_number_of_stages_only_x_axis(input->dimension(0), axis); DataType output_data_type = DataType::S32; TensorInfo not_reshaped_output; const auto input_num_channles = input->num_channels(); const auto input_qinfo = input->quantization_info(); - if(output->total_size() != 0) + if (output->total_size() != 0) { output_data_type = output->data_type(); - const TensorInfo expected_output_shape = output->clone()->set_tensor_shape(arm_compute::misc::shape_calculator::compute_reduced_shape(input->tensor_shape(), axis, false)); + const TensorInfo expected_output_shape = output->clone()->set_tensor_shape( + arm_compute::misc::shape_calculator::compute_reduced_shape(input->tensor_shape(), axis, false)); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&expected_output_shape, output); } auto shape_before_reshape = input->tensor_shape(); shape_before_reshape.set(axis, 1); - auto initialize_tensorinfo = [](TensorInfo & ti, TensorShape shape, DataType data_type, int num_channels, QuantizationInfo qinfo) - { + auto initialize_tensorinfo = [](TensorInfo &ti, TensorShape shape, DataType data_type, int num_channels, + QuantizationInfo qinfo) { ti.set_data_type(data_type).set_tensor_shape(shape).set_num_channels(num_channels).set_quantization_info(qinfo); }; initialize_tensorinfo(not_reshaped_output, shape_before_reshape, output_data_type, input_num_channles, input_qinfo); - if(num_of_stages == 1) - { - ARM_COMPUTE_RETURN_ON_ERROR(CLArgMinMaxLayerKernel::validate(input, nullptr, ¬_reshaped_output, axis, op)); - } - else - { - // Create temporary tensor infos - std::vector<TensorInfo> sums_vector(num_of_stages - 1); - - // Create intermediate tensor info - TensorShape shape{ input->tensor_shape() }; - - for(unsigned int i = 0; i < num_of_stages - 1; i++) - { - shape.set(0, ceil(shape.x() / 128.f)); - sums_vector[i].set_data_type(input->data_type()); - sums_vector[i].set_tensor_shape(shape); - sums_vector[i].set_num_channels(input->num_channels()); - } - - // Validate ReductionOperation only on first kernel - ARM_COMPUTE_RETURN_ON_ERROR(CLArgMinMaxLayerKernel::validate(input, nullptr, &sums_vector[0], axis, op)); - - // Validate ReductionOperation on intermediate stages - for(unsigned int i = 1; i < num_of_stages - 1; ++i) - { - ARM_COMPUTE_RETURN_ON_ERROR(CLArgMinMaxLayerKernel::validate(input, &sums_vector[i - 1], &sums_vector[i], axis, op)); - } - - // Validate ReductionOperation on the last stage - const unsigned int last_stage = num_of_stages - 1; - ARM_COMPUTE_RETURN_ON_ERROR(CLArgMinMaxLayerKernel::validate(input, &sums_vector[last_stage - 1], ¬_reshaped_output, axis, op)); - } - ARM_COMPUTE_RETURN_ON_ERROR(CLReshapeLayerKernel::validate(¬_reshaped_output, output)); + ARM_COMPUTE_RETURN_ON_ERROR(CLArgMinMaxLayerKernel::validate(input, ¬_reshaped_output, axis, op)); + ARM_COMPUTE_RETURN_ON_ERROR(CLReshapeLayer::validate(¬_reshaped_output, output)); return Status{}; } @@ -109,53 +94,43 @@ void CLArgMinMaxLayer::configure(const ICLTensor *input, int axis, ICLTensor *ou configure(CLKernelLibrary::get().get_compile_context(), input, axis, output, op); } -void CLArgMinMaxLayer::configure(const CLCompileContext &compile_context, const ICLTensor *input, int axis, ICLTensor *output, const ReductionOperation &op) +void CLArgMinMaxLayer::configure(const CLCompileContext &compile_context, + const ICLTensor *input, + int axis, + ICLTensor *output, + const ReductionOperation &op) { ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); - _num_of_stages = calculate_number_of_stages_only_x_axis(input->info()->dimension(0), axis); - _reduction_axis = axis; + ARM_COMPUTE_LOG_PARAMS(input, axis, output, op); - const TensorShape output_shape = arm_compute::misc::shape_calculator::compute_reduced_shape(input->info()->tensor_shape(), axis, false); - DataType output_data_type = (output->info()->data_type() == DataType::UNKNOWN) ? DataType::S32 : output->info()->data_type(); - auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(output_shape).set_data_type(output_data_type).reset_padding().set_is_resizable(true)); + _reduction_axis = axis; - // Configure reduction operation kernels - _reduction_kernels_vector.resize(_num_of_stages); + const TensorShape output_shape = + arm_compute::misc::shape_calculator::compute_reduced_shape(input->info()->tensor_shape(), axis, false); + DataType output_data_type = + (output->info()->data_type() == DataType::UNKNOWN) ? DataType::S32 : output->info()->data_type(); + auto_init_if_empty(*output->info(), input->info() + ->clone() + ->set_tensor_shape(output_shape) + .set_data_type(output_data_type) + .reset_padding() + .set_is_resizable(true)); + + TensorShape not_reshaped_output_shape{input->info()->tensor_shape()}; + not_reshaped_output_shape.set(axis, 1); + auto_init_if_empty(*_not_reshaped_output.info(), input->info() + ->clone() + ->set_tensor_shape(not_reshaped_output_shape) + .set_data_type(output_data_type) + .reset_padding() + .set_is_resizable(true)); + + _arg_min_max_kernel = std::make_unique<CLArgMinMaxLayerKernel>(); + _arg_min_max_kernel->configure(compile_context, input, &_not_reshaped_output, axis, op); _memory_group.manage(&_not_reshaped_output); - // Create temporary tensors - if(_num_of_stages == 1) - { - _reduction_kernels_vector[0].configure(compile_context, input, nullptr, &_not_reshaped_output, axis, op); - } - else - { - _results_vector.resize(_num_of_stages - 1); - TensorShape shape{ input->info()->tensor_shape() }; - for(unsigned int i = 0; i < _num_of_stages - 1; i++) - { - shape.set(0, ceil(shape.x() / 128.f)); - _results_vector[i].allocator()->init(input->info()->clone()->set_tensor_shape(shape).set_data_type(output_data_type)); - } - - // Apply ReductionOperation only on first kernel - _memory_group.manage(&_results_vector[0]); - _reduction_kernels_vector[0].configure(compile_context, input, nullptr, &_results_vector[0], axis, op); - - // Apply ReductionOperation on intermediate stages - for(unsigned int i = 1; i < _num_of_stages - 1; ++i) - { - _memory_group.manage(&_results_vector[i]); - _reduction_kernels_vector[i].configure(compile_context, input, &_results_vector[i - 1], &_results_vector[i], axis, op); - _results_vector[i - 1].allocator()->allocate(); - } - - // Apply ReductionOperation on the last stage - const unsigned int last_stage = _num_of_stages - 1; - _reduction_kernels_vector[last_stage].configure(compile_context, input, &_results_vector[last_stage - 1], &_not_reshaped_output, axis, op); - _results_vector[last_stage - 1].allocator()->allocate(); - } - _reshape_kernel.configure(compile_context, &_not_reshaped_output, output); + + _reshape.configure(compile_context, &_not_reshaped_output, output); _not_reshaped_output.allocator()->allocate(); } @@ -163,10 +138,7 @@ void CLArgMinMaxLayer::run() { MemoryGroupResourceScope scope_mg(_memory_group); - for(unsigned int i = 0; i < _num_of_stages; ++i) - { - CLScheduler::get().enqueue(_reduction_kernels_vector[i], false); - } - CLScheduler::get().enqueue(_reshape_kernel, false); + CLScheduler::get().enqueue(*_arg_min_max_kernel, false); + _reshape.run(); } -} // namespace arm_compute
\ No newline at end of file +} // namespace arm_compute diff --git a/src/runtime/CL/functions/CLBatchNormalizationLayer.cpp b/src/runtime/CL/functions/CLBatchNormalizationLayer.cpp index 9fc51136b3..0c371c4171 100644 --- a/src/runtime/CL/functions/CLBatchNormalizationLayer.cpp +++ b/src/runtime/CL/functions/CLBatchNormalizationLayer.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2020 ARM Limited. + * Copyright (c) 2017-2021 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -30,35 +30,58 @@ #include "arm_compute/core/Validate.h" #include "arm_compute/runtime/CL/CLScheduler.h" -using namespace arm_compute; +#include "src/common/utils/Log.h" +#include "src/core/CL/kernels/CLBatchNormalizationLayerKernel.h" +namespace arm_compute +{ CLBatchNormalizationLayer::CLBatchNormalizationLayer() - : _norm_kernel() + : _norm_kernel(std::make_unique<CLBatchNormalizationLayerKernel>()) { } -void CLBatchNormalizationLayer::configure(ICLTensor *input, ICLTensor *output, const ICLTensor *mean, const ICLTensor *var, const ICLTensor *beta, const ICLTensor *gamma, float epsilon, +CLBatchNormalizationLayer::~CLBatchNormalizationLayer() = default; + +void CLBatchNormalizationLayer::configure(ICLTensor *input, + ICLTensor *output, + const ICLTensor *mean, + const ICLTensor *var, + const ICLTensor *beta, + const ICLTensor *gamma, + float epsilon, ActivationLayerInfo act_info) { configure(CLKernelLibrary::get().get_compile_context(), input, output, mean, var, beta, gamma, epsilon, act_info); } -void CLBatchNormalizationLayer::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, const ICLTensor *mean, const ICLTensor *var, const ICLTensor *beta, - const ICLTensor *gamma, float epsilon, - ActivationLayerInfo act_info) +void CLBatchNormalizationLayer::configure(const CLCompileContext &compile_context, + ICLTensor *input, + ICLTensor *output, + const ICLTensor *mean, + const ICLTensor *var, + const ICLTensor *beta, + const ICLTensor *gamma, + float epsilon, + ActivationLayerInfo act_info) { - _norm_kernel.configure(compile_context, input, output, mean, var, beta, gamma, epsilon, act_info); + ARM_COMPUTE_LOG_PARAMS(input, output, mean, var, beta, gamma, epsilon, act_info); + _norm_kernel->configure(compile_context, input, output, mean, var, beta, gamma, epsilon, act_info); } -Status CLBatchNormalizationLayer::validate(const ITensorInfo *input, const ITensorInfo *output, - const ITensorInfo *mean, const ITensorInfo *var, - const ITensorInfo *beta, const ITensorInfo *gamma, - float epsilon, ActivationLayerInfo act_info) +Status CLBatchNormalizationLayer::validate(const ITensorInfo *input, + const ITensorInfo *output, + const ITensorInfo *mean, + const ITensorInfo *var, + const ITensorInfo *beta, + const ITensorInfo *gamma, + float epsilon, + ActivationLayerInfo act_info) { return CLBatchNormalizationLayerKernel::validate(input, output, mean, var, beta, gamma, epsilon, act_info); } void CLBatchNormalizationLayer::run() { - CLScheduler::get().enqueue(_norm_kernel, true); + CLScheduler::get().enqueue(*_norm_kernel, true); } +} // namespace arm_compute diff --git a/src/runtime/CL/functions/CLBatchToSpaceLayer.cpp b/src/runtime/CL/functions/CLBatchToSpaceLayer.cpp index 0a2ae2a6e0..a3798daf61 100644 --- a/src/runtime/CL/functions/CLBatchToSpaceLayer.cpp +++ b/src/runtime/CL/functions/CLBatchToSpaceLayer.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018-2020 ARM Limited. + * Copyright (c) 2018-2021, 2023 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -30,44 +30,66 @@ #include "arm_compute/core/Validate.h" #include "arm_compute/runtime/CL/CLScheduler.h" -using namespace arm_compute; +#include "src/common/utils/Log.h" +#include "src/core/CL/kernels/CLBatchToSpaceLayerKernel.h" -CLBatchToSpaceLayer::CLBatchToSpaceLayer() - : _batch_to_space_kernel() +namespace arm_compute +{ +CLBatchToSpaceLayer::CLBatchToSpaceLayer() : _batch_to_space_kernel(std::make_unique<CLBatchToSpaceLayerKernel>()) { } +CLBatchToSpaceLayer::~CLBatchToSpaceLayer() = default; + void CLBatchToSpaceLayer::configure(const ICLTensor *input, const ICLTensor *block_shape, ICLTensor *output) { - configure(CLKernelLibrary::get().get_compile_context(), input, block_shape, output); + ARM_COMPUTE_LOG_PARAMS(input, block_shape, output); + _batch_to_space_kernel->configure(CLKernelLibrary::get().get_compile_context(), input, block_shape, output); } -void CLBatchToSpaceLayer::configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *block_shape, ICLTensor *output) +void CLBatchToSpaceLayer::configure(const CLCompileContext &compile_context, + const ICLTensor *input, + const ICLTensor *block_shape, + ICLTensor *output) { - _batch_to_space_kernel.configure(compile_context, input, block_shape, output); + ARM_COMPUTE_LOG_PARAMS(input, block_shape, output); + _batch_to_space_kernel->configure(compile_context, input, block_shape, output); } -void CLBatchToSpaceLayer::configure(const ICLTensor *input, int32_t block_shape_x, int32_t block_shape_y, ICLTensor *output) +void CLBatchToSpaceLayer::configure( + const ICLTensor *input, int32_t block_shape_x, int32_t block_shape_y, ICLTensor *output, const CropInfo &crop_info) { - configure(CLKernelLibrary::get().get_compile_context(), input, block_shape_x, block_shape_y, output); + configure(CLKernelLibrary::get().get_compile_context(), input, block_shape_x, block_shape_y, output, crop_info); } -void CLBatchToSpaceLayer::configure(const CLCompileContext &compile_context, const ICLTensor *input, int32_t block_shape_x, int32_t block_shape_y, ICLTensor *output) +void CLBatchToSpaceLayer::configure(const CLCompileContext &compile_context, + const ICLTensor *input, + int32_t block_shape_x, + int32_t block_shape_y, + ICLTensor *output, + const CropInfo &crop_info) { - _batch_to_space_kernel.configure(compile_context, input, block_shape_x, block_shape_y, output); + ARM_COMPUTE_LOG_PARAMS(input, block_shape_x, block_shape_y, output); + _batch_to_space_kernel->configure(compile_context, input, block_shape_x, block_shape_y, output, crop_info); } -Status CLBatchToSpaceLayer::validate(const ITensorInfo *input, const ITensorInfo *block_shape, const ITensorInfo *output) +Status +CLBatchToSpaceLayer::validate(const ITensorInfo *input, const ITensorInfo *block_shape, const ITensorInfo *output) { return CLBatchToSpaceLayerKernel::validate(input, block_shape, output); } -Status CLBatchToSpaceLayer::validate(const ITensorInfo *input, int32_t block_shape_x, int32_t block_shape_y, const ITensorInfo *output) +Status CLBatchToSpaceLayer::validate(const ITensorInfo *input, + int32_t block_shape_x, + int32_t block_shape_y, + const ITensorInfo *output, + const CropInfo &crop_info) { - return CLBatchToSpaceLayerKernel::validate(input, block_shape_x, block_shape_y, output); + return CLBatchToSpaceLayerKernel::validate(input, block_shape_x, block_shape_y, output, crop_info); } void CLBatchToSpaceLayer::run() { - CLScheduler::get().enqueue(_batch_to_space_kernel, true); + CLScheduler::get().enqueue(*_batch_to_space_kernel, true); } +} // namespace arm_compute diff --git a/src/runtime/CL/functions/CLBitwiseAnd.cpp b/src/runtime/CL/functions/CLBitwiseAnd.cpp index 1fa80f0a24..7bfd0e3677 100644 --- a/src/runtime/CL/functions/CLBitwiseAnd.cpp +++ b/src/runtime/CL/functions/CLBitwiseAnd.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2016-2020 ARM Limited. + * Copyright (c) 2016-2021 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -23,21 +23,26 @@ */ #include "arm_compute/runtime/CL/functions/CLBitwiseAnd.h" -#include "arm_compute/core/CL/kernels/CLBitwiseAndKernel.h" -#include "support/MemorySupport.h" +#include "src/common/utils/Log.h" +#include "src/core/CL/kernels/CLBitwiseKernel.h" #include <utility> -using namespace arm_compute; - +namespace arm_compute +{ void CLBitwiseAnd::configure(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output) { configure(CLKernelLibrary::get().get_compile_context(), input1, input2, output); } -void CLBitwiseAnd::configure(const CLCompileContext &compile_context, const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output) +void CLBitwiseAnd::configure(const CLCompileContext &compile_context, + const ICLTensor *input1, + const ICLTensor *input2, + ICLTensor *output) { - auto k = arm_compute::support::cpp14::make_unique<CLBitwiseAndKernel>(); - k->configure(compile_context, input1, input2, output); + ARM_COMPUTE_LOG_PARAMS(input1, input2, output); + auto k = std::make_unique<CLBitwiseKernel>(); + k->configure(compile_context, input1, input2, output, BitwiseOperation::AND); _kernel = std::move(k); } +} // namespace arm_compute diff --git a/src/runtime/CL/functions/CLBitwiseNot.cpp b/src/runtime/CL/functions/CLBitwiseNot.cpp index 46595191a0..9763915c02 100644 --- a/src/runtime/CL/functions/CLBitwiseNot.cpp +++ b/src/runtime/CL/functions/CLBitwiseNot.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2016-2020 ARM Limited. + * Copyright (c) 2016-2021 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -23,13 +23,13 @@ */ #include "arm_compute/runtime/CL/functions/CLBitwiseNot.h" -#include "arm_compute/core/CL/kernels/CLBitwiseNotKernel.h" -#include "support/MemorySupport.h" +#include "src/common/utils/Log.h" +#include "src/core/CL/kernels/CLBitwiseKernel.h" #include <utility> -using namespace arm_compute; - +namespace arm_compute +{ void CLBitwiseNot::configure(const ICLTensor *input, ICLTensor *output) { configure(CLKernelLibrary::get().get_compile_context(), input, output); @@ -37,7 +37,9 @@ void CLBitwiseNot::configure(const ICLTensor *input, ICLTensor *output) void CLBitwiseNot::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output) { - auto k = arm_compute::support::cpp14::make_unique<CLBitwiseNotKernel>(); - k->configure(compile_context, input, output); + ARM_COMPUTE_LOG_PARAMS(input, output); + auto k = std::make_unique<CLBitwiseKernel>(); + k->configure(compile_context, input, nullptr, output, BitwiseOperation::NOT); _kernel = std::move(k); } +} // namespace arm_compute diff --git a/src/runtime/CL/functions/CLBitwiseOr.cpp b/src/runtime/CL/functions/CLBitwiseOr.cpp index 8431140cb8..dd3171b982 100644 --- a/src/runtime/CL/functions/CLBitwiseOr.cpp +++ b/src/runtime/CL/functions/CLBitwiseOr.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2016-2020 ARM Limited. + * Copyright (c) 2016-2021 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -23,21 +23,26 @@ */ #include "arm_compute/runtime/CL/functions/CLBitwiseOr.h" -#include "arm_compute/core/CL/kernels/CLBitwiseOrKernel.h" -#include "support/MemorySupport.h" +#include "src/common/utils/Log.h" +#include "src/core/CL/kernels/CLBitwiseKernel.h" #include <utility> -using namespace arm_compute; - +namespace arm_compute +{ void CLBitwiseOr::configure(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output) { configure(CLKernelLibrary::get().get_compile_context(), input1, input2, output); } -void CLBitwiseOr::configure(const CLCompileContext &compile_context, const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output) +void CLBitwiseOr::configure(const CLCompileContext &compile_context, + const ICLTensor *input1, + const ICLTensor *input2, + ICLTensor *output) { - auto k = arm_compute::support::cpp14::make_unique<CLBitwiseOrKernel>(); - k->configure(compile_context, input1, input2, output); + ARM_COMPUTE_LOG_PARAMS(input1, input2, output); + auto k = std::make_unique<CLBitwiseKernel>(); + k->configure(compile_context, input1, input2, output, BitwiseOperation::OR); _kernel = std::move(k); } +} // namespace arm_compute diff --git a/src/runtime/CL/functions/CLBitwiseXor.cpp b/src/runtime/CL/functions/CLBitwiseXor.cpp index 0e0e7f2028..5bee4b37ec 100644 --- a/src/runtime/CL/functions/CLBitwiseXor.cpp +++ b/src/runtime/CL/functions/CLBitwiseXor.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2016-2020 ARM Limited. + * Copyright (c) 2016-2021 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -23,21 +23,26 @@ */ #include "arm_compute/runtime/CL/functions/CLBitwiseXor.h" -#include "arm_compute/core/CL/kernels/CLBitwiseXorKernel.h" -#include "support/MemorySupport.h" +#include "src/common/utils/Log.h" +#include "src/core/CL/kernels/CLBitwiseKernel.h" #include <utility> -using namespace arm_compute; - +namespace arm_compute +{ void CLBitwiseXor::configure(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output) { configure(CLKernelLibrary::get().get_compile_context(), input1, input2, output); } -void CLBitwiseXor::configure(const CLCompileContext &compile_context, const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output) +void CLBitwiseXor::configure(const CLCompileContext &compile_context, + const ICLTensor *input1, + const ICLTensor *input2, + ICLTensor *output) { - auto k = arm_compute::support::cpp14::make_unique<CLBitwiseXorKernel>(); - k->configure(compile_context, input1, input2, output); + ARM_COMPUTE_LOG_PARAMS(input1, input2, output); + auto k = std::make_unique<CLBitwiseKernel>(); + k->configure(compile_context, input1, input2, output, BitwiseOperation::XOR); _kernel = std::move(k); } +} // namespace arm_compute diff --git a/src/runtime/CL/functions/CLBoundingBoxTransform.cpp b/src/runtime/CL/functions/CLBoundingBoxTransform.cpp index 55bcde749c..76e626fd75 100644 --- a/src/runtime/CL/functions/CLBoundingBoxTransform.cpp +++ b/src/runtime/CL/functions/CLBoundingBoxTransform.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018-2020 ARM Limited. + * Copyright (c) 2018-2021 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -23,25 +23,37 @@ */ #include "arm_compute/runtime/CL/functions/CLBoundingBoxTransform.h" -#include "arm_compute/core/CL/kernels/CLBoundingBoxTransformKernel.h" -#include "support/MemorySupport.h" +#include "src/common/utils/Log.h" +#include "src/core/CL/kernels/CLBoundingBoxTransformKernel.h" namespace arm_compute { -void CLBoundingBoxTransform::configure(const ICLTensor *boxes, ICLTensor *pred_boxes, const ICLTensor *deltas, const BoundingBoxTransformInfo &info) +void CLBoundingBoxTransform::configure(const ICLTensor *boxes, + ICLTensor *pred_boxes, + const ICLTensor *deltas, + const BoundingBoxTransformInfo &info) { configure(CLKernelLibrary::get().get_compile_context(), boxes, pred_boxes, deltas, info); } -void CLBoundingBoxTransform::configure(const CLCompileContext &compile_context, const ICLTensor *boxes, ICLTensor *pred_boxes, const ICLTensor *deltas, const BoundingBoxTransformInfo &info) +void CLBoundingBoxTransform::configure(const CLCompileContext &compile_context, + const ICLTensor *boxes, + ICLTensor *pred_boxes, + const ICLTensor *deltas, + const BoundingBoxTransformInfo &info) { + ARM_COMPUTE_LOG_PARAMS(boxes, pred_boxes, deltas, info); + // Configure Bounding Box kernel - auto k = arm_compute::support::cpp14::make_unique<CLBoundingBoxTransformKernel>(); + auto k = std::make_unique<CLBoundingBoxTransformKernel>(); k->configure(compile_context, boxes, pred_boxes, deltas, info); _kernel = std::move(k); } -Status CLBoundingBoxTransform::validate(const ITensorInfo *boxes, const ITensorInfo *pred_boxes, const ITensorInfo *deltas, const BoundingBoxTransformInfo &info) +Status CLBoundingBoxTransform::validate(const ITensorInfo *boxes, + const ITensorInfo *pred_boxes, + const ITensorInfo *deltas, + const BoundingBoxTransformInfo &info) { return CLBoundingBoxTransformKernel::validate(boxes, pred_boxes, deltas, info); } diff --git a/src/runtime/CL/functions/CLBox3x3.cpp b/src/runtime/CL/functions/CLBox3x3.cpp deleted file mode 100644 index 72c822197c..0000000000 --- a/src/runtime/CL/functions/CLBox3x3.cpp +++ /dev/null @@ -1,45 +0,0 @@ -/* - * Copyright (c) 2016-2020 ARM Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "arm_compute/runtime/CL/functions/CLBox3x3.h" - -#include "arm_compute/core/CL/kernels/CLBox3x3Kernel.h" -#include "arm_compute/core/PixelValue.h" -#include "support/MemorySupport.h" - -#include <utility> - -using namespace arm_compute; - -void CLBox3x3::configure(ICLTensor *input, ICLTensor *output, BorderMode border_mode, uint8_t constant_border_value) -{ - configure(CLKernelLibrary::get().get_compile_context(), input, output, border_mode, constant_border_value); -} - -void CLBox3x3::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, BorderMode border_mode, uint8_t constant_border_value) -{ - auto k = arm_compute::support::cpp14::make_unique<CLBox3x3Kernel>(); - k->configure(compile_context, input, output, border_mode == BorderMode::UNDEFINED); - _kernel = std::move(k); - _border_handler.configure(compile_context, input, BorderSize(1), border_mode, PixelValue(constant_border_value)); -} diff --git a/src/runtime/CL/functions/CLCannyEdge.cpp b/src/runtime/CL/functions/CLCannyEdge.cpp deleted file mode 100644 index 0c8d3532aa..0000000000 --- a/src/runtime/CL/functions/CLCannyEdge.cpp +++ /dev/null @@ -1,209 +0,0 @@ -/* - * Copyright (c) 2017-2020 ARM Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "arm_compute/runtime/CL/functions/CLCannyEdge.h" - -#include "arm_compute/core/CL/ICLTensor.h" -#include "arm_compute/core/CL/OpenCL.h" -#include "arm_compute/core/Error.h" -#include "arm_compute/core/Validate.h" -#include "arm_compute/runtime/CL/CLScheduler.h" -#include "arm_compute/runtime/CL/functions/CLSobel3x3.h" -#include "arm_compute/runtime/CL/functions/CLSobel5x5.h" -#include "arm_compute/runtime/CL/functions/CLSobel7x7.h" -#include "support/MemorySupport.h" - -using namespace arm_compute; - -CLCannyEdge::CLCannyEdge(std::shared_ptr<IMemoryManager> memory_manager) // NOLINT - : _memory_group(std::move(memory_manager)), - _sobel(), - _gradient(), - _border_mag_gradient(), - _non_max_suppr(), - _edge_trace(), - _gx(), - _gy(), - _mag(), - _phase(), - _nonmax(), - _visited(), - _recorded(), - _l1_list_counter(), - _l1_stack(), - _output(nullptr) -{ -} - -void CLCannyEdge::configure(ICLTensor *input, ICLTensor *output, int32_t upper_thr, int32_t lower_thr, int32_t gradient_size, int32_t norm_type, BorderMode border_mode, - uint8_t constant_border_value) -{ - configure(CLKernelLibrary::get().get_compile_context(), input, output, upper_thr, lower_thr, gradient_size, norm_type, border_mode, constant_border_value); -} - -void CLCannyEdge::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, int32_t upper_thr, int32_t lower_thr, int32_t gradient_size, int32_t norm_type, - BorderMode border_mode, - uint8_t constant_border_value) -{ - ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8); - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8); - ARM_COMPUTE_ERROR_ON((1 != norm_type) && (2 != norm_type)); - ARM_COMPUTE_ERROR_ON((gradient_size != 3) && (gradient_size != 5) && (gradient_size != 7)); - ARM_COMPUTE_ERROR_ON((lower_thr < 0) || (lower_thr >= upper_thr)); - - _output = output; - - const unsigned int L1_hysteresis_stack_size = 8; - const TensorShape shape = input->info()->tensor_shape(); - - TensorInfo gradient_info; - TensorInfo info; - - // Initialize images - if(gradient_size < 7) - { - gradient_info.init(shape, 1, arm_compute::DataType::S16); - info.init(shape, 1, arm_compute::DataType::U16); - } - else - { - gradient_info.init(shape, 1, arm_compute::DataType::S32); - info.init(shape, 1, arm_compute::DataType::U32); - } - - _gx.allocator()->init(gradient_info); - _gy.allocator()->init(gradient_info); - _mag.allocator()->init(info); - _nonmax.allocator()->init(info); - - TensorInfo info_u8(shape, 1, arm_compute::DataType::U8); - _phase.allocator()->init(info_u8); - _l1_list_counter.allocator()->init(info_u8); - - TensorInfo info_u32(shape, 1, arm_compute::DataType::U32); - _visited.allocator()->init(info_u32); - _recorded.allocator()->init(info_u32); - - TensorShape shape_l1_stack = input->info()->tensor_shape(); - shape_l1_stack.set(0, input->info()->dimension(0) * L1_hysteresis_stack_size); - TensorInfo info_s32(shape_l1_stack, 1, arm_compute::DataType::S32); - _l1_stack.allocator()->init(info_s32); - - // Manage intermediate buffers - _memory_group.manage(&_gx); - _memory_group.manage(&_gy); - - // Configure/Init sobelNxN - if(gradient_size == 3) - { - auto k = arm_compute::support::cpp14::make_unique<CLSobel3x3>(); - k->configure(compile_context, input, &_gx, &_gy, border_mode, constant_border_value); - _sobel = std::move(k); - } - else if(gradient_size == 5) - { - auto k = arm_compute::support::cpp14::make_unique<CLSobel5x5>(); - k->configure(compile_context, input, &_gx, &_gy, border_mode, constant_border_value); - _sobel = std::move(k); - } - else if(gradient_size == 7) - { - auto k = arm_compute::support::cpp14::make_unique<CLSobel7x7>(); - k->configure(compile_context, input, &_gx, &_gy, border_mode, constant_border_value); - _sobel = std::move(k); - } - else - { - ARM_COMPUTE_ERROR_VAR("Gradient size %d not supported", gradient_size); - } - - // Manage intermediate buffers - _memory_group.manage(&_mag); - _memory_group.manage(&_phase); - - // Configure gradient - _gradient.configure(compile_context, &_gx, &_gy, &_mag, &_phase, norm_type); - - // Allocate intermediate buffers - _gx.allocator()->allocate(); - _gy.allocator()->allocate(); - - // Manage intermediate buffers - _memory_group.manage(&_nonmax); - - // Configure non-maxima suppression - _non_max_suppr.configure(compile_context, &_mag, &_phase, &_nonmax, lower_thr, border_mode == BorderMode::UNDEFINED); - - // Allocate intermediate buffers - _phase.allocator()->allocate(); - - // Fill border around magnitude image as non-maxima suppression will access - // it. If border mode is undefined filling the border is a nop. - _border_mag_gradient.configure(compile_context, &_mag, _non_max_suppr.border_size(), border_mode, constant_border_value); - - // Allocate intermediate buffers - _mag.allocator()->allocate(); - - // Manage intermediate buffers - _memory_group.manage(&_visited); - _memory_group.manage(&_recorded); - _memory_group.manage(&_l1_stack); - _memory_group.manage(&_l1_list_counter); - - // Configure edge tracing - _edge_trace.configure(compile_context, &_nonmax, output, upper_thr, lower_thr, &_visited, &_recorded, &_l1_stack, &_l1_list_counter); - - // Allocate intermediate buffers - _visited.allocator()->allocate(); - _recorded.allocator()->allocate(); - _l1_stack.allocator()->allocate(); - _l1_list_counter.allocator()->allocate(); - _nonmax.allocator()->allocate(); -} - -void CLCannyEdge::run() -{ - MemoryGroupResourceScope scope_mg(_memory_group); - - // Run sobel - _sobel->run(); - - // Run phase and magnitude calculation - CLScheduler::get().enqueue(_gradient, false); - - // Fill border before non-maxima suppression. Nop for border mode undefined. - CLScheduler::get().enqueue(_border_mag_gradient, false); - - // Run non max suppresion - _nonmax.clear(CLScheduler::get().queue()); - CLScheduler::get().enqueue(_non_max_suppr, false); - - // Clear temporary structures and run edge trace - _output->clear(CLScheduler::get().queue()); - _visited.clear(CLScheduler::get().queue()); - _recorded.clear(CLScheduler::get().queue()); - _l1_list_counter.clear(CLScheduler::get().queue()); - _l1_stack.clear(CLScheduler::get().queue()); - CLScheduler::get().enqueue(_edge_trace, true); -} diff --git a/src/runtime/CL/functions/CLCast.cpp b/src/runtime/CL/functions/CLCast.cpp index 7048a79bc5..42ec8f7ee0 100644 --- a/src/runtime/CL/functions/CLCast.cpp +++ b/src/runtime/CL/functions/CLCast.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018-2020 ARM Limited. + * Copyright (c) 2018-2021 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -23,27 +23,60 @@ */ #include "arm_compute/runtime/CL/functions/CLCast.h" -#include "arm_compute/core/CL/kernels/CLDepthConvertLayerKernel.h" -#include "support/MemorySupport.h" +#include "arm_compute/core/CL/CLKernelLibrary.h" +#include "arm_compute/core/CL/ICLTensor.h" +#include "arm_compute/core/Validate.h" + +#include "src/common/utils/Log.h" +#include "src/core/CL/ICLKernel.h" +#include "src/gpu/cl/operators/ClCast.h" #include <utility> namespace arm_compute { +struct CLCast::Impl +{ + const ICLTensor *src{nullptr}; + ICLTensor *dst{nullptr}; + std::unique_ptr<opencl::ClCast> op{nullptr}; +}; + +CLCast::CLCast() : _impl(std::make_unique<Impl>()) +{ +} +CLCast::CLCast(CLCast &&) = default; +CLCast &CLCast::operator=(CLCast &&) = default; +CLCast::~CLCast() = default; + void CLCast::configure(const ICLTensor *input, ICLTensor *output, ConvertPolicy policy) { configure(CLKernelLibrary::get().get_compile_context(), input, output, policy); } -void CLCast::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, ConvertPolicy policy) +void CLCast::configure(const CLCompileContext &compile_context, + const ICLTensor *input, + ICLTensor *output, + ConvertPolicy policy) { - auto k = arm_compute::support::cpp14::make_unique<CLDepthConvertLayerKernel>(); - k->configure(compile_context, input, output, policy, 0); - _kernel = std::move(k); + ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); + ARM_COMPUTE_LOG_PARAMS(input, output, policy); + + _impl->src = input; + _impl->dst = output; + + _impl->op = std::make_unique<opencl::ClCast>(); + _impl->op->configure(compile_context, _impl->src->info(), _impl->dst->info(), policy); } Status CLCast::validate(const ITensorInfo *input, const ITensorInfo *output, ConvertPolicy policy) { - return CLDepthConvertLayerKernel::validate(input, output, policy, 0); + return opencl::ClCast::validate(input, output, policy); +} + +void CLCast::run() +{ + ITensorPack pack = {{ACL_SRC, _impl->src}, {ACL_DST, _impl->dst}}; + _impl->op->run(pack); } } // namespace arm_compute diff --git a/src/runtime/CL/functions/CLChannelCombine.cpp b/src/runtime/CL/functions/CLChannelCombine.cpp deleted file mode 100644 index 249212e03b..0000000000 --- a/src/runtime/CL/functions/CLChannelCombine.cpp +++ /dev/null @@ -1,55 +0,0 @@ -/* - * Copyright (c) 2016-2020 ARM Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "arm_compute/runtime/CL/functions/CLChannelCombine.h" - -#include "arm_compute/core/CL/kernels/CLChannelCombineKernel.h" -#include "support/MemorySupport.h" - -#include <utility> - -using namespace arm_compute; - -void CLChannelCombine::configure(const ICLTensor *plane0, const ICLTensor *plane1, const ICLTensor *plane2, const ICLTensor *plane3, ICLTensor *output) -{ - configure(CLKernelLibrary::get().get_compile_context(), plane0, plane1, plane2, plane3, output); -} - -void CLChannelCombine::configure(const CLCompileContext &compile_context, const ICLTensor *plane0, const ICLTensor *plane1, const ICLTensor *plane2, const ICLTensor *plane3, ICLTensor *output) -{ - auto k = arm_compute::support::cpp14::make_unique<CLChannelCombineKernel>(); - k->configure(compile_context, plane0, plane1, plane2, plane3, output); - _kernel = std::move(k); -} - -void CLChannelCombine::configure(const ICLImage *plane0, const ICLImage *plane1, const ICLImage *plane2, ICLMultiImage *output) -{ - configure(CLKernelLibrary::get().get_compile_context(), plane0, plane1, plane2, output); -} - -void CLChannelCombine::configure(const CLCompileContext &compile_context, const ICLImage *plane0, const ICLImage *plane1, const ICLImage *plane2, ICLMultiImage *output) -{ - auto k = arm_compute::support::cpp14::make_unique<CLChannelCombineKernel>(); - k->configure(compile_context, plane0, plane1, plane2, output); - _kernel = std::move(k); -} diff --git a/src/runtime/CL/functions/CLChannelExtract.cpp b/src/runtime/CL/functions/CLChannelExtract.cpp deleted file mode 100644 index 019e0a7a90..0000000000 --- a/src/runtime/CL/functions/CLChannelExtract.cpp +++ /dev/null @@ -1,55 +0,0 @@ -/* - * Copyright (c) 2016-2020 ARM Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "arm_compute/runtime/CL/functions/CLChannelExtract.h" - -#include "arm_compute/core/CL/kernels/CLChannelExtractKernel.h" -#include "support/MemorySupport.h" - -#include <utility> - -using namespace arm_compute; - -void CLChannelExtract::configure(const ICLTensor *input, Channel channel, ICLTensor *output) -{ - configure(CLKernelLibrary::get().get_compile_context(), input, channel, output); -} - -void CLChannelExtract::configure(const CLCompileContext &compile_context, const ICLTensor *input, Channel channel, ICLTensor *output) -{ - auto k = arm_compute::support::cpp14::make_unique<CLChannelExtractKernel>(); - k->configure(compile_context, input, channel, output); - _kernel = std::move(k); -} - -void CLChannelExtract::configure(const ICLMultiImage *input, Channel channel, ICLImage *output) -{ - configure(CLKernelLibrary::get().get_compile_context(), input, channel, output); -} - -void CLChannelExtract::configure(const CLCompileContext &compile_context, const ICLMultiImage *input, Channel channel, ICLImage *output) -{ - auto k = arm_compute::support::cpp14::make_unique<CLChannelExtractKernel>(); - k->configure(compile_context, input, channel, output); - _kernel = std::move(k); -} diff --git a/src/runtime/CL/functions/CLChannelShuffleLayer.cpp b/src/runtime/CL/functions/CLChannelShuffleLayer.cpp index 93ab7c7ddf..1ee4789816 100644 --- a/src/runtime/CL/functions/CLChannelShuffleLayer.cpp +++ b/src/runtime/CL/functions/CLChannelShuffleLayer.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018-2020 ARM Limited. + * Copyright (c) 2018-2021 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -23,9 +23,10 @@ */ #include "arm_compute/runtime/CL/functions/CLChannelShuffleLayer.h" -#include "arm_compute/core/CL/kernels/CLChannelShuffleLayerKernel.h" #include "arm_compute/core/Types.h" -#include "support/MemorySupport.h" + +#include "src/common/utils/Log.h" +#include "src/core/CL/kernels/CLChannelShuffleLayerKernel.h" namespace arm_compute { @@ -34,9 +35,13 @@ void CLChannelShuffleLayer::configure(const ICLTensor *input, ICLTensor *output, configure(CLKernelLibrary::get().get_compile_context(), input, output, num_groups); } -void CLChannelShuffleLayer::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, unsigned int num_groups) +void CLChannelShuffleLayer::configure(const CLCompileContext &compile_context, + const ICLTensor *input, + ICLTensor *output, + unsigned int num_groups) { - auto k = arm_compute::support::cpp14::make_unique<CLChannelShuffleLayerKernel>(); + ARM_COMPUTE_LOG_PARAMS(input, output, num_groups); + auto k = std::make_unique<CLChannelShuffleLayerKernel>(); k->configure(compile_context, input, output, num_groups); _kernel = std::move(k); } diff --git a/src/runtime/CL/functions/CLColorConvert.cpp b/src/runtime/CL/functions/CLColorConvert.cpp deleted file mode 100644 index b8e597751b..0000000000 --- a/src/runtime/CL/functions/CLColorConvert.cpp +++ /dev/null @@ -1,79 +0,0 @@ -/* - * Copyright (c) 2016-2020 ARM Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "arm_compute/runtime/CL/functions/CLColorConvert.h" - -#include "arm_compute/core/CL/kernels/CLColorConvertKernel.h" -#include "support/MemorySupport.h" - -#include <utility> - -using namespace arm_compute; - -void CLColorConvert::configure(const ICLTensor *input, ICLTensor *output) -{ - configure(CLKernelLibrary::get().get_compile_context(), input, output); -} - -void CLColorConvert::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output) -{ - auto k = arm_compute::support::cpp14::make_unique<CLColorConvertKernel>(); - k->configure(compile_context, input, output); - _kernel = std::move(k); -} - -void CLColorConvert::configure(const ICLImage *input, ICLMultiImage *output) -{ - configure(CLKernelLibrary::get().get_compile_context(), input, output); -} - -void CLColorConvert::configure(const CLCompileContext &compile_context, const ICLImage *input, ICLMultiImage *output) -{ - auto k = arm_compute::support::cpp14::make_unique<CLColorConvertKernel>(); - k->configure(compile_context, input, output); - _kernel = std::move(k); -} - -void CLColorConvert::configure(const ICLMultiImage *input, ICLImage *output) -{ - configure(CLKernelLibrary::get().get_compile_context(), input, output); -} - -void CLColorConvert::configure(const CLCompileContext &compile_context, const ICLMultiImage *input, ICLImage *output) -{ - auto k = arm_compute::support::cpp14::make_unique<CLColorConvertKernel>(); - k->configure(compile_context, input, output); - _kernel = std::move(k); -} - -void CLColorConvert::configure(const ICLMultiImage *input, ICLMultiImage *output) -{ - configure(CLKernelLibrary::get().get_compile_context(), input, output); -} - -void CLColorConvert::configure(const CLCompileContext &compile_context, const ICLMultiImage *input, ICLMultiImage *output) -{ - auto k = arm_compute::support::cpp14::make_unique<CLColorConvertKernel>(); - k->configure(compile_context, input, output); - _kernel = std::move(k); -} diff --git a/src/runtime/CL/functions/CLComparison.cpp b/src/runtime/CL/functions/CLComparison.cpp index 8d5ec3571d..2f54371e88 100644 --- a/src/runtime/CL/functions/CLComparison.cpp +++ b/src/runtime/CL/functions/CLComparison.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018-2020 ARM Limited. + * Copyright (c) 2018-2021 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -24,9 +24,11 @@ #include "arm_compute/runtime/CL/functions/CLComparison.h" #include "arm_compute/core/CL/ICLTensor.h" -#include "arm_compute/core/CL/kernels/CLComparisonKernel.h" #include "arm_compute/core/Types.h" -#include "support/MemorySupport.h" + +#include "src/common/utils/Log.h" +#include "src/core/CL/kernels/CLComparisonKernel.h" +#include "src/core/CL/kernels/CLFillBorderKernel.h" namespace arm_compute { @@ -35,24 +37,33 @@ void CLComparison::configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *ou configure(CLKernelLibrary::get().get_compile_context(), input1, input2, output, operation); } -void CLComparison::configure(const CLCompileContext &compile_context, ICLTensor *input1, ICLTensor *input2, ICLTensor *output, ComparisonOperation operation) +void CLComparison::configure(const CLCompileContext &compile_context, + ICLTensor *input1, + ICLTensor *input2, + ICLTensor *output, + ComparisonOperation operation) { - auto k = arm_compute::support::cpp14::make_unique<CLComparisonKernel>(); + ARM_COMPUTE_LOG_PARAMS(input2, input2, output, operation); + auto k = std::make_unique<CLComparisonKernel>(); k->configure(compile_context, input1, input2, output, operation); _kernel = std::move(k); - if(output->info()->dimension(0) > 1) + if (output->info()->dimension(0) > 1) { ICLTensor *broadcasted_info = (input1->info()->dimension(0) == 1) ? input1 : input2; - if(broadcasted_info->info()->dimension(0) == 1) + if (broadcasted_info->info()->dimension(0) == 1) { - _border_handler.configure(compile_context, broadcasted_info, _kernel->border_size(), BorderMode::REPLICATE); + _border_handler->configure(compile_context, broadcasted_info, _kernel->border_size(), + BorderMode::REPLICATE); } } } -Status CLComparison::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, ComparisonOperation operation) +Status CLComparison::validate(const ITensorInfo *input1, + const ITensorInfo *input2, + const ITensorInfo *output, + ComparisonOperation operation) { return CLComparisonKernel::validate(input1, input2, output, operation); } @@ -64,25 +75,30 @@ void CLComparisonStatic<COP>::configure(ICLTensor *input1, ICLTensor *input2, IC } template <ComparisonOperation COP> -void CLComparisonStatic<COP>::configure(const CLCompileContext &compile_context, ICLTensor *input1, ICLTensor *input2, ICLTensor *output) +void CLComparisonStatic<COP>::configure(const CLCompileContext &compile_context, + ICLTensor *input1, + ICLTensor *input2, + ICLTensor *output) { - auto k = arm_compute::support::cpp14::make_unique<CLComparisonKernel>(); + auto k = std::make_unique<CLComparisonKernel>(); k->configure(compile_context, input1, input2, output, COP); _kernel = std::move(k); - if(output->info()->dimension(0) > 1) + if (output->info()->dimension(0) > 1) { ICLTensor *broadcasted_info = (input1->info()->dimension(0) == 1) ? input1 : input2; - if(broadcasted_info->info()->dimension(0) == 1) + if (broadcasted_info->info()->dimension(0) == 1) { - _border_handler.configure(compile_context, broadcasted_info, _kernel->border_size(), BorderMode::REPLICATE); + _border_handler->configure(compile_context, broadcasted_info, _kernel->border_size(), + BorderMode::REPLICATE); } } } template <ComparisonOperation COP> -Status CLComparisonStatic<COP>::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output) +Status +CLComparisonStatic<COP>::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output) { return CLComparisonKernel::validate(input1, input2, output, COP); } diff --git a/src/runtime/CL/functions/CLConcatenateLayer.cpp b/src/runtime/CL/functions/CLConcatenateLayer.cpp index e97256713f..9df1c34593 100644 --- a/src/runtime/CL/functions/CLConcatenateLayer.cpp +++ b/src/runtime/CL/functions/CLConcatenateLayer.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018-2020 ARM Limited. + * Copyright (c) 2018-2021 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -23,238 +23,77 @@ */ #include "arm_compute/runtime/CL/functions/CLConcatenateLayer.h" -#include "arm_compute/core/CL/kernels/CLBatchConcatenateLayerKernel.h" -#include "arm_compute/core/CL/kernels/CLDepthConcatenateLayerKernel.h" -#include "arm_compute/core/CL/kernels/CLHeightConcatenateLayerKernel.h" -#include "arm_compute/core/CL/kernels/CLWidthConcatenate2TensorsKernel.h" -#include "arm_compute/core/CL/kernels/CLWidthConcatenate4TensorsKernel.h" -#include "arm_compute/core/CL/kernels/CLWidthConcatenateLayerKernel.h" -#include "arm_compute/core/utils/misc/ShapeCalculator.h" -#include "arm_compute/runtime/CL/CLScheduler.h" - #include "arm_compute/core/CL/ICLTensor.h" -#include "arm_compute/core/Error.h" -#include "arm_compute/core/TensorInfo.h" -#include "arm_compute/core/Types.h" -#include "support/MemorySupport.h" + +#include "src/common/utils/Log.h" +#include "src/core/CL/ICLKernel.h" +#include "src/gpu/cl/operators/ClConcatenate.h" namespace arm_compute { -CLConcatenateLayer::CLConcatenateLayer() - : _concat_kernels(), - _num_inputs(0), - _axis(Window::DimX) +struct CLConcatenateLayer::Impl { -} - -void CLConcatenateLayer::configure(std::vector<ICLTensor *> &inputs_vector, ICLTensor *output, size_t axis) + std::vector<const ICLTensor *> srcs{}; + ICLTensor *dst{nullptr}; + unsigned int num_inputs{0}; + unsigned int axis{0}; + std::unique_ptr<opencl::ClConcatenate> op{nullptr}; +}; + +CLConcatenateLayer::CLConcatenateLayer() : _impl(std::make_unique<Impl>()) { - configure(CLKernelLibrary::get().get_compile_context(), inputs_vector, output, axis); } -void CLConcatenateLayer::configure(const CLCompileContext &compile_context, std::vector<ICLTensor *> &inputs_vector, ICLTensor *output, size_t axis) -{ - configure_internal(compile_context, std::move(inputs_vector), output, axis); -} +CLConcatenateLayer::CLConcatenateLayer(CLConcatenateLayer &&) = default; -void CLConcatenateLayer::configure(std::vector<const ICLTensor *> &inputs_vector, ICLTensor *output, size_t axis) -{ - configure(CLKernelLibrary::get().get_compile_context(), inputs_vector, output, axis); -} +CLConcatenateLayer &CLConcatenateLayer::operator=(CLConcatenateLayer &&) = default; -void CLConcatenateLayer::configure(const CLCompileContext &compile_context, std::vector<const ICLTensor *> &inputs_vector, ICLTensor *output, size_t axis) -{ - configure_internal(compile_context, std::move(inputs_vector), output, axis); -} +CLConcatenateLayer::~CLConcatenateLayer() = default; -Status CLConcatenateLayer::validate(const std::vector<ITensorInfo *> &inputs_vector, const ITensorInfo *output, size_t axis) -{ - return validate_internal(inputs_vector, output, axis); -} - -Status CLConcatenateLayer::validate(const std::vector<const ITensorInfo *> &inputs_vector, const ITensorInfo *output, size_t axis) +void CLConcatenateLayer::configure(std::vector<const ICLTensor *> &inputs_vector, ICLTensor *output, size_t axis) { - return validate_internal(inputs_vector, output, axis); + configure(CLKernelLibrary::get().get_compile_context(), inputs_vector, output, axis); } -template <typename TensorType> -void CLConcatenateLayer::configure_internal(const CLCompileContext &compile_context, std::vector<TensorType *> &&inputs_vector, ICLTensor *output, size_t axis) +void CLConcatenateLayer::configure(const CLCompileContext &compile_context, + std::vector<const ICLTensor *> &inputs_vector, + ICLTensor *output, + size_t axis) { ARM_COMPUTE_ERROR_ON(output == nullptr); - _axis = axis; - _num_inputs = inputs_vector.size(); + ARM_COMPUTE_LOG_PARAMS(inputs_vector, output, axis); - std::vector<ITensorInfo *> inputs_vector_info(inputs_vector.size()); - std::transform(inputs_vector.begin(), inputs_vector.end(), inputs_vector_info.begin(), [](TensorType * t) - { - ARM_COMPUTE_ERROR_ON_NULLPTR(t); - return t->info(); - }); - TensorShape output_shape = arm_compute::misc::shape_calculator::calculate_concatenate_shape(inputs_vector, _axis); + _impl->srcs = inputs_vector; + _impl->dst = output; + _impl->axis = axis; + _impl->num_inputs = inputs_vector.size(); + _impl->op = std::make_unique<opencl::ClConcatenate>(); - // Output auto inizialitation if not yet initialized - auto_init_if_empty(*output->info(), output_shape, 1, inputs_vector[0]->info()->data_type()); - ARM_COMPUTE_ERROR_THROW_ON(CLConcatenateLayer::validate(inputs_vector_info, output->info(), axis)); - - unsigned int offset = 0; - switch(_axis) + std::vector<ITensorInfo *> inputs_vector_info; + for (unsigned int i = 0; i < inputs_vector.size(); ++i) { - case Window::DimX: - { - switch(_num_inputs) - { - case 2: - { - // Configure WidthConcatenate2Tensors kernel - auto kernel = support::cpp14::make_unique<CLWidthConcatenate2TensorsKernel>(); - kernel->configure(compile_context, inputs_vector.at(0), inputs_vector.at(1), output); - _concat_kernels.emplace_back(std::move(kernel)); - break; - } - case 4: - { - // Configure WidthConcatenate4Tensors kernel - auto kernel = support::cpp14::make_unique<CLWidthConcatenate4TensorsKernel>(); - kernel->configure(compile_context, inputs_vector.at(0), inputs_vector.at(1), inputs_vector.at(2), inputs_vector.at(3), output); - _concat_kernels.emplace_back(std::move(kernel)); - break; - } - default: - { - // Configure generic case WidthConcatenate kernels - for(unsigned int i = 0; i < _num_inputs; ++i) - { - auto kernel = support::cpp14::make_unique<CLWidthConcatenateLayerKernel>(); - kernel->configure(compile_context, inputs_vector.at(i), offset, output); - offset += inputs_vector.at(i)->info()->dimension(_axis); - _concat_kernels.emplace_back(std::move(kernel)); - } - break; - } - } - break; - } - case Window::DimY: - { - for(unsigned int i = 0; i < _num_inputs; ++i) - { - auto kernel = support::cpp14::make_unique<CLHeightConcatenateLayerKernel>(); - kernel->configure(compile_context, inputs_vector.at(i), offset, output); - offset += inputs_vector.at(i)->info()->dimension(_axis); - _concat_kernels.emplace_back(std::move(kernel)); - } - break; - } - case Window::DimZ: - { - for(unsigned int i = 0; i < _num_inputs; ++i) - { - auto kernel = support::cpp14::make_unique<CLDepthConcatenateLayerKernel>(); - kernel->configure(compile_context, inputs_vector.at(i), offset, output); - offset += inputs_vector.at(i)->info()->dimension(_axis); - _concat_kernels.emplace_back(std::move(kernel)); - } - break; - } - case 3: - { - for(unsigned int i = 0; i < _num_inputs; ++i) - { - auto kernel = support::cpp14::make_unique<CLBatchConcatenateLayerKernel>(); - kernel->configure(compile_context, inputs_vector.at(i), offset, output); - offset += inputs_vector.at(i)->info()->dimension(_axis); - _concat_kernels.emplace_back(std::move(kernel)); - } - break; - } - default: - ARM_COMPUTE_ERROR("Axis not supported"); + ARM_COMPUTE_ERROR_ON_NULLPTR(inputs_vector.at(i)); + inputs_vector_info.emplace_back(inputs_vector.at(i)->info()); } + _impl->op->configure(compile_context, inputs_vector_info, _impl->dst->info(), axis); } -template <typename TensorInfoType> -Status CLConcatenateLayer::validate_internal(const std::vector<TensorInfoType *> &inputs_vector, const ITensorInfo *output, size_t axis) +Status CLConcatenateLayer::validate(const std::vector<const ITensorInfo *> &inputs_vector, + const ITensorInfo *output, + size_t axis) { - ARM_COMPUTE_RETURN_ERROR_ON(output == nullptr); - const unsigned int num_inputs = inputs_vector.size(); - - ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(output); - ARM_COMPUTE_RETURN_ERROR_ON(num_inputs < 2); - - unsigned int offset = 0; - switch(axis) - { - case Window::DimX: - { - switch(num_inputs) - { - case 2: - // Validate WidthConcatenate2Tensors kernels if there are 2 inputs - ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(inputs_vector[0], inputs_vector[1]); - ARM_COMPUTE_RETURN_ON_ERROR(CLWidthConcatenate2TensorsKernel::validate(inputs_vector[0], inputs_vector[1], output)); - break; - case 4: - // Validate WidthConcatenate4Tensors kernels if there are 4 inputs - ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(inputs_vector[0], inputs_vector[1], inputs_vector[2], inputs_vector[3]); - ARM_COMPUTE_RETURN_ON_ERROR(CLWidthConcatenate4TensorsKernel::validate(inputs_vector[0], inputs_vector[1], inputs_vector[2], inputs_vector[3], output)); - break; - default: - // Validate generic case of WidthConcatenate kernel - for(const auto &input : inputs_vector) - { - ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input); - ARM_COMPUTE_RETURN_ON_ERROR(CLWidthConcatenateLayerKernel::validate(input, offset, output)); - offset += input->dimension(axis); - } - break; - } - break; - } - case Window::DimY: - { - for(const auto &input : inputs_vector) - { - ARM_COMPUTE_RETURN_ON_ERROR(CLHeightConcatenateLayerKernel::validate(input, offset, output)); - offset += input->dimension(axis); - } - break; - } - case Window::DimZ: - { - for(const auto &input : inputs_vector) - { - ARM_COMPUTE_RETURN_ON_ERROR(CLDepthConcatenateLayerKernel::validate(input, offset, output)); - offset += input->dimension(axis); - } - break; - } - case 3: - { - for(const auto &input : inputs_vector) - { - ARM_COMPUTE_RETURN_ON_ERROR(CLBatchConcatenateLayerKernel::validate(input, offset, output)); - offset += input->dimension(axis); - } - break; - } - default: - ARM_COMPUTE_ERROR("Axis not supported"); - } - - if(output->total_size() != 0) - { - TensorShape output_shape = arm_compute::misc::shape_calculator::calculate_concatenate_shape(inputs_vector, axis); - ARM_COMPUTE_RETURN_ERROR_ON(output_shape.total_size() != output->tensor_shape().total_size()); - } - - return Status{}; + return opencl::ClConcatenate::validate(inputs_vector, output, axis); } void CLConcatenateLayer::run() { - for(auto &kernel : _concat_kernels) + ITensorPack pack; + for (unsigned i = 0; i < _impl->num_inputs; ++i) { - CLScheduler::get().enqueue(*kernel, true); + pack.add_tensor(TensorType::ACL_SRC_VEC + i, _impl->srcs.at(i)); } + pack.add_tensor(TensorType::ACL_DST, _impl->dst); + + _impl->op->run(pack); } } // namespace arm_compute diff --git a/src/runtime/CL/functions/CLConv3D.cpp b/src/runtime/CL/functions/CLConv3D.cpp new file mode 100644 index 0000000000..9d1b368f72 --- /dev/null +++ b/src/runtime/CL/functions/CLConv3D.cpp @@ -0,0 +1,97 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/runtime/CL/functions/CLConv3D.h" + +#include "arm_compute/core/CL/ICLTensor.h" + +#include "src/gpu/cl/operators/ClDirectConv3d.h" + +namespace arm_compute +{ +using namespace arm_compute::experimental; + +struct CLConv3D::Impl +{ + const ICLTensor *src{nullptr}; + const ICLTensor *weights{nullptr}; + const ICLTensor *biases{nullptr}; + ICLTensor *dst{nullptr}; + std::unique_ptr<opencl::ClDirectConv3d> op{nullptr}; +}; + +CLConv3D::CLConv3D() : _impl(std::make_unique<Impl>()) +{ +} + +CLConv3D::~CLConv3D() = default; + +void CLConv3D::configure(const ICLTensor *src, + const ICLTensor *weights, + const ICLTensor *biases, + ICLTensor *dst, + const Conv3dInfo &conv3d_info) +{ + configure(CLKernelLibrary::get().get_compile_context(), src, weights, biases, dst, conv3d_info); +} + +void CLConv3D::configure(const CLCompileContext &compile_context, + const ICLTensor *src, + const ICLTensor *weights, + const ICLTensor *biases, + ICLTensor *dst, + const Conv3dInfo &conv3d_info) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(src, weights, dst); + ARM_COMPUTE_ERROR_THROW_ON(CLConv3D::validate( + src->info(), weights->info(), ((biases != nullptr) ? biases->info() : nullptr), dst->info(), conv3d_info)); + + _impl->src = src; + _impl->weights = weights; + _impl->biases = biases; + _impl->dst = dst; + + _impl->op = std::make_unique<opencl::ClDirectConv3d>(); + _impl->op->configure(compile_context, _impl->src->info(), _impl->weights->info(), + _impl->biases ? _impl->biases->info() : nullptr, _impl->dst->info(), conv3d_info); +} + +Status CLConv3D::validate(const ITensorInfo *src, + const ITensorInfo *weights, + const ITensorInfo *biases, + const ITensorInfo *dst, + const Conv3dInfo &conv3d_info) +{ + return opencl::ClDirectConv3d::validate(src, weights, biases, dst, conv3d_info); +} + +void CLConv3D::run() +{ + ITensorPack pack; + pack.add_tensor(TensorType::ACL_SRC_0, _impl->src); + pack.add_tensor(TensorType::ACL_SRC_1, _impl->weights); + pack.add_tensor(TensorType::ACL_SRC_2, _impl->biases); + pack.add_tensor(TensorType::ACL_DST, _impl->dst); + _impl->op->run(pack); +} +} // namespace arm_compute diff --git a/src/runtime/CL/functions/CLConvertFullyConnectedWeights.cpp b/src/runtime/CL/functions/CLConvertFullyConnectedWeights.cpp index 68c0fb6ebf..2298f2a669 100644 --- a/src/runtime/CL/functions/CLConvertFullyConnectedWeights.cpp +++ b/src/runtime/CL/functions/CLConvertFullyConnectedWeights.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018-2020 ARM Limited. + * Copyright (c) 2018-2021 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -23,25 +23,64 @@ */ #include "arm_compute/runtime/CL/functions/CLConvertFullyConnectedWeights.h" +#include "arm_compute/core/CL/CLKernelLibrary.h" +#include "arm_compute/core/CL/ICLTensor.h" +#include "arm_compute/core/Types.h" +#include "arm_compute/core/Validate.h" + +#include "src/common/utils/Log.h" +#include "src/core/CL/ICLKernel.h" +#include "src/gpu/cl/operators/ClConvertFullyConnectedWeights.h" + namespace arm_compute { -void CLConvertFullyConnectedWeights::configure(const ICLTensor *input, ICLTensor *output, const TensorShape &original_input_shape, - DataLayout data_layout) +struct CLConvertFullyConnectedWeights::Impl +{ + const ICLTensor *src{nullptr}; + ICLTensor *dst{nullptr}; + std::unique_ptr<opencl::ClConvertFullyConnectedWeights> op{nullptr}; +}; +CLConvertFullyConnectedWeights::CLConvertFullyConnectedWeights() : _impl(std::make_unique<Impl>()) +{ +} +CLConvertFullyConnectedWeights::~CLConvertFullyConnectedWeights() = default; + +void CLConvertFullyConnectedWeights::configure(const ICLTensor *input, + ICLTensor *output, + const TensorShape &original_input_shape, + DataLayout data_layout) { configure(CLKernelLibrary::get().get_compile_context(), input, output, original_input_shape, data_layout); } -void CLConvertFullyConnectedWeights::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const TensorShape &original_input_shape, - DataLayout data_layout) +void CLConvertFullyConnectedWeights::configure(const CLCompileContext &compile_context, + const ICLTensor *input, + ICLTensor *output, + const TensorShape &original_input_shape, + DataLayout data_layout) { - auto k = arm_compute::support::cpp14::make_unique<CLConvertFullyConnectedWeightsKernel>(); - k->configure(compile_context, input, output, original_input_shape, data_layout); - _kernel = std::move(k); + ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); + ARM_COMPUTE_LOG_PARAMS(input, output, original_input_shape, data_layout); + _impl->src = input; + _impl->dst = output; + _impl->op = std::make_unique<opencl::ClConvertFullyConnectedWeights>(); + _impl->op->configure(compile_context, _impl->src->info(), _impl->dst->info(), original_input_shape, data_layout); } -Status CLConvertFullyConnectedWeights::validate(const ITensorInfo *input, const ITensorInfo *output, const TensorShape &original_input_shape, - DataLayout data_layout) +Status CLConvertFullyConnectedWeights::validate(const ITensorInfo *input, + const ITensorInfo *output, + const TensorShape &original_input_shape, + DataLayout data_layout) { - return CLConvertFullyConnectedWeightsKernel::validate(input, output, original_input_shape, data_layout); + return opencl::ClConvertFullyConnectedWeights::validate(input, output, original_input_shape, data_layout); } -} // namespace arm_compute
\ No newline at end of file + +void CLConvertFullyConnectedWeights::run() +{ + ITensorPack pack; + pack.add_tensor(TensorType::ACL_SRC, _impl->src); + pack.add_tensor(TensorType::ACL_DST, _impl->dst); + _impl->op->run(pack); +} + +} // namespace arm_compute diff --git a/src/runtime/CL/functions/CLConvolution.cpp b/src/runtime/CL/functions/CLConvolution.cpp deleted file mode 100644 index 2b0d7d5e53..0000000000 --- a/src/runtime/CL/functions/CLConvolution.cpp +++ /dev/null @@ -1,139 +0,0 @@ -/* - * Copyright (c) 2016-2020 ARM Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "arm_compute/runtime/CL/functions/CLConvolution.h" - -#include "arm_compute/core/CL/ICLTensor.h" -#include "arm_compute/core/CL/kernels/CLConvolutionKernel.h" -#include "arm_compute/core/Error.h" -#include "arm_compute/core/PixelValue.h" -#include "arm_compute/core/TensorInfo.h" -#include "arm_compute/core/Utils.h" -#include "arm_compute/core/Validate.h" -#include "arm_compute/runtime/CL/CLScheduler.h" -#include "arm_compute/runtime/ITensorAllocator.h" -#include "support/MemorySupport.h" - -#include <utility> - -using namespace arm_compute; - -void CLConvolution3x3::configure(ICLTensor *input, ICLTensor *output, const int16_t *conv, uint32_t scale, BorderMode border_mode, uint8_t constant_border_value) -{ - configure(CLKernelLibrary::get().get_compile_context(), input, output, conv, scale, border_mode, constant_border_value); -} - -void CLConvolution3x3::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, const int16_t *conv, uint32_t scale, BorderMode border_mode, - uint8_t constant_border_value) -{ - auto k = arm_compute::support::cpp14::make_unique<CLConvolution3x3Kernel>(); - k->configure(compile_context, input, output, conv, scale, border_mode == BorderMode::UNDEFINED); - _kernel = std::move(k); - _border_handler.configure(compile_context, input, _kernel->border_size(), border_mode, PixelValue(constant_border_value)); -} - -template <unsigned int matrix_size> -CLConvolutionSquare<matrix_size>::CLConvolutionSquare(std::shared_ptr<IMemoryManager> memory_manager) - : _memory_group(std::move(memory_manager)), _tmp(), _is_separable(false), _kernel_hor(), _kernel_vert(), _kernel(), _border_handler() -{ -} - -template <unsigned int matrix_size> -void CLConvolutionSquare<matrix_size>::configure(ICLTensor *input, ICLTensor *output, const int16_t *conv, uint32_t scale, BorderMode border_mode, - uint8_t constant_border_value) -{ - configure(CLKernelLibrary::get().get_compile_context(), input, output, conv, scale, border_mode, constant_border_value); -} - -template <unsigned int matrix_size> -void CLConvolutionSquare<matrix_size>::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, const int16_t *conv, uint32_t scale, BorderMode border_mode, - uint8_t constant_border_value) -{ - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8); - ARM_COMPUTE_ERROR_ON(conv == nullptr); - std::array<int16_t, matrix_size> conv_col{ 0 }; - std::array<int16_t, matrix_size> conv_row{ 0 }; - _is_separable = separate_matrix(conv, conv_col.data(), conv_row.data(), matrix_size); - - if(_is_separable) - { - std::pair<DataType, DataType> type_pair = data_type_for_convolution(conv_col.data(), conv_row.data(), matrix_size); - _tmp.allocator()->init(TensorInfo(input->info()->tensor_shape(), 1, type_pair.first)); - - // Manage intermediate buffers - _memory_group.manage(&_tmp); - - if(scale == 0) - { - scale = calculate_matrix_scale(conv, matrix_size); - } - - _kernel_hor.configure(compile_context, input, &_tmp, conv_row.data(), border_mode == BorderMode::UNDEFINED); - _kernel_vert.configure(compile_context, &_tmp, output, conv_col.data(), scale, border_mode == BorderMode::UNDEFINED, type_pair.second); - _border_handler.configure(compile_context, input, _kernel_hor.border_size(), border_mode, PixelValue(constant_border_value)); - - // Allocate intermediate buffer - _tmp.allocator()->allocate(); - } - else - { - _kernel.configure(compile_context, input, output, conv, scale, border_mode == BorderMode::UNDEFINED); - _border_handler.configure(compile_context, input, _kernel.border_size(), border_mode, PixelValue(constant_border_value)); - } -} - -template <unsigned int matrix_size> -void CLConvolutionSquare<matrix_size>::run() -{ - CLScheduler::get().enqueue(_border_handler); - - if(_is_separable) - { - MemoryGroupResourceScope scope_mg(_memory_group); - - CLScheduler::get().enqueue(_kernel_hor, false); - CLScheduler::get().enqueue(_kernel_vert); - } - else - { - CLScheduler::get().enqueue(_kernel); - } -} - -template class arm_compute::CLConvolutionSquare<5>; -template class arm_compute::CLConvolutionSquare<7>; -template class arm_compute::CLConvolutionSquare<9>; - -void CLConvolutionRectangle::configure(ICLTensor *input, ICLTensor *output, const int16_t *conv, uint32_t rows, uint32_t cols, uint32_t scale, BorderMode border_mode, uint8_t constant_border_value) -{ - configure(CLKernelLibrary::get().get_compile_context(), input, output, conv, rows, cols, scale, border_mode, constant_border_value); -} - -void CLConvolutionRectangle::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, const int16_t *conv, uint32_t rows, uint32_t cols, uint32_t scale, - BorderMode border_mode, uint8_t constant_border_value) -{ - auto k = arm_compute::support::cpp14::make_unique<CLConvolutionRectangleKernel>(); - k->configure(compile_context, input, output, conv, rows, cols, scale, border_mode == BorderMode::UNDEFINED); - _kernel = std::move(k); - _border_handler.configure(compile_context, input, _kernel->border_size(), border_mode, PixelValue(constant_border_value)); -} diff --git a/src/runtime/CL/functions/CLConvolutionLayer.cpp b/src/runtime/CL/functions/CLConvolutionLayer.cpp index b6e1413f7a..7767b45a01 100644 --- a/src/runtime/CL/functions/CLConvolutionLayer.cpp +++ b/src/runtime/CL/functions/CLConvolutionLayer.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2020 ARM Limited. + * Copyright (c) 2017-2023 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -23,113 +23,149 @@ */ #include "arm_compute/runtime/CL/functions/CLConvolutionLayer.h" -#include "arm_compute/core/PixelValue.h" -#include "arm_compute/core/Utils.h" -#include "arm_compute/core/Validate.h" +#include "arm_compute/core/CL/CLKernelLibrary.h" +#include "arm_compute/core/CL/ICLTensor.h" +#include "arm_compute/core/KernelDescriptors.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" -#include "arm_compute/core/utils/quantization/AsymmHelpers.h" -#include "arm_compute/runtime/CL/CLScheduler.h" +#include "arm_compute/runtime/CL/functions/CLFFTConvolutionLayer.h" -#include <cmath> -#include <memory> -#include <tuple> +#include "src/common/utils/Log.h" +#include "src/core/CL/ICLKernel.h" +#include "src/core/helpers/MemoryHelpers.h" +#include "src/gpu/cl/operators/ClConv2d.h" +#include "support/Cast.h" namespace arm_compute { using namespace arm_compute::misc::shape_calculator; - -CLConvolutionLayer::CLConvolutionLayer(std::shared_ptr<IMemoryManager> memory_manager) - : _memory_manager(std::move(memory_manager)), _function() +using namespace arm_compute::experimental; +struct CLConvolutionLayer::Impl +{ + MemoryGroup memory_group{}; + std::shared_ptr<IMemoryManager> memory_manager{}; + std::unique_ptr<opencl::IClOperator> op{nullptr}; + ITensorPack run_pack{}; + ITensorPack prep_pack{}; + WorkspaceData<CLTensor> workspace{}; + experimental::MemoryRequirements aux_mem_req{}; + std::unique_ptr<IFunction> func{nullptr}; +}; + +CLConvolutionLayer::CLConvolutionLayer(std::shared_ptr<IMemoryManager> memory_manager) : _impl(std::make_unique<Impl>()) { + _impl->memory_manager = std::move(memory_manager); } -void CLConvolutionLayer::configure(ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info, const WeightsInfo &weights_info, - const Size2D &dilation, const ActivationLayerInfo &act_info, bool enable_fast_math, unsigned int num_groups) +CLConvolutionLayer::~CLConvolutionLayer() = default; + +void CLConvolutionLayer::configure(ICLTensor *input, + const ICLTensor *weights, + const ICLTensor *biases, + ICLTensor *output, + const PadStrideInfo &conv_info, + const WeightsInfo &weights_info, + const Size2D &dilation, + const ActivationLayerInfo &act_info, + bool enable_fast_math, + unsigned int num_groups) { - configure(CLKernelLibrary::get().get_compile_context(), input, weights, biases, output, conv_info, weights_info, dilation, act_info, enable_fast_math, num_groups); + configure(CLKernelLibrary::get().get_compile_context(), input, weights, biases, output, conv_info, weights_info, + dilation, act_info, enable_fast_math, num_groups); } -void CLConvolutionLayer::configure(const CLCompileContext &compile_context, ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info, - const WeightsInfo &weights_info, - const Size2D &dilation, const ActivationLayerInfo &act_info, bool enable_fast_math, unsigned int num_groups) +void CLConvolutionLayer::configure(const CLCompileContext &compile_context, + ICLTensor *input, + const ICLTensor *weights, + const ICLTensor *biases, + ICLTensor *output, + const PadStrideInfo &conv_info, + const WeightsInfo &weights_info, + const Size2D &dilation, + const ActivationLayerInfo &act_info, + bool enable_fast_math, + unsigned int num_groups) { ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output); - ARM_COMPUTE_ERROR_THROW_ON(CLConvolutionLayer::validate(input->info(), weights->info(), ((biases != nullptr) ? biases->info() : nullptr), output->info(), conv_info, weights_info, dilation, act_info, - enable_fast_math, num_groups)); + ARM_COMPUTE_ERROR_THROW_ON(CLConvolutionLayer::validate( + input->info(), weights->info(), ((biases != nullptr) ? biases->info() : nullptr), output->info(), conv_info, + weights_info, dilation, act_info, enable_fast_math, num_groups)); + ARM_COMPUTE_LOG_PARAMS(input, weights, biases, output, conv_info, weights_info, dilation, act_info, + enable_fast_math, num_groups); + + const Conv2dInfo conv2d_info = Conv2dInfo(conv_info, dilation, act_info, enable_fast_math, num_groups); - switch(CLConvolutionLayer::get_convolution_method(input->info(), weights->info(), output->info(), conv_info, - weights_info, act_info, CLScheduler::get().target(), dilation, enable_fast_math)) + switch (opencl::ClConv2d::get_convolution_method(input->info(), weights->info(), output->info(), conv2d_info, + weights_info, CLScheduler::get().target())) { case ConvolutionMethod::WINOGRAD: - { - ARM_COMPUTE_ERROR_ON(num_groups != 1); - auto f = arm_compute::support::cpp14::make_unique<CLWinogradConvolutionLayer>(_memory_manager); - f->configure(compile_context, input, weights, biases, output, conv_info, act_info, enable_fast_math); - _function = std::move(f); - break; - } case ConvolutionMethod::DIRECT: - { - ARM_COMPUTE_ERROR_ON(num_groups != 1); - auto f = arm_compute::support::cpp14::make_unique<CLDirectConvolutionLayer>(); - f->configure(compile_context, input, weights, biases, output, conv_info, act_info); - _function = std::move(f); - break; - } + case ConvolutionMethod::INDIRECT: case ConvolutionMethod::GEMM: { - auto f = arm_compute::support::cpp14::make_unique<CLGEMMConvolutionLayer>(_memory_manager); - f->configure(compile_context, input, weights, biases, output, conv_info, weights_info, dilation, act_info, num_groups); - _function = std::move(f); + auto f = std::make_unique<opencl::ClConv2d>(); + f->configure(compile_context, input->info(), weights->info(), + ((biases != nullptr) ? biases->info() : nullptr), output->info(), conv2d_info, weights_info); + _impl->op = std::move(f); break; } case ConvolutionMethod::FFT: { - auto f = arm_compute::support::cpp14::make_unique<CLFFTConvolutionLayer>(_memory_manager); - f->configure(compile_context, input, weights, biases, output, conv_info, act_info); - _function = std::move(f); + auto f = std::make_unique<CLFFTConvolutionLayer>(_impl->memory_manager); + f->configure(compile_context, input, weights, biases, output, conv_info, act_info, enable_fast_math); + _impl->func = std::move(f); break; } default: ARM_COMPUTE_ERROR("Not supported."); break; } + + if (_impl->op) + { + _impl->memory_group = MemoryGroup(std::move(_impl->memory_manager)); + _impl->aux_mem_req = _impl->op->workspace(); + _impl->run_pack = {{ACL_SRC_0, input}, {ACL_SRC_1, weights}, {ACL_SRC_2, biases}, {ACL_DST, output}}; + _impl->prep_pack = {{ACL_SRC_1, weights}, {ACL_SRC_2, biases}}; + _impl->workspace = + manage_workspace<CLTensor>(_impl->aux_mem_req, _impl->memory_group, _impl->run_pack, _impl->prep_pack); + } } -Status CLConvolutionLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info, - const WeightsInfo &weights_info, const Size2D &dilation, const ActivationLayerInfo &act_info, bool enable_fast_math, unsigned int num_groups) +Status CLConvolutionLayer::validate(const ITensorInfo *input, + const ITensorInfo *weights, + const ITensorInfo *biases, + const ITensorInfo *output, + const PadStrideInfo &conv_info, + const WeightsInfo &weights_info, + const Size2D &dilation, + const ActivationLayerInfo &act_info, + bool enable_fast_math, + unsigned int num_groups) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output); - ARM_COMPUTE_RETURN_ERROR_ON_MSG((num_groups != 1) && (input->data_layout() != DataLayout::NCHW), "Grouping (num_groups != 1) with NHWC data layout is not supported"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(!weights->are_values_constant(), "Dynamic weights are not supported"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG((num_groups != 1) && (input->data_layout() != DataLayout::NCHW), + "Grouping (num_groups != 1) with NHWC data layout is not supported"); - const GPUTarget gpu_target = CLScheduler::get().target(); + const GPUTarget gpu_target = CLScheduler::get().target(); + const Conv2dInfo conv2d_info = Conv2dInfo(conv_info, dilation, act_info, enable_fast_math, num_groups); - switch(CLConvolutionLayer::get_convolution_method(input, weights, output, conv_info, weights_info, act_info, gpu_target, dilation, enable_fast_math)) + switch (opencl::ClConv2d::get_convolution_method(input, weights, output, conv2d_info, weights_info, gpu_target)) { case ConvolutionMethod::WINOGRAD: - { - //Validate Winograd - ARM_COMPUTE_RETURN_ERROR_ON_MSG(num_groups != 1, "Grouping (num_groups != 1) with CLWinogradConvolutionLayer is not supported"); - ARM_COMPUTE_RETURN_ON_ERROR(CLWinogradConvolutionLayer::validate(input, weights, biases, output, conv_info, act_info, enable_fast_math)); - break; - } case ConvolutionMethod::DIRECT: - { - // Validate direct convolution layer - ARM_COMPUTE_RETURN_ERROR_ON_MSG(num_groups != 1, "Grouping (num_groups != 1) with CLDirectConvolutionLayer is not supported"); - ARM_COMPUTE_RETURN_ON_ERROR(CLDirectConvolutionLayer::validate(input, weights, biases, output, conv_info, act_info)); - break; - } + case ConvolutionMethod::INDIRECT: case ConvolutionMethod::GEMM: { - // Validate gemm-based convolution layer - ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMConvolutionLayer::validate(input, weights, biases, output, conv_info, weights_info, dilation, act_info, num_groups)); + ARM_COMPUTE_RETURN_ON_ERROR( + opencl::ClConv2d::validate(input, weights, biases, output, conv2d_info, weights_info)); break; } case ConvolutionMethod::FFT: { // Validate FFT-based convolution layer - ARM_COMPUTE_RETURN_ON_ERROR(CLFFTConvolutionLayer::validate(input, weights, nullptr, output, conv_info, act_info)); + ARM_COMPUTE_RETURN_ON_ERROR(CLFFTConvolutionLayer::validate(input, weights, nullptr, output, conv_info, + act_info, enable_fast_math)); break; } default: @@ -140,88 +176,48 @@ Status CLConvolutionLayer::validate(const ITensorInfo *input, const ITensorInfo return Status{}; } -ConvolutionMethod CLConvolutionLayer::get_convolution_method(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *output, const PadStrideInfo &conv_info, - const WeightsInfo &weights_info, const ActivationLayerInfo &act_info, const GPUTarget gpu_target, const Size2D &dilation, bool enable_fast_math) +ConvolutionMethod CLConvolutionLayer::get_convolution_method(const ITensorInfo *input, + const ITensorInfo *weights, + const ITensorInfo *output, + const PadStrideInfo &conv_info, + const WeightsInfo &weights_info, + const ActivationLayerInfo &act_info, + const GPUTarget gpu_target, + const Size2D &dilation, + bool enable_fast_math) { - ARM_COMPUTE_ERROR_ON_NULLPTR(input); - ARM_COMPUTE_ERROR_ON_NULLPTR(output); - ARM_COMPUTE_ERROR_ON_NULLPTR(weights); - ARM_COMPUTE_UNUSED(weights_info); - ARM_COMPUTE_UNUSED(gpu_target); - - const size_t idx_w = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::WIDTH); - const size_t idx_h = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::HEIGHT); - const size_t idx_c = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::CHANNEL); - - /* Input spatial dims, kernel size, IFM/OFM, conv info*/ - using ConvolutionConfiguration = std::tuple<Size2D, Size2D, Size2D, PadStrideInfo, DataLayout>; - using ConfigurationMethod = std::pair<ConvolutionConfiguration, ConvolutionMethod>; - - const std::vector<ConfigurationMethod> known_configs = - { - // Alexnet - ConfigurationMethod(ConvolutionConfiguration(Size2D(27U, 27U), Size2D(5U, 5U), Size2D(48U, 128U), PadStrideInfo(1U, 1U, 2U, 2U), DataLayout::NCHW), ConvolutionMethod::DIRECT), - // VGG16 / VGG19 - ConfigurationMethod(ConvolutionConfiguration(Size2D(224U, 224U), Size2D(3U, 3U), Size2D(3U, 64U), PadStrideInfo(1U, 1U, 1U, 1U), DataLayout::NCHW), ConvolutionMethod::DIRECT), - // Mobilenet 224 - ConfigurationMethod(ConvolutionConfiguration(Size2D(224U, 224U), Size2D(3U, 3U), Size2D(3U, 32U), PadStrideInfo(2U, 2U, 0U, 1U, 0U, 1U, DimensionRoundingType::FLOOR), DataLayout::NCHW), ConvolutionMethod::GEMM), - // Mobilenet 160 - ConfigurationMethod(ConvolutionConfiguration(Size2D(160U, 160U), Size2D(3U, 3U), Size2D(3U, 24U), PadStrideInfo(2U, 2U, 0U, 1U, 0U, 1U, DimensionRoundingType::FLOOR), DataLayout::NCHW), ConvolutionMethod::GEMM), - // Mobilenet 224 - ConfigurationMethod(ConvolutionConfiguration(Size2D(224U, 224U), Size2D(3U, 3U), Size2D(3U, 32U), PadStrideInfo(2U, 2U, 0U, 1U, 0U, 1U, DimensionRoundingType::FLOOR), DataLayout::NHWC), ConvolutionMethod::GEMM), - // Mobilenet 160 - ConfigurationMethod(ConvolutionConfiguration(Size2D(160U, 160U), Size2D(3U, 3U), Size2D(3U, 24U), PadStrideInfo(2U, 2U, 0U, 1U, 0U, 1U, DimensionRoundingType::FLOOR), DataLayout::NHWC), ConvolutionMethod::GEMM), - }; - - const auto find_config = [&](ConfigurationMethod c) - { - const ConvolutionConfiguration config = c.first; - const PadStrideInfo info = std::get<3>(config); - const DataLayout data_layout = std::get<4>(config); + const Conv2dInfo conv2d_info = Conv2dInfo(conv_info, dilation, act_info, enable_fast_math, 1); + return opencl::ClConv2d::get_convolution_method(input, weights, output, conv2d_info, weights_info, gpu_target); +} - return std::get<0>(config) == Size2D(input->dimension(idx_w), input->dimension(idx_h)) && std::get<1>(config) == Size2D(weights->dimension(idx_w), weights->dimension(idx_h)) - && std::get<2>(config) == Size2D(weights->dimension(idx_c), weights->dimension(3)) && info.pad_top() == conv_info.pad_top() && info.pad_right() == conv_info.pad_right() - && info.pad_bottom() == conv_info.pad_bottom() && info.pad_left() == conv_info.pad_left() && info.stride() == conv_info.stride() && (data_layout == input->data_layout()); - }; +void CLConvolutionLayer::run() +{ + prepare(); - std::vector<ConfigurationMethod>::const_iterator found; - if((found = std::find_if(known_configs.begin(), known_configs.end(), find_config)) != known_configs.end()) - { - return (*found).second; - } + MemoryGroupResourceScope scope_mg(_impl->memory_group); - if(dilation != Size2D(1U, 1U)) + if (_impl->func) { - return ConvolutionMethod::GEMM; + _impl->func->run(); } else { - // SRGAN - if((input->dimension(idx_h) > 720U) && (output->dimension(idx_h) > 720U) && (weights->dimension(idx_h) == 9) && (conv_info.pad_top() < 3) - && (CLDirectConvolutionLayer::validate(input, weights, nullptr, output, conv_info, act_info))) - { - return ConvolutionMethod::DIRECT; - } - if((weights->dimension(idx_h) > 7) && (input->dimension(idx_c) > output->dimension(idx_c)) && (CLFFTConvolutionLayer::validate(input, weights, nullptr, output, conv_info, act_info))) - { - return ConvolutionMethod::FFT; - } - if(input->dimension(idx_c) < 16) - { - return ConvolutionMethod::GEMM; - } - return bool(CLWinogradConvolutionLayer::validate(input, weights, nullptr, output, conv_info, act_info, enable_fast_math)) ? ConvolutionMethod::WINOGRAD : ConvolutionMethod::GEMM; + _impl->op->run(_impl->run_pack); } } -void CLConvolutionLayer::run() -{ - prepare(); - _function->run(); -} - void CLConvolutionLayer::prepare() { - _function->prepare(); + if (_impl->func) + { + _impl->func->prepare(); + } + else + { + _impl->op->prepare(_impl->prep_pack); + + // Release temporary tensors that are only used in prepare stage + release_temporaries(_impl->aux_mem_req, _impl->workspace); + } } } // namespace arm_compute diff --git a/src/runtime/CL/functions/CLCopy.cpp b/src/runtime/CL/functions/CLCopy.cpp index 4c5d62a82c..a4f2b0634f 100644 --- a/src/runtime/CL/functions/CLCopy.cpp +++ b/src/runtime/CL/functions/CLCopy.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018-2020 ARM Limited. + * Copyright (c) 2018-2021 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -23,31 +23,60 @@ */ #include "arm_compute/runtime/CL/functions/CLCopy.h" +#include "arm_compute/core/CL/CLKernelLibrary.h" #include "arm_compute/core/CL/ICLTensor.h" -#include "arm_compute/core/CL/kernels/CLCopyKernel.h" -#include "arm_compute/core/Error.h" -#include "arm_compute/core/PixelValue.h" -#include "arm_compute/core/TensorInfo.h" +#include "arm_compute/core/Types.h" #include "arm_compute/core/Validate.h" -#include "support/MemorySupport.h" + +#include "src/common/utils/Log.h" +#include "src/core/CL/ICLKernel.h" +#include "src/gpu/cl/operators/ClCopy.h" #include <utility> -using namespace arm_compute; +namespace arm_compute +{ +struct CLCopy::Impl +{ + const ICLTensor *src{nullptr}; + ICLTensor *dst{nullptr}; + std::unique_ptr<opencl::ClCopy> op{nullptr}; +}; + +CLCopy::CLCopy() : _impl(std::make_unique<Impl>()) +{ +} +CLCopy::CLCopy(CLCopy &&) = default; +CLCopy &CLCopy::operator=(CLCopy &&) = default; +CLCopy::~CLCopy() = default; -void CLCopy::configure(ICLTensor *input, ICLTensor *output) +void CLCopy::configure(ICLTensor *input, ICLTensor *output, Window *dst_window) { - configure(CLKernelLibrary::get().get_compile_context(), input, output); + configure(CLKernelLibrary::get().get_compile_context(), input, output, dst_window); +} + +void CLCopy::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, Window *dst_window) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(input); + ARM_COMPUTE_LOG_PARAMS(input, output, dst_window); + + _impl->src = input; + _impl->dst = output; + + _impl->op = std::make_unique<opencl::ClCopy>(); + _impl->op->configure(compile_context, _impl->src->info(), _impl->dst->info(), dst_window); } -void CLCopy::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output) +Status CLCopy::validate(const ITensorInfo *input, const ITensorInfo *output, Window *dst_window) { - auto k = arm_compute::support::cpp14::make_unique<CLCopyKernel>(); - k->configure(compile_context, input, output); - _kernel = std::move(k); + return opencl::ClCopy::validate(input, output, dst_window); } -Status CLCopy::validate(const arm_compute::ITensorInfo *input, const arm_compute::ITensorInfo *output) +void CLCopy::run() { - return CLCopyKernel::validate(input, output); + ITensorPack pack; + pack.add_tensor(TensorType::ACL_SRC, _impl->src); + pack.add_tensor(TensorType::ACL_DST, _impl->dst); + _impl->op->run(pack); } +} // namespace arm_compute diff --git a/src/runtime/CL/functions/CLCrop.cpp b/src/runtime/CL/functions/CLCrop.cpp new file mode 100644 index 0000000000..fc29c43827 --- /dev/null +++ b/src/runtime/CL/functions/CLCrop.cpp @@ -0,0 +1,103 @@ +/* + * Copyright (c) 2018-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/runtime/CL/functions/CLCrop.h" + +#include "arm_compute/core/CL/CLKernelLibrary.h" +#include "arm_compute/core/CL/ICLTensor.h" +#include "arm_compute/core/Types.h" +#include "arm_compute/core/Validate.h" + +#include "src/common/utils/Log.h" +#include "src/core/CL/ICLKernel.h" +#include "src/gpu/cl/operators/ClCrop.h" + +#include <utility> + +namespace arm_compute +{ +struct CLCrop::Impl +{ + const ICLTensor *src{nullptr}; + ICLTensor *dst{nullptr}; + std::unique_ptr<opencl::ClCrop> op{nullptr}; +}; + +CLCrop::CLCrop() : _impl(std::make_unique<Impl>()) +{ +} +CLCrop::CLCrop(CLCrop &&) = default; +CLCrop &CLCrop::operator=(CLCrop &&) = default; +CLCrop::~CLCrop() = default; + +void CLCrop::configure(const ICLTensor *src, + ICLTensor *dst, + Coordinates2D start, + Coordinates2D end, + uint32_t batch_index, + float extrapolation_value, + Window *dst_window) +{ + configure(CLKernelLibrary::get().get_compile_context(), src, dst, start, end, batch_index, extrapolation_value, + dst_window); +} + +void CLCrop::configure(const CLCompileContext &compile_context, + const ICLTensor *src, + ICLTensor *dst, + Coordinates2D start, + Coordinates2D end, + uint32_t batch_index, + float extrapolation_value, + Window *dst_window) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst); + ARM_COMPUTE_LOG_PARAMS(src, dst, start, end, batch_index, extrapolation_value, dst_window); + + _impl->src = src; + _impl->dst = dst; + + _impl->op = std::make_unique<opencl::ClCrop>(); + _impl->op->configure(compile_context, _impl->src->info(), _impl->dst->info(), start, end, batch_index, + extrapolation_value, dst_window); +} + +Status CLCrop::validate(const ITensorInfo *input, + const ITensorInfo *output, + Coordinates2D start, + Coordinates2D end, + uint32_t batch_index, + float extrapolation_value, + Window *dst_window) +{ + return opencl::ClCrop::validate(input, output, start, end, batch_index, extrapolation_value, dst_window); +} + +void CLCrop::run() +{ + ITensorPack pack; + pack.add_tensor(TensorType::ACL_SRC, _impl->src); + pack.add_tensor(TensorType::ACL_DST, _impl->dst); + _impl->op->run(pack); +} +} // namespace arm_compute diff --git a/src/runtime/CL/functions/CLCropResize.cpp b/src/runtime/CL/functions/CLCropResize.cpp index 17fc80e146..821412b149 100644 --- a/src/runtime/CL/functions/CLCropResize.cpp +++ b/src/runtime/CL/functions/CLCropResize.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2020 ARM Limited. + * Copyright (c) 2019-2021 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -26,13 +26,25 @@ #include "arm_compute/core/CL/CLHelpers.h" #include "arm_compute/runtime/CL/CLScheduler.h" +#include "src/common/utils/Log.h" +#include "src/core/CL/kernels/CLFillBorderKernel.h" +#include "src/core/helpers/AutoConfiguration.h" +#include "src/core/helpers/WindowHelpers.h" + #include <cstddef> namespace arm_compute { namespace { -inline void configure_crop(const ICLTensor *input, ICLTensor *crop_boxes, ICLTensor *box_ind, ICLTensor *output, uint32_t crop_box_ind, Coordinates &start, Coordinates &end, uint32_t &batch_index) +inline void configure_crop(const ICLTensor *input, + ICLTensor *crop_boxes, + ICLTensor *box_ind, + ICLTensor *output, + uint32_t crop_box_ind, + Coordinates &start, + Coordinates &end, + uint32_t &batch_index) { batch_index = *(reinterpret_cast<int32_t *>(box_ind->ptr_to_element(Coordinates(crop_box_ind)))); @@ -45,28 +57,48 @@ inline void configure_crop(const ICLTensor *input, ICLTensor *crop_boxes, ICLTen // The normalized coordinates are scaled to retrieve the floating point image coordinates which are rounded to integers. start = Coordinates(std::floor(x0 * (input->info()->tensor_shape()[1] - 1) + 0.5f), std::floor(y0 * (input->info()->tensor_shape()[2] - 1) + 0.5f)); - end = Coordinates(std::floor(x1 * (input->info()->tensor_shape()[1] - 1) + 0.5f), - std::floor(y1 * (input->info()->tensor_shape()[2] - 1) + 0.5f)); - const TensorShape out_shape(input->info()->tensor_shape()[0], static_cast<uint32_t>(abs(end[0] - start[0])) + 1, static_cast<uint32_t>(abs(end[1] - start[1])) + 1); + end = Coordinates(std::floor(x1 * (input->info()->tensor_shape()[1] - 1) + 0.5f), + std::floor(y1 * (input->info()->tensor_shape()[2] - 1) + 0.5f)); + const TensorShape out_shape(input->info()->tensor_shape()[0], static_cast<uint32_t>(abs(end[0] - start[0])) + 1, + static_cast<uint32_t>(abs(end[1] - start[1])) + 1); output->info()->set_tensor_shape(out_shape); } } // namespace CLCropResize::CLCropResize() - : _input(nullptr), _boxes(nullptr), _box_ind(nullptr), _output(nullptr), _num_boxes(0), _method(), _extrapolation_value(0), _scale(), _copy(), _crop_results(), _scaled_results(), _internal_kernels() + : _input(nullptr), + _boxes(nullptr), + _box_ind(nullptr), + _output(nullptr), + _num_boxes(0), + _method(), + _extrapolation_value(0), + _scale(), + _copy(), + _crop_results(), + _scaled_results(), + _internal_functions() { } -Status CLCropResize::validate(const ITensorInfo *input, ITensorInfo *boxes, ITensorInfo *box_ind, const ITensorInfo *output, - Coordinates2D crop_size, InterpolationPolicy method, float extrapolation_value) +CLCropResize::~CLCropResize() = default; + +Status CLCropResize::validate(const ITensorInfo *input, + ITensorInfo *boxes, + ITensorInfo *box_ind, + const ITensorInfo *output, + Coordinates2D crop_size, + InterpolationPolicy method, + float extrapolation_value) { ARM_COMPUTE_RETURN_ERROR_ON(crop_size.x <= 0 || crop_size.y <= 0); ARM_COMPUTE_RETURN_ERROR_ON(method == InterpolationPolicy::AREA); ARM_COMPUTE_RETURN_ERROR_ON(boxes->tensor_shape()[0] != 4); ARM_COMPUTE_RETURN_ERROR_ON(boxes->tensor_shape()[1] != box_ind->tensor_shape()[0]); TensorInfo temp_info; - ARM_COMPUTE_RETURN_ON_ERROR(CLCropKernel::validate(input->clone().get(), &temp_info, { 0, 0 }, { 1, 1 }, input->dimension(3) - 1, extrapolation_value)); - if(output->total_size() > 0) + ARM_COMPUTE_RETURN_ON_ERROR(CLCrop::validate(input->clone().get(), &temp_info, {0, 0}, {1, 1}, + input->dimension(3) - 1, extrapolation_value)); + if (output->total_size() > 0) { ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_NOT_IN(output, DataType::F32); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, output); @@ -76,19 +108,34 @@ Status CLCropResize::validate(const ITensorInfo *input, ITensorInfo *boxes, ITen return Status{}; } -void CLCropResize::configure(const ICLTensor *input, ICLTensor *boxes, ICLTensor *box_ind, ICLTensor *output, Coordinates2D crop_size, - InterpolationPolicy method, float extrapolation_value) +void CLCropResize::configure(const ICLTensor *input, + ICLTensor *boxes, + ICLTensor *box_ind, + ICLTensor *output, + Coordinates2D crop_size, + InterpolationPolicy method, + float extrapolation_value) { - configure(CLKernelLibrary::get().get_compile_context(), input, boxes, box_ind, output, crop_size, method, extrapolation_value); + configure(CLKernelLibrary::get().get_compile_context(), input, boxes, box_ind, output, crop_size, method, + extrapolation_value); } -void CLCropResize::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *boxes, ICLTensor *box_ind, ICLTensor *output, Coordinates2D crop_size, - InterpolationPolicy method, float extrapolation_value) +void CLCropResize::configure(const CLCompileContext &compile_context, + const ICLTensor *input, + ICLTensor *boxes, + ICLTensor *box_ind, + ICLTensor *output, + Coordinates2D crop_size, + InterpolationPolicy method, + float extrapolation_value) { ARM_COMPUTE_ERROR_ON_NULLPTR(input, output, boxes, box_ind); - ARM_COMPUTE_ERROR_THROW_ON(CLCropResize::validate(input->info(), boxes->info(), box_ind->info(), output->info(), crop_size, method, extrapolation_value)); + ARM_COMPUTE_ERROR_THROW_ON(CLCropResize::validate(input->info(), boxes->info(), box_ind->info(), output->info(), + crop_size, method, extrapolation_value)); + ARM_COMPUTE_LOG_PARAMS(input, boxes, box_ind, output, crop_size, method, extrapolation_value); - TensorShape output_shape = TensorShape(input->info()->tensor_shape()[0], crop_size.x, crop_size.y, boxes->info()->tensor_shape()[1]); + TensorShape output_shape = + TensorShape(input->info()->tensor_shape()[0], crop_size.x, crop_size.y, boxes->info()->tensor_shape()[1]); auto_init_if_empty(*output->info(), output_shape, 1, DataType::F32); _num_boxes = boxes->info()->tensor_shape()[1]; @@ -103,26 +150,26 @@ void CLCropResize::configure(const CLCompileContext &compile_context, const ICLT // For each crop box: // - The initial cropped image is produced as specified by boxes[i] from the 3D image input[box_ind[i]]. - // Possibly using a CLCropKernel and up to four CLMemsetKernels. + // Possibly using a CLCrop and up to four CLFills. // - A tensor is required to hold this initial cropped image. // - A scale function is used to resize the cropped image to the size specified by crop_size. // - A tensor is required to hold the final scaled image before it is copied into the 4D output - // that will hold all final cropped and scaled 3D images using CLCopyKernel. + // that will hold all final cropped and scaled 3D images using CLCopy. // The contents of _boxes and _box_ind are required to calculate the shape // of the initial cropped image and thus are required to configure the // kernels used for cropping and scaling. _boxes->map(CLScheduler::get().queue()); _box_ind->map(CLScheduler::get().queue()); - for(unsigned int num_box = 0; num_box < _num_boxes; ++num_box) + for (unsigned int num_box = 0; num_box < _num_boxes; ++num_box) { - auto crop_tensor = support::cpp14::make_unique<CLTensor>(); + auto crop_tensor = std::make_unique<CLTensor>(); TensorInfo crop_result_info(1, DataType::F32); crop_result_info.set_data_layout(DataLayout::NHWC); crop_tensor->allocator()->init(crop_result_info); _crop_results.emplace_back(std::move(crop_tensor)); - auto scale_tensor = support::cpp14::make_unique<CLTensor>(); + auto scale_tensor = std::make_unique<CLTensor>(); TensorInfo scaled_result_info(out_shape, 1, DataType::F32); scaled_result_info.set_data_layout(DataLayout::NHWC); scale_tensor->allocator()->init(scaled_result_info); @@ -134,15 +181,17 @@ void CLCropResize::configure(const CLCompileContext &compile_context, const ICLT Coordinates end{}; configure_crop(_input, _boxes, _box_ind, _crop_results[num_box].get(), num_box, start, end, batch_index); - auto scale_kernel = support::cpp14::make_unique<CLScale>(); - scale_kernel->configure(compile_context, _crop_results[num_box].get(), _scaled_results[num_box].get(), _method, BorderMode::CONSTANT, PixelValue(_extrapolation_value), SamplingPolicy::TOP_LEFT); + auto scale_kernel = std::make_unique<CLScale>(); + scale_kernel->configure( + compile_context, _crop_results[num_box].get(), _scaled_results[num_box].get(), + ScaleKernelInfo{_method, BorderMode::CONSTANT, PixelValue(_extrapolation_value), SamplingPolicy::TOP_LEFT}); _scale.emplace_back(std::move(scale_kernel)); Window win = calculate_max_window(*_output->info()); win.set(3, Window::Dimension(num_box, num_box + 1, 1)); - auto copy_kernel = support::cpp14::make_unique<CLCopyKernel>(); - copy_kernel->configure(compile_context, _scaled_results[num_box].get(), _output, PaddingList(), &win); + auto copy_kernel = std::make_unique<CLCopy>(); + copy_kernel->configure(compile_context, _scaled_results[num_box].get(), _output, &win); _copy.emplace_back(std::move(copy_kernel)); _crop_results[num_box]->allocator()->allocate(); @@ -151,28 +200,50 @@ void CLCropResize::configure(const CLCompileContext &compile_context, const ICLT bool is_width_flipped = end[0] < start[0]; bool is_height_flipped = end[1] < start[1]; /** The number of rows out of bounds at the start and end of _crop_results[num_box].get(). */ - std::array<int32_t, 2> rows_out_of_bounds{ 0 }; + std::array<int32_t, 2> rows_out_of_bounds{0}; /** The number of columns out of bounds at the start and end of _crop_results[num_box].get(). */ - std::array<int32_t, 2> cols_out_of_bounds{ 0 }; - if(is_height_flipped) + std::array<int32_t, 2> cols_out_of_bounds{0}; + if (is_height_flipped) { - rows_out_of_bounds[0] = start[1] >= static_cast<int32_t>(_input->info()->dimension(2)) ? std::min(start[1] - _input->info()->dimension(2) + 1, _crop_results[num_box].get()->info()->dimension(2)) : 0; - rows_out_of_bounds[1] = end[1] < 0 ? std::min(-end[1], static_cast<int32_t>(_crop_results[num_box].get()->info()->dimension(2))) : 0; + rows_out_of_bounds[0] = start[1] >= static_cast<int32_t>(_input->info()->dimension(2)) + ? std::min(start[1] - _input->info()->dimension(2) + 1, + _crop_results[num_box].get()->info()->dimension(2)) + : 0; + rows_out_of_bounds[1] = + end[1] < 0 ? std::min(-end[1], static_cast<int32_t>(_crop_results[num_box].get()->info()->dimension(2))) + : 0; } else { - rows_out_of_bounds[0] = start[1] < 0 ? std::min(-start[1], static_cast<int32_t>(_crop_results[num_box].get()->info()->dimension(2))) : 0; - rows_out_of_bounds[1] = end[1] >= static_cast<int32_t>(_input->info()->dimension(2)) ? std::min(end[1] - _input->info()->dimension(2) + 1, _crop_results[num_box].get()->info()->dimension(2)) : 0; + rows_out_of_bounds[0] = + start[1] < 0 + ? std::min(-start[1], static_cast<int32_t>(_crop_results[num_box].get()->info()->dimension(2))) + : 0; + rows_out_of_bounds[1] = end[1] >= static_cast<int32_t>(_input->info()->dimension(2)) + ? std::min(end[1] - _input->info()->dimension(2) + 1, + _crop_results[num_box].get()->info()->dimension(2)) + : 0; } - if(is_width_flipped) + if (is_width_flipped) { - cols_out_of_bounds[0] = start[0] >= static_cast<int32_t>(_input->info()->dimension(1)) ? std::min(start[0] - _input->info()->dimension(1) + 1, _crop_results[num_box].get()->info()->dimension(1)) : 0; - cols_out_of_bounds[1] = end[0] < 0 ? std::min(-end[0], static_cast<int32_t>(_crop_results[num_box].get()->info()->dimension(1))) : 0; + cols_out_of_bounds[0] = start[0] >= static_cast<int32_t>(_input->info()->dimension(1)) + ? std::min(start[0] - _input->info()->dimension(1) + 1, + _crop_results[num_box].get()->info()->dimension(1)) + : 0; + cols_out_of_bounds[1] = + end[0] < 0 ? std::min(-end[0], static_cast<int32_t>(_crop_results[num_box].get()->info()->dimension(1))) + : 0; } else { - cols_out_of_bounds[0] = start[0] < 0 ? std::min(-start[0], static_cast<int32_t>(_crop_results[num_box].get()->info()->dimension(1))) : 0; - cols_out_of_bounds[1] = end[0] >= static_cast<int32_t>(_input->info()->dimension(1)) ? std::min(end[0] - _input->info()->dimension(1) + 1, _crop_results[num_box].get()->info()->dimension(1)) : 0; + cols_out_of_bounds[0] = + start[0] < 0 + ? std::min(-start[0], static_cast<int32_t>(_crop_results[num_box].get()->info()->dimension(1))) + : 0; + cols_out_of_bounds[1] = end[0] >= static_cast<int32_t>(_input->info()->dimension(1)) + ? std::min(end[0] - _input->info()->dimension(1) + 1, + _crop_results[num_box].get()->info()->dimension(1)) + : 0; } Window full_window = calculate_max_window(*_crop_results[num_box].get()->info()); @@ -195,64 +266,86 @@ void CLCropResize::configure(const CLCompileContext &compile_context, const ICLT // Fill all _crop_results[num_box].get() rows that have no elements that are within the input bounds // with the extrapolation value using memset. // First for the rows before the in bounds rows. - if(rows_out_of_bounds[0] > 0) + if (rows_out_of_bounds[0] > 0) { Window slice_fill_rows_before(full_window); slice_fill_rows_before.set(2, Window::Dimension(0, rows_out_of_bounds[0], 1)); - auto kernel = arm_compute::support::cpp14::make_unique<CLMemsetKernel>(); - kernel->configure(compile_context, _crop_results[num_box].get(), extrapolation_value, &slice_fill_rows_before); - _internal_kernels.push_back(std::move(kernel)); + auto kernel = std::make_unique<CLFill>(); + kernel->configure(compile_context, _crop_results[num_box].get(), extrapolation_value, + &slice_fill_rows_before); + //_internal_functions.emplace_back(std::move(kernel)); + _internal_functions.push_back(std::move(kernel)); } Window slice_in(full_window); - slice_in.set(2, Window::Dimension(rows_out_of_bounds[0], _crop_results[num_box].get()->info()->dimension(2) - rows_out_of_bounds[1], 1)); - slice_in.set(1, Window::Dimension(cols_out_of_bounds[0], _crop_results[num_box].get()->info()->dimension(1) - cols_out_of_bounds[1], 1)); + slice_in.set(2, + Window::Dimension(rows_out_of_bounds[0], + _crop_results[num_box].get()->info()->dimension(2) - rows_out_of_bounds[1], 1)); + slice_in.set(1, + Window::Dimension(cols_out_of_bounds[0], + _crop_results[num_box].get()->info()->dimension(1) - cols_out_of_bounds[1], 1)); - int rows_in_bounds = static_cast<int32_t>(_crop_results[num_box].get()->info()->dimension(2)) - rows_out_of_bounds[0] - rows_out_of_bounds[1]; - if(rows_in_bounds > 0) + int rows_in_bounds = static_cast<int32_t>(_crop_results[num_box].get()->info()->dimension(2)) - + rows_out_of_bounds[0] - rows_out_of_bounds[1]; + if (rows_in_bounds > 0) { // Fill all elements that share a row with an in bounds element with the extrapolation value. - if(cols_out_of_bounds[0] > 0) + if (cols_out_of_bounds[0] > 0) { Window slice_fill_cols_before(slice_in); slice_fill_cols_before.set(1, Window::Dimension(0, cols_out_of_bounds[0], 1)); - auto kernel = arm_compute::support::cpp14::make_unique<CLMemsetKernel>(); - kernel->configure(compile_context, _crop_results[num_box].get(), extrapolation_value, &slice_fill_cols_before); - _internal_kernels.push_back(std::move(kernel)); + auto kernel = std::make_unique<CLFill>(); + kernel->configure(compile_context, _crop_results[num_box].get(), extrapolation_value, + &slice_fill_cols_before); + //_internal_functions.emplace_back(std::move(kernel)); + _internal_functions.push_back(std::move(kernel)); } - if(cols_out_of_bounds[1] > 0) + if (cols_out_of_bounds[1] > 0) { Window slice_fill_cols_after(slice_in); - slice_fill_cols_after.set(1, Window::Dimension(_crop_results[num_box].get()->info()->dimension(1) - cols_out_of_bounds[1], _crop_results[num_box].get()->info()->dimension(1), 1)); - auto kernel = arm_compute::support::cpp14::make_unique<CLMemsetKernel>(); - kernel->configure(compile_context, _crop_results[num_box].get(), extrapolation_value, &slice_fill_cols_after); - _internal_kernels.push_back(std::move(kernel)); + slice_fill_cols_after.set( + 1, Window::Dimension(_crop_results[num_box].get()->info()->dimension(1) - cols_out_of_bounds[1], + _crop_results[num_box].get()->info()->dimension(1), 1)); + auto kernel = std::make_unique<CLFill>(); + kernel->configure(compile_context, _crop_results[num_box].get(), extrapolation_value, + &slice_fill_cols_after); + //_internal_functions.emplace_back(std::move(kernel)); + _internal_functions.push_back(std::move(kernel)); } // Copy all elements within the input bounds from the input tensor. - int cols_in_bounds = static_cast<int32_t>(_crop_results[num_box].get()->info()->dimension(1)) - cols_out_of_bounds[0] - cols_out_of_bounds[1]; - if(cols_in_bounds > 0) + int cols_in_bounds = static_cast<int32_t>(_crop_results[num_box].get()->info()->dimension(1)) - + cols_out_of_bounds[0] - cols_out_of_bounds[1]; + if (cols_in_bounds > 0) { - Coordinates2D start_in{ is_width_flipped ? start[0] - cols_out_of_bounds[0] : start[0] + cols_out_of_bounds[0], - is_height_flipped ? start[1] - rows_out_of_bounds[0] : start[1] + rows_out_of_bounds[0] }; - Coordinates2D end_in{ is_width_flipped ? start_in.x - cols_in_bounds + 1 : start_in.x + cols_in_bounds - 1, - is_height_flipped ? start_in.y - rows_in_bounds + 1 : start_in.y + rows_in_bounds - 1 }; - auto kernel = arm_compute::support::cpp14::make_unique<CLCropKernel>(); - - kernel->configure(compile_context, _input, _crop_results[num_box].get(), start_in, end_in, batch_index, extrapolation_value, &slice_in); - _internal_kernels.push_back(std::move(kernel)); + Coordinates2D start_in{ + is_width_flipped ? start[0] - cols_out_of_bounds[0] : start[0] + cols_out_of_bounds[0], + is_height_flipped ? start[1] - rows_out_of_bounds[0] : start[1] + rows_out_of_bounds[0]}; + Coordinates2D end_in{ + is_width_flipped ? start_in.x - cols_in_bounds + 1 : start_in.x + cols_in_bounds - 1, + is_height_flipped ? start_in.y - rows_in_bounds + 1 : start_in.y + rows_in_bounds - 1}; + auto kernel = std::make_unique<CLCrop>(); + + kernel->configure(compile_context, _input, _crop_results[num_box].get(), start_in, end_in, batch_index, + extrapolation_value, &slice_in); + //_internal_functions.emplace_back(std::move(kernel)); + _internal_functions.push_back(std::move(kernel)); } } // Fill all rows after the in bounds elements with the extrapolation value. - if(rows_out_of_bounds[1] > 0) + if (rows_out_of_bounds[1] > 0) { Window slice_fill_rows_after(full_window); - slice_fill_rows_after.set(2, Window::Dimension(_crop_results[num_box].get()->info()->dimension(2) - rows_out_of_bounds[1], _crop_results[num_box].get()->info()->dimension(2), 1)); - auto kernel = arm_compute::support::cpp14::make_unique<CLMemsetKernel>(); - kernel->configure(compile_context, _crop_results[num_box].get(), extrapolation_value, &slice_fill_rows_after); - _internal_kernels.push_back(std::move(kernel)); + slice_fill_rows_after.set( + 2, Window::Dimension(_crop_results[num_box].get()->info()->dimension(2) - rows_out_of_bounds[1], + _crop_results[num_box].get()->info()->dimension(2), 1)); + auto kernel = std::make_unique<CLFill>(); + kernel->configure(compile_context, _crop_results[num_box].get(), extrapolation_value, + &slice_fill_rows_after); + //_internal_functions.emplace_back(std::move(kernel)); + _internal_functions.push_back(std::move(kernel)); } } _boxes->unmap(CLScheduler::get().queue()); @@ -264,21 +357,21 @@ void CLCropResize::run() { ARM_COMPUTE_ERROR_ON_MSG(_output == nullptr, "Unconfigured function"); - for(unsigned int i = 0; i < _internal_kernels.size(); ++i) + for (unsigned int i = 0; i < _internal_functions.size(); ++i) { - CLScheduler::get().enqueue(*(_internal_kernels[i])); + _internal_functions[i]->run(); } CLScheduler::get().sync(); - for(auto &kernel : _scale) + for (auto &kernel : _scale) { kernel->run(); } CLScheduler::get().sync(); - for(auto &kernel : _copy) + for (auto &kernel : _copy) { - CLScheduler::get().enqueue(*kernel, true); + kernel->run(); } CLScheduler::get().sync(); } -} // namespace arm_compute
\ No newline at end of file +} // namespace arm_compute diff --git a/src/runtime/CL/functions/CLDeconvolutionLayer.cpp b/src/runtime/CL/functions/CLDeconvolutionLayer.cpp index 62e7d9a582..4e0d1501ba 100644 --- a/src/runtime/CL/functions/CLDeconvolutionLayer.cpp +++ b/src/runtime/CL/functions/CLDeconvolutionLayer.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2020 ARM Limited. + * Copyright (c) 2017-2023 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -23,12 +23,18 @@ */ #include "arm_compute/runtime/CL/functions/CLDeconvolutionLayer.h" +#include "arm_compute/core/Types.h" #include "arm_compute/core/Utils.h" -#include "arm_compute/core/Validate.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" #include "arm_compute/core/utils/quantization/AsymmHelpers.h" +#include "arm_compute/core/Validate.h" #include "arm_compute/runtime/CL/CLScheduler.h" +#include "src/common/utils/Log.h" +#include "src/core/CL/ICLKernel.h" +#include "src/gpu/cl/IClOperator.h" +#include "src/gpu/cl/operators/ClTransposedConvolution.h" + #include <cmath> #include <memory> #include <tuple> @@ -36,34 +42,70 @@ using namespace arm_compute; using namespace arm_compute::misc::shape_calculator; +struct CLDeconvolutionLayer::Impl +{ + const ICLTensor *src{nullptr}; + const ICLTensor *weights{nullptr}; + const ICLTensor *biases{nullptr}; + ICLTensor *dst{nullptr}; + std::unique_ptr<opencl::IClOperator> op{nullptr}; +}; + +CLDeconvolutionLayer::~CLDeconvolutionLayer() = default; + CLDeconvolutionLayer::CLDeconvolutionLayer(std::shared_ptr<IMemoryManager> memory_manager) - : _memory_manager(std::move(memory_manager)), _function() + : _memory_manager(std::move(memory_manager)), _function(), _impl(std::make_unique<Impl>()) { } -void CLDeconvolutionLayer::configure(ICLTensor *input, ICLTensor *weights, const ICLTensor *bias, ICLTensor *output, const PadStrideInfo &deconv_info, - const WeightsInfo &weights_info) +void CLDeconvolutionLayer::configure(ICLTensor *input, + ICLTensor *weights, + const ICLTensor *bias, + ICLTensor *output, + const PadStrideInfo &deconv_info, + const WeightsInfo &weights_info) { configure(CLKernelLibrary::get().get_compile_context(), input, weights, bias, output, deconv_info, weights_info); } -void CLDeconvolutionLayer::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *weights, const ICLTensor *bias, ICLTensor *output, const PadStrideInfo &deconv_info, - const WeightsInfo &weights_info) +void CLDeconvolutionLayer::configure(const CLCompileContext &compile_context, + ICLTensor *input, + ICLTensor *weights, + const ICLTensor *bias, + ICLTensor *output, + const PadStrideInfo &deconv_info, + const WeightsInfo &weights_info) { ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output); + ARM_COMPUTE_LOG_PARAMS(input, weights, bias, output, deconv_info, weights_info); - switch(CLDeconvolutionLayer::get_deconvolution_method(input->info(), weights->info(), nullptr, output->info(), deconv_info, weights_info)) + switch (CLDeconvolutionLayer::get_deconvolution_method(input->info(), weights->info(), nullptr, output->info(), + deconv_info, weights_info)) { case DeconvolutionMethod::DIRECT: { - auto f = arm_compute::support::cpp14::make_unique<CLDirectDeconvolutionLayer>(); + auto op = std::make_unique<opencl::ClTransposedConvolution>(); + op->configure(compile_context, input->info(), weights->info(), bias != nullptr ? bias->info() : nullptr, + output->info(), deconv_info); + + _impl->src = input; + _impl->weights = weights; + _impl->biases = bias; + _impl->dst = output; + + _impl->op = std::move(op); + break; + } + case DeconvolutionMethod::UPSCALE_CONV2D: + { + auto f = std::make_unique<CLDirectDeconvolutionLayer>(); f->configure(compile_context, input, weights, bias, output, deconv_info, weights_info); _function = std::move(f); break; } case DeconvolutionMethod::GEMM: { - auto f = arm_compute::support::cpp14::make_unique<CLGEMMDeconvolutionLayer>(_memory_manager); + auto f = std::make_unique<CLGEMMDeconvolutionLayer>(_memory_manager); f->configure(compile_context, input, weights, bias, output, deconv_info); _function = std::move(f); break; @@ -74,16 +116,28 @@ void CLDeconvolutionLayer::configure(const CLCompileContext &compile_context, IC } } -Status CLDeconvolutionLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *bias, ITensorInfo *output, const PadStrideInfo &deconv_info, - const WeightsInfo &weights_info) +Status CLDeconvolutionLayer::validate(const ITensorInfo *input, + const ITensorInfo *weights, + const ITensorInfo *bias, + ITensorInfo *output, + const PadStrideInfo &deconv_info, + const WeightsInfo &weights_info) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output); - switch(CLDeconvolutionLayer::get_deconvolution_method(input, weights, bias, output, deconv_info, weights_info)) + switch (CLDeconvolutionLayer::get_deconvolution_method(input, weights, bias, output, deconv_info, weights_info)) { case DeconvolutionMethod::DIRECT: { + // Validate transposed convolution operator + ARM_COMPUTE_RETURN_ON_ERROR( + opencl::ClTransposedConvolution::validate(input, weights, bias, output, deconv_info)); + break; + } + case DeconvolutionMethod::UPSCALE_CONV2D: + { // Validate direct convolution layer - ARM_COMPUTE_RETURN_ON_ERROR(CLDirectDeconvolutionLayer::validate(input, weights, bias, output, deconv_info, weights_info)); + ARM_COMPUTE_RETURN_ON_ERROR( + CLDirectDeconvolutionLayer::validate(input, weights, bias, output, deconv_info, weights_info)); break; } case DeconvolutionMethod::GEMM: @@ -100,19 +154,40 @@ Status CLDeconvolutionLayer::validate(const ITensorInfo *input, const ITensorInf return Status{}; } -DeconvolutionMethod CLDeconvolutionLayer::get_deconvolution_method(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *bias, ITensorInfo *output, const PadStrideInfo &deconv_info, - const WeightsInfo &weights_info) +DeconvolutionMethod CLDeconvolutionLayer::get_deconvolution_method(const ITensorInfo *input, + const ITensorInfo *weights, + const ITensorInfo *bias, + ITensorInfo *output, + const PadStrideInfo &deconv_info, + const WeightsInfo &weights_info) { ARM_COMPUTE_UNUSED(output, bias, weights_info); + if (is_data_type_quantized_per_channel(weights->data_type())) + { + return DeconvolutionMethod::UPSCALE_CONV2D; + } + const DataLayout data_layout = input->data_layout(); const size_t idx_w = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH); const size_t idx_h = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT); + const size_t idx_n = get_data_layout_dimension_index(data_layout, DataLayoutDimension::BATCHES); + const size_t ofm = weights->tensor_shape()[idx_n]; - if(weights->dimension(idx_w) != deconv_info.stride().first || weights->dimension(idx_h) != deconv_info.stride().second) + if (weights->dimension(idx_w) != deconv_info.stride().first || + weights->dimension(idx_h) != deconv_info.stride().second) { - return DeconvolutionMethod::DIRECT; + // We observe better performance for FP32 types only when ofm <= 16, and for FP16 only when ofm <= 32. + if (input->data_layout() == DataLayout::NHWC && !((input->data_type() == DataType::F32) && (ofm > 16)) && + !((input->data_type() == DataType::F16) && (ofm > 32))) + { + return DeconvolutionMethod::DIRECT; + } + else + { + return DeconvolutionMethod::UPSCALE_CONV2D; + } } return DeconvolutionMethod::GEMM; @@ -121,10 +196,29 @@ DeconvolutionMethod CLDeconvolutionLayer::get_deconvolution_method(const ITensor void CLDeconvolutionLayer::run() { prepare(); - _function->run(); + + if (_impl->op != nullptr) + { + // Optimized Operator will be used + ITensorPack pack; + + pack.add_tensor(TensorType::ACL_SRC_0, _impl->src); + pack.add_tensor(TensorType::ACL_SRC_1, _impl->weights); + pack.add_tensor(TensorType::ACL_SRC_2, _impl->biases); + pack.add_tensor(TensorType::ACL_DST, _impl->dst); + + _impl->op->run(pack); + } + else + { + _function->run(); + } } void CLDeconvolutionLayer::prepare() { - _function->prepare(); + if (_impl->op == nullptr) + { + _function->prepare(); + } } diff --git a/src/runtime/CL/functions/CLDeconvolutionLayerUpsample.cpp b/src/runtime/CL/functions/CLDeconvolutionLayerUpsample.cpp index be2d120dcd..b92bf903a6 100644 --- a/src/runtime/CL/functions/CLDeconvolutionLayerUpsample.cpp +++ b/src/runtime/CL/functions/CLDeconvolutionLayerUpsample.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2020 ARM Limited. + * Copyright (c) 2017-2021 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -28,16 +28,20 @@ #include "arm_compute/runtime/CL/CLScheduler.h" #include "arm_compute/runtime/CL/CLTensor.h" +#include "src/common/utils/Log.h" +#include "src/core/CL/kernels/CLDeconvolutionLayerUpsampleKernel.h" + namespace arm_compute { CLDeconvolutionLayerUpsample::CLDeconvolutionLayerUpsample() // NOLINT - : _upsample(), - _memset(), - _output(nullptr) + : _upsample(std::make_unique<CLDeconvolutionLayerUpsampleKernel>()), _fill(), _output(nullptr) { } -Status CLDeconvolutionLayerUpsample::validate(const ITensorInfo *input, const ITensorInfo *output, const PadStrideInfo &info) +CLDeconvolutionLayerUpsample::~CLDeconvolutionLayerUpsample() = default; + +Status +CLDeconvolutionLayerUpsample::validate(const ITensorInfo *input, const ITensorInfo *output, const PadStrideInfo &info) { return CLDeconvolutionLayerUpsampleKernel::validate(input, output, info); } @@ -47,18 +51,23 @@ void CLDeconvolutionLayerUpsample::configure(ICLTensor *input, ICLTensor *output configure(CLKernelLibrary::get().get_compile_context(), input, output, info); } -void CLDeconvolutionLayerUpsample::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, const PadStrideInfo &info) +void CLDeconvolutionLayerUpsample::configure(const CLCompileContext &compile_context, + ICLTensor *input, + ICLTensor *output, + const PadStrideInfo &info) { ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); + ARM_COMPUTE_LOG_PARAMS(input, output, info); _output = output; - _memset.configure(compile_context, _output, PixelValue(0, _output->info()->data_type(), _output->info()->quantization_info())); - _upsample.configure(compile_context, input, _output, info); + _fill.configure(compile_context, _output, + PixelValue(0, _output->info()->data_type(), _output->info()->quantization_info())); + _upsample->configure(compile_context, input, _output, info); } void CLDeconvolutionLayerUpsample::run() { - CLScheduler::get().enqueue(_memset, false); - CLScheduler::get().enqueue(_upsample, true); + _fill.run(); + CLScheduler::get().enqueue(*_upsample, true); } } // namespace arm_compute diff --git a/src/runtime/CL/functions/CLDepthConvertLayer.cpp b/src/runtime/CL/functions/CLDepthConvertLayer.cpp index b848f989e6..6d2fea974e 100644 --- a/src/runtime/CL/functions/CLDepthConvertLayer.cpp +++ b/src/runtime/CL/functions/CLDepthConvertLayer.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2016-2020 ARM Limited. + * Copyright (c) 2016-2021 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -23,27 +23,66 @@ */ #include "arm_compute/runtime/CL/functions/CLDepthConvertLayer.h" -#include "arm_compute/core/CL/kernels/CLDepthConvertLayerKernel.h" -#include "support/MemorySupport.h" +#include "arm_compute/core/CL/CLKernelLibrary.h" +#include "arm_compute/core/CL/ICLTensor.h" +#include "arm_compute/core/Validate.h" + +#include "src/common/utils/Log.h" +#include "src/core/CL/ICLKernel.h" +#include "src/gpu/cl/operators/ClCast.h" #include <utility> namespace arm_compute { +struct CLDepthConvertLayer::Impl +{ + const ICLTensor *src{nullptr}; + ICLTensor *dst{nullptr}; + std::unique_ptr<opencl::ClCast> op{nullptr}; +}; + +CLDepthConvertLayer::CLDepthConvertLayer() : _impl(std::make_unique<Impl>()) +{ +} +CLDepthConvertLayer::CLDepthConvertLayer(CLDepthConvertLayer &&) = default; +CLDepthConvertLayer &CLDepthConvertLayer::operator=(CLDepthConvertLayer &&) = default; +CLDepthConvertLayer::~CLDepthConvertLayer() = default; + void CLDepthConvertLayer::configure(const ICLTensor *input, ICLTensor *output, ConvertPolicy policy, uint32_t shift) { configure(CLKernelLibrary::get().get_compile_context(), input, output, policy, shift); } -void CLDepthConvertLayer::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, ConvertPolicy policy, uint32_t shift) +void CLDepthConvertLayer::configure(const CLCompileContext &compile_context, + const ICLTensor *input, + ICLTensor *output, + ConvertPolicy policy, + uint32_t shift) +{ + ARM_COMPUTE_UNUSED(shift); + ARM_COMPUTE_LOG_PARAMS(input, output, policy, shift); + + _impl->src = input; + _impl->dst = output; + + ARM_COMPUTE_ERROR_ON_NULLPTR(_impl->src, _impl->dst); + ARM_COMPUTE_ERROR_ON(shift != 0); + + _impl->op = std::make_unique<opencl::ClCast>(); + _impl->op->configure(compile_context, _impl->src->info(), _impl->dst->info(), policy); +} + +Status +CLDepthConvertLayer::validate(const ITensorInfo *input, const ITensorInfo *output, ConvertPolicy policy, uint32_t shift) { - auto k = arm_compute::support::cpp14::make_unique<CLDepthConvertLayerKernel>(); - k->configure(compile_context, input, output, policy, shift); - _kernel = std::move(k); + ARM_COMPUTE_RETURN_ERROR_ON(shift != 0); + return opencl::ClCast::validate(input, output, policy); } -Status CLDepthConvertLayer::validate(const ITensorInfo *input, const ITensorInfo *output, ConvertPolicy policy, uint32_t shift) +void CLDepthConvertLayer::run() { - return CLDepthConvertLayerKernel::validate(input, output, policy, shift); + ITensorPack pack = {{ACL_SRC, _impl->src}, {ACL_DST, _impl->dst}}; + _impl->op->run(pack); } } // namespace arm_compute diff --git a/src/runtime/CL/functions/CLDepthToSpaceLayer.cpp b/src/runtime/CL/functions/CLDepthToSpaceLayer.cpp index 89e5faa4d5..9477c7f81d 100644 --- a/src/runtime/CL/functions/CLDepthToSpaceLayer.cpp +++ b/src/runtime/CL/functions/CLDepthToSpaceLayer.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2020 ARM Limited. + * Copyright (c) 2019-2021 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -23,8 +23,8 @@ */ #include "arm_compute/runtime/CL/functions/CLDepthToSpaceLayer.h" -#include "arm_compute/core/CL/kernels/CLDepthToSpaceLayerKernel.h" -#include "support/MemorySupport.h" +#include "src/common/utils/Log.h" +#include "src/core/CL/kernels/CLDepthToSpaceLayerKernel.h" #include <utility> @@ -35,9 +35,13 @@ void CLDepthToSpaceLayer::configure(const ICLTensor *input, ICLTensor *output, i configure(CLKernelLibrary::get().get_compile_context(), input, output, block_shape); } -void CLDepthToSpaceLayer::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, int32_t block_shape) +void CLDepthToSpaceLayer::configure(const CLCompileContext &compile_context, + const ICLTensor *input, + ICLTensor *output, + int32_t block_shape) { - auto k = arm_compute::support::cpp14::make_unique<CLDepthToSpaceLayerKernel>(); + ARM_COMPUTE_LOG_PARAMS(input, output, block_shape); + auto k = std::make_unique<CLDepthToSpaceLayerKernel>(); k->configure(compile_context, input, output, block_shape); _kernel = std::move(k); } diff --git a/src/runtime/CL/functions/CLDepthwiseConvolutionLayer.cpp b/src/runtime/CL/functions/CLDepthwiseConvolutionLayer.cpp index b1e9fe77d4..873601bb11 100644 --- a/src/runtime/CL/functions/CLDepthwiseConvolutionLayer.cpp +++ b/src/runtime/CL/functions/CLDepthwiseConvolutionLayer.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2020 ARM Limited. + * Copyright (c) 2017-2023 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -24,102 +24,26 @@ #include "arm_compute/runtime/CL/functions/CLDepthwiseConvolutionLayer.h" #include "arm_compute/core/CL/ICLTensor.h" -#include "arm_compute/core/CL/kernels/CLDepthwiseConvolutionLayer3x3NCHWKernel.h" -#include "arm_compute/core/CL/kernels/CLDepthwiseConvolutionLayer3x3NHWCKernel.h" #include "arm_compute/core/Helpers.h" -#include "arm_compute/core/PixelValue.h" +#include "arm_compute/core/Utils.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" #include "arm_compute/core/utils/quantization/AsymmHelpers.h" #include "arm_compute/runtime/CL/CLScheduler.h" -#include "support/MemorySupport.h" + +#include "src/common/utils/Log.h" +#include "src/core/CL/kernels/CLDepthwiseConvolutionLayerNativeKernel.h" +#include "src/runtime/heuristics/dwc_native/ClDWCNativeKernelConfig.h" +#include "src/runtime/heuristics/dwc_native/IClDWCNativeKernelConfig.h" namespace arm_compute { using namespace arm_compute::misc; using namespace arm_compute::misc::shape_calculator; +using namespace arm_compute::cl_dwc; -namespace -{ -Status validate_arguments_3x3(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info, - unsigned int depth_multiplier, ActivationLayerInfo act_info, GPUTarget gpu_target, const Size2D &dilation) -{ - // This function should be removed and incorporated inside CLDepthwiseConvolutionLayerInternal3x3 once CLDepthwiseConvolutionLayer3x3 is properly removed - ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F16, DataType::F32); - ARM_COMPUTE_RETURN_ERROR_ON(input->data_layout() == DataLayout::UNKNOWN); - - const bool is_quantized = is_data_type_quantized_asymmetric(input->data_type()); - const bool is_nhwc = input->data_layout() == DataLayout::NHWC; - const bool needs_permute = is_nhwc && (depth_multiplier > 1); - const bool needs_weights_reshape = is_nhwc && (depth_multiplier == 1) && is_quantized; - const bool is_stride_1 = ((conv_info.stride().first == conv_info.stride().second) && (conv_info.stride().first == 1)); - const bool is_stride_1_dilation_1 = (is_stride_1 && dilation.x() == 1 && dilation.y() == 1); - const bool is_dot8_supported = dot8_supported(CLKernelLibrary::get().get_device()); - DepthwiseConvolutionReshapeInfo info; - info.c0 = 4; - info.transpose = is_stride_1_dilation_1 && is_dot8_supported; - - TensorInfo output_multipliers_shifts_info(TensorInfo(TensorShape(1U), 1, DataType::S32)); - if(is_quantized) - { - if(is_data_type_quantized_per_channel(weights->data_type())) - { - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(weights, 1, DataType::QSYMM8_PER_CHANNEL); - - const size_t idx_c = get_data_layout_dimension_index(weights->data_layout(), DataLayoutDimension::CHANNEL); - output_multipliers_shifts_info.set_tensor_shape(TensorShape(weights->dimension(idx_c))); - } - else - { - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights); - } - } - - if(needs_permute) - { - TensorShape permuted_input_shape = input->tensor_shape(); - TensorShape permuted_weights_shape = weights->tensor_shape(); - TensorShape permuted_output_shape = shape_calculator::compute_depthwise_convolution_shape(*input, *weights, conv_info, depth_multiplier, dilation); - - permute(permuted_input_shape, PermutationVector(1U, 2U, 0U)); - permute(permuted_weights_shape, PermutationVector(1U, 2U, 0U)); - permute(permuted_output_shape, PermutationVector(1U, 2U, 0U)); - - const TensorInfo permuted_input = input->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(permuted_input_shape).set_data_layout(DataLayout::NCHW); - const TensorInfo permuted_weights = weights->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(permuted_weights_shape).set_data_layout(DataLayout::NCHW); - const TensorInfo permuted_output = output->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(permuted_output_shape).set_data_layout(DataLayout::NCHW); - - ARM_COMPUTE_RETURN_ON_ERROR(CLDepthwiseConvolutionLayer3x3NCHWKernel::validate(&permuted_input, &permuted_weights, biases, &permuted_output, - conv_info, depth_multiplier, act_info, gpu_target, - dilation, &output_multipliers_shifts_info, &output_multipliers_shifts_info)); - } - else if(is_nhwc) - { - if(needs_weights_reshape) - { - auto reshaped_weights_shape = arm_compute::misc::shape_calculator::compute_reshaped_depthwise_weights_shape(*weights, info); - ARM_COMPUTE_RETURN_ON_ERROR(CLDepthwiseConvolutionLayer3x3NHWCKernel::validate(input, &weights->clone()->set_tensor_shape(reshaped_weights_shape), biases, - output, conv_info, depth_multiplier, act_info, - dilation, &output_multipliers_shifts_info, &output_multipliers_shifts_info)); - } - else - { - ARM_COMPUTE_RETURN_ON_ERROR(CLDepthwiseConvolutionLayer3x3NHWCKernel::validate(input, weights, biases, output, conv_info, depth_multiplier, act_info, - dilation, &output_multipliers_shifts_info, &output_multipliers_shifts_info)); - } - } - else - { - ARM_COMPUTE_RETURN_ON_ERROR(CLDepthwiseConvolutionLayer3x3NCHWKernel::validate(input, weights, biases, output, conv_info, depth_multiplier, act_info, gpu_target, - dilation, &output_multipliers_shifts_info, &output_multipliers_shifts_info)); - } - return Status{}; -} -} // namespace - -CLDepthwiseConvolutionLayer::CLDepthwiseConvolutionLayerGeneric::CLDepthwiseConvolutionLayerGeneric(std::shared_ptr<IMemoryManager> memory_manager) +CLDepthwiseConvolutionLayer::CLDepthwiseConvolutionLayer(std::shared_ptr<IMemoryManager> memory_manager) : _memory_group(std::move(memory_manager)), - _dwc_native_kernel(), + _dwc_native_kernel(std::make_unique<CLDepthwiseConvolutionLayerNativeKernel>()), _permute_input_to_nhwc(), _permute_weights_to_nhwc(), _permute_output_to_nchw(), @@ -137,25 +61,36 @@ CLDepthwiseConvolutionLayer::CLDepthwiseConvolutionLayerGeneric::CLDepthwiseConv { } -void CLDepthwiseConvolutionLayer::CLDepthwiseConvolutionLayerGeneric::configure(ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info, - unsigned int depth_multiplier, const ActivationLayerInfo &act_info, const Size2D &dilation) +CLDepthwiseConvolutionLayer::~CLDepthwiseConvolutionLayer() = default; + +void CLDepthwiseConvolutionLayer::configure(ICLTensor *input, + const ICLTensor *weights, + const ICLTensor *biases, + ICLTensor *output, + const PadStrideInfo &conv_info, + unsigned int depth_multiplier, + ActivationLayerInfo act_info, + const Size2D &dilation) { - configure(CLKernelLibrary::get().get_compile_context(), input, weights, biases, output, conv_info, depth_multiplier, act_info, dilation); + configure(CLKernelLibrary::get().get_compile_context(), input, weights, biases, output, conv_info, depth_multiplier, + act_info, dilation); } -void CLDepthwiseConvolutionLayer::CLDepthwiseConvolutionLayerGeneric::configure(const CLCompileContext &compile_context, ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, - ICLTensor *output, const PadStrideInfo &conv_info, - unsigned int depth_multiplier, const ActivationLayerInfo &act_info, const Size2D &dilation) +void CLDepthwiseConvolutionLayer::configure(const CLCompileContext &compile_context, + ICLTensor *input, + const ICLTensor *weights, + const ICLTensor *biases, + ICLTensor *output, + const PadStrideInfo &conv_info, + unsigned int depth_multiplier, + ActivationLayerInfo act_info, + const Size2D &dilation) { - ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output); - ARM_COMPUTE_ERROR_THROW_ON(CLDepthwiseConvolutionLayer::validate(input->info(), - weights->info(), - biases != nullptr ? biases->info() : nullptr, - output->info(), - conv_info, - depth_multiplier, - act_info, - dilation)); + ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights); + ARM_COMPUTE_ERROR_THROW_ON(CLDepthwiseConvolutionLayer::validate( + input->info(), weights->info(), biases != nullptr ? biases->info() : nullptr, + output != nullptr ? output->info() : input->info(), conv_info, depth_multiplier, act_info, dilation)); + ARM_COMPUTE_LOG_PARAMS(input, weights, biases, output, conv_info, depth_multiplier, act_info, dilation); _is_quantized = is_data_type_quantized(input->info()->data_type()); _is_prepared = false; @@ -164,10 +99,12 @@ void CLDepthwiseConvolutionLayer::CLDepthwiseConvolutionLayerGeneric::configure( _output = output; _needs_permute = input->info()->data_layout() == DataLayout::NCHW; + const GPUTarget gpu_target = CLScheduler::get().target(); + ICLTensor *input_to_use = input; const ICLTensor *weights_to_use = weights; ICLTensor *output_to_use = output; - if(_needs_permute) + if (_needs_permute) { _memory_group.manage(&_permuted_input); _memory_group.manage(&_permuted_output); @@ -190,10 +127,12 @@ void CLDepthwiseConvolutionLayer::CLDepthwiseConvolutionLayerGeneric::configure( CLTensor *output_multipliers_to_use = nullptr; CLTensor *output_shifts_to_use = nullptr; - if(_is_quantized) + if (_is_quantized) { - const size_t idx_c = get_data_layout_dimension_index(weights->info()->data_layout(), DataLayoutDimension::CHANNEL); - const size_t num_filters = (is_data_type_quantized_per_channel(weights->info()->data_type())) ? weights->info()->dimension(idx_c) : 1; + const size_t idx_c = + get_data_layout_dimension_index(weights->info()->data_layout(), DataLayoutDimension::CHANNEL); + const size_t num_filters = + (is_data_type_quantized_per_channel(weights->info()->data_type())) ? weights->info()->dimension(idx_c) : 1; _output_multipliers.allocator()->init(TensorInfo(TensorShape(num_filters), 1, DataType::S32)); _output_shifts.allocator()->init(TensorInfo(TensorShape(num_filters), 1, DataType::S32)); @@ -202,15 +141,19 @@ void CLDepthwiseConvolutionLayer::CLDepthwiseConvolutionLayerGeneric::configure( output_shifts_to_use = &_output_shifts; } - DWCWeightsKernelInfo dwc_weights_info; - dwc_weights_info.n0 = (depth_multiplier == 1) ? 8 : 1; - DWCKernelInfo dwc_info; - dwc_info.activation_info = act_info; - _dwc_native_kernel.configure(compile_context, input_to_use, weights_to_use, biases, output_to_use, - dwc_weights_info, dwc_info, conv_info, depth_multiplier, dilation, - output_multipliers_to_use, output_shifts_to_use); + // Get the depthwise convolution compute parameters + auto t = ClDWCNativeKernelConfigurationFactory::create(gpu_target); + const DWCComputeKernelInfo dwc_native_compute_info = + t->configure(input_to_use->info(), weights_to_use->info(), conv_info, dilation, depth_multiplier); + + const ConvolutionInfo conv_kernel_info{conv_info, depth_multiplier, act_info, dilation}; - if(_needs_permute) + _dwc_native_kernel->set_target(gpu_target); + _dwc_native_kernel->configure(compile_context, input_to_use, weights_to_use, biases, output_to_use, + dwc_native_compute_info, conv_kernel_info, output_multipliers_to_use, + output_shifts_to_use); + + if (_needs_permute) { _permuted_input.allocator()->allocate(); @@ -220,37 +163,51 @@ void CLDepthwiseConvolutionLayer::CLDepthwiseConvolutionLayerGeneric::configure( _permuted_output.allocator()->allocate(); } - if(_is_quantized) + if (_is_quantized) { _output_multipliers.allocator()->allocate(); _output_shifts.allocator()->allocate(); } } -Status CLDepthwiseConvolutionLayer::CLDepthwiseConvolutionLayerGeneric::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, - const PadStrideInfo &conv_info, - unsigned int depth_multiplier, const ActivationLayerInfo &act_info, const Size2D &dilation) +Status CLDepthwiseConvolutionLayer::validate(const ITensorInfo *input, + const ITensorInfo *weights, + const ITensorInfo *biases, + const ITensorInfo *output, + const PadStrideInfo &conv_info, + unsigned int depth_multiplier, + ActivationLayerInfo act_info, + const Size2D &dilation) { + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(!weights->are_values_constant(), "Dynamic weights are not supported"); + + const bool in_place = input == output || output == nullptr; + if (in_place) + { + output = input; + } ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, output); const size_t idx_w = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::WIDTH); const size_t idx_h = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::HEIGHT); - ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(idx_w) + (weights->dimension(idx_w) - 1) * (dilation.x() - 1) > input->dimension(idx_w) + conv_info.pad_left() + conv_info.pad_right()); - ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(idx_h) + (weights->dimension(idx_h) - 1) * (dilation.y() - 1) > input->dimension(idx_h) + conv_info.pad_top() + conv_info.pad_bottom()); + ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(idx_w) + (weights->dimension(idx_w) - 1) * (dilation.x() - 1) > + input->dimension(idx_w) + conv_info.pad_left() + conv_info.pad_right()); + ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(idx_h) + (weights->dimension(idx_h) - 1) * (dilation.y() - 1) > + input->dimension(idx_h) + conv_info.pad_top() + conv_info.pad_bottom()); - DWCWeightsKernelInfo dwc_weights_info; - dwc_weights_info.n0 = (depth_multiplier == 1) ? 8 : 1; - DWCKernelInfo dwc_info; - dwc_info.activation_info = act_info; + const GPUTarget gpu_target = CLScheduler::get().target(); + + const ConvolutionInfo conv_kernel_info{conv_info, depth_multiplier, act_info, dilation}; const bool needs_permute = input->data_layout() == DataLayout::NCHW; const bool is_quantized = is_data_type_quantized(input->data_type()); TensorInfo output_multipliers_shifts_info(TensorInfo(TensorShape(1U), 1, DataType::S32)); - if(is_quantized) + if (is_quantized) { - if(is_data_type_quantized_per_channel(weights->data_type())) + if (is_data_type_quantized_per_channel(weights->data_type())) { ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(weights, 1, DataType::QSYMM8_PER_CHANNEL); @@ -263,72 +220,95 @@ Status CLDepthwiseConvolutionLayer::CLDepthwiseConvolutionLayerGeneric::validate } } - if(needs_permute) + if (needs_permute) { - TensorShape permuted_input_shape = input->tensor_shape(); - TensorShape permuted_weights_shape = weights->tensor_shape(); - TensorShape permuted_output_shape = shape_calculator::compute_depthwise_convolution_shape(*input, *weights, conv_info, depth_multiplier, dilation); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(in_place, "In-place is supported only with NHWC data layout"); + TensorShape permuted_input_shape = input->tensor_shape(); + TensorShape permuted_weights_shape = weights->tensor_shape(); + const ConvolutionInfo info{conv_info, depth_multiplier, ActivationLayerInfo(), dilation}; + TensorShape permuted_output_shape = + shape_calculator::compute_depthwise_convolution_shape(*input, *weights, info); permute(permuted_input_shape, PermutationVector(2U, 0U, 1U)); permute(permuted_weights_shape, PermutationVector(2U, 0U, 1U)); permute(permuted_output_shape, PermutationVector(2U, 0U, 1U)); - const TensorInfo permuted_input = input->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(permuted_input_shape).set_data_layout(DataLayout::NHWC); - const TensorInfo permuted_weights = weights->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(permuted_weights_shape).set_data_layout(DataLayout::NHWC); - const TensorInfo permuted_output = output->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(permuted_output_shape).set_data_layout(DataLayout::NHWC); + const TensorInfo permuted_input = input->clone() + ->set_is_resizable(true) + .reset_padding() + .set_tensor_shape(permuted_input_shape) + .set_data_layout(DataLayout::NHWC); + const TensorInfo permuted_weights = weights->clone() + ->set_is_resizable(true) + .reset_padding() + .set_tensor_shape(permuted_weights_shape) + .set_data_layout(DataLayout::NHWC); + const TensorInfo permuted_output = output->clone() + ->set_is_resizable(true) + .reset_padding() + .set_tensor_shape(permuted_output_shape) + .set_data_layout(DataLayout::NHWC); ARM_COMPUTE_RETURN_ON_ERROR(CLPermute::validate(input, &permuted_input, PermutationVector(2U, 0U, 1U))); ARM_COMPUTE_RETURN_ON_ERROR(CLPermute::validate(weights, &permuted_weights, PermutationVector(2U, 0U, 1U))); - ARM_COMPUTE_RETURN_ON_ERROR(CLDepthwiseConvolutionLayerNativeKernel::validate(&permuted_input, &permuted_weights, biases, &permuted_output, dwc_weights_info, - dwc_info, conv_info, depth_multiplier, dilation, - &output_multipliers_shifts_info, &output_multipliers_shifts_info)); + + // Get the depthwise convolution compute parameters + auto t = ClDWCNativeKernelConfigurationFactory::create(gpu_target); + const DWCComputeKernelInfo dwc_native_compute_info = + t->configure(&permuted_input, &permuted_weights, conv_info, dilation, depth_multiplier); + + ARM_COMPUTE_RETURN_ON_ERROR(CLDepthwiseConvolutionLayerNativeKernel::validate( + &permuted_input, &permuted_weights, biases, &permuted_output, dwc_native_compute_info, conv_kernel_info, + &output_multipliers_shifts_info, &output_multipliers_shifts_info)); ARM_COMPUTE_RETURN_ON_ERROR(CLPermute::validate(&permuted_output, output, PermutationVector(1U, 2U, 0U))); } else { - ARM_COMPUTE_RETURN_ON_ERROR(CLDepthwiseConvolutionLayerNativeKernel::validate(input, weights, biases, output, dwc_weights_info, dwc_info, conv_info, depth_multiplier, - dilation, &output_multipliers_shifts_info, &output_multipliers_shifts_info)); + // Get the depthwise convolution compute parameters + auto t = ClDWCNativeKernelConfigurationFactory::create(gpu_target); + const DWCComputeKernelInfo dwc_native_compute_info = + t->configure(input, weights, conv_info, dilation, depth_multiplier); + ARM_COMPUTE_RETURN_ON_ERROR(CLDepthwiseConvolutionLayerNativeKernel::validate( + input, weights, biases, output, dwc_native_compute_info, conv_kernel_info, &output_multipliers_shifts_info, + &output_multipliers_shifts_info)); } return Status{}; } -void CLDepthwiseConvolutionLayer::CLDepthwiseConvolutionLayerGeneric::run() +void CLDepthwiseConvolutionLayer::run() { prepare(); MemoryGroupResourceScope scope_mg(_memory_group); - if(_needs_permute) + if (_needs_permute) { _permute_input_to_nhwc.run(); } - CLScheduler::get().enqueue(_dwc_native_kernel); - if(_needs_permute) + CLScheduler::get().enqueue(*_dwc_native_kernel); + if (_needs_permute) { _permute_output_to_nchw.run(); } } -void CLDepthwiseConvolutionLayer::CLDepthwiseConvolutionLayerGeneric::prepare() +void CLDepthwiseConvolutionLayer::prepare() { - if(!_is_prepared) + if (!_is_prepared) { - if(_is_quantized) + if (_is_quantized) { _output_multipliers.map(); _output_shifts.map(); - const unsigned int idx_ofms = get_data_layout_dimension_index(_output->info()->data_layout(), DataLayoutDimension::CHANNEL); - quantization::compute_quantized_multipliers_and_shifts(_input->info(), - _original_weights->info(), - _output->info(), - idx_ofms, - reinterpret_cast<int32_t *>(_output_multipliers.ptr_to_element(Coordinates(0))), - reinterpret_cast<int32_t *>(_output_shifts.ptr_to_element(Coordinates(0)))); + quantization::compute_quantized_multipliers_and_shifts( + _input->info(), _original_weights->info(), _output != nullptr ? _output->info() : _input->info(), + reinterpret_cast<int32_t *>(_output_multipliers.ptr_to_element(Coordinates(0))), + reinterpret_cast<int32_t *>(_output_shifts.ptr_to_element(Coordinates(0)))); _output_multipliers.unmap(); _output_shifts.unmap(); } - if(_needs_permute) + if (_needs_permute) { ARM_COMPUTE_ERROR_ON(!_original_weights->is_used()); @@ -339,315 +319,4 @@ void CLDepthwiseConvolutionLayer::CLDepthwiseConvolutionLayerGeneric::prepare() _is_prepared = true; } } - -CLDepthwiseConvolutionLayer::CLDepthwiseConvolutionLayerInternal3x3::CLDepthwiseConvolutionLayerInternal3x3(std::shared_ptr<IMemoryManager> memory_manager) - : _memory_group(std::move(memory_manager)), - _kernel(nullptr), - _border_handler(), - _permute_input_to_nchw(), - _permute_weights_to_nchw(), - _permute_output_to_nhwc(), - _reshape_weights(), - _permuted_input(), - _permuted_weights(), - _permuted_output(), - _output_multipliers(), - _output_shifts(), - _original_weights(nullptr), - _input(nullptr), - _output(nullptr), - _needs_permute(false), - _needs_weights_reshape(false), - _is_prepared(false), - _is_quantized(false) -{ -} - -void CLDepthwiseConvolutionLayer::CLDepthwiseConvolutionLayerInternal3x3::configure(ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, - const PadStrideInfo &conv_info, unsigned int depth_multiplier, ActivationLayerInfo act_info, const Size2D &dilation) -{ - configure(CLKernelLibrary::get().get_compile_context(), input, weights, biases, output, conv_info, depth_multiplier, act_info, dilation); -} - -void CLDepthwiseConvolutionLayer::CLDepthwiseConvolutionLayerInternal3x3::configure(const CLCompileContext &compile_context, ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, - ICLTensor *output, - const PadStrideInfo &conv_info, unsigned int depth_multiplier, ActivationLayerInfo act_info, const Size2D &dilation) -{ - const GPUTarget gpu_target = CLScheduler::get().target(); - - // Perform validation step - ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output); - ARM_COMPUTE_ERROR_THROW_ON(CLDepthwiseConvolutionLayerInternal3x3::validate(input->info(), - weights->info(), - biases != nullptr ? biases->info() : nullptr, - output->info(), - conv_info, - depth_multiplier, - act_info, - gpu_target, - dilation)); - - const bool is_nhwc = input->info()->data_layout() == DataLayout::NHWC; - _is_quantized = is_data_type_quantized_asymmetric(input->info()->data_type()); - _needs_permute = is_nhwc && (depth_multiplier > 1); - _needs_weights_reshape = is_nhwc && (depth_multiplier == 1) && _is_quantized; - - _is_prepared = false; - _original_weights = weights; - _input = input; - _output = output; - - ICLTensor *input_to_use = input; - const ICLTensor *weights_to_use = weights; - ICLTensor *output_to_use = output; - - const bool is_quantized_per_channel = is_data_type_quantized_per_channel(weights->info()->data_type()); - const bool is_stride_1 = ((conv_info.stride().first == conv_info.stride().second) && (conv_info.stride().first == 1)); - const bool is_dot8_supported = dot8_supported(CLKernelLibrary::get().get_device()) && !is_quantized_per_channel; - const bool is_stride_1_dilation_1 = (is_stride_1 && dilation.x() == 1 && dilation.y() == 1); - - DepthwiseConvolutionReshapeInfo info; - info.c0 = 4; - info.transpose = is_stride_1_dilation_1 && is_dot8_supported; - - if(_needs_permute) - { - _memory_group.manage(&_permuted_input); - _memory_group.manage(&_permuted_output); - - // Configure the function to transform the input tensor from NHWC -> NCHW - _permute_input_to_nchw.configure(compile_context, input, &_permuted_input, PermutationVector(1U, 2U, 0U)); - _permuted_input.info()->set_data_layout(DataLayout::NCHW); - - // Configure the function to transform the weights tensor from HWI -> IHW - _permute_weights_to_nchw.configure(compile_context, weights, &_permuted_weights, PermutationVector(1U, 2U, 0U)); - _permuted_weights.info()->set_data_layout(DataLayout::NCHW); - _permuted_output.info()->set_quantization_info(output->info()->quantization_info()); - - input_to_use = &_permuted_input; - weights_to_use = &_permuted_weights; - output_to_use = &_permuted_output; - - _kernel = arm_compute::support::cpp14::make_unique<CLDepthwiseConvolutionLayer3x3NCHWKernel>(); - } - else if(is_nhwc) - { - if(_needs_weights_reshape) - { - _reshape_weights.configure(compile_context, weights, &_permuted_weights, info); - weights_to_use = &_permuted_weights; - } - _kernel = arm_compute::support::cpp14::make_unique<CLDepthwiseConvolutionLayer3x3NHWCKernel>(); - } - else - { - _kernel = arm_compute::support::cpp14::make_unique<CLDepthwiseConvolutionLayer3x3NCHWKernel>(); - } - - CLTensor *output_multipliers_to_use = nullptr; - CLTensor *output_shifts_to_use = nullptr; - if(_is_quantized) - { - const size_t idx_c = get_data_layout_dimension_index(weights->info()->data_layout(), DataLayoutDimension::CHANNEL); - const size_t num_filters = (is_quantized_per_channel) ? weights->info()->dimension(idx_c) : 1; - - _output_multipliers.allocator()->init(TensorInfo(TensorShape(num_filters), 1, DataType::S32)); - _output_shifts.allocator()->init(TensorInfo(TensorShape(num_filters), 1, DataType::S32)); - - output_multipliers_to_use = &_output_multipliers; - output_shifts_to_use = &_output_shifts; - } - - // Configure kernel - _kernel->set_target(gpu_target); - _kernel->configure(compile_context, input_to_use, weights_to_use, biases, output_to_use, conv_info, depth_multiplier, - act_info, dilation, output_multipliers_to_use, output_shifts_to_use); - - if(_is_quantized) - { - _output_multipliers.allocator()->allocate(); - _output_shifts.allocator()->allocate(); - } - - // Permute output if needed - if(_needs_permute) - { - // Configure the function to transform the convoluted output to ACL's native ordering format NCHW - _permuted_output.info()->set_data_layout(DataLayout::NCHW); - _permute_output_to_nhwc.configure(compile_context, &_permuted_output, output, PermutationVector(2U, 0U, 1U)); - - // Allocate tensors - _permuted_input.allocator()->allocate(); - _permuted_output.allocator()->allocate(); - } - // Configure border handler - PixelValue &&zero_value(0.f); - if(is_data_type_quantized_asymmetric(input->info()->data_type())) - { - zero_value = PixelValue(static_cast<uint8_t>(input->info()->quantization_info().uniform().offset)); - } - _border_handler.configure(compile_context, input_to_use, _kernel->border_size(), BorderMode::CONSTANT, zero_value); -} - -Status CLDepthwiseConvolutionLayer::CLDepthwiseConvolutionLayerInternal3x3::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, - const PadStrideInfo &conv_info, unsigned int depth_multiplier, ActivationLayerInfo act_info, GPUTarget gpu_target, const Size2D &dilation) -{ - return validate_arguments_3x3(input, weights, biases, output, conv_info, depth_multiplier, act_info, gpu_target, dilation); -} - -void CLDepthwiseConvolutionLayer::CLDepthwiseConvolutionLayerInternal3x3::run() -{ - prepare(); - - MemoryGroupResourceScope scope_mg(_memory_group); - - if(_needs_permute) - { - _permute_input_to_nchw.run(); - } - CLScheduler::get().enqueue(_border_handler); - CLScheduler::get().enqueue(*_kernel); - - if(_needs_permute) - { - _permute_output_to_nhwc.run(); - } -} - -void CLDepthwiseConvolutionLayer::CLDepthwiseConvolutionLayerInternal3x3::prepare() -{ - if(!_is_prepared) - { - if(_is_quantized) - { - _output_multipliers.map(); - _output_shifts.map(); - const unsigned int idx_ofms = get_data_layout_dimension_index(_output->info()->data_layout(), DataLayoutDimension::CHANNEL); - quantization::compute_quantized_multipliers_and_shifts(_input->info(), - _original_weights->info(), - _output->info(), - idx_ofms, - reinterpret_cast<int32_t *>(_output_multipliers.ptr_to_element(Coordinates(0))), - reinterpret_cast<int32_t *>(_output_shifts.ptr_to_element(Coordinates(0)))); - _output_multipliers.unmap(); - _output_shifts.unmap(); - } - - if(_needs_permute) - { - ARM_COMPUTE_ERROR_ON(!_original_weights->is_used()); - - _permuted_weights.allocator()->allocate(); - _permute_weights_to_nchw.run(); - _original_weights->mark_as_unused(); - } - - if(_needs_weights_reshape) - { - ARM_COMPUTE_ERROR_ON(_needs_permute); - ARM_COMPUTE_ERROR_ON(!_original_weights->is_used()); - _permuted_weights.allocator()->allocate(); - CLScheduler::get().enqueue(_reshape_weights); - _original_weights->mark_as_unused(); - } - _is_prepared = true; - } -} - -CLDepthwiseConvolutionLayer::CLDepthwiseConvolutionLayer(std::shared_ptr<IMemoryManager> memory_manager) - : _memory_manager(std::move(memory_manager)), _depth_conv_func(DepthwiseConvolutionFunction::GENERIC), _func_3x3(), _func_generic() -{ -} - -void CLDepthwiseConvolutionLayer::configure(ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info, unsigned int depth_multiplier, - ActivationLayerInfo act_info, const Size2D &dilation) -{ - configure(CLKernelLibrary::get().get_compile_context(), input, weights, biases, output, conv_info, depth_multiplier, act_info, dilation); -} - -void CLDepthwiseConvolutionLayer::configure(const CLCompileContext &compile_context, ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, - const PadStrideInfo &conv_info, - unsigned int depth_multiplier, - ActivationLayerInfo act_info, const Size2D &dilation) -{ - const GPUTarget gpu_target = CLScheduler::get().target(); - _depth_conv_func = get_depthwiseconvolution_function(input->info(), weights->info(), (biases != nullptr) ? biases->info() : nullptr, output->info(), conv_info, depth_multiplier, act_info, - dilation, gpu_target); - switch(_depth_conv_func) - { - case DepthwiseConvolutionFunction::OPTIMIZED: - _func_3x3.set_memory_group(_memory_manager); - _func_3x3.configure(compile_context, input, weights, biases, output, conv_info, depth_multiplier, act_info, dilation); - break; - case DepthwiseConvolutionFunction::GENERIC: - { - _func_generic.set_memory_group(_memory_manager); - _func_generic.configure(compile_context, input, weights, biases, output, conv_info, depth_multiplier, act_info, dilation); - } - break; - default: - ARM_COMPUTE_ERROR("Unsupported DepthwiseConvolutionFunction"); - } -} - -Status CLDepthwiseConvolutionLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info, - unsigned int depth_multiplier, ActivationLayerInfo act_info, const Size2D &dilation) -{ - const GPUTarget gpu_target = CLScheduler::get().target(); - DepthwiseConvolutionFunction depth_conv_func = get_depthwiseconvolution_function(input, weights, biases, output, conv_info, depth_multiplier, act_info, dilation, gpu_target); - switch(depth_conv_func) - { - case DepthwiseConvolutionFunction::OPTIMIZED: - return CLDepthwiseConvolutionLayerInternal3x3::validate(input, weights, biases, output, conv_info, depth_multiplier, act_info, gpu_target, dilation); - case DepthwiseConvolutionFunction::GENERIC: - return CLDepthwiseConvolutionLayerGeneric::validate(input, weights, biases, output, conv_info, depth_multiplier, act_info, dilation); - default: - ARM_COMPUTE_ERROR("Unsupported DepthwiseConvolutionFunction"); - } -} - -DepthwiseConvolutionFunction CLDepthwiseConvolutionLayer::get_depthwiseconvolution_function(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, - const PadStrideInfo &conv_info, - unsigned int depth_multiplier, ActivationLayerInfo act_info, const Size2D &dilation, GPUTarget gpu_target) -{ - if(bool(CLDepthwiseConvolutionLayerInternal3x3::validate(input, weights, biases, output, conv_info, depth_multiplier, act_info, gpu_target, dilation)) && (is_data_type_float(input->data_type()) - || get_arch_from_target(gpu_target) == GPUTarget::MIDGARD)) - { - return DepthwiseConvolutionFunction::OPTIMIZED; - } - else - { - return DepthwiseConvolutionFunction::GENERIC; - } -} - -void CLDepthwiseConvolutionLayer::run() -{ - switch(_depth_conv_func) - { - case DepthwiseConvolutionFunction::OPTIMIZED: - _func_3x3.run(); - break; - case DepthwiseConvolutionFunction::GENERIC: - _func_generic.run(); - break; - default: - ARM_COMPUTE_ERROR("DepthwiseConvolutionFunction not properly configured"); - } -} - -void CLDepthwiseConvolutionLayer::prepare() -{ - switch(_depth_conv_func) - { - case DepthwiseConvolutionFunction::OPTIMIZED: - _func_3x3.prepare(); - break; - case DepthwiseConvolutionFunction::GENERIC: - _func_generic.prepare(); - break; - default: - ARM_COMPUTE_ERROR("DepthwiseConvolutionFunction not properly configured"); - } -} } // namespace arm_compute diff --git a/src/runtime/CL/functions/CLDequantizationLayer.cpp b/src/runtime/CL/functions/CLDequantizationLayer.cpp index 362b36cc95..20162a03db 100644 --- a/src/runtime/CL/functions/CLDequantizationLayer.cpp +++ b/src/runtime/CL/functions/CLDequantizationLayer.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2020 ARM Limited. + * Copyright (c) 2017-2021 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -23,25 +23,55 @@ */ #include "arm_compute/runtime/CL/functions/CLDequantizationLayer.h" -#include "arm_compute/core/CL/kernels/CLDequantizationLayerKernel.h" -#include "support/MemorySupport.h" +#include "arm_compute/core/CL/CLKernelLibrary.h" +#include "arm_compute/core/CL/ICLTensor.h" +#include "arm_compute/core/KernelDescriptors.h" + +#include "src/common/utils/Log.h" +#include "src/core/CL/ICLKernel.h" +#include "src/gpu/cl/operators/ClDequantize.h" namespace arm_compute { +struct CLDequantizationLayer::Impl +{ + const ICLTensor *src{nullptr}; + ICLTensor *dst{nullptr}; + std::unique_ptr<opencl::ClDequantize> op{nullptr}; +}; + +CLDequantizationLayer::CLDequantizationLayer() : _impl(std::make_unique<Impl>()) +{ +} +CLDequantizationLayer::~CLDequantizationLayer() = default; + void CLDequantizationLayer::configure(const ICLTensor *input, ICLTensor *output) { configure(CLKernelLibrary::get().get_compile_context(), input, output); } -void CLDequantizationLayer::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output) +void CLDequantizationLayer::configure(const CLCompileContext &compile_context, + const ICLTensor *input, + ICLTensor *output) { - auto k = arm_compute::support::cpp14::make_unique<CLDequantizationLayerKernel>(); - k->configure(compile_context, input, output); - _kernel = std::move(k); + ARM_COMPUTE_LOG_PARAMS(input, output); + _impl->src = input; + _impl->dst = output; + + _impl->op = std::make_unique<opencl::ClDequantize>(); + _impl->op->configure(compile_context, input->info(), output->info()); } Status CLDequantizationLayer::validate(const ITensorInfo *input, const ITensorInfo *output) { - return CLDequantizationLayerKernel::validate(input, output); + return opencl::ClDequantize::validate(input, output); +} + +void CLDequantizationLayer::run() +{ + ITensorPack pack; + pack.add_tensor(TensorType::ACL_SRC, _impl->src); + pack.add_tensor(TensorType::ACL_DST, _impl->dst); + _impl->op->run(pack); } } // namespace arm_compute diff --git a/src/runtime/CL/functions/CLDerivative.cpp b/src/runtime/CL/functions/CLDerivative.cpp deleted file mode 100644 index 68d3752463..0000000000 --- a/src/runtime/CL/functions/CLDerivative.cpp +++ /dev/null @@ -1,45 +0,0 @@ -/* - * Copyright (c) 2016-2020 ARM Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "arm_compute/runtime/CL/functions/CLDerivative.h" - -#include "arm_compute/core/CL/kernels/CLDerivativeKernel.h" -#include "arm_compute/core/PixelValue.h" -#include "support/MemorySupport.h" - -#include <utility> - -using namespace arm_compute; - -void CLDerivative::configure(ICLTensor *input, ICLTensor *output_x, ICLTensor *output_y, BorderMode border_mode, uint8_t constant_border_value) -{ - configure(CLKernelLibrary::get().get_compile_context(), input, output_x, output_y, border_mode, constant_border_value); -} - -void CLDerivative::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output_x, ICLTensor *output_y, BorderMode border_mode, uint8_t constant_border_value) -{ - auto k = arm_compute::support::cpp14::make_unique<CLDerivativeKernel>(); - k->configure(compile_context, input, output_x, output_y, border_mode == BorderMode::UNDEFINED); - _kernel = std::move(k); - _border_handler.configure(compile_context, input, BorderSize(1), border_mode, PixelValue(constant_border_value)); -} diff --git a/src/runtime/CL/functions/CLDilate.cpp b/src/runtime/CL/functions/CLDilate.cpp deleted file mode 100644 index 05351a9de3..0000000000 --- a/src/runtime/CL/functions/CLDilate.cpp +++ /dev/null @@ -1,45 +0,0 @@ -/* - * Copyright (c) 2016-2020 ARM Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "arm_compute/runtime/CL/functions/CLDilate.h" - -#include "arm_compute/core/CL/kernels/CLDilateKernel.h" -#include "arm_compute/core/PixelValue.h" -#include "support/MemorySupport.h" - -#include <utility> - -using namespace arm_compute; - -void CLDilate::configure(ICLTensor *input, ICLTensor *output, BorderMode border_mode, uint8_t constant_border_value) -{ - configure(CLKernelLibrary::get().get_compile_context(), input, output, border_mode, constant_border_value); -} - -void CLDilate::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, BorderMode border_mode, uint8_t constant_border_value) -{ - auto k = arm_compute::support::cpp14::make_unique<CLDilateKernel>(); - k->configure(compile_context, input, output, border_mode == BorderMode::UNDEFINED); - _kernel = std::move(k); - _border_handler.configure(compile_context, input, BorderSize(1), border_mode, PixelValue(constant_border_value)); -} diff --git a/src/runtime/CL/functions/CLDirectConvolutionLayer.cpp b/src/runtime/CL/functions/CLDirectConvolutionLayer.cpp index 6e9782f77a..d6dae0d732 100644 --- a/src/runtime/CL/functions/CLDirectConvolutionLayer.cpp +++ b/src/runtime/CL/functions/CLDirectConvolutionLayer.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2020 ARM Limited. + * Copyright (c) 2017-2021 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -24,76 +24,81 @@ #include "arm_compute/runtime/CL/functions/CLDirectConvolutionLayer.h" #include "arm_compute/core/CL/ICLTensor.h" -#include "arm_compute/core/CL/kernels/CLDirectConvolutionLayerKernel.h" #include "arm_compute/core/PixelValue.h" #include "arm_compute/core/Utils.h" #include "arm_compute/core/Validate.h" #include "arm_compute/runtime/CL/CLScheduler.h" -using namespace arm_compute; +#include "src/common/utils/Log.h" +#include "src/gpu/cl/operators/ClActivation.h" +#include "src/gpu/cl/operators/ClDirectConv2d.h" -CLDirectConvolutionLayer::CLDirectConvolutionLayer() - : _direct_conv_kernel(), _input_border_handler(), _activationlayer_function(), _is_activationlayer_enabled(false) +namespace arm_compute +{ +struct CLDirectConvolutionLayer::Impl +{ + const ICLTensor *src{nullptr}; + const ICLTensor *weights{nullptr}; + const ICLTensor *biases{nullptr}; + ICLTensor *dst{nullptr}; + std::unique_ptr<opencl::ClDirectConv2d> op{nullptr}; +}; + +CLDirectConvolutionLayer::CLDirectConvolutionLayer() : _impl(std::make_unique<Impl>()) { } +CLDirectConvolutionLayer::CLDirectConvolutionLayer(CLDirectConvolutionLayer &&) = default; +CLDirectConvolutionLayer &CLDirectConvolutionLayer::operator=(CLDirectConvolutionLayer &&) = default; +CLDirectConvolutionLayer::~CLDirectConvolutionLayer() = default; -void CLDirectConvolutionLayer::configure(ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info) +void CLDirectConvolutionLayer::configure(ICLTensor *input, + const ICLTensor *weights, + const ICLTensor *biases, + ICLTensor *output, + const PadStrideInfo &conv_info, + const ActivationLayerInfo &act_info) { configure(CLKernelLibrary::get().get_compile_context(), input, weights, biases, output, conv_info, act_info); } -void CLDirectConvolutionLayer::configure(const CLCompileContext &compile_context, ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, - const PadStrideInfo &conv_info, +void CLDirectConvolutionLayer::configure(const CLCompileContext &compile_context, + ICLTensor *input, + const ICLTensor *weights, + const ICLTensor *biases, + ICLTensor *output, + const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info) { - // Set GPU target - _direct_conv_kernel.set_target(CLScheduler::get().target()); - - // Configure direct convolution - _direct_conv_kernel.configure(compile_context, input, weights, biases, output, conv_info); + ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output); + ARM_COMPUTE_LOG_PARAMS(input, weights, biases, output, conv_info, act_info); - // Configure border handler - PixelValue &&zero_value(0.f); - if(is_data_type_quantized_asymmetric(input->info()->data_type())) - { - zero_value = PixelValue(0, input->info()->data_type(), input->info()->quantization_info()); - } - _input_border_handler.configure(compile_context, input, _direct_conv_kernel.border_size(), BorderMode::CONSTANT, zero_value); + _impl->src = input; + _impl->weights = weights; + _impl->biases = biases; + _impl->dst = output; - // Tune kernels - CLScheduler::get().tune_kernel_static(_direct_conv_kernel); - - _is_activationlayer_enabled = act_info.enabled(); - - //Configure Activation Layer - if(_is_activationlayer_enabled) - { - _activationlayer_function.configure(compile_context, output, nullptr, act_info); - } + _impl->op = std::make_unique<opencl::ClDirectConv2d>(); + _impl->op->configure(compile_context, input->info(), weights->info(), + (biases != nullptr) ? biases->info() : nullptr, output->info(), conv_info, act_info); } -Status CLDirectConvolutionLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info, +Status CLDirectConvolutionLayer::validate(const ITensorInfo *input, + const ITensorInfo *weights, + const ITensorInfo *biases, + const ITensorInfo *output, + const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info) { - ARM_COMPUTE_RETURN_ON_ERROR(CLDirectConvolutionLayerKernel::validate(input, weights, biases, output, conv_info, CLScheduler::get().target())); - if(act_info.enabled()) - { - ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayer::validate(output, nullptr, act_info)); - } - return Status{}; + return opencl::ClDirectConv2d::validate(input, weights, biases, output, conv_info, act_info); } void CLDirectConvolutionLayer::run() { - // Run border handler - CLScheduler::get().enqueue(_input_border_handler, false); - - // Run direct convolution - CLScheduler::get().enqueue(_direct_conv_kernel); - - //Run Activation Layer - if(_is_activationlayer_enabled) - { - _activationlayer_function.run(); - } + ITensorPack pack; + pack.add_tensor(TensorType::ACL_SRC, _impl->src); + pack.add_tensor(TensorType::ACL_SRC_1, _impl->weights); + pack.add_tensor(TensorType::ACL_SRC_2, _impl->biases); + pack.add_tensor(TensorType::ACL_DST, _impl->dst); + _impl->op->run(pack); } +} // namespace arm_compute diff --git a/src/runtime/CL/functions/CLDirectDeconvolutionLayer.cpp b/src/runtime/CL/functions/CLDirectDeconvolutionLayer.cpp index da16bed3e0..7cd268ab0b 100644 --- a/src/runtime/CL/functions/CLDirectDeconvolutionLayer.cpp +++ b/src/runtime/CL/functions/CLDirectDeconvolutionLayer.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2020 ARM Limited. + * Copyright (c) 2019-2021, 2023 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -23,12 +23,18 @@ */ #include "arm_compute/runtime/CL/functions/CLDirectDeconvolutionLayer.h" +#include "arm_compute/core/CL/CLKernelLibrary.h" #include "arm_compute/core/Helpers.h" #include "arm_compute/core/Utils.h" -#include "arm_compute/core/Validate.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" +#include "arm_compute/core/Validate.h" #include "arm_compute/runtime/CL/CLScheduler.h" +#include "src/common/utils/Log.h" +#include "src/core/CL/kernels/CLDeconvolutionLayerUpsampleKernel.h" +#include "src/core/CL/kernels/CLFillBorderKernel.h" +#include "src/core/helpers/AutoConfiguration.h" + #include <memory> #include <tuple> @@ -49,11 +55,16 @@ CLDirectDeconvolutionLayer::CLDirectDeconvolutionLayer(std::shared_ptr<IMemoryMa { } -Status CLDirectDeconvolutionLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *bias, ITensorInfo *output, const PadStrideInfo &info, - const WeightsInfo &weights_info) +Status CLDirectDeconvolutionLayer::validate(const ITensorInfo *input, + const ITensorInfo *weights, + const ITensorInfo *bias, + ITensorInfo *output, + const PadStrideInfo &info, + const WeightsInfo &weights_info) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8_SIGNED, DataType::QASYMM8, DataType::F16, DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8_SIGNED, DataType::QASYMM8, + DataType::F16, DataType::F32); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, weights); const DataLayout data_layout = input->data_layout(); @@ -61,18 +72,25 @@ Status CLDirectDeconvolutionLayer::validate(const ITensorInfo *input, const ITen const size_t idx_h = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT); const size_t idx_c = get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL); - ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(idx_w) != weights->dimension(idx_h)); ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(idx_w) < 1); + ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(idx_h) < 1); - auto out_dims = deconvolution_output_dimensions(input->dimension(idx_w), input->dimension(idx_h), weights->dimension(idx_w), weights->dimension(idx_h), info); + auto out_dims = deconvolution_output_dimensions(input->dimension(idx_w), input->dimension(idx_h), + weights->dimension(idx_w), weights->dimension(idx_h), info); const TensorShape output_shape = compute_deconvolution_output_shape(out_dims, *input, *weights); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output, weights); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); - if(bias != nullptr) + if (input->data_type() != weights->data_type()) { - if(is_data_type_quantized_asymmetric(input->data_type())) + ARM_COMPUTE_RETURN_ERROR_ON(weights->data_type() != DataType::QSYMM8_PER_CHANNEL || + !is_data_type_quantized_asymmetric(input->data_type())); + } + + if (bias != nullptr) + { + if (is_data_type_quantized_asymmetric(input->data_type())) { ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(bias, 1, DataType::S32); } @@ -91,26 +109,42 @@ Status CLDirectDeconvolutionLayer::validate(const ITensorInfo *input, const ITen unsigned int deconv_pad_y = 0; const unsigned int stride_x = info.stride().first; const unsigned int stride_y = info.stride().second; - const TensorShape scale_out_shape = compute_deconvolution_upsampled_shape(*input, *weights, stride_x, stride_y, out_dims, deconv_pad_x, deconv_pad_y); - TensorInfo scale_out_info(input->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(scale_out_shape).set_data_layout(data_layout)); + const TensorShape scale_out_shape = compute_deconvolution_upsampled_shape(*input, *weights, stride_x, stride_y, + out_dims, deconv_pad_x, deconv_pad_y); + TensorInfo scale_out_info(input->clone() + ->set_is_resizable(true) + .reset_padding() + .set_tensor_shape(scale_out_shape) + .set_data_layout(data_layout)); const PadStrideInfo conv_info(1, 1, 0, 0, 0, 0, DimensionRoundingType::CEIL); ARM_COMPUTE_RETURN_ON_ERROR(CLDeconvolutionLayerUpsample::validate(input, &scale_out_info, info)); - ARM_COMPUTE_RETURN_ON_ERROR(CLConvolutionLayer::validate(&scale_out_info, weights, bias, output, conv_info, weights_info)); + ARM_COMPUTE_RETURN_ON_ERROR( + CLConvolutionLayer::validate(&scale_out_info, weights, bias, output, conv_info, weights_info)); return Status{}; } -void CLDirectDeconvolutionLayer::configure(ICLTensor *input, ICLTensor *weights, const ICLTensor *bias, ICLTensor *output, const PadStrideInfo &info, - const WeightsInfo &weights_info) +void CLDirectDeconvolutionLayer::configure(ICLTensor *input, + ICLTensor *weights, + const ICLTensor *bias, + ICLTensor *output, + const PadStrideInfo &info, + const WeightsInfo &weights_info) { configure(CLKernelLibrary::get().get_compile_context(), input, weights, bias, output, info, weights_info); } -void CLDirectDeconvolutionLayer::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *weights, const ICLTensor *bias, ICLTensor *output, const PadStrideInfo &info, - const WeightsInfo &weights_info) +void CLDirectDeconvolutionLayer::configure(const CLCompileContext &compile_context, + ICLTensor *input, + ICLTensor *weights, + const ICLTensor *bias, + ICLTensor *output, + const PadStrideInfo &info, + const WeightsInfo &weights_info) { ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output); + ARM_COMPUTE_LOG_PARAMS(input, weights, bias, output, info, weights_info); const unsigned int pad_left = info.pad_left(); const unsigned int pad_right = info.pad_right(); @@ -127,17 +161,21 @@ void CLDirectDeconvolutionLayer::configure(const CLCompileContext &compile_conte _original_weights = weights; _flip_axis.allocator()->init(TensorInfo(TensorShape(2U), 1, DataType::U32)); _weights_flipped.allocator()->init(weights->info()->clone()->set_data_layout(data_layout)); - _flip_weights.configure(compile_context, weights, &_weights_flipped, &_flip_axis); + _flip_weights.configure(compile_context, weights, &_weights_flipped, &_flip_axis, /* use_inverted_axis */ false); - auto out_dims = deconvolution_output_dimensions(input->info()->dimension(idx_w), input->info()->dimension(idx_h), weights->info()->dimension(idx_w), weights->info()->dimension(idx_h), info); + auto out_dims = + deconvolution_output_dimensions(input->info()->dimension(idx_w), input->info()->dimension(idx_h), + weights->info()->dimension(idx_w), weights->info()->dimension(idx_h), info); const TensorShape output_shape = compute_deconvolution_output_shape(out_dims, *input->info(), *weights->info()); // Output auto initialization if not yet initialized - auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(output_shape).set_data_layout(data_layout)); + auto_init_if_empty(*output->info(), + input->info()->clone()->set_tensor_shape(output_shape).set_data_layout(data_layout)); // Perform validation step - ARM_COMPUTE_ERROR_THROW_ON(CLDirectDeconvolutionLayer::validate(input->info(), weights->info(), bias == nullptr ? nullptr : bias->info(), output->info(), info)); + ARM_COMPUTE_ERROR_THROW_ON(CLDirectDeconvolutionLayer::validate( + input->info(), weights->info(), bias == nullptr ? nullptr : bias->info(), output->info(), info)); _is_prepared = weights_info.retain_internal_weights(); @@ -146,7 +184,8 @@ void CLDirectDeconvolutionLayer::configure(const CLCompileContext &compile_conte // Find the upsampled dimensions and the padding needed for the convolution with stride 1 in order to match output shape unsigned int deconv_pad_x = 0; unsigned int deconv_pad_y = 0; - const TensorShape scale_out_shape = compute_deconvolution_upsampled_shape(*input->info(), *weights->info(), stride_x, stride_y, out_dims, deconv_pad_x, deconv_pad_y); + const TensorShape scale_out_shape = compute_deconvolution_upsampled_shape( + *input->info(), *weights->info(), stride_x, stride_y, out_dims, deconv_pad_x, deconv_pad_y); unsigned int deconv_pad_left = pad_right > pad_left ? pad_right - pad_left : 0; unsigned int deconv_pad_right = pad_left > pad_right ? pad_left - pad_right : 0; @@ -167,7 +206,8 @@ void CLDirectDeconvolutionLayer::configure(const CLCompileContext &compile_conte _scaled_output.allocator()->init(scale_out_info); // configure scale function - const PadStrideInfo upsample_info(stride_x, stride_y, deconv_pad_left, deconv_pad_right, deconv_pad_top, deconv_pad_bottom, DimensionRoundingType::FLOOR); + const PadStrideInfo upsample_info(stride_x, stride_y, deconv_pad_left, deconv_pad_right, deconv_pad_top, + deconv_pad_bottom, DimensionRoundingType::FLOOR); _scale_f.configure(compile_context, input, &_scaled_output, upsample_info); // Setup the function to convolve the upscaled output @@ -179,7 +219,7 @@ void CLDirectDeconvolutionLayer::configure(const CLCompileContext &compile_conte _flip_axis.allocator()->allocate(); _flip_axis.map(true); auto axis_data = reinterpret_cast<uint32_t *>(_flip_axis.buffer()); - if(weights->info()->data_layout() == DataLayout::NHWC) + if (weights->info()->data_layout() == DataLayout::NHWC) { axis_data[0] = 1; axis_data[1] = 2; @@ -204,7 +244,7 @@ void CLDirectDeconvolutionLayer::run() void CLDirectDeconvolutionLayer::prepare() { - if(!_is_prepared) + if (!_is_prepared) { ARM_COMPUTE_ERROR_ON(!_original_weights->is_used()); @@ -217,11 +257,10 @@ void CLDirectDeconvolutionLayer::prepare() _conv_f.prepare(); // Free flipped weights - if(!_weights_flipped.is_used()) + if (!_weights_flipped.is_used()) { _weights_flipped.allocator()->free(); } - _is_prepared = true; } } diff --git a/src/runtime/CL/functions/CLElementWiseUnaryLayer.cpp b/src/runtime/CL/functions/CLElementWiseUnaryLayer.cpp deleted file mode 100644 index ce615327a9..0000000000 --- a/src/runtime/CL/functions/CLElementWiseUnaryLayer.cpp +++ /dev/null @@ -1,144 +0,0 @@ -/* - * Copyright (c) 2018-2020 ARM Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "arm_compute/runtime/CL/functions/CLElementWiseUnaryLayer.h" - -#include "arm_compute/core/CL/kernels/CLElementWiseUnaryLayerKernel.h" -#include "support/MemorySupport.h" - -#include <utility> - -namespace arm_compute -{ -void CLRsqrtLayer::configure(const ICLTensor *input, ICLTensor *output) -{ - configure(CLKernelLibrary::get().get_compile_context(), input, output); -} - -void CLRsqrtLayer::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output) -{ - auto k = arm_compute::support::cpp14::make_unique<CLElementWiseUnaryLayerKernel>(); - k->configure(compile_context, input, output, ElementWiseUnary::RSQRT); - _kernel = std::move(k); -} -Status CLRsqrtLayer::validate(const ITensorInfo *input, const ITensorInfo *output) -{ - return CLElementWiseUnaryLayerKernel::validate(input, output, ElementWiseUnary::RSQRT); -} - -void CLExpLayer::configure(const ICLTensor *input, ICLTensor *output) -{ - configure(CLKernelLibrary::get().get_compile_context(), input, output); -} - -void CLExpLayer::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output) -{ - auto k = arm_compute::support::cpp14::make_unique<CLElementWiseUnaryLayerKernel>(); - k->configure(compile_context, input, output, ElementWiseUnary::EXP); - _kernel = std::move(k); -} -Status CLExpLayer::validate(const ITensorInfo *input, const ITensorInfo *output) -{ - return CLElementWiseUnaryLayerKernel::validate(input, output, ElementWiseUnary::EXP); -} - -void CLNegLayer::configure(const ICLTensor *input, ICLTensor *output) -{ - configure(CLKernelLibrary::get().get_compile_context(), input, output); -} - -void CLNegLayer::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output) -{ - auto k = arm_compute::support::cpp14::make_unique<CLElementWiseUnaryLayerKernel>(); - k->configure(compile_context, input, output, ElementWiseUnary::NEG); - _kernel = std::move(k); -} -Status CLNegLayer::validate(const ITensorInfo *input, const ITensorInfo *output) -{ - return CLElementWiseUnaryLayerKernel::validate(input, output, ElementWiseUnary::NEG); -} - -void CLSinLayer::configure(const ICLTensor *input, ICLTensor *output) -{ - configure(CLKernelLibrary::get().get_compile_context(), input, output); -} - -void CLSinLayer::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output) -{ - auto k = arm_compute::support::cpp14::make_unique<CLElementWiseUnaryLayerKernel>(); - k->configure(compile_context, input, output, ElementWiseUnary::SIN); - _kernel = std::move(k); -} -Status CLSinLayer::validate(const ITensorInfo *input, const ITensorInfo *output) -{ - return CLElementWiseUnaryLayerKernel::validate(input, output, ElementWiseUnary::SIN); -} - -void CLAbsLayer::configure(const ICLTensor *input, ICLTensor *output) -{ - configure(CLKernelLibrary::get().get_compile_context(), input, output); -} - -void CLAbsLayer::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output) -{ - auto k = arm_compute::support::cpp14::make_unique<CLElementWiseUnaryLayerKernel>(); - k->configure(compile_context, input, output, ElementWiseUnary::ABS); - _kernel = std::move(k); -} -Status CLAbsLayer::validate(const ITensorInfo *input, const ITensorInfo *output) -{ - return CLElementWiseUnaryLayerKernel::validate(input, output, ElementWiseUnary::ABS); -} -void CLLogLayer::configure(const ICLTensor *input, ICLTensor *output) -{ - configure(CLKernelLibrary::get().get_compile_context(), input, output); -} - -void CLLogLayer::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output) -{ - auto k = arm_compute::support::cpp14::make_unique<CLElementWiseUnaryLayerKernel>(); - k->configure(compile_context, input, output, ElementWiseUnary::LOG); - _kernel = std::move(k); -} -Status CLLogLayer::validate(const ITensorInfo *input, const ITensorInfo *output) -{ - return CLElementWiseUnaryLayerKernel::validate(input, output, ElementWiseUnary::LOG); -} - -void CLRoundLayer::configure(const ICLTensor *input, ICLTensor *output) -{ - configure(CLKernelLibrary::get().get_compile_context(), input, output); -} - -void CLRoundLayer::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output) -{ - auto k = arm_compute::support::cpp14::make_unique<CLElementWiseUnaryLayerKernel>(); - k->configure(compile_context, input, output, ElementWiseUnary::ROUND); - _kernel = std::move(k); -} -Status CLRoundLayer::validate(const ITensorInfo *input, const ITensorInfo *output) -{ - return CLElementWiseUnaryLayerKernel::validate(input, output, ElementWiseUnary::ROUND); -} - -} // namespace arm_compute diff --git a/src/runtime/CL/functions/CLElementwiseOperations.cpp b/src/runtime/CL/functions/CLElementwiseOperations.cpp index 20e9545b61..d9529f0b7f 100644 --- a/src/runtime/CL/functions/CLElementwiseOperations.cpp +++ b/src/runtime/CL/functions/CLElementwiseOperations.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018-2020 ARM Limited. + * Copyright (c) 2018-2021 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -23,155 +23,395 @@ */ #include "arm_compute/runtime/CL/functions/CLElementwiseOperations.h" +#include "arm_compute/core/CL/CLKernelLibrary.h" #include "arm_compute/core/CL/ICLTensor.h" -#include "arm_compute/core/CL/kernels/CLElementwiseOperationKernel.h" -#include "support/MemorySupport.h" +#include "arm_compute/core/Types.h" -#include <utility> +#include "src/core/CL/ICLKernel.h" +#include "src/gpu/cl/operators/ClAdd.h" +#include "src/gpu/cl/operators/ClElementwiseOperations.h" +#include "src/gpu/cl/operators/ClSub.h" namespace arm_compute { -namespace +struct CLArithmeticAddition::Impl { -void configure_border_handler(const CLCompileContext &compile_context, CLFillBorderKernel &border_handler, BorderSize border_size, ICLTensor *input1, ICLTensor *input2, const ICLTensor *output) -{ - if(output->info()->dimension(0) > 1) - { - ICLTensor *broadcasted_info = (input1->info()->dimension(0) == 1) ? input1 : input2; + const ICLTensor *src_0{nullptr}; + const ICLTensor *src_1{nullptr}; + ICLTensor *dst{nullptr}; + std::unique_ptr<opencl::ClAdd> op{nullptr}; +}; - if(broadcasted_info->info()->dimension(0) == 1) - { - border_handler.configure(compile_context, broadcasted_info, border_size, BorderMode::REPLICATE); - } - } +CLArithmeticAddition::CLArithmeticAddition() : _impl(std::make_unique<Impl>()) +{ } -} // namespace +CLArithmeticAddition::CLArithmeticAddition(CLArithmeticAddition &&) = default; +CLArithmeticAddition &CLArithmeticAddition::operator=(CLArithmeticAddition &&) = default; +CLArithmeticAddition::~CLArithmeticAddition() = default; -void CLArithmeticAddition::configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *output, ConvertPolicy policy, const ActivationLayerInfo &act_info) +void CLArithmeticAddition::configure( + ICLTensor *input1, ICLTensor *input2, ICLTensor *output, ConvertPolicy policy, const ActivationLayerInfo &act_info) { configure(CLKernelLibrary::get().get_compile_context(), input1, input2, output, policy, act_info); } -void CLArithmeticAddition::configure(const CLCompileContext &compile_context, ICLTensor *input1, ICLTensor *input2, ICLTensor *output, ConvertPolicy policy, const ActivationLayerInfo &act_info) +void CLArithmeticAddition::configure(const CLCompileContext &compile_context, + const ICLTensor *input1, + const ICLTensor *input2, + ICLTensor *output, + ConvertPolicy policy, + const ActivationLayerInfo &act_info) +{ + _impl->src_0 = input1; + _impl->src_1 = input2; + _impl->dst = output; + _impl->op = std::make_unique<opencl::ClAdd>(); + _impl->op->configure(compile_context, input1->info(), input2->info(), output->info(), policy, act_info); +} + +Status CLArithmeticAddition::validate(const ITensorInfo *input1, + const ITensorInfo *input2, + const ITensorInfo *output, + ConvertPolicy policy, + const ActivationLayerInfo &act_info) { - auto k = arm_compute::support::cpp14::make_unique<CLSaturatedArithmeticOperationKernel>(); - k->configure(compile_context, ArithmeticOperation::ADD, input1, input2, output, policy, act_info); - _kernel = std::move(k); - configure_border_handler(compile_context, _border_handler, _kernel->border_size(), input1, input2, output); + return opencl::ClAdd::validate(input1, input2, output, policy, act_info); } -Status CLArithmeticAddition::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, ConvertPolicy policy, const ActivationLayerInfo &act_info) +void CLArithmeticAddition::run() { - return CLSaturatedArithmeticOperationKernel::validate(ArithmeticOperation::ADD, input1, input2, output, policy, act_info); + ITensorPack pack; + pack.add_tensor(TensorType::ACL_SRC_0, _impl->src_0); + pack.add_tensor(TensorType::ACL_SRC_1, _impl->src_1); + pack.add_tensor(TensorType::ACL_DST, _impl->dst); + + _impl->op->run(pack); } -void CLArithmeticSubtraction::configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *output, ConvertPolicy policy, const ActivationLayerInfo &act_info) +struct CLArithmeticSubtraction::Impl +{ + const ICLTensor *src_0{nullptr}; + const ICLTensor *src_1{nullptr}; + ICLTensor *dst{nullptr}; + std::unique_ptr<opencl::ClSub> op{nullptr}; +}; + +CLArithmeticSubtraction::CLArithmeticSubtraction() : _impl(std::make_unique<Impl>()) +{ +} +CLArithmeticSubtraction::CLArithmeticSubtraction(CLArithmeticSubtraction &&) = default; +CLArithmeticSubtraction &CLArithmeticSubtraction::operator=(CLArithmeticSubtraction &&) = default; +CLArithmeticSubtraction::~CLArithmeticSubtraction() = default; + +void CLArithmeticSubtraction::configure(const ICLTensor *input1, + const ICLTensor *input2, + ICLTensor *output, + ConvertPolicy policy, + const ActivationLayerInfo &act_info) { configure(CLKernelLibrary::get().get_compile_context(), input1, input2, output, policy, act_info); } -void CLArithmeticSubtraction::configure(const CLCompileContext &compile_context, ICLTensor *input1, ICLTensor *input2, ICLTensor *output, ConvertPolicy policy, const ActivationLayerInfo &act_info) +void CLArithmeticSubtraction::configure(const CLCompileContext &compile_context, + const ICLTensor *input1, + const ICLTensor *input2, + ICLTensor *output, + ConvertPolicy policy, + const ActivationLayerInfo &act_info) +{ + _impl->src_0 = input1; + _impl->src_1 = input2; + _impl->dst = output; + _impl->op = std::make_unique<opencl::ClSub>(); + _impl->op->configure(compile_context, input1->info(), input2->info(), output->info(), policy, act_info); +} + +Status CLArithmeticSubtraction::validate(const ITensorInfo *input1, + const ITensorInfo *input2, + const ITensorInfo *output, + ConvertPolicy policy, + const ActivationLayerInfo &act_info) +{ + return opencl::ClSub::validate(input1, input2, output, policy, act_info); +} + +void CLArithmeticSubtraction::run() { - auto k = arm_compute::support::cpp14::make_unique<CLSaturatedArithmeticOperationKernel>(); - k->configure(compile_context, ArithmeticOperation::SUB, input1, input2, output, policy, act_info); - _kernel = std::move(k); - configure_border_handler(compile_context, _border_handler, _kernel->border_size(), input1, input2, output); + ITensorPack pack; + pack.add_tensor(TensorType::ACL_SRC_0, _impl->src_0); + pack.add_tensor(TensorType::ACL_SRC_1, _impl->src_1); + pack.add_tensor(TensorType::ACL_DST, _impl->dst); + + _impl->op->run(pack); } -Status CLArithmeticSubtraction::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, ConvertPolicy policy, const ActivationLayerInfo &act_info) +struct CLArithmeticDivision::Impl +{ + const ICLTensor *src_0{nullptr}; + const ICLTensor *src_1{nullptr}; + ICLTensor *dst{nullptr}; + std::unique_ptr<opencl::ClElementwiseDivision> op{nullptr}; +}; + +CLArithmeticDivision::CLArithmeticDivision() : _impl(std::make_unique<Impl>()) { - ARM_COMPUTE_UNUSED(policy); - return CLSaturatedArithmeticOperationKernel::validate(ArithmeticOperation::SUB, input1, input2, output, policy, act_info); } +CLArithmeticDivision::CLArithmeticDivision(CLArithmeticDivision &&) = default; +CLArithmeticDivision &CLArithmeticDivision::operator=(CLArithmeticDivision &&) = default; +CLArithmeticDivision::~CLArithmeticDivision() = default; -void CLArithmeticDivision::configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *output, const ActivationLayerInfo &act_info) +void CLArithmeticDivision::configure(ICLTensor *input1, + ICLTensor *input2, + ICLTensor *output, + const ActivationLayerInfo &act_info) { configure(CLKernelLibrary::get().get_compile_context(), input1, input2, output, act_info); } -void CLArithmeticDivision::configure(const CLCompileContext &compile_context, ICLTensor *input1, ICLTensor *input2, ICLTensor *output, const ActivationLayerInfo &act_info) +void CLArithmeticDivision::configure(const CLCompileContext &compile_context, + const ICLTensor *input1, + const ICLTensor *input2, + ICLTensor *output, + const ActivationLayerInfo &act_info) { - auto k = arm_compute::support::cpp14::make_unique<CLArithmeticOperationKernel>(); - k->configure(compile_context, ArithmeticOperation::DIV, input1, input2, output, act_info); - _kernel = std::move(k); - configure_border_handler(compile_context, _border_handler, _kernel->border_size(), input1, input2, output); + _impl->src_0 = input1; + _impl->src_1 = input2; + _impl->dst = output; + _impl->op = std::make_unique<opencl::ClElementwiseDivision>(); + _impl->op->configure(compile_context, input1->info(), input2->info(), output->info(), act_info); } -Status CLArithmeticDivision::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const ActivationLayerInfo &act_info) +Status CLArithmeticDivision::validate(const ITensorInfo *input1, + const ITensorInfo *input2, + const ITensorInfo *output, + const ActivationLayerInfo &act_info) { - return CLArithmeticOperationKernel::validate(ArithmeticOperation::DIV, input1, input2, output, act_info); + return opencl::ClElementwiseDivision::validate(input1, input2, output, act_info); } -void CLElementwiseMax::configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *output, const ActivationLayerInfo &act_info) +void CLArithmeticDivision::run() +{ + ITensorPack pack; + pack.add_tensor(TensorType::ACL_SRC_0, _impl->src_0); + pack.add_tensor(TensorType::ACL_SRC_1, _impl->src_1); + pack.add_tensor(TensorType::ACL_DST, _impl->dst); + + _impl->op->run(pack); +} + +struct CLElementwiseMax::Impl +{ + const ICLTensor *src_0{nullptr}; + const ICLTensor *src_1{nullptr}; + ICLTensor *dst{nullptr}; + std::unique_ptr<opencl::ClElementwiseMax> op{nullptr}; +}; + +CLElementwiseMax::CLElementwiseMax() : _impl(std::make_unique<Impl>()) +{ +} +CLElementwiseMax::CLElementwiseMax(CLElementwiseMax &&) = default; +CLElementwiseMax &CLElementwiseMax::operator=(CLElementwiseMax &&) = default; +CLElementwiseMax::~CLElementwiseMax() = default; + +void CLElementwiseMax::configure(ICLTensor *input1, + ICLTensor *input2, + ICLTensor *output, + const ActivationLayerInfo &act_info) { configure(CLKernelLibrary::get().get_compile_context(), input1, input2, output, act_info); } -void CLElementwiseMax::configure(const CLCompileContext &compile_context, ICLTensor *input1, ICLTensor *input2, ICLTensor *output, const ActivationLayerInfo &act_info) +void CLElementwiseMax::configure(const CLCompileContext &compile_context, + ICLTensor *input1, + ICLTensor *input2, + ICLTensor *output, + const ActivationLayerInfo &act_info) { - auto k = arm_compute::support::cpp14::make_unique<CLArithmeticOperationKernel>(); - k->configure(compile_context, ArithmeticOperation::MAX, input1, input2, output, act_info); - _kernel = std::move(k); - configure_border_handler(compile_context, _border_handler, _kernel->border_size(), input1, input2, output); + _impl->src_0 = input1; + _impl->src_1 = input2; + _impl->dst = output; + _impl->op = std::make_unique<opencl::ClElementwiseMax>(); + _impl->op->configure(compile_context, input1->info(), input2->info(), output->info(), act_info); } -Status CLElementwiseMax::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const ActivationLayerInfo &act_info) +Status CLElementwiseMax::validate(const ITensorInfo *input1, + const ITensorInfo *input2, + const ITensorInfo *output, + const ActivationLayerInfo &act_info) { - return CLArithmeticOperationKernel::validate(ArithmeticOperation::MAX, input1, input2, output, act_info); + return opencl::ClElementwiseMax::validate(input1, input2, output, act_info); } -void CLElementwiseMin::configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *output, const ActivationLayerInfo &act_info) +void CLElementwiseMax::run() +{ + ITensorPack pack; + pack.add_tensor(TensorType::ACL_SRC_0, _impl->src_0); + pack.add_tensor(TensorType::ACL_SRC_1, _impl->src_1); + pack.add_tensor(TensorType::ACL_DST, _impl->dst); + + _impl->op->run(pack); +} + +struct CLElementwiseMin::Impl +{ + const ICLTensor *src_0{nullptr}; + const ICLTensor *src_1{nullptr}; + ICLTensor *dst{nullptr}; + std::unique_ptr<opencl::ClElementwiseMin> op{nullptr}; +}; + +CLElementwiseMin::CLElementwiseMin() : _impl(std::make_unique<Impl>()) +{ +} +CLElementwiseMin::CLElementwiseMin(CLElementwiseMin &&) = default; +CLElementwiseMin &CLElementwiseMin::operator=(CLElementwiseMin &&) = default; +CLElementwiseMin::~CLElementwiseMin() = default; + +void CLElementwiseMin::configure(ICLTensor *input1, + ICLTensor *input2, + ICLTensor *output, + const ActivationLayerInfo &act_info) { configure(CLKernelLibrary::get().get_compile_context(), input1, input2, output, act_info); } -void CLElementwiseMin::configure(const CLCompileContext &compile_context, ICLTensor *input1, ICLTensor *input2, ICLTensor *output, const ActivationLayerInfo &act_info) +void CLElementwiseMin::configure(const CLCompileContext &compile_context, + ICLTensor *input1, + ICLTensor *input2, + ICLTensor *output, + const ActivationLayerInfo &act_info) { - auto k = arm_compute::support::cpp14::make_unique<CLArithmeticOperationKernel>(); - k->configure(compile_context, ArithmeticOperation::MIN, input1, input2, output, act_info); - _kernel = std::move(k); - configure_border_handler(compile_context, _border_handler, _kernel->border_size(), input1, input2, output); + _impl->src_0 = input1; + _impl->src_1 = input2; + _impl->dst = output; + _impl->op = std::make_unique<opencl::ClElementwiseMin>(); + _impl->op->configure(compile_context, input1->info(), input2->info(), output->info(), act_info); } -Status CLElementwiseMin::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const ActivationLayerInfo &act_info) +Status CLElementwiseMin::validate(const ITensorInfo *input1, + const ITensorInfo *input2, + const ITensorInfo *output, + const ActivationLayerInfo &act_info) { - return CLArithmeticOperationKernel::validate(ArithmeticOperation::MIN, input1, input2, output, act_info); + return opencl::ClElementwiseMin::validate(input1, input2, output, act_info); } -void CLElementwiseSquaredDiff::configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *output, const ActivationLayerInfo &act_info) +void CLElementwiseMin::run() +{ + ITensorPack pack; + pack.add_tensor(TensorType::ACL_SRC_0, _impl->src_0); + pack.add_tensor(TensorType::ACL_SRC_1, _impl->src_1); + pack.add_tensor(TensorType::ACL_DST, _impl->dst); + + _impl->op->run(pack); +} + +struct CLElementwiseSquaredDiff::Impl +{ + const ICLTensor *src_0{nullptr}; + const ICLTensor *src_1{nullptr}; + ICLTensor *dst{nullptr}; + std::unique_ptr<opencl::ClElementwiseSquaredDiff> op{nullptr}; +}; + +CLElementwiseSquaredDiff::CLElementwiseSquaredDiff() : _impl(std::make_unique<Impl>()) +{ +} +CLElementwiseSquaredDiff::CLElementwiseSquaredDiff(CLElementwiseSquaredDiff &&) = default; +CLElementwiseSquaredDiff &CLElementwiseSquaredDiff::operator=(CLElementwiseSquaredDiff &&) = default; +CLElementwiseSquaredDiff::~CLElementwiseSquaredDiff() = default; + +void CLElementwiseSquaredDiff::configure(ICLTensor *input1, + ICLTensor *input2, + ICLTensor *output, + const ActivationLayerInfo &act_info) { configure(CLKernelLibrary::get().get_compile_context(), input1, input2, output, act_info); } -void CLElementwiseSquaredDiff::configure(const CLCompileContext &compile_context, ICLTensor *input1, ICLTensor *input2, ICLTensor *output, const ActivationLayerInfo &act_info) +void CLElementwiseSquaredDiff::configure(const CLCompileContext &compile_context, + ICLTensor *input1, + ICLTensor *input2, + ICLTensor *output, + const ActivationLayerInfo &act_info) +{ + _impl->src_0 = input1; + _impl->src_1 = input2; + _impl->dst = output; + _impl->op = std::make_unique<opencl::ClElementwiseSquaredDiff>(); + _impl->op->configure(compile_context, input1->info(), input2->info(), output->info(), act_info); +} + +Status CLElementwiseSquaredDiff::validate(const ITensorInfo *input1, + const ITensorInfo *input2, + const ITensorInfo *output, + const ActivationLayerInfo &act_info) +{ + return opencl::ClElementwiseSquaredDiff::validate(input1, input2, output, act_info); +} + +void CLElementwiseSquaredDiff::run() { - auto k = arm_compute::support::cpp14::make_unique<CLArithmeticOperationKernel>(); - k->configure(compile_context, ArithmeticOperation::SQUARED_DIFF, input1, input2, output, act_info); - _kernel = std::move(k); - configure_border_handler(compile_context, _border_handler, _kernel->border_size(), input1, input2, output); + ITensorPack pack; + pack.add_tensor(TensorType::ACL_SRC_0, _impl->src_0); + pack.add_tensor(TensorType::ACL_SRC_1, _impl->src_1); + pack.add_tensor(TensorType::ACL_DST, _impl->dst); + + _impl->op->run(pack); } -Status CLElementwiseSquaredDiff::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const ActivationLayerInfo &act_info) +struct CLElementwisePower::Impl +{ + const ICLTensor *src_0{nullptr}; + const ICLTensor *src_1{nullptr}; + ICLTensor *dst{nullptr}; + std::unique_ptr<opencl::ClElementwisePower> op{nullptr}; +}; + +CLElementwisePower::CLElementwisePower() : _impl(std::make_unique<Impl>()) { - return CLArithmeticOperationKernel::validate(ArithmeticOperation::SQUARED_DIFF, input1, input2, output, act_info); } +CLElementwisePower::CLElementwisePower(CLElementwisePower &&) = default; +CLElementwisePower &CLElementwisePower::operator=(CLElementwisePower &&) = default; +CLElementwisePower::~CLElementwisePower() = default; -void CLElementwisePower::configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *output, const ActivationLayerInfo &act_info) +void CLElementwisePower::configure(ICLTensor *input1, + ICLTensor *input2, + ICLTensor *output, + const ActivationLayerInfo &act_info) { configure(CLKernelLibrary::get().get_compile_context(), input1, input2, output, act_info); } -void CLElementwisePower::configure(const CLCompileContext &compile_context, ICLTensor *input1, ICLTensor *input2, ICLTensor *output, const ActivationLayerInfo &act_info) +void CLElementwisePower::configure(const CLCompileContext &compile_context, + ICLTensor *input1, + ICLTensor *input2, + ICLTensor *output, + const ActivationLayerInfo &act_info) { - auto k = arm_compute::support::cpp14::make_unique<CLArithmeticOperationKernel>(); - k->configure(compile_context, ArithmeticOperation::POWER, input1, input2, output, act_info); - _kernel = std::move(k); - configure_border_handler(compile_context, _border_handler, _kernel->border_size(), input1, input2, output); + _impl->src_0 = input1; + _impl->src_1 = input2; + _impl->dst = output; + _impl->op = std::make_unique<opencl::ClElementwisePower>(); + _impl->op->configure(compile_context, input1->info(), input2->info(), output->info(), act_info); } -Status CLElementwisePower::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const ActivationLayerInfo &act_info) +Status CLElementwisePower::validate(const ITensorInfo *input1, + const ITensorInfo *input2, + const ITensorInfo *output, + const ActivationLayerInfo &act_info) { - return CLArithmeticOperationKernel::validate(ArithmeticOperation::POWER, input1, input2, output, act_info); + return opencl::ClElementwisePower::validate(input1, input2, output, act_info); } +void CLElementwisePower::run() +{ + ITensorPack pack; + pack.add_tensor(TensorType::ACL_SRC_0, _impl->src_0); + pack.add_tensor(TensorType::ACL_SRC_1, _impl->src_1); + pack.add_tensor(TensorType::ACL_DST, _impl->dst); + + _impl->op->run(pack); +} } // namespace arm_compute diff --git a/src/runtime/CL/functions/CLElementwiseUnaryLayer.cpp b/src/runtime/CL/functions/CLElementwiseUnaryLayer.cpp new file mode 100644 index 0000000000..3043c26feb --- /dev/null +++ b/src/runtime/CL/functions/CLElementwiseUnaryLayer.cpp @@ -0,0 +1,315 @@ +/* + * Copyright (c) 2018-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/runtime/CL/functions/CLElementwiseUnaryLayer.h" + +#include "arm_compute/core/CL/CLKernelLibrary.h" +#include "arm_compute/core/CL/ICLTensor.h" + +#include "src/core/CL/ICLKernel.h" +#include "src/gpu/cl/operators/ClElementwiseUnary.h" + +namespace arm_compute +{ +struct CLRsqrtLayer::Impl +{ + const ICLTensor *src{nullptr}; + ICLTensor *dst{nullptr}; + std::unique_ptr<opencl::ClRsqrt> op{nullptr}; +}; + +CLRsqrtLayer::CLRsqrtLayer() : _impl(std::make_unique<Impl>()) +{ +} + +CLRsqrtLayer::CLRsqrtLayer(CLRsqrtLayer &&) = default; +CLRsqrtLayer &CLRsqrtLayer::operator=(CLRsqrtLayer &&) = default; +CLRsqrtLayer::~CLRsqrtLayer() = default; + +void CLRsqrtLayer::configure(const ICLTensor *input, ICLTensor *output) +{ + configure(CLKernelLibrary::get().get_compile_context(), input, output); +} + +void CLRsqrtLayer::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output) +{ + _impl->src = input; + _impl->dst = output; + _impl->op = std::make_unique<opencl::ClRsqrt>(); + _impl->op->configure(compile_context, input->info(), output->info()); +} + +Status CLRsqrtLayer::validate(const ITensorInfo *input, const ITensorInfo *output) +{ + return opencl::ClRsqrt::validate(input, output); +} + +void CLRsqrtLayer::run() +{ + ITensorPack pack; + pack.add_tensor(TensorType::ACL_SRC, _impl->src); + pack.add_tensor(TensorType::ACL_DST, _impl->dst); + _impl->op->run(pack); +} + +struct CLExpLayer::Impl +{ + const ICLTensor *src{nullptr}; + ICLTensor *dst{nullptr}; + std::unique_ptr<opencl::ClExp> op{nullptr}; +}; + +CLExpLayer::CLExpLayer() : _impl(std::make_unique<Impl>()) +{ +} + +CLExpLayer::CLExpLayer(CLExpLayer &&) = default; +CLExpLayer &CLExpLayer::operator=(CLExpLayer &&) = default; +CLExpLayer::~CLExpLayer() = default; + +void CLExpLayer::configure(const ICLTensor *input, ICLTensor *output) +{ + configure(CLKernelLibrary::get().get_compile_context(), input, output); +} + +void CLExpLayer::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output) +{ + _impl->src = input; + _impl->dst = output; + _impl->op = std::make_unique<opencl::ClExp>(); + _impl->op->configure(compile_context, input->info(), output->info()); +} + +Status CLExpLayer::validate(const ITensorInfo *input, const ITensorInfo *output) +{ + return opencl::ClExp::validate(input, output); +} + +void CLExpLayer::run() +{ + ITensorPack pack; + pack.add_tensor(TensorType::ACL_SRC, _impl->src); + pack.add_tensor(TensorType::ACL_DST, _impl->dst); + _impl->op->run(pack); +} + +struct CLNegLayer::Impl +{ + const ICLTensor *src{nullptr}; + ICLTensor *dst{nullptr}; + std::unique_ptr<opencl::ClNeg> op{nullptr}; +}; + +CLNegLayer::CLNegLayer() : _impl(std::make_unique<Impl>()) +{ +} + +CLNegLayer::CLNegLayer(CLNegLayer &&) = default; +CLNegLayer &CLNegLayer::operator=(CLNegLayer &&) = default; +CLNegLayer::~CLNegLayer() = default; + +void CLNegLayer::configure(const ICLTensor *input, ICLTensor *output) +{ + configure(CLKernelLibrary::get().get_compile_context(), input, output); +} + +void CLNegLayer::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output) +{ + _impl->src = input; + _impl->dst = output; + _impl->op = std::make_unique<opencl::ClNeg>(); + _impl->op->configure(compile_context, input->info(), output->info()); +} +Status CLNegLayer::validate(const ITensorInfo *input, const ITensorInfo *output) +{ + return opencl::ClNeg::validate(input, output); +} + +void CLNegLayer::run() +{ + ITensorPack pack; + pack.add_tensor(TensorType::ACL_SRC, _impl->src); + pack.add_tensor(TensorType::ACL_DST, _impl->dst); + _impl->op->run(pack); +} + +struct CLSinLayer::Impl +{ + const ICLTensor *src{nullptr}; + ICLTensor *dst{nullptr}; + std::unique_ptr<opencl::ClSin> op{nullptr}; +}; + +CLSinLayer::CLSinLayer() : _impl(std::make_unique<Impl>()) +{ +} + +CLSinLayer::CLSinLayer(CLSinLayer &&) = default; +CLSinLayer &CLSinLayer::operator=(CLSinLayer &&) = default; +CLSinLayer::~CLSinLayer() = default; + +void CLSinLayer::configure(const ICLTensor *input, ICLTensor *output) +{ + configure(CLKernelLibrary::get().get_compile_context(), input, output); +} + +void CLSinLayer::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output) +{ + _impl->src = input; + _impl->dst = output; + _impl->op = std::make_unique<opencl::ClSin>(); + _impl->op->configure(compile_context, input->info(), output->info()); +} +Status CLSinLayer::validate(const ITensorInfo *input, const ITensorInfo *output) +{ + return opencl::ClSin::validate(input, output); +} + +void CLSinLayer::run() +{ + ITensorPack pack; + pack.add_tensor(TensorType::ACL_SRC, _impl->src); + pack.add_tensor(TensorType::ACL_DST, _impl->dst); + _impl->op->run(pack); +} + +struct CLAbsLayer::Impl +{ + const ICLTensor *src{nullptr}; + ICLTensor *dst{nullptr}; + std::unique_ptr<opencl::ClAbs> op{nullptr}; +}; + +CLAbsLayer::CLAbsLayer() : _impl(std::make_unique<Impl>()) +{ +} + +CLAbsLayer::CLAbsLayer(CLAbsLayer &&) = default; +CLAbsLayer &CLAbsLayer::operator=(CLAbsLayer &&) = default; +CLAbsLayer::~CLAbsLayer() = default; + +void CLAbsLayer::configure(const ICLTensor *input, ICLTensor *output) +{ + configure(CLKernelLibrary::get().get_compile_context(), input, output); +} + +void CLAbsLayer::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output) +{ + _impl->src = input; + _impl->dst = output; + _impl->op = std::make_unique<opencl::ClAbs>(); + _impl->op->configure(compile_context, input->info(), output->info()); +} +Status CLAbsLayer::validate(const ITensorInfo *input, const ITensorInfo *output) +{ + return opencl::ClAbs::validate(input, output); +} + +void CLAbsLayer::run() +{ + ITensorPack pack; + pack.add_tensor(TensorType::ACL_SRC, _impl->src); + pack.add_tensor(TensorType::ACL_DST, _impl->dst); + _impl->op->run(pack); +} + +struct CLLogLayer::Impl +{ + const ICLTensor *src{nullptr}; + ICLTensor *dst{nullptr}; + std::unique_ptr<opencl::ClLog> op{nullptr}; +}; + +CLLogLayer::CLLogLayer() : _impl(std::make_unique<Impl>()) +{ +} + +CLLogLayer::CLLogLayer(CLLogLayer &&) = default; +CLLogLayer &CLLogLayer::operator=(CLLogLayer &&) = default; +CLLogLayer::~CLLogLayer() = default; + +void CLLogLayer::configure(const ICLTensor *input, ICLTensor *output) +{ + configure(CLKernelLibrary::get().get_compile_context(), input, output); +} + +void CLLogLayer::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output) +{ + _impl->src = input; + _impl->dst = output; + _impl->op = std::make_unique<opencl::ClLog>(); + _impl->op->configure(compile_context, input->info(), output->info()); +} +Status CLLogLayer::validate(const ITensorInfo *input, const ITensorInfo *output) +{ + return opencl::ClLog::validate(input, output); +} + +void CLLogLayer::run() +{ + ITensorPack pack; + pack.add_tensor(TensorType::ACL_SRC, _impl->src); + pack.add_tensor(TensorType::ACL_DST, _impl->dst); + _impl->op->run(pack); +} + +struct CLRoundLayer::Impl +{ + const ICLTensor *src{nullptr}; + ICLTensor *dst{nullptr}; + std::unique_ptr<opencl::ClRound> op{nullptr}; +}; + +CLRoundLayer::CLRoundLayer() : _impl(std::make_unique<Impl>()) +{ +} + +CLRoundLayer::CLRoundLayer(CLRoundLayer &&) = default; +CLRoundLayer &CLRoundLayer::operator=(CLRoundLayer &&) = default; +CLRoundLayer::~CLRoundLayer() = default; + +void CLRoundLayer::configure(const ICLTensor *input, ICLTensor *output) +{ + configure(CLKernelLibrary::get().get_compile_context(), input, output); +} + +void CLRoundLayer::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output) +{ + _impl->src = input; + _impl->dst = output; + _impl->op = std::make_unique<opencl::ClRound>(); + _impl->op->configure(compile_context, input->info(), output->info()); +} +Status CLRoundLayer::validate(const ITensorInfo *input, const ITensorInfo *output) +{ + return opencl::ClRound::validate(input, output); +} + +void CLRoundLayer::run() +{ + ITensorPack pack; + pack.add_tensor(TensorType::ACL_SRC, _impl->src); + pack.add_tensor(TensorType::ACL_DST, _impl->dst); + _impl->op->run(pack); +} +} // namespace arm_compute diff --git a/src/runtime/CL/functions/CLEqualizeHistogram.cpp b/src/runtime/CL/functions/CLEqualizeHistogram.cpp deleted file mode 100644 index e1bd7e6f2a..0000000000 --- a/src/runtime/CL/functions/CLEqualizeHistogram.cpp +++ /dev/null @@ -1,115 +0,0 @@ -/* - * Copyright (c) 2016-2020 ARM Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "arm_compute/runtime/CL/functions/CLEqualizeHistogram.h" - -#include "arm_compute/core/CL/ICLDistribution1D.h" -#include "arm_compute/core/CL/ICLLut.h" -#include "arm_compute/core/CL/OpenCL.h" -#include "arm_compute/core/Types.h" -#include "arm_compute/runtime/CL/CLScheduler.h" - -#include <algorithm> -#include <cmath> -#include <cstddef> -#include <numeric> - -using namespace arm_compute; - -namespace -{ -void calculate_cum_dist_and_lut(CLDistribution1D &dist, CLDistribution1D &cum_dist, CLLut &lut) -{ - dist.map(true); - cum_dist.map(true); - lut.map(true); - - const uint32_t *dist_ptr = dist.buffer(); - uint32_t *cum_dist_ptr = cum_dist.buffer(); - uint8_t *lut_ptr = lut.buffer(); - - ARM_COMPUTE_ERROR_ON(dist_ptr == nullptr); - ARM_COMPUTE_ERROR_ON(cum_dist_ptr == nullptr); - ARM_COMPUTE_ERROR_ON(lut_ptr == nullptr); - - // Calculate cumulative distribution - std::partial_sum(dist_ptr, dist_ptr + 256, cum_dist_ptr); - - // Get the number of pixels that have the lowest value in the input image - const uint32_t num_lowest_pixels = *std::find_if(dist_ptr, dist_ptr + 256, [](const uint32_t &v) - { - return v > 0; - }); - const size_t image_size = cum_dist_ptr[255]; - - if(image_size == num_lowest_pixels) - { - std::iota(lut_ptr, lut_ptr + 256, 0); - } - else - { - const float diff = image_size - num_lowest_pixels; - - for(size_t i = 0; i < 256; ++i) - { - lut_ptr[i] = lround((cum_dist_ptr[i] - num_lowest_pixels) / diff * 255.f); - } - } - - dist.unmap(); - cum_dist.unmap(); - lut.unmap(); -} -} // namespace - -CLEqualizeHistogram::CLEqualizeHistogram() - : _histogram_kernel(), _border_histogram_kernel(), _map_histogram_kernel(), _hist(nr_bins, 0, max_range), _cum_dist(nr_bins, 0, max_range), _cd_lut(nr_bins, DataType::U8) -{ -} - -void CLEqualizeHistogram::configure(const ICLImage *input, ICLImage *output) -{ - configure(CLKernelLibrary::get().get_compile_context(), input, output); -} - -void CLEqualizeHistogram::configure(const CLCompileContext &compile_context, const ICLImage *input, ICLImage *output) -{ - _histogram_kernel.configure(compile_context, input, &_hist); - _border_histogram_kernel.configure(compile_context, input, &_hist); - _map_histogram_kernel.configure(compile_context, input, &_cd_lut, output); -} - -void CLEqualizeHistogram::run() -{ - // Calculate histogram of input. - CLScheduler::get().enqueue(_histogram_kernel, false); - - // Calculate remaining pixels when image is not multiple of the elements of histogram kernel - CLScheduler::get().enqueue(_border_histogram_kernel, false); - - // Calculate cumulative distribution of histogram and create LUT. - calculate_cum_dist_and_lut(_hist, _cum_dist, _cd_lut); - - // Map input to output using created LUT. - CLScheduler::get().enqueue(_map_histogram_kernel); -} diff --git a/src/runtime/CL/functions/CLErode.cpp b/src/runtime/CL/functions/CLErode.cpp deleted file mode 100644 index 8106148316..0000000000 --- a/src/runtime/CL/functions/CLErode.cpp +++ /dev/null @@ -1,45 +0,0 @@ -/* - * Copyright (c) 2016-2020 ARM Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "arm_compute/runtime/CL/functions/CLErode.h" - -#include "arm_compute/core/CL/kernels/CLErodeKernel.h" -#include "arm_compute/core/PixelValue.h" -#include "support/MemorySupport.h" - -#include <utility> - -using namespace arm_compute; - -void CLErode::configure(ICLTensor *input, ICLTensor *output, BorderMode border_mode, uint8_t constant_border_value) -{ - configure(CLKernelLibrary::get().get_compile_context(), input, output, border_mode, constant_border_value); -} - -void CLErode::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, BorderMode border_mode, uint8_t constant_border_value) -{ - auto k = arm_compute::support::cpp14::make_unique<CLErodeKernel>(); - k->configure(compile_context, input, output, border_mode == BorderMode::UNDEFINED); - _kernel = std::move(k); - _border_handler.configure(compile_context, input, BorderSize(1), border_mode, PixelValue(constant_border_value)); -} diff --git a/src/runtime/CL/functions/CLFFT1D.cpp b/src/runtime/CL/functions/CLFFT1D.cpp index c3922f5e66..48e9ae824a 100644 --- a/src/runtime/CL/functions/CLFFT1D.cpp +++ b/src/runtime/CL/functions/CLFFT1D.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2020 ARM Limited. + * Copyright (c) 2019-2021 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -25,25 +25,43 @@ #include "arm_compute/core/CL/ICLTensor.h" #include "arm_compute/core/Validate.h" -#include "arm_compute/core/utils/helpers/fft.h" #include "arm_compute/runtime/CL/CLScheduler.h" +#include "src/common/utils/Log.h" +#include "src/core/CL/kernels/CLFFTDigitReverseKernel.h" +#include "src/core/CL/kernels/CLFFTRadixStageKernel.h" +#include "src/core/CL/kernels/CLFFTScaleKernel.h" +#include "src/core/utils/helpers/fft.h" + namespace arm_compute { CLFFT1D::CLFFT1D(std::shared_ptr<IMemoryManager> memory_manager) - : _memory_group(std::move(memory_manager)), _digit_reverse_kernel(), _fft_kernels(), _scale_kernel(), _digit_reversed_input(), _digit_reverse_indices(), _num_ffts(0), _run_scale(false) + : _memory_group(std::move(memory_manager)), + _digit_reverse_kernel(std::make_unique<CLFFTDigitReverseKernel>()), + _fft_kernels(), + _scale_kernel(std::make_unique<CLFFTScaleKernel>()), + _digit_reversed_input(), + _digit_reverse_indices(), + _num_ffts(0), + _run_scale(false) { } +CLFFT1D::~CLFFT1D() = default; + void CLFFT1D::configure(const ICLTensor *input, ICLTensor *output, const FFT1DInfo &config) { configure(CLKernelLibrary::get().get_compile_context(), input, output, config); } -void CLFFT1D::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const FFT1DInfo &config) +void CLFFT1D::configure(const CLCompileContext &compile_context, + const ICLTensor *input, + ICLTensor *output, + const FFT1DInfo &config) { ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); ARM_COMPUTE_ERROR_THROW_ON(CLFFT1D::validate(input->info(), output->info(), config)); + ARM_COMPUTE_LOG_PARAMS(input, output, config); // Decompose size to radix factors const auto supported_radix = CLFFTRadixStageKernel::supported_radix(); @@ -62,13 +80,14 @@ void CLFFT1D::configure(const CLCompileContext &compile_context, const ICLTensor TensorInfo digit_reverse_indices_info(TensorShape(input->info()->tensor_shape()[config.axis]), 1, DataType::U32); _digit_reverse_indices.allocator()->init(digit_reverse_indices_info); _memory_group.manage(&_digit_reversed_input); - _digit_reverse_kernel.configure(compile_context, input, &_digit_reversed_input, &_digit_reverse_indices, digit_reverse_config); + _digit_reverse_kernel->configure(compile_context, input, &_digit_reversed_input, &_digit_reverse_indices, + digit_reverse_config); // Create and configure FFT kernels unsigned int Nx = 1; _num_ffts = decomposed_vector.size(); - _fft_kernels.resize(_num_ffts); - for(unsigned int i = 0; i < _num_ffts; ++i) + _fft_kernels.reserve(_num_ffts); + for (unsigned int i = 0; i < _num_ffts; ++i) { const unsigned int radix_for_stage = decomposed_vector.at(i); @@ -77,18 +96,21 @@ void CLFFT1D::configure(const CLCompileContext &compile_context, const ICLTensor fft_kernel_info.radix = radix_for_stage; fft_kernel_info.Nx = Nx; fft_kernel_info.is_first_stage = (i == 0); - _fft_kernels[i].configure(compile_context, &_digit_reversed_input, ((i == (_num_ffts - 1)) && !is_c2r) ? output : nullptr, fft_kernel_info); + _fft_kernels.emplace_back(std::make_unique<CLFFTRadixStageKernel>()); + _fft_kernels.back()->configure(compile_context, &_digit_reversed_input, + ((i == (_num_ffts - 1)) && !is_c2r) ? output : nullptr, fft_kernel_info); Nx *= radix_for_stage; } // Configure scale kernel - if(_run_scale) + if (_run_scale) { FFTScaleKernelInfo scale_config; scale_config.scale = static_cast<float>(N); scale_config.conjugate = config.direction == FFTDirection::Inverse; - is_c2r ? _scale_kernel.configure(compile_context, &_digit_reversed_input, output, scale_config) : _scale_kernel.configure(output, nullptr, scale_config); + is_c2r ? _scale_kernel->configure(compile_context, &_digit_reversed_input, output, scale_config) + : _scale_kernel->configure(output, nullptr, scale_config); } // Allocate tensors @@ -105,9 +127,9 @@ void CLFFT1D::configure(const CLCompileContext &compile_context, const ICLTensor Status CLFFT1D::validate(const ITensorInfo *input, const ITensorInfo *output, const FFT1DInfo &config) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output); - ARM_COMPUTE_RETURN_ERROR_ON(input->data_type() != DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_NOT_IN(input, DataType::F16, DataType::F32); ARM_COMPUTE_RETURN_ERROR_ON(input->num_channels() != 1 && input->num_channels() != 2); - ARM_COMPUTE_RETURN_ERROR_ON(std::set<unsigned int>({ 0, 1 }).count(config.axis) == 0); + ARM_COMPUTE_RETURN_ERROR_ON(std::set<unsigned int>({0, 1}).count(config.axis) == 0); // Check if FFT is decomposable const auto supported_radix = CLFFTRadixStageKernel::supported_radix(); @@ -116,7 +138,7 @@ Status CLFFT1D::validate(const ITensorInfo *input, const ITensorInfo *output, co ARM_COMPUTE_RETURN_ERROR_ON(decomposed_vector.empty()); // Checks performed when output is configured - if((output != nullptr) && (output->total_size() != 0)) + if ((output != nullptr) && (output->total_size() != 0)) { ARM_COMPUTE_RETURN_ERROR_ON(output->num_channels() == 1 && input->num_channels() == 1); ARM_COMPUTE_RETURN_ERROR_ON(output->num_channels() != 1 && output->num_channels() != 2); @@ -132,18 +154,18 @@ void CLFFT1D::run() MemoryGroupResourceScope scope_mg(_memory_group); // Run digit reverse - CLScheduler::get().enqueue(_digit_reverse_kernel, false); + CLScheduler::get().enqueue(*_digit_reverse_kernel, false); // Run radix kernels - for(unsigned int i = 0; i < _num_ffts; ++i) + for (unsigned int i = 0; i < _num_ffts; ++i) { - CLScheduler::get().enqueue(_fft_kernels[i], i == (_num_ffts - 1) && !_run_scale); + CLScheduler::get().enqueue(*_fft_kernels[i], i == (_num_ffts - 1) && !_run_scale); } // Run output scaling - if(_run_scale) + if (_run_scale) { - CLScheduler::get().enqueue(_scale_kernel, true); + CLScheduler::get().enqueue(*_scale_kernel, true); } } } // namespace arm_compute diff --git a/src/runtime/CL/functions/CLFFT2D.cpp b/src/runtime/CL/functions/CLFFT2D.cpp index 2482ea901a..3857046719 100644 --- a/src/runtime/CL/functions/CLFFT2D.cpp +++ b/src/runtime/CL/functions/CLFFT2D.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2020 ARM Limited. + * Copyright (c) 2019-2021 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -27,22 +27,36 @@ #include "arm_compute/core/Validate.h" #include "arm_compute/runtime/CL/CLScheduler.h" +#include "src/common/utils/Log.h" +#include "src/core/CL/kernels/CLFFTDigitReverseKernel.h" +#include "src/core/CL/kernels/CLFFTRadixStageKernel.h" +#include "src/core/CL/kernels/CLFFTScaleKernel.h" + namespace arm_compute { CLFFT2D::CLFFT2D(std::shared_ptr<IMemoryManager> memory_manager) - : _memory_group(memory_manager), _first_pass_func(memory_manager), _second_pass_func(memory_manager), _first_pass_tensor() + : _memory_group(memory_manager), + _first_pass_func(memory_manager), + _second_pass_func(memory_manager), + _first_pass_tensor() { } +CLFFT2D::~CLFFT2D() = default; + void CLFFT2D::configure(const ICLTensor *input, ICLTensor *output, const FFT2DInfo &config) { configure(CLKernelLibrary::get().get_compile_context(), input, output, config); } -void CLFFT2D::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const FFT2DInfo &config) +void CLFFT2D::configure(const CLCompileContext &compile_context, + const ICLTensor *input, + ICLTensor *output, + const FFT2DInfo &config) { ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); ARM_COMPUTE_ERROR_THROW_ON(CLFFT2D::validate(input->info(), output->info(), config)); + ARM_COMPUTE_LOG_PARAMS(input, output, config); // Setup first pass FFT1DInfo first_pass_config; @@ -62,6 +76,7 @@ void CLFFT2D::configure(const CLCompileContext &compile_context, const ICLTensor Status CLFFT2D::validate(const ITensorInfo *input, const ITensorInfo *output, const FFT2DInfo &config) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_NOT_IN(input, DataType::F16, DataType::F32); // Create intermediate tensor info TensorInfo first_pass_tensor(input->clone()->set_is_resizable(true).reset_padding().set_num_channels(2)); @@ -79,7 +94,7 @@ Status CLFFT2D::validate(const ITensorInfo *input, const ITensorInfo *output, co ARM_COMPUTE_RETURN_ON_ERROR(CLFFT1D::validate(&first_pass_tensor, output, second_pass_config)); // Checks performed when output is configured - if((output != nullptr) && (output->total_size() != 0)) + if ((output != nullptr) && (output->total_size() != 0)) { ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); diff --git a/src/runtime/CL/functions/CLFFTConvolutionLayer.cpp b/src/runtime/CL/functions/CLFFTConvolutionLayer.cpp index ff439cca8d..2a73517549 100644 --- a/src/runtime/CL/functions/CLFFTConvolutionLayer.cpp +++ b/src/runtime/CL/functions/CLFFTConvolutionLayer.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2020 ARM Limited. + * Copyright (c) 2019-2021, 2023 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -25,12 +25,21 @@ #include "arm_compute/core/CL/ICLTensor.h" #include "arm_compute/core/Utils.h" -#include "arm_compute/core/Validate.h" -#include "arm_compute/core/utils/helpers/fft.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" +#include "arm_compute/core/Validate.h" #include "arm_compute/runtime/CL/CLScheduler.h" #include "arm_compute/runtime/CPP/CPPScheduler.h" +#include "src/common/utils/Log.h" +#include "src/core/CL/kernels/CLFFTDigitReverseKernel.h" +#include "src/core/CL/kernels/CLFFTRadixStageKernel.h" +#include "src/core/CL/kernels/CLFFTScaleKernel.h" +#include "src/core/CL/kernels/CLFillBorderKernel.h" +#include "src/core/CL/kernels/CLPadLayerKernel.h" +#include "src/core/CL/kernels/CLReductionOperationKernel.h" +#include "src/core/helpers/AutoConfiguration.h" +#include "src/core/utils/helpers/fft.h" + namespace arm_compute { namespace @@ -41,11 +50,11 @@ int pad_decomposable(int N) int pad = 0; bool is_decomposed = false; - while(!is_decomposed) + while (!is_decomposed) { const auto decomposed_vector = arm_compute::helpers::fft::decompose_stages(N++, supported_radix); is_decomposed = !decomposed_vector.empty(); - if(!is_decomposed) + if (!is_decomposed) { ++pad; } @@ -95,15 +104,33 @@ CLFFTConvolutionLayer::CLFFTConvolutionLayer(std::shared_ptr<IMemoryManager> mem { } -void CLFFTConvolutionLayer::configure(ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info, - const ActivationLayerInfo &act_info) +void CLFFTConvolutionLayer::configure(ICLTensor *input, + const ICLTensor *weights, + const ICLTensor *biases, + ICLTensor *output, + const PadStrideInfo &conv_info, + const ActivationLayerInfo &act_info, + bool enable_fast_math) { - configure(CLKernelLibrary::get().get_compile_context(), input, weights, biases, output, conv_info, act_info); + configure(CLKernelLibrary::get().get_compile_context(), input, weights, biases, output, conv_info, act_info, + enable_fast_math); } -void CLFFTConvolutionLayer::configure(const CLCompileContext &compile_context, ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info, - const ActivationLayerInfo &act_info) +void CLFFTConvolutionLayer::configure(const CLCompileContext &compile_context, + ICLTensor *input, + const ICLTensor *weights, + const ICLTensor *biases, + ICLTensor *output, + const PadStrideInfo &conv_info, + const ActivationLayerInfo &act_info, + bool enable_fast_math) { + ARM_COMPUTE_UNUSED(enable_fast_math); + ARM_COMPUTE_ERROR_THROW_ON(CLFFTConvolutionLayer::validate(input->info(), weights->info(), + biases != nullptr ? biases->info() : nullptr, + output->info(), conv_info, act_info, enable_fast_math)); + ARM_COMPUTE_LOG_PARAMS(input, weights, biases, output, conv_info, act_info, enable_fast_math); + _original_weights = weights; _original_bias = biases; @@ -111,21 +138,24 @@ void CLFFTConvolutionLayer::configure(const CLCompileContext &compile_context, I _has_bias = biases != nullptr; // Get indices for the width and height - const size_t idx_width = get_data_layout_dimension_index(input->info()->data_layout(), DataLayoutDimension::WIDTH); - const size_t idx_height = get_data_layout_dimension_index(input->info()->data_layout(), DataLayoutDimension::HEIGHT); + const size_t idx_width = get_data_layout_dimension_index(input->info()->data_layout(), DataLayoutDimension::WIDTH); + const size_t idx_height = + get_data_layout_dimension_index(input->info()->data_layout(), DataLayoutDimension::HEIGHT); // Input shape, kernel size and output tile - const Size2D input_dims = Size2D(input->info()->tensor_shape()[idx_width], input->info()->tensor_shape()[idx_height]); - const Size2D kernel_size = Size2D(weights->info()->tensor_shape()[idx_width], weights->info()->tensor_shape()[idx_height]); - const Size2D pad_valid = Size2D(pad_decomposable(input_dims.x() + kernel_size.x() - 1), - pad_decomposable(input_dims.y() + kernel_size.y() - 1)); + const Size2D input_dims = + Size2D(input->info()->tensor_shape()[idx_width], input->info()->tensor_shape()[idx_height]); + const Size2D kernel_size = + Size2D(weights->info()->tensor_shape()[idx_width], weights->info()->tensor_shape()[idx_height]); + const Size2D pad_valid = Size2D(pad_decomposable(input_dims.x() + kernel_size.x() - 1), + pad_decomposable(input_dims.y() + kernel_size.y() - 1)); // Tensors to use ICLTensor *input_to_use = input; const ICLTensor *weights_to_use = weights; ICLTensor *output_to_use = _has_bias ? &_bias_output : output; // Permute bias - if(biases != nullptr) + if (biases != nullptr) { _permute_bias_func.configure(compile_context, biases, &_permuted_bias, PermutationVector(1U, 2U, 0U)); _permuted_bias.info()->set_data_layout(DataLayout::NCHW); @@ -133,7 +163,7 @@ void CLFFTConvolutionLayer::configure(const CLCompileContext &compile_context, I // Permute input if needed _needs_permute = input->info()->data_layout() == DataLayout::NHWC; - if(_needs_permute) + if (_needs_permute) { _memory_group.manage(&_permuted_input); // Configure the function to transform the input tensor from NHWC -> NCHW @@ -151,21 +181,22 @@ void CLFFTConvolutionLayer::configure(const CLCompileContext &compile_context, I // Flip weights _flipped_weights.allocator()->init(weights_to_use->info()->clone()->set_is_resizable(true).reset_padding()); _flip_axis.allocator()->init(TensorInfo(TensorShape(2U), 1, DataType::U32)); - _flip_weights_func.configure(compile_context, weights_to_use, &_flipped_weights, &_flip_axis); + _flip_weights_func.configure(compile_context, weights_to_use, &_flipped_weights, &_flip_axis, + /* use_inverted_axis */ false); // Pad weights - const PaddingList padding_w = { { 0, input_dims.x() + pad_valid.x() - 1 }, { 0, input_dims.y() + pad_valid.y() - 1 } }; + const PaddingList padding_w = {{0, input_dims.x() + pad_valid.x() - 1}, {0, input_dims.y() + pad_valid.y() - 1}}; _pad_weights_func.configure(compile_context, &_flipped_weights, &_padded_weights, padding_w); // Transform weights - _transform_weights_func = support::cpp14::make_unique<CLFFT2D>(); + _transform_weights_func = std::make_unique<CLFFT2D>(); _transform_weights_func->configure(compile_context, &_padded_weights, &_transformed_weights, FFT2DInfo()); // Pad input - const PaddingList padding_in = { { 0, kernel_size.x() + pad_valid.x() - 1 }, { 0, kernel_size.y() + pad_valid.y() - 1 } }; + const PaddingList padding_in = {{0, kernel_size.x() + pad_valid.x() - 1}, {0, kernel_size.y() + pad_valid.y() - 1}}; _memory_group.manage(&_padded_input); _pad_input_func.configure(compile_context, input_to_use, &_padded_input, padding_in); - if(_needs_permute) + if (_needs_permute) { _permuted_input.allocator()->allocate(); } @@ -189,7 +220,8 @@ void CLFFTConvolutionLayer::configure(const CLCompileContext &compile_context, I _memory_group.manage(&_itransformed_output); FFT2DInfo itranform_info; itranform_info.direction = FFTDirection::Inverse; - _itransformed_output.allocator()->init(_output_reduced.info()->clone()->set_is_resizable(true).set_num_channels(1).reset_padding()); + _itransformed_output.allocator()->init( + _output_reduced.info()->clone()->set_is_resizable(true).set_num_channels(1).reset_padding()); _itransform_output_func.configure(compile_context, &_output_reduced, &_itransformed_output, itranform_info); _output_reduced.allocator()->allocate(); @@ -201,25 +233,28 @@ void CLFFTConvolutionLayer::configure(const CLCompileContext &compile_context, I // Extract correct region const int start_left = kernel_size.x() - conv_info.pad_left() - 1; const int start_top = kernel_size.y() - conv_info.pad_top() - 1; - const int end_right = _reshaped_output.info()->tensor_shape().x() - (kernel_size.x() - conv_info.pad_right() - 1) - pad_valid.x(); - const int end_botton = _reshaped_output.info()->tensor_shape().y() - (kernel_size.y() - conv_info.pad_bottom() - 1) - pad_valid.y(); - if(_has_bias) + const int end_right = + _reshaped_output.info()->tensor_shape().x() - (kernel_size.x() - conv_info.pad_right() - 1) - pad_valid.x(); + const int end_botton = + _reshaped_output.info()->tensor_shape().y() - (kernel_size.y() - conv_info.pad_bottom() - 1) - pad_valid.y(); + if (_has_bias) { _memory_group.manage(&_bias_output); } - else if(_needs_permute) + else if (_needs_permute) { output_to_use = &_permuted_output; _memory_group.manage(&_permuted_output); } - _extract_output_func.configure(compile_context, &_reshaped_output, output_to_use, Coordinates(start_left, start_top), Coordinates(end_right, end_botton)); + _extract_output_func.configure(compile_context, &_reshaped_output, output_to_use, + Coordinates(start_left, start_top), Coordinates(end_right, end_botton)); _itransformed_output.allocator()->allocate(); // Add bias - if(biases != nullptr) + if (biases != nullptr) { output_to_use = output; - if(_needs_permute) + if (_needs_permute) { output_to_use = &_permuted_output; _memory_group.manage(&_permuted_output); @@ -230,7 +265,7 @@ void CLFFTConvolutionLayer::configure(const CLCompileContext &compile_context, I } // Permute output - if(_needs_permute) + if (_needs_permute) { // Configure the function to transform the convoluted output to ACL's native ordering format NCHW _permuted_output.info()->set_data_layout(DataLayout::NCHW); @@ -242,7 +277,7 @@ void CLFFTConvolutionLayer::configure(const CLCompileContext &compile_context, I // Configure Activation Layer _is_activationlayer_enabled = act_info.enabled(); - if(_is_activationlayer_enabled) + if (_is_activationlayer_enabled) { _activation_layer_func.configure(compile_context, output, nullptr, act_info); } @@ -256,10 +291,16 @@ void CLFFTConvolutionLayer::configure(const CLCompileContext &compile_context, I _flip_axis.unmap(); } -Status CLFFTConvolutionLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info, - const ActivationLayerInfo &act_info) +Status CLFFTConvolutionLayer::validate(const ITensorInfo *input, + const ITensorInfo *weights, + const ITensorInfo *biases, + const ITensorInfo *output, + const PadStrideInfo &conv_info, + const ActivationLayerInfo &act_info, + bool enable_fast_math) { - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON((input->data_type() == DataType::F16) && !enable_fast_math); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights); // Get indices for the width and height @@ -273,25 +314,27 @@ Status CLFFTConvolutionLayer::validate(const ITensorInfo *input, const ITensorIn const auto strides = conv_info.stride(); ARM_COMPUTE_RETURN_ERROR_ON(strides.first != strides.second && strides.first != 1); ARM_COMPUTE_RETURN_ERROR_ON(kernel_size.x() != kernel_size.y()); - ARM_COMPUTE_RETURN_ERROR_ON(conv_info.pad_left() != (kernel_size.x() / 2) || conv_info.pad_right() != (kernel_size.x() / 2)); - ARM_COMPUTE_RETURN_ERROR_ON(conv_info.pad_top() != (kernel_size.y() / 2) || conv_info.pad_bottom() != (kernel_size.y() / 2)); + ARM_COMPUTE_RETURN_ERROR_ON(conv_info.pad_left() != (kernel_size.x() / 2) || + conv_info.pad_right() != (kernel_size.x() / 2)); + ARM_COMPUTE_RETURN_ERROR_ON(conv_info.pad_top() != (kernel_size.y() / 2) || + conv_info.pad_bottom() != (kernel_size.y() / 2)); // Validate biases - if(biases != nullptr) + if (biases != nullptr) { - const size_t idx_channels = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::CHANNEL); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, biases); - ARM_COMPUTE_RETURN_ERROR_ON(input->tensor_shape()[idx_channels] != biases->tensor_shape().x()); + ARM_COMPUTE_RETURN_ERROR_ON(weights->tensor_shape()[3] != biases->tensor_shape().x()); } // Checks performed when output is configured - if((output != nullptr) && (output->total_size() != 0)) + if ((output != nullptr) && (output->total_size() != 0)) { ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); - ARM_COMPUTE_RETURN_ERROR_ON((input->tensor_shape()[idx_height] != output->tensor_shape()[idx_height]) || (input->tensor_shape()[idx_width] != output->tensor_shape()[idx_width])); + ARM_COMPUTE_RETURN_ERROR_ON((input->tensor_shape()[idx_height] != output->tensor_shape()[idx_height]) || + (input->tensor_shape()[idx_width] != output->tensor_shape()[idx_width])); // Validate Activation Layer - if(act_info.enabled()) + if (act_info.enabled()) { ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayer::validate(output, nullptr, act_info)); } @@ -307,7 +350,7 @@ void CLFFTConvolutionLayer::run() MemoryGroupResourceScope scope_mg(_memory_group); // Transform input - if(_needs_permute) + if (_needs_permute) { _permute_input_func.run(); } @@ -323,17 +366,17 @@ void CLFFTConvolutionLayer::run() _reshaped_output.allocator()->import_memory(_itransformed_output.cl_buffer()); _extract_output_func.run(); // Add bias - if(_has_bias) + if (_has_bias) { _bias_add_func.run(); } - if(_needs_permute) + if (_needs_permute) { _permute_output_func.run(); } // Run activation layer - if(_is_activationlayer_enabled) + if (_is_activationlayer_enabled) { _activation_layer_func.run(); } @@ -341,10 +384,10 @@ void CLFFTConvolutionLayer::run() void CLFFTConvolutionLayer::prepare() { - if(!_is_prepared) + if (!_is_prepared) { // Permute bias to NCHW - if(_original_bias != nullptr) + if (_original_bias != nullptr) { _permuted_bias.allocator()->allocate(); _permute_bias_func.run(); @@ -353,7 +396,7 @@ void CLFFTConvolutionLayer::prepare() const ICLTensor *cur_weights = _original_weights; // Permute weights - if(_needs_permute) + if (_needs_permute) { ARM_COMPUTE_ERROR_ON(!cur_weights->is_used()); diff --git a/src/runtime/CL/functions/CLFastCorners.cpp b/src/runtime/CL/functions/CLFastCorners.cpp deleted file mode 100644 index f51abf0880..0000000000 --- a/src/runtime/CL/functions/CLFastCorners.cpp +++ /dev/null @@ -1,138 +0,0 @@ -/* - * Copyright (c) 2016-2020 ARM Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "arm_compute/runtime/CL/functions/CLFastCorners.h" - -#include "arm_compute/core/CL/OpenCL.h" -#include "arm_compute/core/CL/kernels/CLFastCornersKernel.h" -#include "arm_compute/core/Error.h" -#include "arm_compute/core/TensorInfo.h" -#include "arm_compute/core/Validate.h" -#include "arm_compute/runtime/CL/CLScheduler.h" -#include "arm_compute/runtime/ITensorAllocator.h" - -#include <algorithm> -#include <cstring> - -using namespace arm_compute; - -CLFastCorners::CLFastCorners(std::shared_ptr<IMemoryManager> memory_manager) - : _memory_group(std::move(memory_manager)), - _fast_corners_kernel(), - _suppr_func(), - _copy_array_kernel(), - _output(), - _suppr(), - _win(), - _non_max(false), - _num_corners(nullptr), - _num_buffer(), - _corners(nullptr), - _constant_border_value(0) -{ -} - -void CLFastCorners::configure(const ICLImage *input, float threshold, bool nonmax_suppression, ICLKeyPointArray *corners, - unsigned int *num_corners, BorderMode border_mode, uint8_t constant_border_value) -{ - configure(CLKernelLibrary::get().get_compile_context(), input, threshold, nonmax_suppression, corners, num_corners, border_mode, constant_border_value); -} - -void CLFastCorners::configure(const CLCompileContext &compile_context, const ICLImage *input, float threshold, bool nonmax_suppression, ICLKeyPointArray *corners, - unsigned int *num_corners, BorderMode border_mode, uint8_t constant_border_value) -{ - ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(input); - ARM_COMPUTE_ERROR_ON(BorderMode::UNDEFINED != border_mode); - ARM_COMPUTE_ERROR_ON(nullptr == corners); - ARM_COMPUTE_ERROR_ON(threshold < 1 && threshold > 255); - - TensorInfo tensor_info(input->info()->tensor_shape(), 1, DataType::U8); - _output.allocator()->init(tensor_info); - - _non_max = nonmax_suppression; - _num_corners = num_corners; - _corners = corners; - _num_buffer = cl::Buffer(CLScheduler::get().context(), CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE, sizeof(unsigned int)); - _constant_border_value = constant_border_value; - - const bool update_number = (nullptr != _num_corners); - - _memory_group.manage(&_output); - _fast_corners_kernel.configure(compile_context, input, &_output, threshold, nonmax_suppression, border_mode); - - if(!_non_max) - { - _copy_array_kernel.configure(compile_context, &_output, update_number, _corners, &_num_buffer); - } - else - { - _suppr.allocator()->init(tensor_info); - _memory_group.manage(&_suppr); - - _suppr_func.configure(compile_context, &_output, &_suppr, border_mode); - _copy_array_kernel.configure(compile_context, &_suppr, update_number, _corners, &_num_buffer); - - _suppr.allocator()->allocate(); - } - - // Allocate intermediate tensors - _output.allocator()->allocate(); -} - -void CLFastCorners::run() -{ - cl::CommandQueue q = CLScheduler::get().queue(); - - MemoryGroupResourceScope scope_mg(_memory_group); - - if(_non_max) - { - ARM_COMPUTE_ERROR_ON_MSG(_output.cl_buffer().get() == nullptr, "Unconfigured function"); - const auto out_buffer = static_cast<unsigned char *>(q.enqueueMapBuffer(_output.cl_buffer(), CL_TRUE, CL_MAP_WRITE, 0, _output.info()->total_size())); - memset(out_buffer, 0, _output.info()->total_size()); - q.enqueueUnmapMemObject(_output.cl_buffer(), out_buffer); - } - - CLScheduler::get().enqueue(_fast_corners_kernel, false); - - if(_non_max) - { - _suppr_func.run(); - } - - CLScheduler::get().enqueue(_copy_array_kernel, false); - - unsigned int get_num_corners = 0; - q.enqueueReadBuffer(_num_buffer, CL_TRUE, 0, sizeof(unsigned int), &get_num_corners); - - size_t corner_size = std::min(static_cast<size_t>(get_num_corners), _corners->max_num_values()); - - _corners->resize(corner_size); - - if(_num_corners != nullptr) - { - *_num_corners = get_num_corners; - } - - q.flush(); -} diff --git a/src/runtime/CL/functions/CLFill.cpp b/src/runtime/CL/functions/CLFill.cpp index 7b96ed1592..9bd96a975e 100644 --- a/src/runtime/CL/functions/CLFill.cpp +++ b/src/runtime/CL/functions/CLFill.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2020 ARM Limited. + * Copyright (c) 2019-2021 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -23,22 +23,59 @@ */ #include "arm_compute/runtime/CL/functions/CLFill.h" -#include "arm_compute/core/CL/kernels/CLMemsetKernel.h" +#include "arm_compute/core/CL/CLKernelLibrary.h" +#include "arm_compute/core/CL/ICLTensor.h" #include "arm_compute/core/Types.h" +#include "arm_compute/core/Validate.h" + +#include "src/core/CL/ICLKernel.h" +#include "src/gpu/cl/operators/ClFill.h" #include <utility> namespace arm_compute { -void CLFill::configure(ICLTensor *tensor, PixelValue constant_value) +struct CLFill::Impl +{ + const ICLTensor *src{nullptr}; + ICLTensor *dst{nullptr}; + std::unique_ptr<opencl::ClFill> op{nullptr}; +}; + +CLFill::CLFill() : _impl(std::make_unique<Impl>()) +{ +} +CLFill::CLFill(CLFill &&) = default; +CLFill &CLFill::operator=(CLFill &&) = default; +CLFill::~CLFill() = default; + +void CLFill::configure(ICLTensor *tensor, const PixelValue &constant_value, Window *dst_window) +{ + configure(CLKernelLibrary::get().get_compile_context(), tensor, constant_value, dst_window); +} + +void CLFill::configure(const CLCompileContext &compile_context, + ICLTensor *tensor, + const PixelValue &constant_value, + Window *dst_window) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(tensor); + + _impl->src = tensor; + + _impl->op = std::make_unique<opencl::ClFill>(); + _impl->op->configure(compile_context, _impl->src->info(), constant_value, dst_window); +} + +Status CLFill::validate(const ITensorInfo *tensor, const PixelValue &constant_value, Window *dst_window) { - configure(CLKernelLibrary::get().get_compile_context(), tensor, constant_value); + return opencl::ClFill::validate(tensor, constant_value, dst_window); } -void CLFill::configure(const CLCompileContext &compile_context, ICLTensor *tensor, PixelValue constant_value) +void CLFill::run() { - auto k = arm_compute::support::cpp14::make_unique<CLMemsetKernel>(); - k->configure(compile_context, tensor, constant_value); - _kernel = std::move(k); + ITensorPack pack; + pack.add_tensor(TensorType::ACL_SRC, _impl->src); + _impl->op->run(pack); } } // namespace arm_compute diff --git a/src/runtime/CL/functions/CLFillBorder.cpp b/src/runtime/CL/functions/CLFillBorder.cpp deleted file mode 100644 index f9d7396c5b..0000000000 --- a/src/runtime/CL/functions/CLFillBorder.cpp +++ /dev/null @@ -1,43 +0,0 @@ -/* - * Copyright (c) 2016-2020 ARM Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "arm_compute/runtime/CL/functions/CLFillBorder.h" - -#include "arm_compute/core/CL/kernels/CLFillBorderKernel.h" -#include "support/MemorySupport.h" - -#include <utility> - -using namespace arm_compute; - -void CLFillBorder::configure(ICLTensor *tensor, unsigned int border_width, BorderMode border_mode, const PixelValue &constant_border_value) -{ - configure(CLKernelLibrary::get().get_compile_context(), tensor, border_width, border_mode, constant_border_value); -} - -void CLFillBorder::configure(const CLCompileContext &compile_context, ICLTensor *tensor, unsigned int border_width, BorderMode border_mode, const PixelValue &constant_border_value) -{ - auto k = arm_compute::support::cpp14::make_unique<CLFillBorderKernel>(); - k->configure(compile_context, tensor, BorderSize(border_width), border_mode, constant_border_value); - _kernel = std::move(k); -} diff --git a/src/runtime/CL/functions/CLFlattenLayer.cpp b/src/runtime/CL/functions/CLFlattenLayer.cpp index 9a247ccfcb..ba1b5372d3 100644 --- a/src/runtime/CL/functions/CLFlattenLayer.cpp +++ b/src/runtime/CL/functions/CLFlattenLayer.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2020 ARM Limited. + * Copyright (c) 2017-2021 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -23,11 +23,31 @@ */ #include "arm_compute/runtime/CL/functions/CLFlattenLayer.h" -#include "arm_compute/core/CL/kernels/CLFlattenLayerKernel.h" -#include "arm_compute/runtime/CL/CLScheduler.h" -#include "support/MemorySupport.h" +#include "arm_compute/core/CL/CLKernelLibrary.h" +#include "arm_compute/core/CL/ICLTensor.h" +#include "arm_compute/core/TensorInfo.h" +#include "arm_compute/core/utils/misc/ShapeCalculator.h" +#include "arm_compute/core/Validate.h" -using namespace arm_compute; +#include "src/core/CL/ICLKernel.h" +#include "src/core/helpers/AutoConfiguration.h" +#include "src/gpu/cl/operators/ClFlatten.h" + +namespace arm_compute +{ +struct CLFlattenLayer::Impl +{ + const ICLTensor *src{nullptr}; + ICLTensor *dst{nullptr}; + std::unique_ptr<opencl::ClFlatten> op{nullptr}; +}; + +CLFlattenLayer::CLFlattenLayer() : _impl(std::make_unique<Impl>()) +{ +} +CLFlattenLayer::CLFlattenLayer(CLFlattenLayer &&) = default; +CLFlattenLayer &CLFlattenLayer::operator=(CLFlattenLayer &&) = default; +CLFlattenLayer::~CLFlattenLayer() = default; void CLFlattenLayer::configure(const ICLTensor *input, ICLTensor *output) { @@ -36,13 +56,33 @@ void CLFlattenLayer::configure(const ICLTensor *input, ICLTensor *output) void CLFlattenLayer::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output) { - auto k = arm_compute::support::cpp14::make_unique<CLFlattenLayerKernel>(); - k->configure(compile_context, input, output); - _kernel = std::move(k); - CLScheduler::get().tune_kernel_static(*_kernel); + ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); + _impl->src = input; + _impl->dst = output; + auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape( + misc::shape_calculator::compute_flatten_shape(input->info()))); + + _impl->op = std::make_unique<opencl::ClFlatten>(); + _impl->op->configure(compile_context, _impl->src->info(), _impl->dst->info()); } Status CLFlattenLayer::validate(const ITensorInfo *input, const ITensorInfo *output) { - return CLFlattenLayerKernel::validate(input, output); + // Checks performed when output is configured + if (output->total_size() != 0) + { + const TensorInfo tensor_info_output = + input->clone()->set_tensor_shape(misc::shape_calculator::compute_flatten_shape(input)); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output, &tensor_info_output); + } + return opencl::ClFlatten::validate(input, output); +} + +void CLFlattenLayer::run() +{ + ITensorPack pack; + pack.add_tensor(TensorType::ACL_SRC, _impl->src); + pack.add_tensor(TensorType::ACL_DST, _impl->dst); + _impl->op->run(pack); } +} // namespace arm_compute diff --git a/src/runtime/CL/functions/CLFloor.cpp b/src/runtime/CL/functions/CLFloor.cpp index 44e1d39dc2..4322219dd9 100644 --- a/src/runtime/CL/functions/CLFloor.cpp +++ b/src/runtime/CL/functions/CLFloor.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2020 ARM Limited. + * Copyright (c) 2016-2021 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -23,11 +23,30 @@ */ #include "arm_compute/runtime/CL/functions/CLFloor.h" -#include "arm_compute/core/CL/kernels/CLFloorKernel.h" -#include "support/MemorySupport.h" +#include "arm_compute/core/CL/CLKernelLibrary.h" +#include "arm_compute/core/CL/ICLTensor.h" +#include "arm_compute/core/Types.h" +#include "arm_compute/core/Validate.h" + +#include "src/core/CL/ICLKernel.h" +#include "src/gpu/cl/operators/ClFloor.h" namespace arm_compute { +struct CLFloor::Impl +{ + const ICLTensor *src{nullptr}; + ICLTensor *dst{nullptr}; + std::unique_ptr<opencl::ClFloor> op{nullptr}; +}; + +CLFloor::CLFloor() : _impl(std::make_unique<Impl>()) +{ +} +CLFloor::CLFloor(CLFloor &&) = default; +CLFloor &CLFloor::operator=(CLFloor &&) = default; +CLFloor::~CLFloor() = default; + void CLFloor::configure(const ICLTensor *input, ICLTensor *output) { configure(CLKernelLibrary::get().get_compile_context(), input, output); @@ -35,13 +54,25 @@ void CLFloor::configure(const ICLTensor *input, ICLTensor *output) void CLFloor::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output) { - auto k = arm_compute::support::cpp14::make_unique<CLFloorKernel>(); - k->configure(compile_context, input, output); - _kernel = std::move(k); + ARM_COMPUTE_ERROR_ON_NULLPTR(input); + + _impl->src = input; + _impl->dst = output; + + _impl->op = std::make_unique<opencl::ClFloor>(); + _impl->op->configure(compile_context, _impl->src->info(), _impl->dst->info()); } Status CLFloor::validate(const ITensorInfo *input, const ITensorInfo *output) { - return CLFloorKernel::validate(input, output); + return opencl::ClFloor::validate(input, output); +} + +void CLFloor::run() +{ + ITensorPack pack; + pack.add_tensor(TensorType::ACL_SRC, _impl->src); + pack.add_tensor(TensorType::ACL_DST, _impl->dst); + _impl->op->run(pack); } } // namespace arm_compute diff --git a/src/runtime/CL/functions/CLFullyConnectedLayer.cpp b/src/runtime/CL/functions/CLFullyConnectedLayer.cpp index ecbac6f703..b30f9e701f 100644 --- a/src/runtime/CL/functions/CLFullyConnectedLayer.cpp +++ b/src/runtime/CL/functions/CLFullyConnectedLayer.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2020 ARM Limited. + * Copyright (c) 2017-2021, 2023 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -23,496 +23,137 @@ */ #include "arm_compute/runtime/CL/functions/CLFullyConnectedLayer.h" -#include "arm_compute/core/Size2D.h" -#include "arm_compute/core/Validate.h" -#include "arm_compute/core/utils/misc/Cast.h" -#include "arm_compute/core/utils/misc/ShapeCalculator.h" -#include "arm_compute/core/utils/quantization/AsymmHelpers.h" +#include "arm_compute/core/CL/CLKernelLibrary.h" #include "arm_compute/runtime/CL/CLScheduler.h" -#include "support/MemorySupport.h" -#include <algorithm> +#include "src/core/helpers/MemoryHelpers.h" +#include "src/gpu/cl/operators/ClFullyConnected.h" namespace arm_compute { -using namespace arm_compute::misc::shape_calculator; -using namespace arm_compute::utils::cast; +using namespace arm_compute::experimental; -namespace +struct CLFullyConnectedLayer::Impl { -Status construct_gemmlowp_output_stage(const ITensorInfo &input, const ITensorInfo &weights, const ITensorInfo &output, - GEMMLowpOutputStageInfo &gemmlowp_output_stage, ActivationLayerInfo activation_info) -{ - gemmlowp_output_stage.type = GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT; - gemmlowp_output_stage.gemmlowp_offset = 0; - gemmlowp_output_stage.gemmlowp_multiplier = 0; - gemmlowp_output_stage.gemmlowp_shift = 0; - - const auto data_type = input.data_type(); - - // Configure output stage for quantized case - if(is_data_type_quantized_asymmetric(data_type)) - { - const QuantizationInfo oq_info = output.quantization_info(); - const UniformQuantizationInfo iq_unif = input.quantization_info().uniform(); - const UniformQuantizationInfo wq_unif = weights.quantization_info().uniform(); - const UniformQuantizationInfo oq_unif = oq_info.uniform(); - - const auto output_quant_info = (output.total_size() == 0) ? iq_unif : oq_unif; - - const float multiplier = (iq_unif.scale * wq_unif.scale) / output_quant_info.scale; - int output_multiplier = 0; - int output_shift = 0; - ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier(multiplier, &output_multiplier, &output_shift)); - - PixelValue type_min{}; - PixelValue type_max{}; - std::tie(type_min, type_max) = get_min_max(data_type); - - if(activation_info.enabled()) - { - switch(activation_info.activation()) - { - case ActivationLayerInfo::ActivationFunction::RELU: - type_min = PixelValue(oq_unif.offset); - break; - case ActivationLayerInfo::ActivationFunction::BOUNDED_RELU: - type_min = PixelValue(oq_unif.offset); - type_max = PixelValue(activation_info.a(), data_type, oq_info); - break; - case ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU: - type_min = PixelValue(activation_info.b(), data_type, oq_info); - type_max = PixelValue(activation_info.a(), data_type, oq_info); - break; - default: - ARM_COMPUTE_ERROR("Activation function not supported."); - break; - } - } - - // Set the GEMMLowp output stage info - gemmlowp_output_stage.gemmlowp_offset = output_quant_info.offset; - gemmlowp_output_stage.gemmlowp_multiplier = output_multiplier; - gemmlowp_output_stage.gemmlowp_shift = output_shift; - gemmlowp_output_stage.gemmlowp_multipliers.push_back(output_multiplier); - gemmlowp_output_stage.gemmlowp_shifts.push_back(output_shift); - type_min.get(gemmlowp_output_stage.gemmlowp_min_bound); - type_max.get(gemmlowp_output_stage.gemmlowp_max_bound); - } - - return Status{}; -} - -Status validate_mm(const ITensorInfo &input, const ITensorInfo &weights, const ITensorInfo *bias, const ITensorInfo &output, const FullyConnectedLayerInfo &fc_info) -{ - GEMMLowpOutputStageInfo gemmlowp_output_stage; - ARM_COMPUTE_RETURN_ON_ERROR(construct_gemmlowp_output_stage(input, weights, output, gemmlowp_output_stage, fc_info.activation_info)); - - const GEMMInfo &gemm_info = GEMMInfo(false, // is_a_reshaped - false, // is_b_reshaped - true, // reshape_b_only_on_first_run - 0, // depth_output_gemm3d - false, // reinterpret_input_as_3d - fc_info.retain_internal_weights, // retain_internal_weights - gemmlowp_output_stage, // gemmlowp_output_stage - fc_info.fp_mixed_precision, // fp_mixed_precision - true, // broadcast_bias - ActivationLayerInfo()); // activation_info - - if(is_data_type_quantized_asymmetric(input.data_type())) - { - const UniformQuantizationInfo iq_info = input.quantization_info().uniform(); - const UniformQuantizationInfo wq_info = weights.quantization_info().uniform(); - - // Since we need negative offsets for computing convolution, we need to change QuantizationInfo() - // Extract and negate input and weights offset - const QuantizationInfo input_quantization_info(iq_info.scale, -iq_info.offset); - const QuantizationInfo weights_quantization_info(wq_info.scale, -wq_info.offset); - - // Validate gemmlowp function - ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixMultiplyCore::validate(&input.clone()->set_quantization_info(input_quantization_info), - &weights.clone()->set_quantization_info(weights_quantization_info), - bias, - &output, - gemm_info)); - } - else - { - ARM_COMPUTE_RETURN_ON_ERROR(CLGEMM::validate(&input, &weights, bias, &output, 1.f, 1.f, gemm_info)); - } - - return Status{}; -} -} // namespace - -void CLFullyConnectedLayerReshapeWeights::configure(const ICLTensor *input, ICLTensor *output) -{ - configure(CLKernelLibrary::get().get_compile_context(), input, output); -} - -void CLFullyConnectedLayerReshapeWeights::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output) -{ - auto k = arm_compute::support::cpp14::make_unique<CLTransposeKernel>(); - k->configure(compile_context, input, output); - _kernel = std::move(k); -} + MemoryGroup memory_group{}; + IWeightsManager *weights_manager{nullptr}; -Status CLFullyConnectedLayerReshapeWeights::validate(const ITensorInfo *input, const ITensorInfo *output) -{ - return CLTransposeKernel::validate(input, output); -} + std::unique_ptr<opencl::ClFullyConnected> op{nullptr}; -CLFullyConnectedLayer::CLFullyConnectedLayer(std::shared_ptr<IMemoryManager> memory_manager, IWeightsManager *weights_manager) - : _memory_group(memory_manager), _weights_manager(weights_manager), _convert_weights(), _convert_weights_managed(), _reshape_weights_managed_function(), _flatten_layer(), _reshape_weights_function(), - _mm_gemm(memory_manager, weights_manager), _mm_gemmlowp(memory_manager), _flatten_output(), _converted_weights_output(), _reshape_weights_output(), _are_weights_converted(true), - _are_weights_reshaped(true), _is_fc_after_conv(true), _is_quantized(false), _is_prepared(false), _original_weights(nullptr) -{ -} -void CLFullyConnectedLayer::configure_mm(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *weights, const ICLTensor *bias, ICLTensor *output, - const FullyConnectedLayerInfo &fc_info) -{ - GEMMLowpOutputStageInfo gemmlowp_output_stage; - construct_gemmlowp_output_stage(*input->info(), *weights->info(), *output->info(), gemmlowp_output_stage, fc_info.activation_info); - - const GEMMInfo &gemm_info = GEMMInfo(false, // is_a_reshaped - false, // is_b_reshaped - true, // reshape_b_only_on_first_run - 0, // depth_output_gemm3d - false, // reinterpret_input_as_3d - fc_info.retain_internal_weights, // retain_internal_weights - gemmlowp_output_stage, // gemmlowp_output_stage - fc_info.fp_mixed_precision, // fp_mixed_precision - true, // broadcast_bias - fc_info.activation_info); // activation_info - - if(_is_quantized) - { - // Since we need negative offsets for computing convolution, we need to change QuantizationInfo() - // Extract and negate input and weights offset - const QuantizationInfo input_quantization_info = input->info()->quantization_info(); - const QuantizationInfo weights_quantization_info = weights->info()->quantization_info(); + const ITensor *original_weights{nullptr}; - input->info()->set_quantization_info(QuantizationInfo(input_quantization_info.uniform().scale, -input_quantization_info.uniform().offset)); - weights->info()->set_quantization_info(QuantizationInfo(weights_quantization_info.uniform().scale, -weights_quantization_info.uniform().offset)); + ITensorPack run_pack{}; + WorkspaceData<CLTensor> workspace{}; + experimental::MemoryRequirements aux_mem_req{}; - // Configure gemmlowp function - _mm_gemmlowp.configure(compile_context, input, weights, bias, output, gemm_info); + bool is_prepared{false}; + bool dynamic_weights{false}; +}; - // Revert back QuantizatioInfo as input and weights could be used in other fully connected layers - input->info()->set_quantization_info(input_quantization_info); - weights->info()->set_quantization_info(weights_quantization_info); - } - else - { - // Configure matrix multiply kernel - _mm_gemm.configure(compile_context, input, weights, bias, output, 1.f, 1.f, gemm_info); - } -} - -void CLFullyConnectedLayer::configure_conv_fc(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *weights, const ICLTensor *bias, ICLTensor *output, - const FullyConnectedLayerInfo &fc_info) +CLFullyConnectedLayer::CLFullyConnectedLayer(std::shared_ptr<IMemoryManager> memory_manager, + IWeightsManager *weights_manager) + : _impl(std::make_unique<Impl>()) { - ARM_COMPUTE_ERROR_ON((weights->info()->dimension(1) != (input->info()->dimension(0) * input->info()->dimension(1) * input->info()->dimension(2)))); - - // If the fully connected layer is called after a convolution layer, the input tensor must be linearized - - // Initialize output tensor for flatten - TensorShape shape_flatten = compute_flatten_shape(input->info()); - _flatten_output.allocator()->init(input->info()->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(shape_flatten).set_data_layout(DataLayout::NCHW)); - - // Configure flatten kernel - _memory_group.manage(&_flatten_output); - _flatten_layer.configure(compile_context, input, &_flatten_output); - - // Configure matrix multiply kernel - configure_mm(compile_context, &_flatten_output, weights, bias, output, fc_info); - - // Allocate the output tensor for flatten once all the configure methods have been called - _flatten_output.allocator()->allocate(); + _impl->memory_group = MemoryGroup(std::move(memory_manager)); + _impl->weights_manager = weights_manager; } -void CLFullyConnectedLayer::configure_fc_fc(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *weights, const ICLTensor *bias, ICLTensor *output, - const FullyConnectedLayerInfo &fc_info) -{ - ARM_COMPUTE_ERROR_ON(input->info()->dimension(0) != weights->info()->dimension(1)); +CLFullyConnectedLayer::~CLFullyConnectedLayer() = default; - // Configure matrix multiply kernel - configure_mm(compile_context, input, weights, bias, output, fc_info); -} - -void CLFullyConnectedLayer::configure(const ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, +void CLFullyConnectedLayer::configure(const ICLTensor *input, + const ICLTensor *weights, + const ICLTensor *biases, + ICLTensor *output, FullyConnectedLayerInfo fc_info) { configure(CLKernelLibrary::get().get_compile_context(), input, weights, biases, output, fc_info); } -void CLFullyConnectedLayer::configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, +void CLFullyConnectedLayer::configure(const CLCompileContext &compile_context, + const ICLTensor *input, + const ICLTensor *weights, + const ICLTensor *biases, + ICLTensor *output, FullyConnectedLayerInfo fc_info) { + // Perform validate step ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output); + ARM_COMPUTE_ERROR_THROW_ON(CLFullyConnectedLayer::validate( + input->info(), weights->info(), biases != nullptr ? biases->info() : nullptr, output->info(), fc_info)); - // Perform validate step - ARM_COMPUTE_ERROR_THROW_ON(CLFullyConnectedLayer::validate(input->info(), - weights->info(), - biases != nullptr ? biases->info() : nullptr, - output->info(), - fc_info)); + _impl->op = std::make_unique<opencl::ClFullyConnected>(); + _impl->original_weights = weights; + _impl->is_prepared = fc_info.retain_internal_weights; - _are_weights_converted = true; - _are_weights_reshaped = fc_info.transpose_weights ? fc_info.are_weights_reshaped : true; - _is_fc_after_conv = true; - _is_quantized = is_data_type_quantized_asymmetric(input->info()->data_type()); - _is_prepared = fc_info.retain_internal_weights; - _original_weights = weights; + _impl->op->configure(compile_context, input->info(), weights->info(), + (biases != nullptr) ? biases->info() : nullptr, output->info(), fc_info); - if(_weights_manager) + if (_impl->weights_manager != nullptr) { - _weights_manager->manage(weights); + _impl->weights_manager->manage(_impl->original_weights); } - const ICLTensor *weights_to_use = weights; - - // With the Fully Connected layer we can have 4 different cases: - // 1) Convolution layer -> Fully Connected layer without batches - // 2) Fully Connected layer -> Fully Connected layer without batches - // 3) Convolution layer -> Fully Connected layer with batches - // 4) Fully Connected layer -> Fully Connected layer with batches - - // Check if we have a fully connected layer with batches - const bool is_batched_fc_layer = output->info()->dimension(1) > 1; - if(is_batched_fc_layer) + if (!_impl->is_prepared) { - _is_fc_after_conv = (TensorShape::num_max_dimensions >= 4) && (std::equal(input->info()->tensor_shape().cbegin() + 3, - input->info()->tensor_shape().cend(), - output->info()->tensor_shape().cbegin() + 1)); + _impl->aux_mem_req = _impl->op->workspace(); + _impl->run_pack = {{ACL_SRC_0, input}, {ACL_SRC_1, weights}, {ACL_SRC_2, biases}, {ACL_DST, output}}; + _impl->workspace = + manage_workspace<CLTensor>(_impl->aux_mem_req, _impl->memory_group, _impl->run_pack, _impl->run_pack); } else { - _is_fc_after_conv = input->info()->num_dimensions() > 1; - } - - // Reshape weights if needed - if(!_are_weights_reshaped) - { - if(_weights_manager && _weights_manager->are_weights_managed(weights)) - { - _reshape_weights_managed_function.configure(compile_context, weights); - weights_to_use = utils::cast::polymorphic_downcast<ICLTensor *>(_weights_manager->acquire(weights, &_reshape_weights_managed_function)); - } - else - { - // Reshape the weights - _reshape_weights_function.configure(compile_context, weights, &_reshape_weights_output); - weights_to_use = &_reshape_weights_output; - } - } - - // Convert weights if needed - if(_is_fc_after_conv && (input->info()->data_layout() != fc_info.weights_trained_layout)) - { - if(_weights_manager && _weights_manager->are_weights_managed(weights_to_use)) - { - _convert_weights_managed.configure(compile_context, weights_to_use, - input->info()->tensor_shape(), - fc_info.weights_trained_layout); - weights_to_use = utils::cast::polymorphic_downcast<ICLTensor *>(_weights_manager->acquire(weights, &_convert_weights_managed)); - } - else - { - // Convert weights - _convert_weights.configure(compile_context, weights_to_use, - &_converted_weights_output, - input->info()->tensor_shape(), - fc_info.weights_trained_layout); - - weights_to_use = &_converted_weights_output; - } - _are_weights_converted = false; + _impl->run_pack.add_tensor(ACL_SRC_0, input); + _impl->run_pack.add_tensor(ACL_DST, output); } - if(_is_fc_after_conv) - { - // Fully Connected layer after a Convolution Layer without batches - configure_conv_fc(compile_context, input, weights_to_use, biases, output, fc_info); - } - else - { - // Fully Connected layer after a Fully Connected Layer without batches - configure_fc_fc(compile_context, input, weights_to_use, biases, output, fc_info); - } + _impl->dynamic_weights = !weights->info()->are_values_constant() && fc_info.transpose_weights && + !fc_info.are_weights_reshaped && !fc_info.retain_internal_weights; } -Status CLFullyConnectedLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, +Status CLFullyConnectedLayer::validate(const ITensorInfo *input, + const ITensorInfo *weights, + const ITensorInfo *biases, + const ITensorInfo *output, FullyConnectedLayerInfo fc_info) { - ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F16, DataType::F32); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights, output); - ARM_COMPUTE_RETURN_ERROR_ON(weights->num_dimensions() > 2); - ARM_COMPUTE_RETURN_ERROR_ON(fc_info.activation_info.enabled() && is_data_type_quantized(input->data_type()) && fc_info.activation_info.activation() != ActivationLayerInfo::ActivationFunction::RELU - && fc_info.activation_info.activation() != ActivationLayerInfo::ActivationFunction::BOUNDED_RELU && fc_info.activation_info.activation() != ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU); - - bool weights_reshaped = fc_info.transpose_weights ? fc_info.are_weights_reshaped : true; - bool is_fc_after_conv = true; - - const ITensorInfo &flatten_input = TensorInfo(input->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(compute_flatten_shape(input)).set_data_layout(DataLayout::NCHW)); - const ITensorInfo &reshaped_weights = TensorInfo(weights->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(compute_transposed_shape(*weights))); - const ITensorInfo &converted_weights = weights_reshaped ? TensorInfo(weights->clone()->set_is_resizable(true).reset_padding()) : TensorInfo(*reshaped_weights.clone()); - - // With the Fully Connected layer we can have 4 different cases: - // 1) Convolution layer -> Fully Connected layer without batches - // 2) Fully Connected layer -> Fully Connected layer without batches - // 3) Convolution layer -> Fully Connected layer with batches - // 4) Fully Connected layer -> Fully Connected layer with batches - - const ITensorInfo *input_to_use = input; - const ITensorInfo *weights_to_use = weights; - - // Check if we have a fully connected layer with batches - const bool is_batched_fc_layer = output->dimension(1) > 1; - if(is_batched_fc_layer) - { - is_fc_after_conv = (TensorShape::num_max_dimensions >= 4) && (std::equal(input->tensor_shape().cbegin() + 3, - input->tensor_shape().cend(), - output->tensor_shape().cbegin() + 1)); - } - else - { - is_fc_after_conv = input->num_dimensions() > 1; - } - - if(!weights_reshaped) - { - // Validate reshape weights kernel - ARM_COMPUTE_RETURN_ON_ERROR(CLFullyConnectedLayerReshapeWeights::validate(weights, &reshaped_weights)); - weights_to_use = &reshaped_weights; - } - - if(is_fc_after_conv && (input->data_layout() != fc_info.weights_trained_layout)) - { - // Validate convert weights kernel - ARM_COMPUTE_RETURN_ON_ERROR(CLConvertFullyConnectedWeights::validate(weights_to_use, - &converted_weights, - input->tensor_shape(), - fc_info.weights_trained_layout)); - weights_to_use = &converted_weights; - } - - if(is_fc_after_conv) - { - // Fully Connected layer after a Convolution Layer without batches - ARM_COMPUTE_RETURN_ERROR_ON((weights_to_use->dimension(1) != (input->dimension(0) * input->dimension(1) * input->dimension(2)))); - - // Validate flatten kernel - ARM_COMPUTE_RETURN_ON_ERROR(CLFlattenLayer::validate(input, &flatten_input)); - input_to_use = &flatten_input; - } - else - { - // Fully Connected layer after a Fully Connected Layer without batches - ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(0) != weights_to_use->dimension(1)); - } - - // Validate matrix multiply kernel - ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(*input_to_use, *weights_to_use, biases, *output, fc_info)); - - return Status{}; + return opencl::ClFullyConnected::validate(input, weights, biases, output, fc_info); } void CLFullyConnectedLayer::run() { - prepare(); - - MemoryGroupResourceScope scope_mg(_memory_group); - - // Linearize input if it comes from a convolutional layer - if(_is_fc_after_conv) + if (!_impl->dynamic_weights) { - _flatten_layer.run(); + prepare(); } - // Run matrix multiply - if(_is_quantized) - { - _mm_gemmlowp.run(); - } - else - { - _mm_gemm.run(); - } + MemoryGroupResourceScope scope_mg(_impl->memory_group); + _impl->op->run(_impl->run_pack); } void CLFullyConnectedLayer::prepare() { - if(!_is_prepared) + if (!_impl->is_prepared) { - if(!_weights_manager) - { - ARM_COMPUTE_ERROR_ON(!_original_weights->is_used()); - } - - auto release_unused = [](CLTensor * w) - { - if(!w->is_used()) - { - CLScheduler::get().queue().finish(); - w->allocator()->free(); - } - }; + _impl->op->prepare(_impl->run_pack); - // Pointer to current weights - const ICLTensor *cur_weights = _original_weights; + // Release temporary tensors that are only used in prepare stage + release_temporaries<CLTensor>(_impl->aux_mem_req, _impl->workspace); + _impl->is_prepared = true; - // Reshape of the weights if needed (happens only once) - if(!_are_weights_reshaped) + // Handle weights managed infrastructure + if (_impl->weights_manager != nullptr && _impl->weights_manager->are_weights_managed(_impl->original_weights)) { - if(_weights_manager && _weights_manager->are_weights_managed(_original_weights)) + // Ensure that b gets marked as unused (memory released) only after the last function which uses b also finishes its prepare + // This is for cases where multiple functions share the same b (weights) + // Therefore when a function marks original b as unused, we pre-mark it in weights manager, and mark it back to used so that it doesn't get released before its last reference + const ITensor *original_b = _impl->original_weights; + if (!original_b->is_used()) { - cur_weights = utils::cast::polymorphic_downcast<ICLTensor *>(_weights_manager->run(cur_weights, &_reshape_weights_managed_function)); + _impl->weights_manager->pre_mark_as_unused(original_b); } - else - { - // Run reshape weights kernel and mark weights as unused - _reshape_weights_output.allocator()->allocate(); - _reshape_weights_function.run(); - - cur_weights->mark_as_unused(); - cur_weights = &_reshape_weights_output; - } - _are_weights_reshaped = true; + _impl->original_weights->mark_as_used(); + _impl->weights_manager->release(_impl->original_weights); } - - // Convert weights if needed (happens only once) - if(!_are_weights_converted) - { - if(_weights_manager && _weights_manager->are_weights_managed(cur_weights)) - { - _weights_manager->run(cur_weights, &_convert_weights_managed); - } - else - { - _converted_weights_output.allocator()->allocate(); - _convert_weights.run(); - cur_weights->mark_as_unused(); - } - - _are_weights_converted = true; - } - - // Release reshaped weights if unused - release_unused(&_reshape_weights_output); - - // Prepare GEMM prepare and release unused weights - if(!_is_quantized) - { - _mm_gemm.prepare(); - } - - // Release converted weights if unused - release_unused(&_reshape_weights_output); - release_unused(&_converted_weights_output); - - _is_prepared = true; } } } // namespace arm_compute diff --git a/src/runtime/CL/functions/CLFuseBatchNormalization.cpp b/src/runtime/CL/functions/CLFuseBatchNormalization.cpp index 6deecdc089..e4fbf78e13 100644 --- a/src/runtime/CL/functions/CLFuseBatchNormalization.cpp +++ b/src/runtime/CL/functions/CLFuseBatchNormalization.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018-2020 ARM Limited. + * Copyright (c) 2018-2021 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -29,39 +29,68 @@ #include "arm_compute/core/Types.h" #include "arm_compute/runtime/CL/CLScheduler.h" +#include "src/common/utils/Log.h" +#include "src/core/CL/kernels/CLFuseBatchNormalizationKernel.h" + namespace arm_compute { CLFuseBatchNormalization::CLFuseBatchNormalization() - : _fuse_bn_kernel() + : _fuse_bn_kernel(std::make_unique<CLFuseBatchNormalizationKernel>()) { } -void CLFuseBatchNormalization::configure(const ICLTensor *input_weights, const ICLTensor *bn_mean, const ICLTensor *bn_var, - ICLTensor *fused_weights, ICLTensor *fused_bias, - const ICLTensor *input_bias, const ICLTensor *bn_beta, const ICLTensor *bn_gamma, - float epsilon, FuseBatchNormalizationType fbn_type) +CLFuseBatchNormalization::~CLFuseBatchNormalization() = default; + +void CLFuseBatchNormalization::configure(const ICLTensor *input_weights, + const ICLTensor *bn_mean, + const ICLTensor *bn_var, + ICLTensor *fused_weights, + ICLTensor *fused_bias, + const ICLTensor *input_bias, + const ICLTensor *bn_beta, + const ICLTensor *bn_gamma, + float epsilon, + FuseBatchNormalizationType fbn_type) { - configure(CLKernelLibrary::get().get_compile_context(), input_weights, bn_mean, bn_var, fused_weights, fused_bias, input_bias, bn_beta, bn_gamma, epsilon, fbn_type); + configure(CLKernelLibrary::get().get_compile_context(), input_weights, bn_mean, bn_var, fused_weights, fused_bias, + input_bias, bn_beta, bn_gamma, epsilon, fbn_type); } -void CLFuseBatchNormalization::configure(const CLCompileContext &compile_context, const ICLTensor *input_weights, const ICLTensor *bn_mean, const ICLTensor *bn_var, - ICLTensor *fused_weights, ICLTensor *fused_bias, - const ICLTensor *input_bias, const ICLTensor *bn_beta, const ICLTensor *bn_gamma, - float epsilon, FuseBatchNormalizationType fbn_type) +void CLFuseBatchNormalization::configure(const CLCompileContext &compile_context, + const ICLTensor *input_weights, + const ICLTensor *bn_mean, + const ICLTensor *bn_var, + ICLTensor *fused_weights, + ICLTensor *fused_bias, + const ICLTensor *input_bias, + const ICLTensor *bn_beta, + const ICLTensor *bn_gamma, + float epsilon, + FuseBatchNormalizationType fbn_type) { - _fuse_bn_kernel.configure(compile_context, input_weights, bn_mean, bn_var, fused_weights, fused_bias, input_bias, bn_beta, bn_gamma, epsilon, fbn_type); + ARM_COMPUTE_LOG_PARAMS(input_weights, bn_mean, bn_var, fused_weights, fused_bias, input_bias, bn_beta, bn_gamma, + epsilon, fbn_type); + _fuse_bn_kernel->configure(compile_context, input_weights, bn_mean, bn_var, fused_weights, fused_bias, input_bias, + bn_beta, bn_gamma, epsilon, fbn_type); } -Status CLFuseBatchNormalization::validate(const ITensorInfo *input_weights, const ITensorInfo *bn_mean, const ITensorInfo *bn_var, - const ITensorInfo *fused_weights, const ITensorInfo *fused_bias, - const ITensorInfo *input_bias, const ITensorInfo *bn_beta, const ITensorInfo *bn_gamma, - float epsilon, FuseBatchNormalizationType fbn_type) +Status CLFuseBatchNormalization::validate(const ITensorInfo *input_weights, + const ITensorInfo *bn_mean, + const ITensorInfo *bn_var, + const ITensorInfo *fused_weights, + const ITensorInfo *fused_bias, + const ITensorInfo *input_bias, + const ITensorInfo *bn_beta, + const ITensorInfo *bn_gamma, + float epsilon, + FuseBatchNormalizationType fbn_type) { - return CLFuseBatchNormalizationKernel::validate(input_weights, bn_mean, bn_var, fused_weights, fused_bias, input_bias, bn_beta, bn_gamma, epsilon, fbn_type); + return CLFuseBatchNormalizationKernel::validate(input_weights, bn_mean, bn_var, fused_weights, fused_bias, + input_bias, bn_beta, bn_gamma, epsilon, fbn_type); } void CLFuseBatchNormalization::run() { - CLScheduler::get().enqueue(_fuse_bn_kernel, true); + CLScheduler::get().enqueue(*_fuse_bn_kernel, true); } } // namespace arm_compute diff --git a/src/runtime/CL/functions/CLGEMM.cpp b/src/runtime/CL/functions/CLGEMM.cpp index 8466024c04..871a1d6e27 100644 --- a/src/runtime/CL/functions/CLGEMM.cpp +++ b/src/runtime/CL/functions/CLGEMM.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2020 ARM Limited. + * Copyright (c) 2017-2022 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -23,675 +23,130 @@ */ #include "arm_compute/runtime/CL/functions/CLGEMM.h" -#include "arm_compute/core/CL/ICLGEMMKernelConfiguration.h" -#include "arm_compute/core/CL/ICLTensor.h" -#include "arm_compute/core/CL/gemm/reshaped/CLGEMMReshapedKernelConfiguration.h" -#include "arm_compute/core/CL/gemm/reshaped_only_rhs/CLGEMMReshapedOnlyRHSKernelConfiguration.h" -#include "arm_compute/core/Error.h" -#include "arm_compute/core/GPUTarget.h" +#include "arm_compute/core/CL/CLHelpers.h" +#include "arm_compute/core/CL/CLKernelLibrary.h" #include "arm_compute/core/Helpers.h" #include "arm_compute/core/KernelDescriptors.h" #include "arm_compute/core/TensorInfo.h" #include "arm_compute/core/Types.h" #include "arm_compute/core/Utils.h" -#include "arm_compute/core/Validate.h" -#include "arm_compute/core/utils/helpers/float_ops.h" -#include "arm_compute/core/utils/misc/Cast.h" -#include "arm_compute/core/utils/misc/ShapeCalculator.h" -#include "arm_compute/runtime/CL/CLScheduler.h" -#include "arm_compute/runtime/CL/gemm/CLGEMMKernelSelection.h" -#include "arm_compute/runtime/ITensorAllocator.h" + +#include "src/core/helpers/MemoryHelpers.h" +#include "src/gpu/cl/operators/ClGemm.h" namespace arm_compute { -using namespace arm_compute::misc::shape_calculator; -using namespace arm_compute::cl_gemm; -using namespace arm_compute::utils::cast; +using namespace arm_compute::experimental; +using OperatorType = opencl::ClGemm; -CLGEMM::CLGEMM(std::shared_ptr<IMemoryManager> memory_manager, IWeightsManager *weights_manager) - : _memory_group(std::move(memory_manager)), - _weights_manager(weights_manager), - _mm_kernel(), - _reshape_lhs_kernel(), - _reshape_rhs_kernel(), - _reshape_rhs_kernel_managed(), - _mm_reshaped_kernel(), - _mm_reshaped_only_rhs_kernel(), - _tmp_a(), - _tmp_b(), - _original_b(nullptr), - _reshape_b_only_on_first_run(false), - _is_prepared(false), - _gemm_kernel_type(CLGEMMKernelType::NATIVE_V1) +struct CLGEMM::Impl { -} + const ICLTensor *b{nullptr}; + std::unique_ptr<OperatorType> op{nullptr}; + MemoryGroup memory_group{}; + IWeightsManager *weights_manager{nullptr}; + ITensorPack run_pack{}; + ITensorPack prep_pack{}; + MemoryRequirements aux_mem_req{}; + WorkspaceData<CLTensor> workspace_tensors{}; + bool is_prepared{false}; +}; -CLGEMMKernelType CLGEMM::select_gemm_kernel(unsigned int m, unsigned int n, unsigned int k, DataType data_type, bool reshape_b_only_on_first_run) +CLGEMM::CLGEMM(std::shared_ptr<IMemoryManager> memory_manager, IWeightsManager *weights_manager) + : _impl(std::make_unique<Impl>()) { - std::unique_ptr<ICLGEMMKernelSelection> gemm_kernel = CLGEMMKernelSelectionFactory::create(CLScheduler::get().target()); - ARM_COMPUTE_ERROR_ON_NULLPTR(gemm_kernel.get()); - - CLGEMMKernelSelectionParams params; - params.m = m; - params.n = n; - params.k = k; - params.is_rhs_constant = reshape_b_only_on_first_run; - params.data_type = data_type; - - return gemm_kernel->select_kernel(params); + _impl->memory_group = MemoryGroup(memory_manager); + _impl->weights_manager = weights_manager; } -void CLGEMM::configure_native_v1(const CLCompileContext &compile_context, const ICLTensor *a, const ICLTensor *b, const ICLTensor *c, ICLTensor *output, float alpha, float beta, - const GEMMInfo &gemm_info) -{ - const unsigned int m = gemm_info.reinterpret_input_as_3d() ? (a->info()->dimension(1) * a->info()->dimension(2)) : a->info()->dimension(1); - const unsigned int n = b->info()->dimension(0); - const unsigned int k = a->info()->dimension(0); - const GPUTarget gpu_target = CLScheduler::get().target(); - - // Set the target for the kernels - _mm_kernel.set_target(gpu_target); - - GEMMReshapeInfo reshape_info(m, n, k, 1, 1, gemm_info.depth_output_gemm3d(), gemm_info.reinterpret_input_as_3d(), gemm_info.broadcast_bias()); - - // Configure and tune matrix multiply kernel - _mm_kernel.configure(compile_context, a, b, c, output, alpha, beta, false, reshape_info, gemm_info.fp_mixed_precision(), gemm_info.activation_info()); - - // Tune kernel statically - CLScheduler::get().tune_kernel_static(_mm_kernel); -} +CLGEMM::~CLGEMM() = default; -void CLGEMM::configure_reshaped_v1(const CLCompileContext &compile_context, const ICLTensor *a, const ICLTensor *b, const ICLTensor *c, ICLTensor *output, float alpha, float beta, - const GEMMInfo &gemm_info) +void CLGEMM::configure(const ICLTensor *a, + const ICLTensor *b, + const ICLTensor *c, + ICLTensor *output, + float alpha, + float beta, + const GEMMInfo &gemm_info) { - bool reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d(); - const unsigned int m = reinterpret_input_as_3d ? (a->info()->dimension(1) * a->info()->dimension(2)) : a->info()->dimension(1); - const unsigned int n = b->info()->dimension(0); - const unsigned int k = a->info()->dimension(0); - const int depth_output_gemm3d = gemm_info.depth_output_gemm3d(); - const GPUTarget gpu_target = CLScheduler::get().target(); - int mult_transpose1xW_width = 1; - int mult_interleave4x4_height = 1; - - // Set the target for the kernels - _reshape_lhs_kernel.set_target(gpu_target); - _mm_kernel.set_target(gpu_target); - - if(get_arch_from_target(gpu_target) == GPUTarget::BIFROST) - { - mult_transpose1xW_width = 4; - mult_interleave4x4_height = 2; - } - - GEMMRHSMatrixInfo rhs_info; - rhs_info.n0 = 16 / b->info()->element_size(); - rhs_info.k0 = 1; - rhs_info.h0 = mult_transpose1xW_width; - rhs_info.interleave = false; - rhs_info.transpose = false; - - GEMMLHSMatrixInfo lhs_info; - lhs_info.m0 = 4; - lhs_info.k0 = 4; - lhs_info.v0 = mult_interleave4x4_height; - lhs_info.interleave = true; - lhs_info.transpose = true; - - GEMMReshapeInfo reshape_info(m, n, k, mult_transpose1xW_width, mult_interleave4x4_height, depth_output_gemm3d, false, gemm_info.broadcast_bias()); - - const bool use_mm_b = (!_weights_manager || !_weights_manager->are_weights_managed(b)); - - // Manage intermediate buffers - _memory_group.manage(&_tmp_a); - - if(!_reshape_b_only_on_first_run && use_mm_b) - { - _memory_group.manage(&_tmp_b); - } - - // Configure interleave kernel - _reshape_lhs_kernel.configure(compile_context, a, &_tmp_a, lhs_info, reinterpret_input_as_3d); - - // Configure transpose kernel - ICLTensor *reshaped_rhs = &_tmp_b; - if(_weights_manager && _weights_manager->are_weights_managed(b)) - { - _reshape_rhs_kernel_managed.configure(compile_context, b, rhs_info); - reshaped_rhs = utils::cast::polymorphic_downcast<ICLTensor *>(_weights_manager->acquire(b, &_reshape_rhs_kernel_managed)); - } - else - { - _reshape_rhs_kernel.configure(compile_context, b, &_tmp_b, rhs_info); - } - - // Configure and tune matrix multiply kernel - _mm_kernel.configure(compile_context, &_tmp_a, reshaped_rhs, c, output, alpha, beta, true, reshape_info, gemm_info.fp_mixed_precision(), gemm_info.activation_info()); - - CLScheduler::get().tune_kernel_static(_mm_kernel); - - // Allocate intermediate tensors - _tmp_a.allocator()->allocate(); - - if(!_reshape_b_only_on_first_run && use_mm_b) - { - _tmp_b.allocator()->allocate(); - } -} - -void CLGEMM::configure_reshaped_v2(const CLCompileContext &compile_context, const ICLTensor *a, const ICLTensor *b, const ICLTensor *c, ICLTensor *output, float alpha, float beta, - const GEMMInfo &gemm_info) -{ - DataType data_type = a->info()->data_type(); - bool reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d(); - const unsigned int m = reinterpret_input_as_3d ? (a->info()->dimension(1) * a->info()->dimension(2)) : a->info()->dimension(1); - const unsigned int n = b->info()->dimension(0); - const unsigned int k = a->info()->dimension(0); - const unsigned int batch_size = reinterpret_input_as_3d ? a->info()->dimension(3) : a->info()->dimension(2); - const int depth_output_gemm3d = gemm_info.depth_output_gemm3d(); - const GPUTarget gpu_target = CLScheduler::get().target(); - bool broadcast_bias = gemm_info.broadcast_bias(); - - GEMMKernelInfo kernel_info; - kernel_info.m = m; - kernel_info.n = n; - kernel_info.k = k; - kernel_info.depth_output_gemm3d = depth_output_gemm3d; - kernel_info.reinterpret_input_as_3d = false; - kernel_info.broadcast_bias = broadcast_bias; - kernel_info.activation_info = gemm_info.activation_info(); - - // Set the target for the kernels - _reshape_lhs_kernel.set_target(gpu_target); - _mm_kernel.set_target(gpu_target); - - const bool use_mm_b = (!_weights_manager || !_weights_manager->are_weights_managed(b)); - - // Manage intermediate buffers - _memory_group.manage(&_tmp_a); - - if(!_reshape_b_only_on_first_run && use_mm_b) - { - _memory_group.manage(&_tmp_b); - } - - // _tmp_a and _tmp_b will be auto configured in _interleave_kernel and in _transpose_kernel - - GEMMLHSMatrixInfo lhs_info{}; - GEMMRHSMatrixInfo rhs_info{}; - - // Pick up the GEMM configuration - std::unique_ptr<ICLGEMMKernelConfiguration> gemm_config = CLGEMMReshapedKernelConfigurationFactory::create(gpu_target); - ARM_COMPUTE_ERROR_ON_NULLPTR(gemm_config.get()); - - // Configure lhs_info and rhs_info - std::tie(lhs_info, rhs_info) = gemm_config->configure(m, n, k, batch_size, data_type); - - _reshape_lhs_kernel.configure(compile_context, a, &_tmp_a, lhs_info, gemm_info.reinterpret_input_as_3d()); - - ICLTensor *reshaped_rhs = &_tmp_b; - if(_weights_manager && _weights_manager->are_weights_managed(b)) - { - _reshape_rhs_kernel_managed.configure(compile_context, b, rhs_info); - reshaped_rhs = utils::cast::polymorphic_downcast<ICLTensor *>(_weights_manager->acquire(b, &_reshape_rhs_kernel_managed)); - } - else - { - _reshape_rhs_kernel.configure(compile_context, b, &_tmp_b, rhs_info); - } - - // Configure and tune matrix multiply kernel - _mm_reshaped_kernel.configure(compile_context, &_tmp_a, reshaped_rhs, c, output, alpha, beta, lhs_info, rhs_info, kernel_info); - - // Allocate intermediate tensors - _tmp_a.allocator()->allocate(); - - if(!_reshape_b_only_on_first_run && use_mm_b) - { - _tmp_b.allocator()->allocate(); - } + configure(CLKernelLibrary::get().get_compile_context(), a, b, c, output, alpha, beta, gemm_info); } -void CLGEMM::configure_reshaped_only_rhs(const CLCompileContext &compile_context, const ICLTensor *a, const ICLTensor *b, const ICLTensor *c, ICLTensor *output, float alpha, float beta, - const GEMMInfo &gemm_info) +void CLGEMM::configure(const CLCompileContext &compile_context, + const ICLTensor *a, + const ICLTensor *b, + const ICLTensor *c, + ICLTensor *output, + float alpha, + float beta, + const GEMMInfo &gemm_info) { - DataType data_type = a->info()->data_type(); - bool reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d(); - const unsigned int m = reinterpret_input_as_3d ? (a->info()->dimension(1) * a->info()->dimension(2)) : a->info()->dimension(1); - const unsigned int n = b->info()->dimension(0); - const unsigned int k = a->info()->dimension(0); - const unsigned int batch_size = reinterpret_input_as_3d ? a->info()->dimension(3) : a->info()->dimension(2); - const int depth_output_gemm3d = gemm_info.depth_output_gemm3d(); - const GPUTarget gpu_target = CLScheduler::get().target(); - bool broadcast_bias = gemm_info.broadcast_bias(); - - GEMMKernelInfo kernel_info; - kernel_info.m = m; - kernel_info.n = n; - kernel_info.k = k; - kernel_info.depth_output_gemm3d = depth_output_gemm3d; - kernel_info.reinterpret_input_as_3d = reinterpret_input_as_3d; - kernel_info.broadcast_bias = broadcast_bias; - kernel_info.activation_info = gemm_info.activation_info(); - - // Set the target for the kernels - _mm_kernel.set_target(gpu_target); - - const bool use_mm_b = (!_weights_manager || !_weights_manager->are_weights_managed(b)); - - // Manage intermediate buffers - if(!_reshape_b_only_on_first_run && use_mm_b) - { - _memory_group.manage(&_tmp_b); - } - - GEMMLHSMatrixInfo lhs_info{}; - GEMMRHSMatrixInfo rhs_info{}; + ARM_COMPUTE_ERROR_ON_NULLPTR(a, b, output); - // Pick up the GEMM configuration - std::unique_ptr<ICLGEMMKernelConfiguration> gemm_config = CLGEMMReshapedOnlyRHSKernelConfigurationFactory::create(gpu_target); - ARM_COMPUTE_ERROR_ON_NULLPTR(gemm_config.get()); + _impl->b = b; + _impl->op = std::make_unique<OperatorType>(); + _impl->is_prepared = gemm_info.retain_internal_weights(); - // Configure lhs_info and rhs_info - std::tie(lhs_info, rhs_info) = gemm_config->configure(m, n, k, batch_size, data_type); + _impl->op->configure(compile_context, a->info(), b->info(), c != nullptr ? c->info() : nullptr, output->info(), + alpha, beta, gemm_info); + _impl->aux_mem_req = _impl->op->workspace(); - ICLTensor *reshaped_rhs = &_tmp_b; - if(_weights_manager && _weights_manager->are_weights_managed(b)) + // Manage/allocate auxilairy tensors + if (_impl->is_prepared) { - _reshape_rhs_kernel_managed.configure(compile_context, b, rhs_info); - reshaped_rhs = utils::cast::polymorphic_downcast<ICLTensor *>(_weights_manager->acquire(b, &_reshape_rhs_kernel_managed)); + _impl->run_pack.add_const_tensor(ACL_SRC_0, a); + _impl->run_pack.add_tensor(ACL_DST, output); } else { - _reshape_rhs_kernel.configure(compile_context, b, &_tmp_b, rhs_info); - } + _impl->run_pack = {{ACL_SRC_0, a}, {ACL_SRC_2, c}, {ACL_DST, output}}; + _impl->prep_pack = {{ACL_SRC_1, _impl->b}}; - // Configure and tune matrix multiply kernel - _mm_reshaped_only_rhs_kernel.configure(compile_context, a, reshaped_rhs, c, output, alpha, beta, lhs_info, rhs_info, kernel_info); - - if(!_reshape_b_only_on_first_run && use_mm_b) - { - _tmp_b.allocator()->allocate(); + _impl->workspace_tensors = + manage_workspace<CLTensor>(_impl->op->workspace(), _impl->memory_group, _impl->run_pack, _impl->prep_pack); } } -Status CLGEMM::validate_native_v1(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *output, float alpha, float beta, const GEMMInfo &gemm_info) +Status CLGEMM::validate(const ITensorInfo *a, + const ITensorInfo *b, + const ITensorInfo *c, + const ITensorInfo *output, + float alpha, + float beta, + const GEMMInfo &gemm_info) { - ARM_COMPUTE_UNUSED(alpha); - ARM_COMPUTE_UNUSED(output); - - // Get the GPU target - const GPUTarget gpu_target = CLScheduler::get().target(); - bool reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d(); - const unsigned int m = reinterpret_input_as_3d ? (a->dimension(1) * a->dimension(2)) : a->dimension(1); - const unsigned int n = b->dimension(0); - const unsigned int k = a->dimension(0); - const int depth_output_gemm3d = gemm_info.depth_output_gemm3d(); - - const GEMMReshapeInfo reshape_info = GEMMReshapeInfo(m, n, k, 1, 1, depth_output_gemm3d, reinterpret_input_as_3d, gemm_info.broadcast_bias()); - - // Validate matrix multiply - ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMMatrixMultiplyKernel::validate(a, b, c, output, alpha, beta, - false, reshape_info, gpu_target, gemm_info.fp_mixed_precision(), gemm_info.activation_info())); - - return Status{}; + return OperatorType::validate(a, b, c, output, alpha, beta, gemm_info); } -Status CLGEMM::validate_reshaped_v1(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *output, float alpha, float beta, const GEMMInfo &gemm_info) -{ - ARM_COMPUTE_UNUSED(alpha); - ARM_COMPUTE_UNUSED(output); - - TensorInfo tmp_a_info{}; - TensorInfo tmp_b_info{}; - - // Get the GPU target - const GPUTarget gpu_target = CLScheduler::get().target(); - const unsigned int m = gemm_info.reinterpret_input_as_3d() ? (a->dimension(1) * a->dimension(2)) : a->dimension(1); - const unsigned int n = b->dimension(0); - const unsigned int k = a->dimension(0); - int mult_transpose1xW_width = 1; - int mult_interleave4x4_height = 1; - const int depth_output_gemm3d = gemm_info.depth_output_gemm3d(); - - if(get_arch_from_target(gpu_target) == GPUTarget::BIFROST) - { - mult_transpose1xW_width = 4; - mult_interleave4x4_height = 2; - } - - GEMMRHSMatrixInfo rhs_info; - rhs_info.n0 = 16 / b->element_size(); - rhs_info.k0 = 1; - rhs_info.h0 = mult_transpose1xW_width; - rhs_info.interleave = false; - rhs_info.transpose = false; - - GEMMLHSMatrixInfo lhs_info; - lhs_info.m0 = 4; - lhs_info.k0 = 4; - lhs_info.v0 = mult_interleave4x4_height; - lhs_info.interleave = true; - lhs_info.transpose = true; - - const GEMMReshapeInfo reshape_info = GEMMReshapeInfo(m, n, k, mult_transpose1xW_width, mult_interleave4x4_height, depth_output_gemm3d, false, gemm_info.broadcast_bias()); - - // Validate interleave kernel - auto_init_if_empty(tmp_a_info, a->clone()->set_tensor_shape(compute_lhs_reshaped_shape(*a, lhs_info, gemm_info.reinterpret_input_as_3d()))); - ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMReshapeLHSMatrixKernel::validate(a, &tmp_a_info, lhs_info, gemm_info.reinterpret_input_as_3d())); - - // Validate transpose kernel - auto_init_if_empty(tmp_b_info, b->clone()->set_tensor_shape(compute_rhs_reshaped_shape(*b, rhs_info))); - ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMReshapeRHSMatrixKernel::validate(b, &tmp_b_info, rhs_info)); - - // Validate matrix multiply - ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMMatrixMultiplyKernel::validate(&tmp_a_info, &tmp_b_info, c, output, alpha, beta, - true, reshape_info, gpu_target, gemm_info.fp_mixed_precision(), gemm_info.activation_info())); - - return Status{}; -} - -Status CLGEMM::validate_reshaped(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *output, float alpha, float beta, const GEMMInfo &gemm_info) -{ - ARM_COMPUTE_UNUSED(alpha); - ARM_COMPUTE_UNUSED(output); - - TensorInfo tmp_a_info{}; - TensorInfo tmp_b_info{}; - - // Get the GPU target - const GPUTarget gpu_target = CLScheduler::get().target(); - DataType data_type = a->data_type(); - bool reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d(); - const unsigned int m = reinterpret_input_as_3d ? (a->dimension(1) * a->dimension(2)) : a->dimension(1); - const unsigned int n = b->dimension(0); - const unsigned int k = a->dimension(0); - const unsigned int batch_size = reinterpret_input_as_3d ? a->dimension(3) : a->dimension(2); - const int depth_output_gemm3d = gemm_info.depth_output_gemm3d(); - const bool broadcast_bias = gemm_info.broadcast_bias(); - - GEMMKernelInfo kernel_info; - kernel_info.m = m; - kernel_info.n = n; - kernel_info.k = k; - kernel_info.depth_output_gemm3d = depth_output_gemm3d; - kernel_info.reinterpret_input_as_3d = false; - kernel_info.broadcast_bias = broadcast_bias; - kernel_info.activation_info = gemm_info.activation_info(); - - GEMMLHSMatrixInfo lhs_info; - GEMMRHSMatrixInfo rhs_info; - - // Pick up the GEMM configuration - std::unique_ptr<ICLGEMMKernelConfiguration> gemm_config = CLGEMMReshapedKernelConfigurationFactory::create(gpu_target); - ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(gemm_config.get()); - - // Configure lhs_info and rhs_info - std::tie(lhs_info, rhs_info) = gemm_config->configure(m, n, k, batch_size, data_type); - - auto_init_if_empty(tmp_a_info, a->clone()->set_tensor_shape(compute_lhs_reshaped_shape(*a, lhs_info, gemm_info.reinterpret_input_as_3d()))); - ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMReshapeLHSMatrixKernel::validate(a, &tmp_a_info, lhs_info, gemm_info.reinterpret_input_as_3d())); - - auto_init_if_empty(tmp_b_info, b->clone()->set_tensor_shape(compute_rhs_reshaped_shape(*b, rhs_info))); - ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMReshapeRHSMatrixKernel::validate(b, &tmp_b_info, rhs_info)); - - // Validate matrix multiply - ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMMatrixMultiplyReshapedKernel::validate(&tmp_a_info, &tmp_b_info, c, output, alpha, beta, lhs_info, rhs_info, kernel_info)); - - return Status{}; -} - -Status CLGEMM::validate_reshaped_only_rhs(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *output, float alpha, float beta, const GEMMInfo &gemm_info) -{ - ARM_COMPUTE_UNUSED(alpha); - ARM_COMPUTE_UNUSED(output); - - TensorInfo tmp_b_info{}; - - // Get the GPU target - const GPUTarget gpu_target = CLScheduler::get().target(); - const DataType data_type = a->data_type(); - bool reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d(); - const unsigned int m = reinterpret_input_as_3d ? (a->dimension(1) * a->dimension(2)) : a->dimension(1); - const unsigned int n = b->dimension(0); - const unsigned int k = a->dimension(0); - const unsigned int batch_size = reinterpret_input_as_3d ? a->dimension(3) : a->dimension(2); - const int depth_output_gemm3d = gemm_info.depth_output_gemm3d(); - const bool broadcast_bias = gemm_info.broadcast_bias(); - - GEMMKernelInfo kernel_info; - kernel_info.m = m; - kernel_info.n = n; - kernel_info.k = k; - kernel_info.depth_output_gemm3d = depth_output_gemm3d; - kernel_info.reinterpret_input_as_3d = reinterpret_input_as_3d; - kernel_info.broadcast_bias = broadcast_bias; - kernel_info.activation_info = gemm_info.activation_info(); - - GEMMLHSMatrixInfo lhs_info; - GEMMRHSMatrixInfo rhs_info; - - // Pick up the GEMM configuration - std::unique_ptr<ICLGEMMKernelConfiguration> gemm_config = CLGEMMReshapedOnlyRHSKernelConfigurationFactory::create(gpu_target); - ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(gemm_config.get()); - - // Configure lhs_info and rhs_info - std::tie(lhs_info, rhs_info) = gemm_config->configure(m, n, k, batch_size, data_type); - - auto_init_if_empty(tmp_b_info, b->clone()->set_tensor_shape(compute_rhs_reshaped_shape(*b, rhs_info))); - ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMReshapeRHSMatrixKernel::validate(b, &tmp_b_info, rhs_info)); - - // Validate matrix multiply - ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMMatrixMultiplyReshapedOnlyRHSKernel::validate(a, &tmp_b_info, c, output, alpha, beta, lhs_info, rhs_info, kernel_info)); - - return Status{}; -} - -void CLGEMM::configure(const ICLTensor *a, const ICLTensor *b, const ICLTensor *c, ICLTensor *output, float alpha, float beta, const GEMMInfo &gemm_info) -{ - configure(CLKernelLibrary::get().get_compile_context(), a, b, c, output, alpha, beta, gemm_info); -} - -void CLGEMM::configure(const CLCompileContext &compile_context, const ICLTensor *a, const ICLTensor *b, const ICLTensor *c, ICLTensor *output, float alpha, float beta, const GEMMInfo &gemm_info) -{ - ARM_COMPUTE_ERROR_ON_NULLPTR(a, b, output); - - // Perform validation step - ARM_COMPUTE_ERROR_THROW_ON(validate(a->info(), b->info(), c != nullptr ? c->info() : nullptr, output->info(), alpha, beta, gemm_info)); - - // Check if we need to reshape the matrix B only on the first run - _reshape_b_only_on_first_run = gemm_info.reshape_b_only_on_first_run(); - _is_prepared = gemm_info.retain_internal_weights(); - _original_b = b; - - // Get the GPU target - bool reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d(); - const unsigned int m = reinterpret_input_as_3d ? (a->info()->dimension(1) * a->info()->dimension(2)) : a->info()->dimension(1); - const unsigned int n = b->info()->dimension(0); - const unsigned int k = a->info()->dimension(0); - - // Select GEMMType - _gemm_kernel_type = select_gemm_kernel(m, n, k, a->info()->data_type(), _reshape_b_only_on_first_run); - - const bool fuse_add_c = (!(helpers::float_ops::is_zero(beta)) && c != nullptr); - - const ICLTensor *c_to_use = fuse_add_c ? c : nullptr; - - switch(_gemm_kernel_type) - { - case CLGEMMKernelType::NATIVE_V1: - { - configure_native_v1(compile_context, a, b, c_to_use, output, alpha, beta, gemm_info); - break; - } - case CLGEMMKernelType::RESHAPED_V1: - { - configure_reshaped_v1(compile_context, a, b, c_to_use, output, alpha, beta, gemm_info); - break; - } - case CLGEMMKernelType::RESHAPED: - { - configure_reshaped_v2(compile_context, a, b, c_to_use, output, alpha, beta, gemm_info); - break; - } - case CLGEMMKernelType::RESHAPED_ONLY_RHS: - { - configure_reshaped_only_rhs(compile_context, a, b, c_to_use, output, alpha, beta, gemm_info); - break; - } - default: - { - ARM_COMPUTE_ERROR("GEMMType not supported"); - } - } -} - -Status CLGEMM::validate(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *output, float alpha, float beta, const GEMMInfo &gemm_info) +void CLGEMM::run() { - // Get the GPU target - bool reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d(); - const unsigned int m = reinterpret_input_as_3d ? (a->dimension(1) * a->dimension(2)) : a->dimension(1); - const unsigned int n = b->dimension(0); - const unsigned int k = a->dimension(0); - - // Select GEMMType - CLGEMMKernelType gemm_kernel_type = select_gemm_kernel(m, n, k, a->data_type(), gemm_info.reshape_b_only_on_first_run()); - - const bool fuse_add_c = (!(helpers::float_ops::is_zero(beta)) && c != nullptr); - - const ITensorInfo *c_to_use = fuse_add_c ? c : nullptr; + prepare(); - switch(gemm_kernel_type) - { - case CLGEMMKernelType::NATIVE_V1: - { - ARM_COMPUTE_RETURN_ON_ERROR(validate_native_v1(a, b, c_to_use, output, alpha, beta, gemm_info)); - break; - } - case CLGEMMKernelType::RESHAPED_V1: - { - ARM_COMPUTE_RETURN_ON_ERROR(validate_reshaped_v1(a, b, c_to_use, output, alpha, beta, gemm_info)); - break; - } - case CLGEMMKernelType::RESHAPED: - { - ARM_COMPUTE_RETURN_ON_ERROR(validate_reshaped(a, b, c_to_use, output, alpha, beta, gemm_info)); - break; - } - case CLGEMMKernelType::RESHAPED_ONLY_RHS: - { - ARM_COMPUTE_RETURN_ON_ERROR(validate_reshaped_only_rhs(a, b, c_to_use, output, alpha, beta, gemm_info)); - break; - } - default: - { - ARM_COMPUTE_RETURN_ERROR_MSG("GEMMType not supported"); - } - } + MemoryGroupResourceScope scope_mg(_impl->memory_group); - return Status{}; + _impl->op->run(_impl->run_pack); } -void CLGEMM::run() +void CLGEMM::prepare() { - prepare(); - - MemoryGroupResourceScope scope_mg(_memory_group); - - // Run matrix multiply kernel - switch(_gemm_kernel_type) + if (!_impl->is_prepared) { - case CLGEMMKernelType::NATIVE_V1: - { - CLScheduler::get().enqueue(_mm_kernel, true); - break; - } - case CLGEMMKernelType::RESHAPED_V1: - { - // Run interleave kernel - CLScheduler::get().enqueue(_reshape_lhs_kernel, false); + _impl->op->prepare(_impl->prep_pack); - if(!_reshape_b_only_on_first_run) - { - // Run transpose kernel - if(_weights_manager && _weights_manager->are_weights_managed(_original_b)) - { - _weights_manager->run(_original_b, &_reshape_rhs_kernel_managed); - } - else - { - CLScheduler::get().enqueue(_reshape_rhs_kernel, false); - } - } + auto has_reshape = + std::find_if(_impl->aux_mem_req.begin(), _impl->aux_mem_req.end(), + [](const MemoryInfo &m) -> bool { return m.lifetime == MemoryLifetime::Persistent; }); - CLScheduler::get().enqueue(_mm_kernel, true); - break; - } - case CLGEMMKernelType::RESHAPED: + if (has_reshape != std::end(_impl->aux_mem_req)) { - // Run interleave kernel - CLScheduler::get().enqueue(_reshape_lhs_kernel, false); - - if(!_reshape_b_only_on_first_run) - { - // Run transpose kernel - if(_weights_manager && _weights_manager->are_weights_managed(_original_b)) - { - _weights_manager->run(_original_b, &_reshape_rhs_kernel_managed); - } - else - { - CLScheduler::get().enqueue(_reshape_rhs_kernel, false); - } - } - - CLScheduler::get().enqueue(_mm_reshaped_kernel, true); - break; + _impl->b->mark_as_unused(); } - case CLGEMMKernelType::RESHAPED_ONLY_RHS: - { - if(!_reshape_b_only_on_first_run) - { - // Run transpose kernel - if(_weights_manager && _weights_manager->are_weights_managed(_original_b)) - { - _weights_manager->run(_original_b, &_reshape_rhs_kernel_managed); - } - else - { - CLScheduler::get().enqueue(_reshape_rhs_kernel, false); - } - } - - CLScheduler::get().enqueue(_mm_reshaped_only_rhs_kernel, true); - break; - } - default: - { - ARM_COMPUTE_ERROR("GEMMType not supported"); - } - } -} - -void CLGEMM::prepare() -{ - if(!_is_prepared) - { - if(_gemm_kernel_type != CLGEMMKernelType::NATIVE_V1 && _reshape_b_only_on_first_run) + else { - if(_weights_manager && _weights_manager->are_weights_managed(_original_b)) - { - _weights_manager->run(_original_b, &_reshape_rhs_kernel_managed); - } - else - { - // Run transpose kernel and mark original weights tensor as unused - _tmp_b.allocator()->allocate(); - CLScheduler::get().enqueue(_reshape_rhs_kernel, false); - _original_b->mark_as_unused(); - } + // Pack the B matrix to be used as the underlying GEMM performs no reshapes + _impl->run_pack.add_const_tensor(ACL_SRC_1, _impl->b); } - CLScheduler::get().queue().finish(); - _is_prepared = true; + _impl->is_prepared = true; } } } // namespace arm_compute diff --git a/src/runtime/CL/functions/CLGEMMConvolutionLayer.cpp b/src/runtime/CL/functions/CLGEMMConvolutionLayer.cpp index 1c37993bda..aef7cddd7a 100644 --- a/src/runtime/CL/functions/CLGEMMConvolutionLayer.cpp +++ b/src/runtime/CL/functions/CLGEMMConvolutionLayer.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2020 ARM Limited. + * Copyright (c) 2017-2021, 2023 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -23,15 +23,19 @@ */ #include "arm_compute/runtime/CL/functions/CLGEMMConvolutionLayer.h" +#include "arm_compute/core/CL/CLKernelLibrary.h" #include "arm_compute/core/PixelValue.h" #include "arm_compute/core/Size2D.h" #include "arm_compute/core/Utils.h" -#include "arm_compute/core/Validate.h" -#include "arm_compute/core/utils/misc/Cast.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" #include "arm_compute/core/utils/quantization/AsymmHelpers.h" +#include "arm_compute/core/Validate.h" #include "arm_compute/runtime/CL/CLScheduler.h" +#include "src/core/helpers/MemoryHelpers.h" +#include "src/gpu/cl/operators/ClGemmConv2d.h" +#include "support/Cast.h" + #include <cmath> #include <memory> #include <tuple> @@ -40,635 +44,117 @@ namespace arm_compute { using namespace arm_compute::misc::shape_calculator; using namespace arm_compute::utils::cast; +using namespace arm_compute::experimental; -CLConvolutionLayerReshapeWeights::CLConvolutionLayerReshapeWeights() - : _weights_reshape_kernel() -{ -} - -void CLConvolutionLayerReshapeWeights::configure(const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, unsigned int num_groups) -{ - configure(CLKernelLibrary::get().get_compile_context(), weights, biases, output, num_groups); -} - -void CLConvolutionLayerReshapeWeights::configure(const CLCompileContext &compile_context, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, unsigned int num_groups) -{ - // Perform validation step - ARM_COMPUTE_ERROR_ON_NULLPTR(weights, output); - ARM_COMPUTE_ERROR_THROW_ON(CLConvolutionLayerReshapeWeights::validate(weights->info(), - (biases != nullptr) ? biases->info() : nullptr, - output->info(), - num_groups)); - - const bool append_biases = (biases != nullptr) && !is_data_type_quantized_asymmetric(weights->info()->data_type()); - const ICLTensor *biases_to_use = (append_biases) ? biases : nullptr; - - _weights_reshape_kernel.configure(compile_context, weights, biases_to_use, output, num_groups); - - output->info()->set_quantization_info(weights->info()->quantization_info()); -} - -Status CLConvolutionLayerReshapeWeights::validate(const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, unsigned int num_groups) +struct CLGEMMConvolutionLayer::Impl { - ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(weights); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(weights, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM8_PER_CHANNEL, DataType::F16, DataType::F32); - ARM_COMPUTE_RETURN_ERROR_ON(weights->num_dimensions() > 4); - - if(biases != nullptr) - { - const int idx_kernels = get_data_layout_dimension_index(weights->data_layout(), DataLayoutDimension::BATCHES); - ARM_COMPUTE_RETURN_ERROR_ON(is_data_type_quantized(weights->data_type())); - - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(weights, biases); - ARM_COMPUTE_RETURN_ERROR_ON(biases->dimension(0) != weights->dimension(idx_kernels)); - ARM_COMPUTE_RETURN_ERROR_ON(biases->num_dimensions() > 1); - } - - if((output != nullptr) && (output->total_size() != 0)) - { - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(weights, output); - CLWeightsReshapeKernel::validate(weights, biases, output, num_groups); - } - - return Status{}; -} - -void CLConvolutionLayerReshapeWeights::run() + const ITensor *weights{nullptr}; + std::unique_ptr<opencl::ClGemmConv2d> op{nullptr}; + ITensorPack run_pack{}; + ITensorPack prep_pack{}; + MemoryGroup memory_group{}; + IWeightsManager *weights_manager{nullptr}; + MemoryRequirements aux_mem_req{}; + WorkspaceData<CLTensor> workspace_tensors{}; + bool is_prepared{false}; +}; + +CLGEMMConvolutionLayer::CLGEMMConvolutionLayer(std::shared_ptr<IMemoryManager> memory_manager, + IWeightsManager *weights_manager) + : _impl(std::make_unique<Impl>()) { - CLScheduler::get().enqueue(_weights_reshape_kernel); -} - -CLGEMMConvolutionLayer::CLGEMMConvolutionLayer(std::shared_ptr<IMemoryManager> memory_manager, IWeightsManager *weights_manager) - : _memory_group(memory_manager), _weights_manager(weights_manager), _reshape_weights(), _reshape_weights_managed(), _im2col_kernel(), _mm_gemm(memory_manager, weights_manager), - _mm_gemmlowp(memory_manager), _col2im_kernel(), _activationlayer_function(), _original_weights(nullptr), _im2col_output(), _weights_reshaped(), _gemm_output(), _skip_im2col(false), - _skip_col2im(false), _is_quantized(false), _fuse_activation(true), _is_prepared(false) -{ -} - -void CLGEMMConvolutionLayer::configure_mm(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, - const GEMMLowpOutputStageInfo &gemmlowp_output_stage, - int gemm_3d_depth, const ActivationLayerInfo &act_info) -{ - ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights); - ARM_COMPUTE_ERROR_THROW_ON(validate_mm(input->info(), weights->info(), biases != nullptr ? biases->info() : nullptr, output->info(), gemmlowp_output_stage, gemm_3d_depth, _skip_im2col, act_info)); - - const GEMMInfo &gemm_info = GEMMInfo(false, // is_a_reshaped - false, // is_b_reshaped - true, // reshape_b_only_on_first_run - gemm_3d_depth, // depth_output_gemm3d - _skip_im2col, // reinterpret_input_as_3d - false, // retain_internal_weights - gemmlowp_output_stage, // gemmlowp_output_stage - false, // fp_mixed_precision - true, // broadcast_bias - act_info); // activation_info - - if(_is_quantized) - { - // Since we need negative offsets for computing convolution, we need to change QuantizationInfo() - // Extract and negate input and weights offset - const QuantizationInfo input_quantization_info = input->info()->quantization_info(); - const QuantizationInfo weights_quantization_info = weights->info()->quantization_info(); - - input->info()->set_quantization_info(QuantizationInfo(input_quantization_info.uniform().scale, -input_quantization_info.uniform().offset)); - weights->info()->set_quantization_info(QuantizationInfo(weights_quantization_info.uniform().scale, -weights_quantization_info.uniform().offset)); - - _mm_gemmlowp.configure(compile_context, input, weights, biases, output, gemm_info); - - // Revert back QuantizatioInfo as input and weights could be used in other convolution layers - input->info()->set_quantization_info(input_quantization_info); - weights->info()->set_quantization_info(weights_quantization_info); - } - else - { - // Configure matrix multiply function - _mm_gemm.configure(compile_context, input, weights, biases, output, 1.0f, 1.0f, gemm_info); - } -} - -Status CLGEMMConvolutionLayer::validate_mm(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, - const GEMMLowpOutputStageInfo &gemmlowp_output_stage, int gemm_3d_depth, bool skip_im2col, const ActivationLayerInfo &act_info) -{ - const bool is_quantized = is_data_type_quantized_asymmetric(input->data_type()); - - const GEMMInfo &gemm_info = GEMMInfo(false, // is_a_reshaped - false, // is_b_reshaped - true, // reshape_b_only_on_first_run - gemm_3d_depth, // depth_output_gemm3d - skip_im2col, // reinterpret_input_as_3d - false, // retain_internal_weights - gemmlowp_output_stage, // gemmlowp_output_stage - false, // fp_mixed_precision - true, // broadcast_bias - act_info); // activation_info - - if(is_quantized) - { - // Since we need negative offsets for computing convolution, we need to change QuantizationInfo() - // Extract and negate input and weights offset - const QuantizationInfo input_quantization_info = input->quantization_info(); - const QuantizationInfo weights_quantization_info = weights->quantization_info(); - - std::unique_ptr<ITensorInfo> input_qa = input->clone(); - std::unique_ptr<ITensorInfo> weights_qa = weights->clone(); - input_qa->set_quantization_info(QuantizationInfo(input_quantization_info.uniform().scale, -input_quantization_info.uniform().offset)); - weights_qa->set_quantization_info(QuantizationInfo(weights_quantization_info.uniform().scale, -weights_quantization_info.uniform().offset)); - - // Perform validation step on GEMMLowp - return CLGEMMLowpMatrixMultiplyCore::validate(input_qa.get(), weights_qa.get(), biases, output, gemm_info); - } - else - { - // Perform validation step on Matrix multiply function - return CLGEMM::validate(input, weights, biases, output, 1.0f, 1.0f, gemm_info); - } + _impl->memory_group = MemoryGroup(memory_manager); + _impl->weights_manager = weights_manager; } -void CLGEMMConvolutionLayer::configure(const ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info, const WeightsInfo &weights_info, - const Size2D &dilation, const ActivationLayerInfo &act_info, unsigned int num_groups) +CLGEMMConvolutionLayer::~CLGEMMConvolutionLayer() = default; + +void CLGEMMConvolutionLayer::configure(const ICLTensor *input, + const ICLTensor *weights, + const ICLTensor *biases, + ICLTensor *output, + const PadStrideInfo &conv_info, + const WeightsInfo &weights_info, + const Size2D &dilation, + const ActivationLayerInfo &act_info, + unsigned int num_groups) { - configure(CLKernelLibrary::get().get_compile_context(), input, weights, biases, output, conv_info, weights_info, dilation, act_info, num_groups); + configure(CLKernelLibrary::get().get_compile_context(), input, weights, biases, output, conv_info, weights_info, + dilation, act_info, num_groups); } -void CLGEMMConvolutionLayer::configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, - const PadStrideInfo &conv_info, - const WeightsInfo &weights_info, const Size2D &dilation, const ActivationLayerInfo &act_info, unsigned int num_groups) +void CLGEMMConvolutionLayer::configure(const CLCompileContext &compile_context, + const ICLTensor *input, + const ICLTensor *weights, + const ICLTensor *biases, + ICLTensor *output, + const PadStrideInfo &conv_info, + const WeightsInfo &weights_info, + const Size2D &dilation, + const ActivationLayerInfo &act_info, + unsigned int num_groups) { ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output); - - ARM_COMPUTE_ERROR_THROW_ON(CLGEMMConvolutionLayer::validate(input->info(), - weights->info(), - biases != nullptr ? biases->info() : nullptr, - output->info(), - conv_info, - weights_info, - dilation, - act_info, - num_groups)); - - const DataType data_type = input->info()->data_type(); - const DataLayout data_layout = input->info()->data_layout(); - const int idx_width = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH); - const int idx_height = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT); - const int idx_kernels = get_data_layout_dimension_index(data_layout, DataLayoutDimension::BATCHES); - - const unsigned int kernel_width = weights->info()->dimension(idx_width); - const unsigned int kernel_height = weights->info()->dimension(idx_height); - const unsigned int num_kernels = weights->info()->dimension(idx_kernels); - - const UniformQuantizationInfo iq_info = input->info()->quantization_info().uniform(); - const UniformQuantizationInfo oq_info = output->info()->quantization_info().uniform(); - - _is_prepared = weights_info.retain_internal_weights(); - _original_weights = weights; - _is_quantized = is_data_type_quantized_asymmetric(input->info()->data_type()); - _skip_im2col = (data_layout == DataLayout::NHWC && kernel_width == 1 && kernel_height == 1 && conv_info.stride().first == 1 && conv_info.stride().second == 1); - _skip_col2im = data_layout == DataLayout::NHWC; - - // Only for quantize there are few cases where we cannot fuse the activation function in GEMM - _fuse_activation = true; - - // Set the GPU target for im2col and col2im - _im2col_kernel.set_target(CLScheduler::get().target()); - _col2im_kernel.set_target(CLScheduler::get().target()); - - const ICLTensor *gemm_input_to_use = input; - ICLTensor *gemm_output_to_use = output; - - // Get parameters from conv_info - unsigned int stride_x = 0; - unsigned int stride_y = 0; - std::tie(stride_x, stride_y) = conv_info.stride(); - - // Get convolved dimensions - unsigned int conv_w = 0; - unsigned int conv_h = 0; - std::tie(conv_w, conv_h) = scaled_dimensions(input->info()->dimension(idx_width), - input->info()->dimension(idx_height), - kernel_width, - kernel_height, - conv_info, - dilation); - - unsigned int mat_weights_cols = num_kernels / num_groups; - - const ICLTensor *biases_to_use = biases; - bool append_bias = false; - - ICLTensor *weights_to_use = &_weights_reshaped; - if(num_groups != 1 && biases != nullptr) - { - // num_groups != 1 can only be for NCHW - // Since it is missing an utility function to reshape the biases, we append the biases into the weights tensor - biases_to_use = nullptr; - append_bias = true; - - if(_weights_manager && _weights_manager->are_weights_managed(weights)) - { - _reshape_weights_managed.configure(compile_context, weights, biases, num_groups); - weights_to_use = utils::cast::polymorphic_downcast<ICLTensor *>(_weights_manager->acquire(weights, &_reshape_weights_managed)); - } - else - { - _reshape_weights.configure(compile_context, weights, biases, &_weights_reshaped, num_groups); - } - } - else - { - if(_weights_manager && _weights_manager->are_weights_managed(weights)) - { - _reshape_weights_managed.configure(compile_context, weights, nullptr, num_groups); - weights_to_use = utils::cast::polymorphic_downcast<ICLTensor *>(_weights_manager->acquire(weights, &_reshape_weights_managed)); - } - else - { - _reshape_weights.configure(compile_context, weights, nullptr, &_weights_reshaped, num_groups); - } - } - - // Create tensor to store im2col reshaped inputs - if(!_skip_im2col) - { - _memory_group.manage(&_im2col_output); - - // Configure and tune im2col. im2col output shape is auto-initialized - _im2col_kernel.configure(compile_context, input, &_im2col_output, Size2D(kernel_width, kernel_height), conv_info, append_bias, dilation, num_groups); - - // Set quantization info - _im2col_output.info()->set_quantization_info(input->info()->quantization_info()); - CLScheduler::get().tune_kernel_static(_im2col_kernel); - - // Update GEMM input - gemm_input_to_use = &_im2col_output; - } - - // Create GEMM output tensor - if(!_skip_col2im) - { - TensorShape shape_gemm; - - // If we cannot skip col2im it means we run im2col as well - shape_gemm = _im2col_output.info()->tensor_shape(); - shape_gemm.set(0, mat_weights_cols); - shape_gemm.set(1, conv_w * conv_h); - - // TODO(COMPMID-2078): input->clone() doesn't work with subtensors for grouped convolutions. - TensorInfo info_gemm(shape_gemm, 1, data_type); - info_gemm.set_quantization_info(output->info()->quantization_info()).set_data_layout(input->info()->data_layout()); - _gemm_output.allocator()->init(info_gemm); - _memory_group.manage(&_gemm_output); - - // Update GEMM output - gemm_output_to_use = &_gemm_output; - } - - GEMMLowpOutputStageInfo gemmlowp_output_stage; - gemmlowp_output_stage.type = GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT; - gemmlowp_output_stage.gemmlowp_offset = 0; - - // Configure output stage for quantized case - if(_is_quantized) - { - const auto output_quant_info = (output->info()->total_size() == 0) ? iq_info : oq_info; - const bool is_quantized_per_channel = is_data_type_quantized_per_channel(weights->info()->data_type()); - const unsigned int num_filters = (is_quantized_per_channel) ? num_kernels : 1; - - gemmlowp_output_stage.is_quantized_per_channel = is_quantized_per_channel; - - gemmlowp_output_stage.gemmlowp_multipliers.resize(num_filters); - gemmlowp_output_stage.gemmlowp_shifts.resize(num_filters); - quantization::compute_quantized_multipliers_and_shifts(input->info(), - weights->info(), - output->info(), - idx_kernels, - gemmlowp_output_stage.gemmlowp_multipliers.data(), - gemmlowp_output_stage.gemmlowp_shifts.data()); - gemmlowp_output_stage.gemmlowp_multiplier = gemmlowp_output_stage.gemmlowp_multipliers[0]; - gemmlowp_output_stage.gemmlowp_shift = gemmlowp_output_stage.gemmlowp_shifts[0]; - - PixelValue min_val{}; - PixelValue max_val{}; - std::tie(min_val, max_val) = get_min_max(output->info()->data_type()); - - auto min_activation = min_val.get<int32_t>(); - auto max_activation = max_val.get<int32_t>(); - - const std::set<ActivationLayerInfo::ActivationFunction> supported_acts = { ActivationLayerInfo::ActivationFunction::RELU, - ActivationLayerInfo::ActivationFunction::BOUNDED_RELU, - ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU - }; - - if(act_info.enabled()) - { - if(supported_acts.count(act_info.activation()) != 0) - { - std::tie(min_activation, max_activation) = get_quantized_activation_min_max(act_info, data_type, output_quant_info); - } - else - { - _fuse_activation = false; - } - } - - // Set the GEMMLowp output stage info - gemmlowp_output_stage.gemmlowp_offset = output_quant_info.offset; - gemmlowp_output_stage.gemmlowp_min_bound = min_activation; - gemmlowp_output_stage.gemmlowp_max_bound = max_activation; - } - - // Configure and tune GEMM - // In case of NHWC, we need to run GEMM3D (gemm_3d_depth != 0) in order to avoid reshaping the output matrix - const unsigned int gemm_3d_depth = (data_layout == DataLayout::NHWC) ? conv_h : 0; - - configure_mm(compile_context, gemm_input_to_use, weights_to_use, biases_to_use, gemm_output_to_use, gemmlowp_output_stage, gemm_3d_depth, act_info); - - if(!_skip_im2col) - { - _im2col_output.allocator()->allocate(); - } - - if(!_skip_col2im) - { - // Configure and tune Col2Im - _col2im_kernel.configure(compile_context, gemm_output_to_use, output, Size2D(conv_w, conv_h), num_groups); - CLScheduler::get().tune_kernel_static(_col2im_kernel); - } - - if(!_skip_col2im) - { - _gemm_output.allocator()->allocate(); - } - - ARM_COMPUTE_ERROR_ON_MSG((output->info()->dimension(idx_width) != conv_w) || (output->info()->dimension(idx_height) != conv_h), - "Output shape does not match the expected one"); - - if(!_fuse_activation) - { - _activationlayer_function.configure(compile_context, output, nullptr, act_info); - } - - ARM_COMPUTE_UNUSED(weights_info); + _impl->weights = weights; + _impl->op = std::make_unique<opencl::ClGemmConv2d>(); + const Conv2dInfo conv2d_info = Conv2dInfo(conv_info, dilation, act_info, false, num_groups); + _impl->op->configure(compile_context, input->info(), weights->info(), + (biases != nullptr ? biases->info() : nullptr), output->info(), conv2d_info, weights_info); + + _impl->run_pack = {{TensorType::ACL_SRC_0, input}, + {TensorType::ACL_SRC_1, weights}, + {TensorType::ACL_SRC_2, biases}, + {TensorType::ACL_DST, output}}; + _impl->prep_pack = { + {TensorType::ACL_SRC_1, weights}, + {TensorType::ACL_SRC_2, biases}, + }; + _impl->aux_mem_req = _impl->op->workspace(); + _impl->workspace_tensors = + manage_workspace<CLTensor>(_impl->aux_mem_req, _impl->memory_group, _impl->run_pack, _impl->prep_pack); } -Status CLGEMMConvolutionLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info, - const WeightsInfo &weights_info, const Size2D &dilation, const ActivationLayerInfo &act_info, unsigned int num_groups) +Status CLGEMMConvolutionLayer::validate(const ITensorInfo *input, + const ITensorInfo *weights, + const ITensorInfo *biases, + const ITensorInfo *output, + const PadStrideInfo &conv_info, + const WeightsInfo &weights_info, + const Size2D &dilation, + const ActivationLayerInfo &act_info, + unsigned int num_groups) { - ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(weights_info.are_reshaped(), "Weights already reshaped are not supported!"); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F16, DataType::F32); - const bool is_quantized_per_channel = is_data_type_quantized_per_channel(weights->data_type()); - - if(is_quantized_per_channel) - { - ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->data_type() != DataType::QASYMM8, "Input data type not compatible with Weights"); - } - else - { - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights); - } - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, weights); - ARM_COMPUTE_RETURN_ERROR_ON_MSG((num_groups != 1) && (input->data_layout() != DataLayout::NCHW), "Grouping (num_groups != 1) with NHWC data layout is not supported"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG((num_groups != 1) && (input->data_type() == DataType::QASYMM8), "Grouping (num_groups != 1) is not supported with QASYMM8"); - ARM_COMPUTE_RETURN_ERROR_ON(((input->dimension(2) / weights->dimension(2)) != num_groups) && (input->data_layout() == DataLayout::NCHW)); - - const DataLayout data_layout = input->data_layout(); - const DataType data_type = input->data_type(); - const int idx_width = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH); - const int idx_height = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT); - const int idx_channel = get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL); - const int idx_kernels = get_data_layout_dimension_index(data_layout, DataLayoutDimension::BATCHES); - - const unsigned int kernel_width = weights->dimension(idx_width); - const unsigned int kernel_height = weights->dimension(idx_height); - const unsigned int num_kernels = weights->dimension(idx_kernels); - - TensorInfo im2col_reshaped_info{}; - TensorInfo info_gemm{}; - TensorInfo weights_reshaped_info{}; - const ITensorInfo *gemm_input_to_use = input; - const ITensorInfo *gemm_output_to_use = output; - const ITensorInfo *weights_to_use = weights; - const bool is_quantized = is_data_type_quantized_asymmetric(data_type); - const bool skip_im2col = (data_layout == DataLayout::NHWC && kernel_width == 1 && kernel_height == 1 && conv_info.stride().first == 1 && conv_info.stride().second == 1); - const bool skip_col2im = data_layout == DataLayout::NHWC; - bool fuse_activation = true; - - ARM_COMPUTE_RETURN_ERROR_ON((weights->dimension(idx_channel) * num_groups) != input->dimension(idx_channel)); - ARM_COMPUTE_RETURN_ERROR_ON(weights->num_dimensions() > 4); - - // Validate biases - if(biases != nullptr) - { - if(is_quantized) - { - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(biases, 1, DataType::S32); - } - else - { - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, biases); - } - ARM_COMPUTE_RETURN_ERROR_ON(biases->dimension(0) != weights->dimension(idx_kernels)); - ARM_COMPUTE_RETURN_ERROR_ON(biases->num_dimensions() > 1); - } - - if(act_info.enabled()) - { - ARM_COMPUTE_ERROR_ON(act_info.b() > act_info.a()); - } - - // Get convolved dimensions - unsigned int conv_w = 0; - unsigned int conv_h = 0; - - std::tie(conv_w, conv_h) = scaled_dimensions(input->dimension(idx_width), - input->dimension(idx_height), - kernel_width, - kernel_height, - conv_info, - dilation); - - unsigned int mat_weights_cols = num_kernels / num_groups; - - const ITensorInfo *biases_to_use = biases; - bool append_bias = false; - - if(num_groups != 1 && biases != nullptr) - { - // num_groups != 1 can only be for NCHW - // Since it is missing an utility function to reshape the biases, we append the biases into the weights tensor - biases_to_use = nullptr; - append_bias = true; - - ARM_COMPUTE_RETURN_ON_ERROR(CLConvolutionLayerReshapeWeights::validate(weights, biases, nullptr, num_groups)); - weights_reshaped_info = TensorInfo(compute_weights_reshaped_shape(*weights, true, num_groups), 1, data_type); - } - else - { - ARM_COMPUTE_RETURN_ON_ERROR(CLConvolutionLayerReshapeWeights::validate(weights, nullptr, nullptr, num_groups)); - weights_reshaped_info = TensorInfo(compute_weights_reshaped_shape(*weights, false, num_groups), 1, data_type); - } - - weights_to_use = &weights_reshaped_info; - - if(!skip_im2col) - { - const Size2D kernel_dims(kernel_width, kernel_height); - - // Output tensor auto initialization if not yet initialized - TensorShape expected_output_shape = compute_im2col_conv_shape(input, kernel_dims, conv_info, append_bias, dilation, num_groups == 1, num_groups); - - auto_init_if_empty(im2col_reshaped_info, input->clone()->set_tensor_shape(expected_output_shape)); - - ARM_COMPUTE_RETURN_ON_ERROR(CLIm2ColKernel::validate(input, &im2col_reshaped_info, kernel_dims, conv_info, append_bias, dilation, num_groups)); - gemm_input_to_use = &im2col_reshaped_info; - } - - // Create GEMM output tensor - if(!skip_col2im) - { - TensorShape shape_gemm; - - shape_gemm = gemm_input_to_use->tensor_shape(); - shape_gemm.set(0, mat_weights_cols); - shape_gemm.set(1, conv_w * conv_h); - - info_gemm = TensorInfo(shape_gemm, 1, data_type); - info_gemm.set_quantization_info(output->quantization_info()).set_data_layout(input->data_layout()); - gemm_output_to_use = &info_gemm; - } - - GEMMLowpOutputStageInfo gemmlowp_output_stage; - gemmlowp_output_stage.type = GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT; - gemmlowp_output_stage.gemmlowp_offset = 0; - gemmlowp_output_stage.is_quantized_per_channel = is_quantized_per_channel; - - if(is_quantized) - { - const UniformQuantizationInfo iq_info = input->quantization_info().uniform(); - const UniformQuantizationInfo oq_info = output->quantization_info().uniform(); - const auto output_quant_info = (output->total_size() == 0) ? iq_info : oq_info; - const unsigned int num_filters = (is_quantized_per_channel) ? num_kernels : 1; - - gemmlowp_output_stage.gemmlowp_multipliers.resize(num_filters); - gemmlowp_output_stage.gemmlowp_shifts.resize(num_filters); - quantization::compute_quantized_multipliers_and_shifts(input, - weights, - output, - idx_kernels, - gemmlowp_output_stage.gemmlowp_multipliers.data(), - gemmlowp_output_stage.gemmlowp_shifts.data()); - gemmlowp_output_stage.gemmlowp_multiplier = gemmlowp_output_stage.gemmlowp_multipliers[0]; - gemmlowp_output_stage.gemmlowp_shift = gemmlowp_output_stage.gemmlowp_shifts[0]; - - int min_activation = 0; - int max_activation = 0; - - const std::set<ActivationLayerInfo::ActivationFunction> supported_acts = { ActivationLayerInfo::ActivationFunction::RELU, - ActivationLayerInfo::ActivationFunction::BOUNDED_RELU, - ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU - }; - - if(act_info.enabled()) - { - if(supported_acts.count(act_info.activation()) != 0) - { - std::tie(min_activation, max_activation) = get_quantized_activation_min_max(act_info, data_type, output_quant_info); - } - else - { - fuse_activation = false; - } - } - - // Set the GEMMLowp output stage info - gemmlowp_output_stage.gemmlowp_offset = output_quant_info.offset; - gemmlowp_output_stage.gemmlowp_min_bound = min_activation; - gemmlowp_output_stage.gemmlowp_max_bound = max_activation; - } - - // In case of NHWC, we need to run GEMM3D (gemm_3d_depth != 0) in order to avoid reshaping the output matrix - const unsigned int gemm_3d_depth = (data_layout == DataLayout::NHWC) ? conv_h : 0; - - ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemm_input_to_use, weights_to_use, biases_to_use, gemm_output_to_use, gemmlowp_output_stage, gemm_3d_depth, skip_im2col, act_info)); - - // Validate Col2Im - if(!skip_col2im) - { - ARM_COMPUTE_RETURN_ON_ERROR(CLCol2ImKernel::validate(gemm_output_to_use, output, Size2D(conv_w, conv_h), num_groups)); - } - - //Validate Activation Layer - if(!fuse_activation) - { - ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayer::validate(output, nullptr, act_info)); - } - - return Status{}; + const Conv2dInfo conv2d_info = Conv2dInfo(conv_info, dilation, act_info, false, num_groups); + return opencl::ClGemmConv2d::validate(input, weights, biases, output, conv2d_info, weights_info); } void CLGEMMConvolutionLayer::run() { prepare(); - - MemoryGroupResourceScope scope_mg(_memory_group); - - // Run im2col - if(!_skip_im2col) - { - CLScheduler::get().enqueue(_im2col_kernel); - } - - // Runs CLGEMM or CLGEMMLowpMatrixMultiplyCore functions - if(_is_quantized) - { - // Run gemmlowp - _mm_gemmlowp.run(); - } - else - { - // Run gemm - _mm_gemm.run(); - } - - // Reshape output matrix - if(!_skip_col2im) - { - CLScheduler::get().enqueue(_col2im_kernel, false); - } - - //Run Activation Layer if we cannot fuse in GEMM - if(!_fuse_activation) - { - _activationlayer_function.run(); - } + MemoryGroupResourceScope scope_mg(_impl->memory_group); + _impl->op->run(_impl->run_pack); } void CLGEMMConvolutionLayer::prepare() { - if(!_is_prepared) + if (!_impl->is_prepared) { - ARM_COMPUTE_ERROR_ON(!_original_weights->is_used()); - if(_weights_manager && _weights_manager->are_weights_managed(_original_weights)) + _impl->op->prepare(_impl->prep_pack); + auto has_reshape = + std::find_if(_impl->aux_mem_req.begin(), _impl->aux_mem_req.end(), + [](const MemoryInfo &m) -> bool { return m.lifetime == MemoryLifetime::Persistent; }); + + if (has_reshape != std::end(_impl->aux_mem_req)) { - _weights_manager->run(_original_weights, &_reshape_weights_managed); + _impl->weights->mark_as_unused(); } else { - // Run weights reshaping and mark original weights tensor as unused - _weights_reshaped.allocator()->allocate(); - _reshape_weights.run(); - _original_weights->mark_as_unused(); - } - - // Prepare GEMM - _is_quantized ? _mm_gemmlowp.prepare() : _mm_gemm.prepare(); - if(!_weights_reshaped.is_used()) - { - _weights_reshaped.allocator()->free(); + // Pack the B matrix to be used as the underlying GEMM performs no reshapes + _impl->run_pack.add_const_tensor(ACL_SRC_1, _impl->weights); } - - CLScheduler::get().queue().finish(); - _is_prepared = true; + release_temporaries(_impl->aux_mem_req, _impl->workspace_tensors); + _impl->is_prepared = true; } } } // namespace arm_compute diff --git a/src/runtime/CL/functions/CLGEMMDeconvolutionLayer.cpp b/src/runtime/CL/functions/CLGEMMDeconvolutionLayer.cpp index 1dcb341fe7..7d40cf1829 100644 --- a/src/runtime/CL/functions/CLGEMMDeconvolutionLayer.cpp +++ b/src/runtime/CL/functions/CLGEMMDeconvolutionLayer.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2020 ARM Limited. + * Copyright (c) 2019-2021, 2023 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -24,24 +24,29 @@ #include "arm_compute/runtime/CL/functions/CLGEMMDeconvolutionLayer.h" #include "arm_compute/core/Helpers.h" -#include "arm_compute/core/Validate.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" #include "arm_compute/core/utils/quantization/AsymmHelpers.h" +#include "arm_compute/core/Validate.h" +#include "arm_compute/function_info/ActivationLayerInfo.h" #include "arm_compute/runtime/CL/CLScheduler.h" -#include <memory> +#include "src/common/utils/Log.h" +#include "src/core/CL/kernels/CLDeconvolutionReshapeOutputKernel.h" +#include "src/core/CL/kernels/CLFillBorderKernel.h" + #include <tuple> namespace arm_compute { namespace { -std::pair<Coordinates, Coordinates> compute_start_end_slice_coordinates(const ITensorInfo &output_info, const PadStrideInfo &deconv_info, bool is_nchw) +std::pair<Coordinates, Coordinates> +compute_start_end_slice_coordinates(const ITensorInfo &output_info, const PadStrideInfo &deconv_info, bool is_nchw) { Coordinates start; Coordinates end; - if(is_nchw) + if (is_nchw) { start.set(0, deconv_info.pad_left()); start.set(1, deconv_info.pad_top()); @@ -59,13 +64,16 @@ std::pair<Coordinates, Coordinates> compute_start_end_slice_coordinates(const IT end.set(2, output_info.dimension(2) - deconv_info.pad_bottom()); } - return { start, end }; + return {start, end}; } -Status construct_gemmlowp_output_stage(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *output, GEMMLowpOutputStageInfo &output_stage_info) +Status construct_gemmlowp_output_stage(const ITensorInfo *input, + const ITensorInfo *weights, + const ITensorInfo *output, + GEMMLowpOutputStageInfo &output_stage_info) { const auto data_type = input->data_type(); - if(is_data_type_quantized_asymmetric(data_type)) + if (is_data_type_quantized_asymmetric(data_type)) { const UniformQuantizationInfo iq_info = input->quantization_info().uniform(); const UniformQuantizationInfo wq_info = weights->quantization_info().uniform(); @@ -74,7 +82,8 @@ Status construct_gemmlowp_output_stage(const ITensorInfo *input, const ITensorIn float multiplier = iq_info.scale * wq_info.scale / oq_info.scale; int output_multiplier(0); int output_shift(0); - ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier(multiplier, &output_multiplier, &output_shift)); + ARM_COMPUTE_RETURN_ON_ERROR( + quantization::calculate_quantized_multiplier(multiplier, &output_multiplier, &output_shift)); output_stage_info.type = GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT; output_stage_info.gemmlowp_multiplier = output_multiplier; @@ -99,7 +108,7 @@ CLGEMMDeconvolutionLayer::CLGEMMDeconvolutionLayer(std::shared_ptr<IMemoryManage _permute_weights_to_nhwc(), _reshape_weights(), _transpose_weights(), - _deconv_reshape(), + _deconv_reshape(std::make_unique<CLDeconvolutionReshapeOutputKernel>()), _slice_gemm(), _gemmlowp_final(), _reshaped_weights(), @@ -116,15 +125,23 @@ CLGEMMDeconvolutionLayer::CLGEMMDeconvolutionLayer(std::shared_ptr<IMemoryManage { } -Status CLGEMMDeconvolutionLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *bias, const ITensorInfo *output, const PadStrideInfo &deconv_info) +CLGEMMDeconvolutionLayer::~CLGEMMDeconvolutionLayer() = default; + +Status CLGEMMDeconvolutionLayer::validate(const ITensorInfo *input, + const ITensorInfo *weights, + const ITensorInfo *bias, + const ITensorInfo *output, + const PadStrideInfo &deconv_info) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32, DataType::F16, DataType::QASYMM8, DataType::QASYMM8_SIGNED); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32, DataType::F16, DataType::QASYMM8, + DataType::QASYMM8_SIGNED); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, weights); DataLayout data_layout = input->data_layout(); - const bool padded_input = deconv_info.pad_bottom() > 0 || deconv_info.pad_left() > 0 || deconv_info.pad_right() > 0 || deconv_info.pad_top() > 0; + const bool padded_input = deconv_info.pad_bottom() > 0 || deconv_info.pad_left() > 0 || + deconv_info.pad_right() > 0 || deconv_info.pad_top() > 0; const bool is_nchw = input->data_layout() == DataLayout::NCHW; const bool is_quantized = is_data_type_quantized_asymmetric(input->data_type()); @@ -138,21 +155,31 @@ Status CLGEMMDeconvolutionLayer::validate(const ITensorInfo *input, const ITenso TensorShape nhwc_weights_shape = weights->tensor_shape(); TensorShape nhwc_input_shape = input->tensor_shape(); - if(is_nchw) + if (is_nchw) { permute(nhwc_weights_shape, PermutationVector(2, 0, 1)); permute(nhwc_input_shape, PermutationVector(2, 0, 1)); - TensorInfo nhwc_input_info = input->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(nhwc_input_shape).set_data_layout(DataLayout::NCHW); + TensorInfo nhwc_input_info = input->clone() + ->set_is_resizable(true) + .reset_padding() + .set_tensor_shape(nhwc_input_shape) + .set_data_layout(DataLayout::NCHW); - TensorInfo nhwc_weights_info = weights->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(nhwc_weights_shape).set_data_layout(DataLayout::NCHW); + TensorInfo nhwc_weights_info = weights->clone() + ->set_is_resizable(true) + .reset_padding() + .set_tensor_shape(nhwc_weights_shape) + .set_data_layout(DataLayout::NCHW); CLPermute::validate(weights, &nhwc_weights_info, PermutationVector(2, 0, 1)); CLPermute::validate(input, &nhwc_input_info, PermutationVector(2, 0, 1)); } - const TensorShape reshaped_shape = TensorShape(nhwc_weights_shape[0], nhwc_weights_shape[1] * nhwc_weights_shape[2] * nhwc_weights_shape[3]); - const TensorInfo reshaped_info = weights->clone()->set_tensor_shape(reshaped_shape).set_data_layout(DataLayout::NCHW).set_is_resizable(true); + const TensorShape reshaped_shape = + TensorShape(nhwc_weights_shape[0], nhwc_weights_shape[1] * nhwc_weights_shape[2] * nhwc_weights_shape[3]); + const TensorInfo reshaped_info = + weights->clone()->set_tensor_shape(reshaped_shape).set_data_layout(DataLayout::NCHW).set_is_resizable(true); ARM_COMPUTE_RETURN_ON_ERROR(CLReshapeLayer::validate(weights, &reshaped_info)); TensorShape transposed_shape(reshaped_shape[1], reshaped_shape[0]); @@ -160,76 +187,95 @@ Status CLGEMMDeconvolutionLayer::validate(const ITensorInfo *input, const ITenso ARM_COMPUTE_RETURN_ON_ERROR(CLTranspose::validate(&reshaped_info, &reshaped_t_info)); TensorShape gemm_output_shape(weights->dimension(idx_w) * weights->dimension(idx_h) * weights->dimension(idx_b), - input->dimension(idx_w), - input->dimension(idx_h), - input->dimension(idx_b)); + input->dimension(idx_w), input->dimension(idx_h), input->dimension(idx_b)); TensorInfo gemm_output_info = reshaped_t_info.clone()->set_tensor_shape(gemm_output_shape).set_is_resizable(true); GEMMInfo gemm_info(false, false, true, input->dimension(idx_h), true); GEMMLowpOutputStageInfo output_stage_info; - if(is_quantized) + if (is_quantized) { - ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixMultiplyCore::validate(&input->clone()->set_tensor_shape(nhwc_input_shape), &reshaped_t_info, nullptr, &gemm_output_info.set_data_type(DataType::S32), - gemm_info)); + ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixMultiplyCore::validate( + &input->clone()->set_tensor_shape(nhwc_input_shape), &reshaped_t_info, nullptr, + &gemm_output_info.set_data_type(DataType::S32), gemm_info)); ARM_COMPUTE_RETURN_ON_ERROR(construct_gemmlowp_output_stage(input, weights, output, output_stage_info)); } else { - ARM_COMPUTE_RETURN_ON_ERROR(CLGEMM::validate(&input->clone()->set_tensor_shape(nhwc_input_shape).set_is_resizable(true), &reshaped_t_info, nullptr, &gemm_output_info, 1.0f, 0.0f, gemm_info)); + ARM_COMPUTE_RETURN_ON_ERROR( + CLGEMM::validate(&input->clone()->set_tensor_shape(nhwc_input_shape).set_is_resizable(true), + &reshaped_t_info, nullptr, &gemm_output_info, 1.0f, 0.0f, gemm_info)); } const PadStrideInfo stride_info(deconv_info.stride().first, deconv_info.stride().second); - auto out_dims = deconvolution_output_dimensions(input->dimension(idx_w), input->dimension(idx_h), weights->dimension(idx_w), weights->dimension(idx_h), stride_info); - const TensorShape deconv_shape = misc::shape_calculator::compute_deconvolution_output_shape(out_dims, *input, *weights); - TensorInfo col2im_output_info = gemm_output_info.clone()->set_tensor_shape(deconv_shape).set_is_resizable(true); + auto out_dims = deconvolution_output_dimensions(input->dimension(idx_w), input->dimension(idx_h), + weights->dimension(idx_w), weights->dimension(idx_h), stride_info); + const TensorShape deconv_shape = + misc::shape_calculator::compute_deconvolution_output_shape(out_dims, *input, *weights); + TensorInfo col2im_output_info = gemm_output_info.clone()->set_tensor_shape(deconv_shape).set_is_resizable(true); - if(padded_input && is_quantized) + if (padded_input && is_quantized) { const auto start_end = compute_start_end_slice_coordinates(col2im_output_info, deconv_info, is_nchw); - ARM_COMPUTE_RETURN_ON_ERROR(CLDeconvolutionReshapeOutputKernel::validate(&gemm_output_info, bias, &col2im_output_info, input, weights, deconv_info)); - ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpOutputStage::validate(&col2im_output_info, nullptr, &col2im_output_info.clone()->set_is_resizable(true).set_data_type(input->data_type()), output_stage_info)); - ARM_COMPUTE_RETURN_ON_ERROR(CLSlice::validate(&col2im_output_info.clone()->set_is_resizable(true).set_data_type(input->data_type()), output, start_end.first, start_end.second)); + ARM_COMPUTE_RETURN_ON_ERROR(CLDeconvolutionReshapeOutputKernel::validate( + &gemm_output_info, bias, &col2im_output_info, input, weights, deconv_info)); + ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpOutputStage::validate( + &col2im_output_info, nullptr, + &col2im_output_info.clone()->set_is_resizable(true).set_data_type(input->data_type()), output_stage_info)); + ARM_COMPUTE_RETURN_ON_ERROR( + CLSlice::validate(&col2im_output_info.clone()->set_is_resizable(true).set_data_type(input->data_type()), + output, start_end.first, start_end.second)); } - else if(padded_input) + else if (padded_input) { const auto start_end = compute_start_end_slice_coordinates(col2im_output_info, deconv_info, is_nchw); - ARM_COMPUTE_RETURN_ON_ERROR(CLDeconvolutionReshapeOutputKernel::validate(&gemm_output_info, bias, &col2im_output_info, input, weights, deconv_info)); + ARM_COMPUTE_RETURN_ON_ERROR(CLDeconvolutionReshapeOutputKernel::validate( + &gemm_output_info, bias, &col2im_output_info, input, weights, deconv_info)); ARM_COMPUTE_RETURN_ON_ERROR(CLSlice::validate(&col2im_output_info, output, start_end.first, start_end.second)); } - else if(is_quantized) + else if (is_quantized) { - ARM_COMPUTE_RETURN_ON_ERROR(CLDeconvolutionReshapeOutputKernel::validate(&gemm_output_info, bias, &col2im_output_info, input, weights, deconv_info)); - ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpOutputStage::validate(&col2im_output_info, nullptr, output, output_stage_info)); + ARM_COMPUTE_RETURN_ON_ERROR(CLDeconvolutionReshapeOutputKernel::validate( + &gemm_output_info, bias, &col2im_output_info, input, weights, deconv_info)); + ARM_COMPUTE_RETURN_ON_ERROR( + CLGEMMLowpOutputStage::validate(&col2im_output_info, nullptr, output, output_stage_info)); } else { - ARM_COMPUTE_RETURN_ON_ERROR(CLDeconvolutionReshapeOutputKernel::validate(&gemm_output_info, bias, output, input, weights, deconv_info)); + ARM_COMPUTE_RETURN_ON_ERROR( + CLDeconvolutionReshapeOutputKernel::validate(&gemm_output_info, bias, output, input, weights, deconv_info)); } return Status{}; } -void CLGEMMDeconvolutionLayer::configure(const ICLTensor *input, const ICLTensor *weights, const ICLTensor *bias, ICLTensor *output, const PadStrideInfo &deconv_info) +void CLGEMMDeconvolutionLayer::configure(const ICLTensor *input, + const ICLTensor *weights, + const ICLTensor *bias, + ICLTensor *output, + const PadStrideInfo &deconv_info) { configure(CLKernelLibrary::get().get_compile_context(), input, weights, bias, output, deconv_info); } -void CLGEMMDeconvolutionLayer::configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *weights, const ICLTensor *bias, ICLTensor *output, - const PadStrideInfo &deconv_info) +void CLGEMMDeconvolutionLayer::configure(const CLCompileContext &compile_context, + const ICLTensor *input, + const ICLTensor *weights, + const ICLTensor *bias, + ICLTensor *output, + const PadStrideInfo &deconv_info) { ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output); - ARM_COMPUTE_ERROR_THROW_ON(CLGEMMDeconvolutionLayer::validate(input->info(), - weights->info(), - bias != nullptr ? bias->info() : nullptr, - output->info(), - deconv_info)); + ARM_COMPUTE_ERROR_THROW_ON(CLGEMMDeconvolutionLayer::validate( + input->info(), weights->info(), bias != nullptr ? bias->info() : nullptr, output->info(), deconv_info)); + ARM_COMPUTE_LOG_PARAMS(input, weights, bias, output, deconv_info); _original_weights = weights; - _padded_input = deconv_info.pad_bottom() > 0 || deconv_info.pad_left() > 0 || deconv_info.pad_right() > 0 || deconv_info.pad_top() > 0; - _is_nchw = input->info()->data_layout() == DataLayout::NCHW; - _is_quantized = is_data_type_quantized_asymmetric(input->info()->data_type()); + _padded_input = deconv_info.pad_bottom() > 0 || deconv_info.pad_left() > 0 || deconv_info.pad_right() > 0 || + deconv_info.pad_top() > 0; + _is_nchw = input->info()->data_layout() == DataLayout::NCHW; + _is_quantized = is_data_type_quantized_asymmetric(input->info()->data_type()); const ICLTensor *input_to_use = input; const ICLTensor *weights_to_use = weights; @@ -238,7 +284,7 @@ void CLGEMMDeconvolutionLayer::configure(const CLCompileContext &compile_context // do an outer product in NCHW and then an accumulation through a reduction. This would have two // drawbacks: first, the outer product is less efficient than a full GEMM. Second, the reduction // might be slower than GEMM. - if(_is_nchw) + if (_is_nchw) { _memory_group.manage(&_permuted_input); _permute_input_to_nhwc.configure(compile_context, input, &_permuted_input, PermutationVector(2U, 0U, 1U)); @@ -250,10 +296,11 @@ void CLGEMMDeconvolutionLayer::configure(const CLCompileContext &compile_context } // Reshape the input weights. The weights will be reshaped only once during the call to prepare() - _reshaped_weights.allocator()->init(TensorInfo(TensorShape(weights_to_use->info()->dimension(0), - weights_to_use->info()->dimension(1) * weights_to_use->info()->dimension(2) * weights_to_use->info()->dimension(3)), - 1, - input->info()->data_type(), weights->info()->quantization_info())); + _reshaped_weights.allocator()->init( + TensorInfo(TensorShape(weights_to_use->info()->dimension(0), weights_to_use->info()->dimension(1) * + weights_to_use->info()->dimension(2) * + weights_to_use->info()->dimension(3)), + 1, input->info()->data_type(), weights->info()->quantization_info())); _reshape_weights.configure(compile_context, weights_to_use, &_reshaped_weights); _transpose_weights.configure(compile_context, &_reshaped_weights, &_reshaped_weights_t); @@ -262,15 +309,17 @@ void CLGEMMDeconvolutionLayer::configure(const CLCompileContext &compile_context GEMMInfo gemm_info(false, false, true, input->info()->dimension(idx_h), true); // Configure output stage for asymmetric quantized types - if(_is_quantized) + if (_is_quantized) { // gemmlowp adds the offsets (instead of subtracting them). Thus, we need to negate the original // and restore them back to make it work properly. QuantizationInfo iq_info = input->info()->quantization_info(); QuantizationInfo wq_info = weights->info()->quantization_info(); - input_to_use->info()->set_quantization_info(QuantizationInfo(iq_info.uniform().scale, -iq_info.uniform().offset)); - _reshaped_weights_t.info()->set_quantization_info(QuantizationInfo(wq_info.uniform().scale, -wq_info.uniform().offset)); + input_to_use->info()->set_quantization_info( + QuantizationInfo(iq_info.uniform().scale, -iq_info.uniform().offset)); + _reshaped_weights_t.info()->set_quantization_info( + QuantizationInfo(wq_info.uniform().scale, -wq_info.uniform().offset)); _mm_gemmlowp.configure(compile_context, input_to_use, &_reshaped_weights_t, nullptr, &_gemm_output, gemm_info); @@ -279,10 +328,11 @@ void CLGEMMDeconvolutionLayer::configure(const CLCompileContext &compile_context } else { - _mm_gemm.configure(compile_context, input_to_use, &_reshaped_weights_t, nullptr, &_gemm_output, 1.f, 0.0f, gemm_info); + _mm_gemm.configure(compile_context, input_to_use, &_reshaped_weights_t, nullptr, &_gemm_output, 1.f, 0.0f, + gemm_info); } - if(_is_nchw) + if (_is_nchw) { _permuted_input.allocator()->allocate(); } @@ -291,7 +341,7 @@ void CLGEMMDeconvolutionLayer::configure(const CLCompileContext &compile_context ICLTensor *slice_output = nullptr; ICLTensor *output_stage_output = nullptr; - if(_padded_input && _is_quantized) + if (_padded_input && _is_quantized) { _memory_group.manage(&_slice_gemm_input); _memory_group.manage(&_gemmlowp_final); @@ -299,13 +349,13 @@ void CLGEMMDeconvolutionLayer::configure(const CLCompileContext &compile_context output_stage_output = &_slice_gemm_input; slice_output = output; } - else if(_padded_input) + else if (_padded_input) { _memory_group.manage(&_slice_gemm_input); deconv_reshape_output = &_slice_gemm_input; slice_output = output; } - else if(_is_quantized) + else if (_is_quantized) { _memory_group.manage(&_gemmlowp_final); deconv_reshape_output = &_gemmlowp_final; @@ -317,21 +367,24 @@ void CLGEMMDeconvolutionLayer::configure(const CLCompileContext &compile_context } // Configure a Col2Im call to reshape the output of GEMM - _deconv_reshape.configure(compile_context, &_gemm_output, bias, deconv_reshape_output, input->info(), weights->info(), deconv_info); + _deconv_reshape->configure(compile_context, &_gemm_output, bias, deconv_reshape_output, input->info(), + weights->info(), deconv_info); _gemm_output.allocator()->allocate(); - if(_is_quantized) + if (_is_quantized) { GEMMLowpOutputStageInfo output_stage_info; construct_gemmlowp_output_stage(input->info(), weights->info(), output->info(), output_stage_info); - _gemmlowp_output_stage.configure(compile_context, &_gemmlowp_final, nullptr, output_stage_output, output_stage_info); + _gemmlowp_output_stage.configure(compile_context, &_gemmlowp_final, nullptr, output_stage_output, + output_stage_info); _gemmlowp_final.allocator()->allocate(); } // If the input was padded, the output needs to be sliced. - if(_padded_input) + if (_padded_input) { - const auto start_end = compute_start_end_slice_coordinates(*deconv_reshape_output->info(), deconv_info, _is_nchw); + const auto start_end = + compute_start_end_slice_coordinates(*deconv_reshape_output->info(), deconv_info, _is_nchw); _slice_gemm.configure(compile_context, &_slice_gemm_input, slice_output, start_end.first, start_end.second); _slice_gemm_input.allocator()->allocate(); } @@ -343,12 +396,12 @@ void CLGEMMDeconvolutionLayer::run() MemoryGroupResourceScope scope_mg(_memory_group); - if(_is_nchw) + if (_is_nchw) { _permute_input_to_nhwc.run(); } - if(_is_quantized) + if (_is_quantized) { _mm_gemmlowp.run(); } @@ -357,14 +410,14 @@ void CLGEMMDeconvolutionLayer::run() _mm_gemm.run(); } - CLScheduler::get().enqueue(_deconv_reshape, false); + CLScheduler::get().enqueue(*_deconv_reshape, false); - if(_is_quantized) + if (_is_quantized) { _gemmlowp_output_stage.run(); } - if(_padded_input) + if (_padded_input) { _slice_gemm.run(); } @@ -372,11 +425,11 @@ void CLGEMMDeconvolutionLayer::run() void CLGEMMDeconvolutionLayer::prepare() { - if(!_is_prepared) + if (!_is_prepared) { ARM_COMPUTE_ERROR_ON(!_original_weights->is_used()); - if(_is_nchw) + if (_is_nchw) { _permuted_weights.allocator()->allocate(); _permute_weights_to_nhwc.run(); @@ -385,7 +438,7 @@ void CLGEMMDeconvolutionLayer::prepare() _reshaped_weights.allocator()->allocate(); _reshape_weights.run(); - if(_is_nchw) + if (_is_nchw) { _permuted_weights.allocator()->free(); } @@ -394,7 +447,7 @@ void CLGEMMDeconvolutionLayer::prepare() _transpose_weights.run(); // Prepare gemm - if(!_is_quantized) + if (!_is_quantized) { _mm_gemm.prepare(); } @@ -404,7 +457,7 @@ void CLGEMMDeconvolutionLayer::prepare() } // Free resources - if(!_reshaped_weights_t.is_used()) + if (!_reshaped_weights_t.is_used()) { _reshaped_weights_t.allocator()->free(); } diff --git a/src/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.cpp b/src/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.cpp index 84da4a7e98..8bad198658 100644 --- a/src/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.cpp +++ b/src/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2020 ARM Limited. + * Copyright (c) 2017-2021 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -23,537 +23,111 @@ */ #include "arm_compute/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.h" +#include "arm_compute/core/CL/CLKernelLibrary.h" #include "arm_compute/core/CL/ICLTensor.h" -#include "arm_compute/core/CL/gemm/native/CLGEMMNativeKernelConfiguration.h" -#include "arm_compute/core/CL/gemm/reshaped_only_rhs/CLGEMMReshapedOnlyRHSKernelConfiguration.h" #include "arm_compute/core/Error.h" #include "arm_compute/core/Helpers.h" #include "arm_compute/core/KernelDescriptors.h" +#include "arm_compute/core/Log.h" #include "arm_compute/core/TensorInfo.h" #include "arm_compute/core/Types.h" -#include "arm_compute/core/Validate.h" -#include "arm_compute/core/utils/misc/ShapeCalculator.h" #include "arm_compute/core/utils/quantization/AsymmHelpers.h" +#include "arm_compute/core/Validate.h" #include "arm_compute/runtime/CL/CLScheduler.h" -#include "arm_compute/runtime/CL/gemm/CLGEMMKernelSelection.h" +#include "arm_compute/runtime/IMemoryManager.h" + +#include "src/core/helpers/MemoryHelpers.h" +#include "src/gpu/cl/operators/ClGemmLowpMatrixMultiplyCore.h" namespace arm_compute { -using namespace arm_compute::misc::shape_calculator; -using namespace arm_compute::cl_gemm; +using namespace arm_compute::experimental; +using OperatorType = opencl::ClGemmLowpMatrixMultiplyCore; -namespace -{ -inline bool is_gemm_reshaped(unsigned int m, unsigned int n, unsigned int k, DataType data_type, bool reshape_b_only_on_first_run) +struct CLGEMMLowpMatrixMultiplyCore::Impl { - std::unique_ptr<ICLGEMMKernelSelection> gemm_kernel = CLGEMMKernelSelectionFactory::create(CLScheduler::get().target()); - ARM_COMPUTE_ERROR_ON_NULLPTR(gemm_kernel.get()); - - CLGEMMKernelSelectionParams params; - params.m = m; - params.n = n; - params.k = k; - params.is_rhs_constant = reshape_b_only_on_first_run; - params.data_type = data_type; - - switch(gemm_kernel->select_kernel(params)) - { - case CLGEMMKernelType::NATIVE: - return false; - case CLGEMMKernelType::RESHAPED_ONLY_RHS: - return true; - default: - ARM_COMPUTE_ERROR("Not supported gemmlowp kernel!"); - } -} -} // namespace + const ICLTensor *b{nullptr}; + std::unique_ptr<OperatorType> op{nullptr}; + MemoryGroup memory_group{}; + ITensorPack run_pack{}; + MemoryRequirements aux_mem_req{}; + WorkspaceData<CLTensor> workspace_tensors{}; + bool is_prepared{false}; +}; CLGEMMLowpMatrixMultiplyCore::CLGEMMLowpMatrixMultiplyCore(std::shared_ptr<IMemoryManager> memory_manager) - : _memory_group(std::move(memory_manager)), - _weights_to_qasymm8(), - _mm_native_kernel(), - _mm_reshaped_only_rhs_kernel(), - _mtx_b_reshape_kernel(), - _mtx_a_reduction_kernel(), - _mtx_b_reduction_kernel(), - _offset_contribution_kernel(), - _offset_contribution_output_stage_kernel(), - _qasymm8_weights(), - _vector_sum_col(), - _vector_sum_row(), - _tmp_b(), - _mm_result_s32(), - _gemm_output_stage_multipliers(), - _gemm_output_stage_shifts(), - _matrix_a(nullptr), - _original_b(nullptr), - _output(nullptr), - _a_offset(0), - _b_offset(0), - _is_gemm_reshaped(true), - _reshape_b_only_on_first_run(false), - _is_prepared(false), - _run_output_stage(false), - _convert_to_qasymm8(false), - _run_offset_contribution(false) + : _impl(std::make_unique<Impl>()) { + _impl->memory_group = MemoryGroup(memory_manager); } -void CLGEMMLowpMatrixMultiplyCore::configure(const ICLTensor *a, const ICLTensor *b, const ICLTensor *c, ICLTensor *output, const GEMMInfo &gemm_info) +CLGEMMLowpMatrixMultiplyCore::~CLGEMMLowpMatrixMultiplyCore() = default; + +void CLGEMMLowpMatrixMultiplyCore::configure( + const ICLTensor *a, const ICLTensor *b, const ICLTensor *c, ICLTensor *output, const GEMMInfo &gemm_info) { configure(CLKernelLibrary::get().get_compile_context(), a, b, c, output, gemm_info); } -void CLGEMMLowpMatrixMultiplyCore::configure(const CLCompileContext &compile_context, const ICLTensor *a, const ICLTensor *b, const ICLTensor *c, ICLTensor *output, const GEMMInfo &gemm_info) +void CLGEMMLowpMatrixMultiplyCore::configure(const CLCompileContext &compile_context, + const ICLTensor *a, + const ICLTensor *b, + const ICLTensor *c, + ICLTensor *output, + const GEMMInfo &gemm_info) { ARM_COMPUTE_ERROR_ON_NULLPTR(a, b, output); - ARM_COMPUTE_ERROR_THROW_ON(CLGEMMLowpMatrixMultiplyCore::validate(a->info(), b->info(), c != nullptr ? c->info() : nullptr, output->info(), gemm_info)); - - _is_prepared = false; - _original_b = b; - _reshape_b_only_on_first_run = gemm_info.reshape_b_only_on_first_run(); - _a_offset = a->info()->quantization_info().uniform().offset; - _matrix_a = a; - _output = output; - - _convert_to_qasymm8 = is_data_type_quantized_per_channel(b->info()->data_type()) && is_data_type_quantized_symmetric(b->info()->data_type()) - && is_data_type_quantized_asymmetric(a->info()->data_type()); - _b_offset = _convert_to_qasymm8 ? -128 : b->info()->quantization_info().uniform().offset; - - // Get the GPU target - const GPUTarget gpu_target = CLScheduler::get().target(); - - // Set the target for the kernels - _mm_native_kernel.set_target(gpu_target); - _mm_reshaped_only_rhs_kernel.set_target(gpu_target); - - GEMMRHSMatrixInfo rhs_info; - GEMMLHSMatrixInfo lhs_info; - // Arguments used by GEMMReshapeInfo - // If we pass the matrix A and matrix B reshaped to CLGEMMMatrixMultiplyKernel, we need to pass m, n, k, mult_transpose1xW_width and mult_interleave4x4_height to CLGEMMReshapeInfo - // in order to know how the matrices have been reshaped - bool reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d(); - const unsigned int m = reinterpret_input_as_3d ? (a->info()->dimension(1) * a->info()->dimension(2)) : a->info()->dimension(1); - const unsigned int n = b->info()->dimension(0); - const unsigned int k = a->info()->dimension(0); - const unsigned int batch_size = reinterpret_input_as_3d ? a->info()->dimension(3) : a->info()->dimension(2); - const int depth_output_gemm3d = gemm_info.depth_output_gemm3d(); + _impl->b = b; + _impl->op = std::make_unique<OperatorType>(); + _impl->is_prepared = gemm_info.retain_internal_weights(); - // Check if we need to reshape the matrix A and matrix B - _is_gemm_reshaped = is_gemm_reshaped(m, n, k, a->info()->data_type(), _reshape_b_only_on_first_run); + _impl->op->configure(compile_context, a->info(), b->info(), c != nullptr ? c->info() : nullptr, output->info(), + gemm_info); + _impl->aux_mem_req = _impl->op->workspace(); - if(_convert_to_qasymm8) + // Manage/allocate auxilairy tensors + if (_impl->is_prepared) { - // Set data type for converted weights - TensorInfo weights_info(*b->info()); - weights_info.set_data_type(DataType::QASYMM8); - _qasymm8_weights.allocator()->init(weights_info); - _weights_to_qasymm8.configure(compile_context, b, &_qasymm8_weights, ConvertPolicy::WRAP, 0); - } - - const ICLTensor *matrix_b = _convert_to_qasymm8 ? &_qasymm8_weights : b; - if(_is_gemm_reshaped) - { - matrix_b = &_tmp_b; - - if(!_reshape_b_only_on_first_run) - { - _memory_group.manage(&_tmp_b); - } - - // Pick up the GEMM configuration - // Datatype is DataType::QASYMM8 or DataType::QASYMM8_SIGNED doesn't matter, since it only affect the shape configuration - std::tie(lhs_info, rhs_info) = CLGEMMReshapedOnlyRHSKernelConfigurationFactory::create(gpu_target)->configure(m, n, k, batch_size, DataType::QASYMM8); - - // Configure reshape RHS kernel - _mtx_b_reshape_kernel.configure(compile_context, _convert_to_qasymm8 ? &_qasymm8_weights : b, &_tmp_b, rhs_info); - } - - // Using default reduction info - const GEMMLowpReductionKernelInfo reduction_info {}; - - // Initialize matrix B reduction kernel only if _a_offset is not equal to 0 - if(_a_offset != 0) - { - TensorInfo info_vector_sum_col(compute_reductionA_shape(*b->info()), 1, DataType::S32); - _vector_sum_col.allocator()->init(info_vector_sum_col); - if(!_reshape_b_only_on_first_run) - { - _memory_group.manage(&_vector_sum_col); - } - - // Configure Matrix B reduction kernel - _mtx_b_reduction_kernel.configure(compile_context, _convert_to_qasymm8 ? &_qasymm8_weights : b, &_vector_sum_col, reduction_info); - } - - // Initialize Matrix A reduction kernel only if _b_offset is not equal to 0 - if(_b_offset != 0) - { - TensorInfo info_vector_sum_row(compute_reductionB_shape(*a->info()), 1, DataType::S32); - _vector_sum_row.allocator()->init(info_vector_sum_row); - _memory_group.manage(&_vector_sum_row); - - // Configure matrix A reduction kernel - _mtx_a_reduction_kernel.configure(compile_context, a, &_vector_sum_row, reduction_info); - } - - GEMMKernelInfo gemm_kernel_info; - gemm_kernel_info.m = m; - gemm_kernel_info.n = n; - gemm_kernel_info.k = k; - gemm_kernel_info.depth_output_gemm3d = depth_output_gemm3d; - gemm_kernel_info.reinterpret_input_as_3d = reinterpret_input_as_3d; - gemm_kernel_info.lhs_info = lhs_info; - gemm_kernel_info.rhs_info = rhs_info; - gemm_kernel_info.a_offset = _a_offset; - gemm_kernel_info.b_offset = _b_offset; - // If GEMMLowpOutputStage != NONE, fuse the offset contribution with the output stage - if(gemm_info.gemmlowp_output_stage().type != GEMMLowpOutputStageType::NONE) - { - // Configure offset contribution kernel - const size_t num_filters = (gemm_info.gemmlowp_output_stage().is_quantized_per_channel) ? gemm_info.gemmlowp_output_stage().gemmlowp_multipliers.size() : 1; - - _gemm_output_stage_multipliers.allocator()->init(TensorInfo(TensorShape(num_filters), 1, DataType::S32)); - _gemm_output_stage_shifts.allocator()->init(TensorInfo(TensorShape(num_filters), 1, DataType::S32)); - - GEMMLowpOutputStageInfo gemmlowp_output_stage = gemm_info.gemmlowp_output_stage(); - gemmlowp_output_stage.output_data_type = _matrix_a->info()->data_type(); - - gemm_kernel_info.output_stage = gemmlowp_output_stage; - - if(_is_gemm_reshaped && gemmlowp_output_stage.type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT) - { - // Configure and tune matrix multiply kernel with fused output stage - _mm_reshaped_only_rhs_kernel.configure(compile_context, _matrix_a, matrix_b, output, gemm_kernel_info, _a_offset == 0 ? nullptr : &_vector_sum_col, - _b_offset == 0 ? nullptr : &_vector_sum_row, c, &_gemm_output_stage_multipliers, &_gemm_output_stage_shifts); - } - else - { - _run_output_stage = true; - - _memory_group.manage(&_mm_result_s32); - - if(_is_gemm_reshaped) - { - _mm_reshaped_only_rhs_kernel.configure(compile_context, _matrix_a, matrix_b, &_mm_result_s32, gemm_kernel_info); - } - else - { - // Pick up the GEMM configuration - std::tie(lhs_info, rhs_info) = CLGEMMNativeKernelConfigurationFactory::create(gpu_target)->configure(m, n, k, batch_size, DataType::QASYMM8); - - // Configure matrix multiply kernel - _mm_native_kernel.configure(compile_context, _matrix_a, matrix_b, &_mm_result_s32, lhs_info, rhs_info, GEMMReshapeInfo(m, n, k, 1, 1, depth_output_gemm3d, reinterpret_input_as_3d)); - - _offset_contribution_output_stage_kernel.configure(compile_context, &_mm_result_s32, _a_offset == 0 ? nullptr : &_vector_sum_col, _b_offset == 0 ? nullptr : &_vector_sum_row, c, output, - a->info()->dimension(0), - _a_offset, _b_offset, gemmlowp_output_stage, &_gemm_output_stage_multipliers, &_gemm_output_stage_shifts); - _mm_result_s32.allocator()->allocate(); - } - } - - _gemm_output_stage_multipliers.allocator()->allocate(); - _gemm_output_stage_shifts.allocator()->allocate(); - // Compute GEMM output multipliers and shifts for output stage - _gemm_output_stage_multipliers.map(); - _gemm_output_stage_shifts.map(); - std::memcpy(_gemm_output_stage_multipliers.ptr_to_element(Coordinates(0)), gemm_info.gemmlowp_output_stage().gemmlowp_multipliers.data(), num_filters * sizeof(int32_t)); - std::memcpy(_gemm_output_stage_shifts.ptr_to_element(Coordinates(0)), gemm_info.gemmlowp_output_stage().gemmlowp_shifts.data(), num_filters * sizeof(int32_t)); - _gemm_output_stage_multipliers.unmap(); - _gemm_output_stage_shifts.unmap(); + _impl->run_pack.add_const_tensor(ACL_SRC_0, a); + _impl->run_pack.add_tensor(ACL_DST, output); } else { - _run_offset_contribution = true; - if(_is_gemm_reshaped) - { - // Configure and tune matrix multiply kernel - _mm_reshaped_only_rhs_kernel.configure(compile_context, _matrix_a, matrix_b, output, gemm_kernel_info); - } - else - { - // Pick up the GEMM configuration - std::tie(lhs_info, rhs_info) = CLGEMMNativeKernelConfigurationFactory::create(gpu_target)->configure(m, n, k, batch_size, DataType::QASYMM8); - - // Configure matrix multiply kernel - _mm_native_kernel.configure(compile_context, _matrix_a, matrix_b, output, lhs_info, rhs_info, GEMMReshapeInfo(m, n, k, 1, 1, depth_output_gemm3d, reinterpret_input_as_3d)); - } - - // Configure offset contribution kernel - _offset_contribution_kernel.configure(compile_context, output, _a_offset == 0 ? nullptr : &_vector_sum_col, _b_offset == 0 ? nullptr : &_vector_sum_row, c, a->info()->dimension(0), _a_offset, - _b_offset); - } - - // Allocate tensors - if(_is_gemm_reshaped) - { - if(!_reshape_b_only_on_first_run) - { - _tmp_b.allocator()->allocate(); - } - } - - if(_a_offset != 0 && !_reshape_b_only_on_first_run) - { - _vector_sum_col.allocator()->allocate(); - } - - if(_b_offset != 0) - { - _vector_sum_row.allocator()->allocate(); + _impl->run_pack = {{ACL_SRC_0, a}, {ACL_SRC_1, _impl->b}, {ACL_SRC_2, c}, {ACL_DST, output}}; + _impl->workspace_tensors = + manage_workspace<CLTensor>(_impl->op->workspace(), _impl->memory_group, _impl->run_pack, _impl->run_pack); } } -Status CLGEMMLowpMatrixMultiplyCore::validate(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *output, const GEMMInfo &gemm_info) +Status CLGEMMLowpMatrixMultiplyCore::validate(const ITensorInfo *a, + const ITensorInfo *b, + const ITensorInfo *c, + const ITensorInfo *output, + const GEMMInfo &gemm_info) { - ARM_COMPUTE_ERROR_ON_NULLPTR(a, b, output); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(a, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(b, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM8, DataType::QSYMM8_PER_CHANNEL); - ARM_COMPUTE_RETURN_ERROR_ON(a->data_type() == DataType::QASYMM8 && b->data_type() == DataType::QASYMM8_SIGNED); - ARM_COMPUTE_RETURN_ERROR_ON(a->data_type() == DataType::QASYMM8_SIGNED && b->data_type() == DataType::QASYMM8); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.is_a_reshaped(), "Matrix A already reshaped is not supported"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.is_b_reshaped(), "Matrix B already reshaped is not supported"); - - int32_t a_offset = a->quantization_info().uniform().offset; - int32_t b_offset = b->quantization_info().uniform().offset; - - const ITensorInfo *matrix_a_info = a; - - TensorInfo tmp_b_info{}; - GEMMRHSMatrixInfo rhs_info; - GEMMLHSMatrixInfo lhs_info; - - // Get the GPU target - const GPUTarget gpu_target = CLScheduler::get().target(); - - bool reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d(); - const unsigned int m = reinterpret_input_as_3d ? (a->dimension(1) * a->dimension(2)) : a->dimension(1); - const unsigned int n = b->dimension(0); - const unsigned int k = a->dimension(0); - const unsigned int batch_size = reinterpret_input_as_3d ? a->dimension(3) : a->dimension(2); - const int depth_output_gemm3d = gemm_info.depth_output_gemm3d(); - - bool reshape_matrix_b = is_gemm_reshaped(m, n, k, a->data_type(), gemm_info.reshape_b_only_on_first_run()); - - const GEMMReshapeInfo reshape_info = GEMMReshapeInfo(m, n, k, 1, 1, depth_output_gemm3d, reinterpret_input_as_3d); - - bool convert_to_qasymm8 = is_data_type_quantized_per_channel(b->data_type()) && is_data_type_quantized_symmetric(b->data_type()) - && is_data_type_quantized_asymmetric(a->data_type()); - TensorInfo weights_info(*b); - if(convert_to_qasymm8) - { - b_offset = -128; - weights_info.set_data_type(DataType::QASYMM8); - ARM_COMPUTE_RETURN_ON_ERROR(CLDepthConvertLayerKernel::validate(b, &weights_info, ConvertPolicy::WRAP, 0)); - } - const ITensorInfo *matrix_b_info = &weights_info; - if(reshape_matrix_b) - { - matrix_b_info = &tmp_b_info; - - // Pick up the GEMM configuration - std::tie(lhs_info, rhs_info) = CLGEMMReshapedOnlyRHSKernelConfigurationFactory::create(gpu_target)->configure(m, n, k, batch_size, DataType::QASYMM8); - - // Validate reshape RHS kernel - auto_init_if_empty(tmp_b_info, weights_info.clone()->set_tensor_shape(compute_rhs_reshaped_shape(weights_info, rhs_info))); - ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMReshapeRHSMatrixKernel::validate(&weights_info, &tmp_b_info, rhs_info)); - } - - TensorInfo info_vector_sum_col{}; - TensorInfo info_vector_sum_row{}; - - const GEMMLowpReductionKernelInfo reduction_info; - // Validate matrix B reduction kernel only if _a_offset is not equal to 0 - if(a_offset != 0) - { - info_vector_sum_col = TensorInfo(compute_reductionA_shape(weights_info), 1, DataType::S32); - - // Configure Matrix B reduction kernel - ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixBReductionKernel::validate(&weights_info, &info_vector_sum_col, reduction_info)); - } - - // Validate Matrix A reduction kernel only if _b_offset is not equal to 0 - if(b_offset != 0) - { - info_vector_sum_row = TensorInfo(compute_reductionB_shape(*a), 1, DataType::S32); - - // Configure matrix A reduction kernel - ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixAReductionKernel::validate(a, &info_vector_sum_row, reduction_info)); - } - - GEMMKernelInfo gemm_kernel_info; - gemm_kernel_info.m = m; - gemm_kernel_info.n = n; - gemm_kernel_info.k = k; - gemm_kernel_info.depth_output_gemm3d = depth_output_gemm3d; - gemm_kernel_info.reinterpret_input_as_3d = reinterpret_input_as_3d; - gemm_kernel_info.lhs_info = lhs_info; - gemm_kernel_info.rhs_info = rhs_info; - gemm_kernel_info.a_offset = a_offset; - gemm_kernel_info.b_offset = b_offset; - if(gemm_info.gemmlowp_output_stage().type != GEMMLowpOutputStageType::NONE) - { - const size_t num_filters = (gemm_info.gemmlowp_output_stage().is_quantized_per_channel) ? gemm_info.gemmlowp_output_stage().gemmlowp_multipliers.size() : 1; - - const TensorInfo gemm_output_stage_multipliers_shifts_info(TensorInfo(TensorShape(num_filters), 1, DataType::S32)); - - GEMMLowpOutputStageInfo gemmlowp_output_stage = gemm_info.gemmlowp_output_stage(); - gemmlowp_output_stage.output_data_type = a->data_type(); - - gemm_kernel_info.output_stage = gemmlowp_output_stage; - if(reshape_matrix_b && gemm_info.gemmlowp_output_stage().type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT) - { - ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel::validate(matrix_a_info, matrix_b_info, output, gemm_kernel_info, - a_offset == 0 ? nullptr : &info_vector_sum_col, - b_offset == 0 ? nullptr : &info_vector_sum_row, - c, - &gemm_output_stage_multipliers_shifts_info, - &gemm_output_stage_multipliers_shifts_info)); - } - else - { - TensorInfo mm_result_s32_info{}; - - if(reshape_matrix_b) - { - // Output tensor auto inizialitation if not yet initialized - auto_init_if_empty(mm_result_s32_info, a->clone()->set_tensor_shape(compute_mm_shape(*matrix_a_info, *matrix_b_info, reshape_info)).set_data_type(DataType::S32)); - - // Validate matrix multiply - ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel::validate(matrix_a_info, matrix_b_info, &mm_result_s32_info, gemm_kernel_info)); - } - else - { - // Output tensor auto inizialitation if not yet initialized - auto_init_if_empty(mm_result_s32_info, a->clone()->set_tensor_shape(compute_mm_shape(*matrix_a_info, *matrix_b_info, false, reshape_info)).set_data_type(DataType::S32)); - - // Pick up the GEMM configuration - std::tie(lhs_info, rhs_info) = CLGEMMNativeKernelConfigurationFactory::create(gpu_target)->configure(m, n, k, batch_size, DataType::QASYMM8); - - // Validate matrix multiply - ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixMultiplyNativeKernel::validate(matrix_a_info, matrix_b_info, &mm_result_s32_info, lhs_info, rhs_info, reshape_info)); - } - - // Validate offset contribution kernel - ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpOffsetContributionOutputStageKernel::validate(&mm_result_s32_info, - a_offset == 0 ? nullptr : &info_vector_sum_col, - b_offset == 0 ? nullptr : &info_vector_sum_row, - c, - output, - a_offset, b_offset, - gemmlowp_output_stage, - &gemm_output_stage_multipliers_shifts_info, - &gemm_output_stage_multipliers_shifts_info)); - } - } - else - { - if(reshape_matrix_b) - { - // Validate matrix multiply - ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel::validate(matrix_a_info, matrix_b_info, output, gemm_kernel_info)); - } - else - { - // Pick up the GEMM configuration - std::tie(lhs_info, rhs_info) = CLGEMMNativeKernelConfigurationFactory::create(gpu_target)->configure(m, n, k, batch_size, DataType::QASYMM8); - - // Validate matrix multiply - ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixMultiplyNativeKernel::validate(matrix_a_info, matrix_b_info, output, lhs_info, rhs_info, reshape_info)); - } - - if(output->total_size() != 0) - { - // Validate offset contribution kernel - ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpOffsetContributionKernel::validate(output, - a_offset == 0 ? nullptr : &info_vector_sum_col, - b_offset == 0 ? nullptr : &info_vector_sum_row, - c, - a_offset, b_offset)); - } - } - - return Status{}; + return OperatorType::validate(a, b, c, output, gemm_info); } void CLGEMMLowpMatrixMultiplyCore::run() { prepare(); - MemoryGroupResourceScope scope_mg(_memory_group); - - if(_is_gemm_reshaped) - { - if(!_reshape_b_only_on_first_run) - { - // Run reshape matrix B - CLScheduler::get().enqueue(_mtx_b_reshape_kernel, false); - } - } - - // Run matrix B reduction kernel only if _a_offset is not equal to 0 - if(_a_offset != 0 && !_reshape_b_only_on_first_run) - { - CLScheduler::get().enqueue(_mtx_b_reduction_kernel, false); - } - - // Run matrix A reduction kernel only if _b_offset is not equal to 0 - if(_b_offset != 0) - { - CLScheduler::get().enqueue(_mtx_a_reduction_kernel, false); - } + MemoryGroupResourceScope scope_mg(_impl->memory_group); - // Run matrix multiply - if(_is_gemm_reshaped) - { - CLScheduler::get().enqueue(_mm_reshaped_only_rhs_kernel, false); - } - else - { - CLScheduler::get().enqueue(_mm_native_kernel, false); - } - if(_run_output_stage) - { - // Run offset contribution/output stage kernel - CLScheduler::get().enqueue(_offset_contribution_output_stage_kernel, true); - } - if(_run_offset_contribution) - { - // Run offset contribution kernel - CLScheduler::get().enqueue(_offset_contribution_kernel, true); - } + _impl->op->run(_impl->run_pack); } void CLGEMMLowpMatrixMultiplyCore::prepare() { - if(!_is_prepared) + if (!_impl->is_prepared) { - if(_convert_to_qasymm8) - { - _qasymm8_weights.allocator()->allocate(); - CLScheduler::get().enqueue(_weights_to_qasymm8, false); - } - - if(_is_gemm_reshaped && _reshape_b_only_on_first_run) - { - ARM_COMPUTE_ERROR_ON(!_original_b->is_used()); - - // Run reshape kernel and mark original weights tensor as unused - _tmp_b.allocator()->allocate(); - CLScheduler::get().enqueue(_mtx_b_reshape_kernel, false); - _original_b->mark_as_unused(); - } + _impl->op->prepare(_impl->run_pack); - // Run matrix B reduction kernel only if _a_offset is not equal to 0 - if(_a_offset != 0 && _reshape_b_only_on_first_run) - { - _vector_sum_col.allocator()->allocate(); - CLScheduler::get().enqueue(_mtx_b_reduction_kernel, false); - } + // Release temporary tensors that are only used in prepare stage + release_temporaries(_impl->aux_mem_req, _impl->workspace_tensors); - CLScheduler::get().queue().finish(); - _is_prepared = true; + _impl->is_prepared = true; } } } // namespace arm_compute diff --git a/src/runtime/CL/functions/CLGEMMLowpOutputStage.cpp b/src/runtime/CL/functions/CLGEMMLowpOutputStage.cpp index 9ae5d5121c..3dd8c5f101 100644 --- a/src/runtime/CL/functions/CLGEMMLowpOutputStage.cpp +++ b/src/runtime/CL/functions/CLGEMMLowpOutputStage.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2020 ARM Limited. + * Copyright (c) 2017-2021 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -23,249 +23,73 @@ */ #include "arm_compute/runtime/CL/functions/CLGEMMLowpOutputStage.h" +#include "arm_compute/core/CL/CLHelpers.h" +#include "arm_compute/core/CL/CLKernelLibrary.h" #include "arm_compute/core/CL/ICLTensor.h" -#include "arm_compute/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleByFloatKernel.h" -#include "arm_compute/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleKernel.h" -#include "arm_compute/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel.h" -#include "arm_compute/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel.h" -#include "arm_compute/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel.h" -#include "support/MemorySupport.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/KernelDescriptors.h" +#include "arm_compute/core/TensorInfo.h" +#include "arm_compute/core/Types.h" -namespace arm_compute -{ -void CLGEMMLowpQuantizeDownInt32ToUint8Scale::configure(const ICLTensor *input, const ICLTensor *bias, ICLTensor *output, int result_offset, int result_mult_int, int result_shift, int min, int max) -{ - GEMMLowpOutputStageInfo info = GEMMLowpOutputStageInfo(); - info.gemmlowp_offset = result_offset; - info.gemmlowp_multiplier = result_mult_int; - info.gemmlowp_shift = result_shift; - info.gemmlowp_min_bound = min; - info.gemmlowp_max_bound = max; - - auto k = arm_compute::support::cpp14::make_unique<CLGEMMLowpQuantizeDownInt32ScaleKernel>(); - k->configure(CLKernelLibrary::get().get_compile_context(), input, bias, output, &info); - _kernel = std::move(k); -} - -void CLGEMMLowpQuantizeDownInt32ToUint8Scale::configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *bias, ICLTensor *output, int result_offset, - int result_mult_int, - int result_shift, int min, int max) -{ - GEMMLowpOutputStageInfo info = GEMMLowpOutputStageInfo(); - info.gemmlowp_offset = result_offset; - info.gemmlowp_multiplier = result_mult_int; - info.gemmlowp_shift = result_shift; - info.gemmlowp_min_bound = min; - info.gemmlowp_max_bound = max; - - auto k = arm_compute::support::cpp14::make_unique<CLGEMMLowpQuantizeDownInt32ScaleKernel>(); - k->configure(compile_context, input, bias, output, &info); - _kernel = std::move(k); -} - -Status CLGEMMLowpQuantizeDownInt32ToUint8Scale::validate(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, int min, int max) -{ - GEMMLowpOutputStageInfo info = GEMMLowpOutputStageInfo(); - info.gemmlowp_min_bound = min; - info.gemmlowp_max_bound = max; - - return CLGEMMLowpQuantizeDownInt32ScaleKernel::validate(input, bias, output, &info); -} - -void CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint::configure(const ICLTensor *input, const ICLTensor *bias, ICLTensor *output, - int result_fixedpoint_multiplier, int result_shift, int result_offset_after_shift, - int min, int max) -{ - configure(CLKernelLibrary::get().get_compile_context(), input, bias, output, result_fixedpoint_multiplier, result_shift, result_offset_after_shift, min, max); -} - -void CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint::configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *bias, ICLTensor *output, - int result_fixedpoint_multiplier, int result_shift, int result_offset_after_shift, - int min, int max) -{ - auto k = arm_compute::support::cpp14::make_unique<CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel>(); - k->configure(compile_context, input, bias, output, result_fixedpoint_multiplier, result_shift, result_offset_after_shift, min, max); - _kernel = std::move(k); -} - -Status CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint::validate(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, - int min, int max) -{ - return CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel::validate(input, bias, output, min, max); -} - -void CLGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPoint::configure(const ICLTensor *input, const ICLTensor *bias, ICLTensor *output, - int result_fixedpoint_multiplier, int result_shift, int result_offset_after_shift, - int min, int max) -{ - auto k = arm_compute::support::cpp14::make_unique<CLGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel>(); - k->configure(CLKernelLibrary::get().get_compile_context(), input, bias, output, result_fixedpoint_multiplier, result_shift, result_offset_after_shift, min, max); - _kernel = std::move(k); -} +#include "src/core/CL/ICLKernel.h" +#include "src/gpu/cl/operators/ClGemmLowpOutputStage.h" -void CLGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPoint::configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *bias, ICLTensor *output, - int result_fixedpoint_multiplier, int result_shift, int result_offset_after_shift, - int min, int max) -{ - auto k = arm_compute::support::cpp14::make_unique<CLGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel>(); - k->configure(compile_context, input, bias, output, result_fixedpoint_multiplier, result_shift, result_offset_after_shift, min, max); - _kernel = std::move(k); -} +#include <algorithm> -Status CLGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPoint::validate(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, - int min, int max) +namespace arm_compute { - return CLGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel::validate(input, bias, output, min, max); -} - -void CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFloat::configure(const ICLTensor *input, const ICLTensor *bias, ICLTensor *output, - float multiplier, int offset, - int min, int max) +struct CLGEMMLowpOutputStage::Impl { - GEMMLowpOutputStageInfo info = GEMMLowpOutputStageInfo(); - info.gemmlowp_offset = offset; - info.gemmlowp_real_multiplier = multiplier; - info.gemmlowp_min_bound = min; - info.gemmlowp_max_bound = max; - - auto k = arm_compute::support::cpp14::make_unique<CLGEMMLowpQuantizeDownInt32ScaleByFloatKernel>(); - k->configure(CLKernelLibrary::get().get_compile_context(), input, bias, output, &info); - _kernel = std::move(k); -} + const ICLTensor *src{nullptr}; + const ICLTensor *bias{nullptr}; + ICLTensor *dst{nullptr}; + std::unique_ptr<opencl::ClGemmLowpOutputStage> op{nullptr}; + ITensorPack run_pack{}; +}; -void CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFloat::configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *bias, ICLTensor *output, - float multiplier, int offset, - int min, int max) +CLGEMMLowpOutputStage::CLGEMMLowpOutputStage() : _impl(std::make_unique<Impl>()) { - GEMMLowpOutputStageInfo info = GEMMLowpOutputStageInfo(); - info.gemmlowp_offset = offset; - info.gemmlowp_real_multiplier = multiplier; - info.gemmlowp_min_bound = min; - info.gemmlowp_max_bound = max; - - auto k = arm_compute::support::cpp14::make_unique<CLGEMMLowpQuantizeDownInt32ScaleByFloatKernel>(); - k->configure(compile_context, input, bias, output, &info); - _kernel = std::move(k); } +CLGEMMLowpOutputStage::CLGEMMLowpOutputStage(CLGEMMLowpOutputStage &&) = default; +CLGEMMLowpOutputStage &CLGEMMLowpOutputStage::operator=(CLGEMMLowpOutputStage &&) = default; +CLGEMMLowpOutputStage::~CLGEMMLowpOutputStage() = default; -Status CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFloat::validate(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, - int min, int max) +void CLGEMMLowpOutputStage::configure(const ICLTensor *input, + const ICLTensor *bias, + ICLTensor *output, + const GEMMLowpOutputStageInfo &info) { - GEMMLowpOutputStageInfo info = GEMMLowpOutputStageInfo(); - info.gemmlowp_min_bound = min; - info.gemmlowp_max_bound = max; - return CLGEMMLowpQuantizeDownInt32ScaleByFloatKernel::validate(input, bias, output, &info); -} - -void CLGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPoint::configure(const ICLTensor *input, const ICLTensor *bias, ICLTensor *output, - int result_fixedpoint_multiplier, int result_shift, - int min, int max) -{ - configure(CLKernelLibrary::get().get_compile_context(), input, bias, output, result_fixedpoint_multiplier, result_shift, min, max); + configure(CLKernelLibrary::get().get_compile_context(), input, bias, output, info); } -void CLGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPoint::configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *bias, ICLTensor *output, - int result_fixedpoint_multiplier, int result_shift, - int min, int max) +void CLGEMMLowpOutputStage::configure(const CLCompileContext &compile_context, + const ICLTensor *input, + const ICLTensor *bias, + ICLTensor *output, + const GEMMLowpOutputStageInfo &info) { - auto k = arm_compute::support::cpp14::make_unique<CLGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel>(); - k->configure(compile_context, input, bias, output, result_fixedpoint_multiplier, result_shift, min, max); - _kernel = std::move(k); -} + ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); -Status CLGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPoint::validate(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, - int min, int max) -{ - return CLGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel::validate(input, bias, output, min, max); -} + _impl->src = input; + _impl->bias = bias; + _impl->dst = output; -void CLGEMMLowpOutputStage::configure(const ICLTensor *input, const ICLTensor *bias, ICLTensor *output, const GEMMLowpOutputStageInfo &info) -{ - configure(CLKernelLibrary::get().get_compile_context(), input, bias, output, info); + _impl->op = std::make_unique<opencl::ClGemmLowpOutputStage>(); + _impl->op->configure(compile_context, input->info(), bias != nullptr ? bias->info() : nullptr, output->info(), + info); + _impl->run_pack = {{ACL_SRC, _impl->src}, {ACL_BIAS, _impl->bias}, {ACL_DST, _impl->dst}}; } -void CLGEMMLowpOutputStage::configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *bias, ICLTensor *output, const GEMMLowpOutputStageInfo &info) +Status CLGEMMLowpOutputStage::validate(const ITensorInfo *input, + const ITensorInfo *bias, + const ITensorInfo *output, + const GEMMLowpOutputStageInfo &info) { - ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); - - switch(info.type) - { - case GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT: - { - switch(info.output_data_type) - { - case DataType::QASYMM8: - { - auto k = arm_compute::support::cpp14::make_unique<CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel>(); - k->configure(compile_context, input, bias, output, info.gemmlowp_multiplier, info.gemmlowp_shift, info.gemmlowp_offset, info.gemmlowp_min_bound, info.gemmlowp_max_bound); - _kernel = std::move(k); - break; - } - case DataType::QASYMM8_SIGNED: - { - auto k = arm_compute::support::cpp14::make_unique<CLGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel>(); - k->configure(compile_context, input, bias, output, info.gemmlowp_multiplier, info.gemmlowp_shift, info.gemmlowp_offset, info.gemmlowp_min_bound, info.gemmlowp_max_bound); - _kernel = std::move(k); - break; - } - case DataType::QSYMM16: - { - auto k = arm_compute::support::cpp14::make_unique<CLGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel>(); - k->configure(input, bias, output, info.gemmlowp_multiplier, info.gemmlowp_shift, info.gemmlowp_min_bound, info.gemmlowp_max_bound); - _kernel = std::move(k); - break; - } - default: - ARM_COMPUTE_ERROR("Unsupported output data type."); - } - break; - } - case GEMMLowpOutputStageType::QUANTIZE_DOWN: - { - auto k = arm_compute::support::cpp14::make_unique<CLGEMMLowpQuantizeDownInt32ScaleKernel>(); - k->configure(compile_context, input, bias, output, &info); - _kernel = std::move(k); - break; - } - case GEMMLowpOutputStageType::QUANTIZE_DOWN_FLOAT: - { - auto k = arm_compute::support::cpp14::make_unique<CLGEMMLowpQuantizeDownInt32ScaleByFloatKernel>(); - k->configure(compile_context, input, bias, output, &info); - _kernel = std::move(k); - break; - } - default: - ARM_COMPUTE_ERROR("Unsupported GEMMLowpOutputStage type."); - } + return opencl::ClGemmLowpOutputStage::validate(input, bias, output, info); } -Status CLGEMMLowpOutputStage::validate(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, const GEMMLowpOutputStageInfo &info) +void CLGEMMLowpOutputStage::run() { - ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(output); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM16); - - switch(info.type) - { - case GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT: - { - switch(output->data_type()) - { - case DataType::QASYMM8: - return CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel::validate(input, bias, output, info.gemmlowp_min_bound, info.gemmlowp_max_bound); - case DataType::QASYMM8_SIGNED: - return CLGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel::validate(input, bias, output, info.gemmlowp_min_bound, info.gemmlowp_max_bound); - case DataType::QSYMM16: - return CLGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel::validate(input, bias, output, info.gemmlowp_min_bound, info.gemmlowp_max_bound); - default: - return ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Unsupported output data type."); - } - } - case GEMMLowpOutputStageType::QUANTIZE_DOWN: - return CLGEMMLowpQuantizeDownInt32ScaleKernel::validate(input, bias, output, &info); - case GEMMLowpOutputStageType::QUANTIZE_DOWN_FLOAT: - return CLGEMMLowpQuantizeDownInt32ScaleByFloatKernel::validate(input, bias, output, &info); - default: - return ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Unsupported GEMMLowpOutputStage type."); - } + _impl->op->run(_impl->run_pack); } -} // namespace arm_compute
\ No newline at end of file +} // namespace arm_compute diff --git a/src/runtime/CL/functions/CLGather.cpp b/src/runtime/CL/functions/CLGather.cpp index e2b18e0f55..2610cb1a3b 100644 --- a/src/runtime/CL/functions/CLGather.cpp +++ b/src/runtime/CL/functions/CLGather.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018-2020 ARM Limited. + * Copyright (c) 2018-2021 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -24,8 +24,9 @@ #include "arm_compute/runtime/CL/functions/CLGather.h" #include "arm_compute/core/CL/ICLTensor.h" -#include "arm_compute/core/CL/kernels/CLGatherKernel.h" -#include "support/MemorySupport.h" + +#include "src/common/utils/Log.h" +#include "src/core/CL/kernels/CLGatherKernel.h" namespace arm_compute { @@ -34,9 +35,14 @@ void CLGather::configure(const ICLTensor *input, const ICLTensor *indices, ICLTe configure(CLKernelLibrary::get().get_compile_context(), input, indices, output, axis); } -void CLGather::configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *indices, ICLTensor *output, int axis) +void CLGather::configure(const CLCompileContext &compile_context, + const ICLTensor *input, + const ICLTensor *indices, + ICLTensor *output, + int axis) { - auto k = arm_compute::support::cpp14::make_unique<CLGatherKernel>(); + ARM_COMPUTE_LOG_PARAMS(input, indices, output, axis); + auto k = std::make_unique<CLGatherKernel>(); k->configure(compile_context, input, indices, output, axis); _kernel = std::move(k); } diff --git a/src/runtime/CL/functions/CLGaussian3x3.cpp b/src/runtime/CL/functions/CLGaussian3x3.cpp deleted file mode 100644 index 47367c4b17..0000000000 --- a/src/runtime/CL/functions/CLGaussian3x3.cpp +++ /dev/null @@ -1,45 +0,0 @@ -/* - * Copyright (c) 2016-2020 ARM Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "arm_compute/runtime/CL/functions/CLGaussian3x3.h" - -#include "arm_compute/core/CL/kernels/CLGaussian3x3Kernel.h" -#include "arm_compute/core/PixelValue.h" -#include "support/MemorySupport.h" - -#include <utility> - -using namespace arm_compute; - -void CLGaussian3x3::configure(ICLTensor *input, ICLTensor *output, BorderMode border_mode, uint8_t constant_border_value) -{ - configure(CLKernelLibrary::get().get_compile_context(), input, output, border_mode, constant_border_value); -} - -void CLGaussian3x3::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, BorderMode border_mode, uint8_t constant_border_value) -{ - auto k = arm_compute::support::cpp14::make_unique<CLGaussian3x3Kernel>(); - k->configure(compile_context, input, output, border_mode == BorderMode::UNDEFINED); - _kernel = std::move(k); - _border_handler.configure(compile_context, input, _kernel->border_size(), border_mode, PixelValue(constant_border_value)); -} diff --git a/src/runtime/CL/functions/CLGaussian5x5.cpp b/src/runtime/CL/functions/CLGaussian5x5.cpp deleted file mode 100644 index 6b82cd0c35..0000000000 --- a/src/runtime/CL/functions/CLGaussian5x5.cpp +++ /dev/null @@ -1,74 +0,0 @@ -/* - * Copyright (c) 2016-2020 ARM Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "arm_compute/runtime/CL/functions/CLGaussian5x5.h" - -#include "arm_compute/core/CL/ICLTensor.h" -#include "arm_compute/core/CL/kernels/CLGaussian5x5Kernel.h" -#include "arm_compute/core/PixelValue.h" -#include "arm_compute/core/TensorInfo.h" -#include "arm_compute/core/Validate.h" -#include "arm_compute/runtime/CL/CLScheduler.h" -#include "arm_compute/runtime/ITensorAllocator.h" - -#include <utility> - -using namespace arm_compute; - -CLGaussian5x5::CLGaussian5x5(std::shared_ptr<IMemoryManager> memory_manager) - : _memory_group(std::move(memory_manager)), _kernel_hor(), _kernel_vert(), _border_handler(), _tmp() -{ -} - -void CLGaussian5x5::configure(ICLTensor *input, ICLTensor *output, BorderMode border_mode, uint8_t constant_border_value) -{ - configure(CLKernelLibrary::get().get_compile_context(), input, output, border_mode, constant_border_value); -} - -void CLGaussian5x5::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, BorderMode border_mode, uint8_t constant_border_value) -{ - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8); - - _tmp.allocator()->init(TensorInfo(input->info()->tensor_shape(), 1, DataType::U16)); - - // Manage intermediate buffers - _memory_group.manage(&_tmp); - - // Configure kernels - _kernel_hor.configure(compile_context, input, &_tmp, border_mode == BorderMode::UNDEFINED); - _kernel_vert.configure(compile_context, &_tmp, output, border_mode == BorderMode::UNDEFINED); - _border_handler.configure(compile_context, input, _kernel_hor.border_size(), border_mode, PixelValue(constant_border_value)); - - // Allocate intermediate buffers - _tmp.allocator()->allocate(); -} - -void CLGaussian5x5::run() -{ - CLScheduler::get().enqueue(_border_handler, false); - - MemoryGroupResourceScope scope_mg(_memory_group); - - CLScheduler::get().enqueue(_kernel_hor, false); - CLScheduler::get().enqueue(_kernel_vert); -} diff --git a/src/runtime/CL/functions/CLGaussianPyramid.cpp b/src/runtime/CL/functions/CLGaussianPyramid.cpp deleted file mode 100644 index 1ac98787ac..0000000000 --- a/src/runtime/CL/functions/CLGaussianPyramid.cpp +++ /dev/null @@ -1,204 +0,0 @@ -/* - * Copyright (c) 2017-2020 ARM Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "arm_compute/runtime/CL/functions/CLGaussianPyramid.h" - -#include "arm_compute/core/CL/ICLTensor.h" -#include "arm_compute/core/CL/kernels/CLGaussianPyramidKernel.h" -#include "arm_compute/core/CL/kernels/CLScaleKernel.h" -#include "arm_compute/core/Error.h" -#include "arm_compute/core/PixelValue.h" -#include "arm_compute/core/TensorInfo.h" -#include "arm_compute/core/Validate.h" -#include "arm_compute/core/Window.h" - -#include "arm_compute/runtime/CL/CLPyramid.h" -#include "arm_compute/runtime/CL/CLScheduler.h" -#include "arm_compute/runtime/CL/CLTensor.h" -#include "arm_compute/runtime/CL/CLTensorAllocator.h" -#include "arm_compute/runtime/CL/functions/CLGaussian5x5.h" - -#include <cstddef> - -using namespace arm_compute; - -CLGaussianPyramid::CLGaussianPyramid() - : _input(nullptr), _pyramid(nullptr), _tmp() -{ -} - -CLGaussianPyramidHalf::CLGaussianPyramidHalf() // NOLINT - : _horizontal_border_handler(), - _vertical_border_handler(), - _horizontal_reduction(), - _vertical_reduction() -{ -} - -void CLGaussianPyramidHalf::configure(ICLTensor *input, CLPyramid *pyramid, BorderMode border_mode, uint8_t constant_border_value) -{ - configure(CLKernelLibrary::get().get_compile_context(), input, pyramid, border_mode, constant_border_value); -} - -void CLGaussianPyramidHalf::configure(const CLCompileContext &compile_context, ICLTensor *input, CLPyramid *pyramid, BorderMode border_mode, uint8_t constant_border_value) -{ - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8); - ARM_COMPUTE_ERROR_ON(pyramid == nullptr); - ARM_COMPUTE_ERROR_ON(input->info()->num_dimensions() != pyramid->get_pyramid_level(0)->info()->num_dimensions()); - ARM_COMPUTE_ERROR_ON(input->info()->dimension(0) != pyramid->info()->width()); - ARM_COMPUTE_ERROR_ON(input->info()->dimension(1) != pyramid->info()->height()); - ARM_COMPUTE_ERROR_ON(SCALE_PYRAMID_HALF != pyramid->info()->scale()); - - // Constant value to use for vertical fill border when the border mode is CONSTANT - const uint16_t pixel_value_u16 = static_cast<uint16_t>(constant_border_value) * 2 + static_cast<uint16_t>(constant_border_value) * 8 + static_cast<uint16_t>(constant_border_value) * 6; - - /* Get number of pyramid levels */ - const size_t num_levels = pyramid->info()->num_levels(); - - _input = input; - _pyramid = pyramid; - - if(num_levels > 1) - { - _horizontal_border_handler.resize(num_levels - 1); - _vertical_border_handler.resize(num_levels - 1); - _horizontal_reduction.resize(num_levels - 1); - _vertical_reduction.resize(num_levels - 1); - - // Apply half scale to the X dimension of the tensor shape - TensorShape tensor_shape = pyramid->info()->tensor_shape(); - tensor_shape.set(0, (pyramid->info()->width() + 1) * SCALE_PYRAMID_HALF); - - PyramidInfo pyramid_info(num_levels - 1, SCALE_PYRAMID_HALF, tensor_shape, Format::U16); - _tmp.init(pyramid_info); - - for(size_t i = 0; i < num_levels - 1; ++i) - { - /* Configure horizontal kernel */ - _horizontal_reduction[i].configure(compile_context, _pyramid->get_pyramid_level(i), _tmp.get_pyramid_level(i)); - - /* Configure vertical kernel */ - _vertical_reduction[i].configure(compile_context, _tmp.get_pyramid_level(i), _pyramid->get_pyramid_level(i + 1)); - - /* Configure border */ - _horizontal_border_handler[i].configure(compile_context, _pyramid->get_pyramid_level(i), _horizontal_reduction[i].border_size(), border_mode, PixelValue(constant_border_value)); - - /* Configure border */ - _vertical_border_handler[i].configure(compile_context, _tmp.get_pyramid_level(i), _vertical_reduction[i].border_size(), border_mode, PixelValue(pixel_value_u16)); - } - _tmp.allocate(); - } -} - -void CLGaussianPyramidHalf::run() -{ - ARM_COMPUTE_ERROR_ON_MSG(_pyramid == nullptr, "Unconfigured function"); - - /* Get number of pyramid levels */ - const size_t num_levels = _pyramid->info()->num_levels(); - - /* The first level of the pyramid has the input image */ - _pyramid->get_pyramid_level(0)->map(CLScheduler::get().queue(), true /* blocking */); - _input->map(CLScheduler::get().queue(), true /* blocking */); - _pyramid->get_pyramid_level(0)->copy_from(*_input); - - _input->unmap(CLScheduler::get().queue()); - _pyramid->get_pyramid_level(0)->unmap(CLScheduler::get().queue()); - - for(unsigned int i = 0; i < num_levels - 1; ++i) - { - CLScheduler::get().enqueue(_horizontal_border_handler[i], false); - CLScheduler::get().enqueue(_horizontal_reduction[i], false); - CLScheduler::get().enqueue(_vertical_border_handler[i], false); - CLScheduler::get().enqueue(_vertical_reduction[i], false); - } -} - -CLGaussianPyramidOrb::CLGaussianPyramidOrb() // NOLINT - : _gauss5x5(), - _scale_nearest() -{ -} - -void CLGaussianPyramidOrb::configure(ICLTensor *input, CLPyramid *pyramid, BorderMode border_mode, uint8_t constant_border_value) -{ - configure(CLKernelLibrary::get().get_compile_context(), input, pyramid, border_mode, constant_border_value); -} - -void CLGaussianPyramidOrb::configure(const CLCompileContext &compile_context, ICLTensor *input, CLPyramid *pyramid, BorderMode border_mode, uint8_t constant_border_value) -{ - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8); - ARM_COMPUTE_ERROR_ON(nullptr == pyramid); - ARM_COMPUTE_ERROR_ON(input->info()->num_dimensions() != pyramid->get_pyramid_level(0)->info()->num_dimensions()); - ARM_COMPUTE_ERROR_ON(input->info()->dimension(0) != pyramid->info()->width()); - ARM_COMPUTE_ERROR_ON(input->info()->dimension(1) != pyramid->info()->height()); - ARM_COMPUTE_ERROR_ON(SCALE_PYRAMID_ORB != pyramid->info()->scale()); - - /* Get number of pyramid levels */ - const size_t num_levels = pyramid->info()->num_levels(); - - _input = input; - _pyramid = pyramid; - - if(num_levels > 1) - { - _gauss5x5.resize(num_levels - 1); - _scale_nearest.resize(num_levels - 1); - - PyramidInfo pyramid_info(num_levels - 1, SCALE_PYRAMID_ORB, pyramid->info()->tensor_shape(), Format::U8); - - _tmp.init(pyramid_info); - - for(size_t i = 0; i < num_levels - 1; ++i) - { - /* Configure gaussian 5x5 */ - _gauss5x5[i].configure(compile_context, _pyramid->get_pyramid_level(i), _tmp.get_pyramid_level(i), border_mode, constant_border_value); - - /* Configure scale image kernel */ - _scale_nearest[i].configure(compile_context, _tmp.get_pyramid_level(i), _pyramid->get_pyramid_level(i + 1), InterpolationPolicy::NEAREST_NEIGHBOR, border_mode, SamplingPolicy::CENTER); - } - - _tmp.allocate(); - } -} - -void CLGaussianPyramidOrb::run() -{ - ARM_COMPUTE_ERROR_ON_MSG(_pyramid == nullptr, "Unconfigured function"); - - /* Get number of pyramid levels */ - const size_t num_levels = _pyramid->info()->num_levels(); - - /* The first level of the pyramid has the input image */ - _pyramid->get_pyramid_level(0)->map(CLScheduler::get().queue(), true /* blocking */); - _input->map(CLScheduler::get().queue(), true /* blocking */); - _pyramid->get_pyramid_level(0)->copy_from(*_input); - _input->unmap(CLScheduler::get().queue()); - _pyramid->get_pyramid_level(0)->unmap(CLScheduler::get().queue()); - - for(unsigned int i = 0; i < num_levels - 1; ++i) - { - _gauss5x5[i].run(); - CLScheduler::get().enqueue(_scale_nearest[i]); - } -} diff --git a/src/runtime/CL/functions/CLGenerateProposalsLayer.cpp b/src/runtime/CL/functions/CLGenerateProposalsLayer.cpp index 7f037fc51f..b2c1d2631e 100644 --- a/src/runtime/CL/functions/CLGenerateProposalsLayer.cpp +++ b/src/runtime/CL/functions/CLGenerateProposalsLayer.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2020 ARM Limited. + * Copyright (c) 2019-2021 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -25,21 +25,29 @@ #include "arm_compute/core/CL/ICLTensor.h" #include "arm_compute/core/Types.h" +#include "arm_compute/runtime/CL/functions/CLDequantizationLayer.h" +#include "arm_compute/runtime/CL/functions/CLQuantizationLayer.h" + +#include "src/common/utils/Log.h" +#include "src/core/CL/kernels/CLBoundingBoxTransformKernel.h" +#include "src/core/CL/kernels/CLGenerateProposalsLayerKernel.h" +#include "src/core/CL/kernels/CLPadLayerKernel.h" +#include "src/core/helpers/AutoConfiguration.h" namespace arm_compute { CLGenerateProposalsLayer::CLGenerateProposalsLayer(std::shared_ptr<IMemoryManager> memory_manager) : _memory_group(memory_manager), - _permute_deltas_kernel(), - _flatten_deltas_kernel(), - _permute_scores_kernel(), - _flatten_scores_kernel(), - _compute_anchors_kernel(), - _bounding_box_kernel(), - _pad_kernel(), - _dequantize_anchors(), - _dequantize_deltas(), - _quantize_all_proposals(), + _permute_deltas(), + _flatten_deltas(), + _permute_scores(), + _flatten_scores(), + _compute_anchors_kernel(std::make_unique<CLComputeAllAnchorsKernel>()), + _bounding_box_kernel(std::make_unique<CLBoundingBoxTransformKernel>()), + _pad_kernel(std::make_unique<CLPadLayerKernel>()), + _dequantize_anchors(std::make_unique<CLDequantizationLayer>()), + _dequantize_deltas(std::make_unique<CLDequantizationLayer>()), + _quantize_all_proposals(std::make_unique<CLQuantizationLayer>()), _cpp_nms(memory_manager), _is_nhwc(false), _is_qasymm8(false), @@ -61,53 +69,75 @@ CLGenerateProposalsLayer::CLGenerateProposalsLayer(std::shared_ptr<IMemoryManage { } -void CLGenerateProposalsLayer::configure(const ICLTensor *scores, const ICLTensor *deltas, const ICLTensor *anchors, ICLTensor *proposals, ICLTensor *scores_out, ICLTensor *num_valid_proposals, +CLGenerateProposalsLayer::~CLGenerateProposalsLayer() = default; + +void CLGenerateProposalsLayer::configure(const ICLTensor *scores, + const ICLTensor *deltas, + const ICLTensor *anchors, + ICLTensor *proposals, + ICLTensor *scores_out, + ICLTensor *num_valid_proposals, const GenerateProposalsInfo &info) { - configure(CLKernelLibrary::get().get_compile_context(), scores, deltas, anchors, proposals, scores_out, num_valid_proposals, info); + configure(CLKernelLibrary::get().get_compile_context(), scores, deltas, anchors, proposals, scores_out, + num_valid_proposals, info); } -void CLGenerateProposalsLayer::configure(const CLCompileContext &compile_context, const ICLTensor *scores, const ICLTensor *deltas, const ICLTensor *anchors, ICLTensor *proposals, - ICLTensor *scores_out, - ICLTensor *num_valid_proposals, const GenerateProposalsInfo &info) +void CLGenerateProposalsLayer::configure(const CLCompileContext &compile_context, + const ICLTensor *scores, + const ICLTensor *deltas, + const ICLTensor *anchors, + ICLTensor *proposals, + ICLTensor *scores_out, + ICLTensor *num_valid_proposals, + const GenerateProposalsInfo &info) { ARM_COMPUTE_ERROR_ON_NULLPTR(scores, deltas, anchors, proposals, scores_out, num_valid_proposals); - ARM_COMPUTE_ERROR_THROW_ON(CLGenerateProposalsLayer::validate(scores->info(), deltas->info(), anchors->info(), proposals->info(), scores_out->info(), num_valid_proposals->info(), info)); + ARM_COMPUTE_ERROR_THROW_ON(CLGenerateProposalsLayer::validate(scores->info(), deltas->info(), anchors->info(), + proposals->info(), scores_out->info(), + num_valid_proposals->info(), info)); + ARM_COMPUTE_LOG_PARAMS(scores, deltas, anchors, proposals, scores_out, num_valid_proposals, info); _is_nhwc = scores->info()->data_layout() == DataLayout::NHWC; const DataType scores_data_type = scores->info()->data_type(); _is_qasymm8 = scores_data_type == DataType::QASYMM8; - const int num_anchors = scores->info()->dimension(get_data_layout_dimension_index(scores->info()->data_layout(), DataLayoutDimension::CHANNEL)); - const int feat_width = scores->info()->dimension(get_data_layout_dimension_index(scores->info()->data_layout(), DataLayoutDimension::WIDTH)); - const int feat_height = scores->info()->dimension(get_data_layout_dimension_index(scores->info()->data_layout(), DataLayoutDimension::HEIGHT)); - const int total_num_anchors = num_anchors * feat_width * feat_height; - const int pre_nms_topN = info.pre_nms_topN(); - const int post_nms_topN = info.post_nms_topN(); - const size_t values_per_roi = info.values_per_roi(); + const int num_anchors = scores->info()->dimension( + get_data_layout_dimension_index(scores->info()->data_layout(), DataLayoutDimension::CHANNEL)); + const int feat_width = scores->info()->dimension( + get_data_layout_dimension_index(scores->info()->data_layout(), DataLayoutDimension::WIDTH)); + const int feat_height = scores->info()->dimension( + get_data_layout_dimension_index(scores->info()->data_layout(), DataLayoutDimension::HEIGHT)); + const int total_num_anchors = num_anchors * feat_width * feat_height; + const int pre_nms_topN = info.pre_nms_topN(); + const int post_nms_topN = info.post_nms_topN(); + const size_t values_per_roi = info.values_per_roi(); const QuantizationInfo scores_qinfo = scores->info()->quantization_info(); const DataType rois_data_type = (_is_qasymm8) ? DataType::QASYMM16 : scores_data_type; - const QuantizationInfo rois_qinfo = (_is_qasymm8) ? QuantizationInfo(0.125f, 0) : scores->info()->quantization_info(); + const QuantizationInfo rois_qinfo = + (_is_qasymm8) ? QuantizationInfo(0.125f, 0) : scores->info()->quantization_info(); // Compute all the anchors _memory_group.manage(&_all_anchors); - _compute_anchors_kernel.configure(compile_context, anchors, &_all_anchors, ComputeAnchorsInfo(feat_width, feat_height, info.spatial_scale())); + _compute_anchors_kernel->configure(compile_context, anchors, &_all_anchors, + ComputeAnchorsInfo(feat_width, feat_height, info.spatial_scale())); const TensorShape flatten_shape_deltas(values_per_roi, total_num_anchors); - _deltas_flattened.allocator()->init(TensorInfo(flatten_shape_deltas, 1, scores_data_type, deltas->info()->quantization_info())); + _deltas_flattened.allocator()->init( + TensorInfo(flatten_shape_deltas, 1, scores_data_type, deltas->info()->quantization_info())); // Permute and reshape deltas _memory_group.manage(&_deltas_flattened); - if(!_is_nhwc) + if (!_is_nhwc) { _memory_group.manage(&_deltas_permuted); - _permute_deltas_kernel.configure(compile_context, deltas, &_deltas_permuted, PermutationVector{ 2, 0, 1 }); - _flatten_deltas_kernel.configure(compile_context, &_deltas_permuted, &_deltas_flattened); + _permute_deltas.configure(compile_context, deltas, &_deltas_permuted, PermutationVector{2, 0, 1}); + _flatten_deltas.configure(compile_context, &_deltas_permuted, &_deltas_flattened); _deltas_permuted.allocator()->allocate(); } else { - _flatten_deltas_kernel.configure(compile_context, deltas, &_deltas_flattened); + _flatten_deltas.configure(compile_context, deltas, &_deltas_flattened); } const TensorShape flatten_shape_scores(1, total_num_anchors); @@ -115,49 +145,50 @@ void CLGenerateProposalsLayer::configure(const CLCompileContext &compile_context // Permute and reshape scores _memory_group.manage(&_scores_flattened); - if(!_is_nhwc) + if (!_is_nhwc) { _memory_group.manage(&_scores_permuted); - _permute_scores_kernel.configure(compile_context, scores, &_scores_permuted, PermutationVector{ 2, 0, 1 }); - _flatten_scores_kernel.configure(compile_context, &_scores_permuted, &_scores_flattened); + _permute_scores.configure(compile_context, scores, &_scores_permuted, PermutationVector{2, 0, 1}); + _flatten_scores.configure(compile_context, &_scores_permuted, &_scores_flattened); _scores_permuted.allocator()->allocate(); } else { - _flatten_scores_kernel.configure(compile_context, scores, &_scores_flattened); + _flatten_scores.configure(compile_context, scores, &_scores_flattened); } CLTensor *anchors_to_use = &_all_anchors; CLTensor *deltas_to_use = &_deltas_flattened; - if(_is_qasymm8) + if (_is_qasymm8) { _all_anchors_f32.allocator()->init(TensorInfo(_all_anchors.info()->tensor_shape(), 1, DataType::F32)); _deltas_flattened_f32.allocator()->init(TensorInfo(_deltas_flattened.info()->tensor_shape(), 1, DataType::F32)); _memory_group.manage(&_all_anchors_f32); _memory_group.manage(&_deltas_flattened_f32); // Dequantize anchors to float - _dequantize_anchors.configure(compile_context, &_all_anchors, &_all_anchors_f32); + _dequantize_anchors->configure(compile_context, &_all_anchors, &_all_anchors_f32); _all_anchors.allocator()->allocate(); anchors_to_use = &_all_anchors_f32; // Dequantize deltas to float - _dequantize_deltas.configure(compile_context, &_deltas_flattened, &_deltas_flattened_f32); + _dequantize_deltas->configure(compile_context, &_deltas_flattened, &_deltas_flattened_f32); _deltas_flattened.allocator()->allocate(); deltas_to_use = &_deltas_flattened_f32; } // Bounding box transform _memory_group.manage(&_all_proposals); BoundingBoxTransformInfo bbox_info(info.im_width(), info.im_height(), 1.f); - _bounding_box_kernel.configure(compile_context, anchors_to_use, &_all_proposals, deltas_to_use, bbox_info); + _bounding_box_kernel->configure(compile_context, anchors_to_use, &_all_proposals, deltas_to_use, bbox_info); deltas_to_use->allocator()->allocate(); anchors_to_use->allocator()->allocate(); _all_proposals_to_use = &_all_proposals; - if(_is_qasymm8) + if (_is_qasymm8) { _memory_group.manage(&_all_proposals_quantized); // Requantize all_proposals to QASYMM16 with 0.125 scale and 0 offset - _all_proposals_quantized.allocator()->init(TensorInfo(_all_proposals.info()->tensor_shape(), 1, DataType::QASYMM16, QuantizationInfo(0.125f, 0))); - _quantize_all_proposals.configure(compile_context, &_all_proposals, &_all_proposals_quantized); + _all_proposals_quantized.allocator()->init( + TensorInfo(_all_proposals.info()->tensor_shape(), 1, DataType::QASYMM16, QuantizationInfo(0.125f, 0))); + _quantize_all_proposals->configure(compile_context, &_all_proposals, &_all_proposals_quantized); _all_proposals.allocator()->allocate(); _all_proposals_to_use = &_all_proposals_quantized; } @@ -172,7 +203,8 @@ void CLGenerateProposalsLayer::configure(const CLCompileContext &compile_context // Note that NMS needs outputs preinitialized. auto_init_if_empty(*scores_out->info(), TensorShape(scores_nms_size), 1, scores_data_type, scores_qinfo); - auto_init_if_empty(*_proposals_4_roi_values.info(), TensorShape(values_per_roi, scores_nms_size), 1, rois_data_type, rois_qinfo); + auto_init_if_empty(*_proposals_4_roi_values.info(), TensorShape(values_per_roi, scores_nms_size), 1, rois_data_type, + rois_qinfo); auto_init_if_empty(*num_valid_proposals->info(), TensorShape(1), 1, DataType::U32); // Initialize temporaries (unused) outputs @@ -184,20 +216,27 @@ void CLGenerateProposalsLayer::configure(const CLCompileContext &compile_context _num_valid_proposals = num_valid_proposals; _memory_group.manage(&_proposals_4_roi_values); - _cpp_nms.configure(&_scores_flattened, _all_proposals_to_use, nullptr, scores_out, &_proposals_4_roi_values, &_classes_nms_unused, nullptr, &_keeps_nms_unused, num_valid_proposals, - BoxNMSLimitInfo(0.0f, info.nms_thres(), scores_nms_size, false, NMSType::LINEAR, 0.5f, 0.001f, true, min_size_scaled, info.im_width(), info.im_height())); + _cpp_nms.configure(&_scores_flattened, _all_proposals_to_use, nullptr, scores_out, &_proposals_4_roi_values, + &_classes_nms_unused, nullptr, &_keeps_nms_unused, num_valid_proposals, + BoxNMSLimitInfo(0.0f, info.nms_thres(), scores_nms_size, false, NMSType::LINEAR, 0.5f, 0.001f, + true, min_size_scaled, info.im_width(), info.im_height())); _keeps_nms_unused.allocator()->allocate(); _classes_nms_unused.allocator()->allocate(); _all_proposals_to_use->allocator()->allocate(); _scores_flattened.allocator()->allocate(); // Add the first column that represents the batch id. This will be all zeros, as we don't support multiple images - _pad_kernel.configure(compile_context, &_proposals_4_roi_values, proposals, PaddingList{ { 1, 0 } }); + _pad_kernel->configure(compile_context, &_proposals_4_roi_values, proposals, PaddingList{{1, 0}}); _proposals_4_roi_values.allocator()->allocate(); } -Status CLGenerateProposalsLayer::validate(const ITensorInfo *scores, const ITensorInfo *deltas, const ITensorInfo *anchors, const ITensorInfo *proposals, const ITensorInfo *scores_out, - const ITensorInfo *num_valid_proposals, const GenerateProposalsInfo &info) +Status CLGenerateProposalsLayer::validate(const ITensorInfo *scores, + const ITensorInfo *deltas, + const ITensorInfo *anchors, + const ITensorInfo *proposals, + const ITensorInfo *scores_out, + const ITensorInfo *num_valid_proposals, + const GenerateProposalsInfo &info) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(scores, deltas, anchors, proposals, scores_out, num_valid_proposals); ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(scores, 1, DataType::QASYMM8, DataType::F16, DataType::F32); @@ -205,9 +244,12 @@ Status CLGenerateProposalsLayer::validate(const ITensorInfo *scores, const ITens ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(scores, deltas); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(scores, deltas); - const int num_anchors = scores->dimension(get_data_layout_dimension_index(scores->data_layout(), DataLayoutDimension::CHANNEL)); - const int feat_width = scores->dimension(get_data_layout_dimension_index(scores->data_layout(), DataLayoutDimension::WIDTH)); - const int feat_height = scores->dimension(get_data_layout_dimension_index(scores->data_layout(), DataLayoutDimension::HEIGHT)); + const int num_anchors = + scores->dimension(get_data_layout_dimension_index(scores->data_layout(), DataLayoutDimension::CHANNEL)); + const int feat_width = + scores->dimension(get_data_layout_dimension_index(scores->data_layout(), DataLayoutDimension::WIDTH)); + const int feat_height = + scores->dimension(get_data_layout_dimension_index(scores->data_layout(), DataLayoutDimension::HEIGHT)); const int num_images = scores->dimension(3); const int total_num_anchors = num_anchors * feat_width * feat_height; const int values_per_roi = info.values_per_roi(); @@ -216,76 +258,101 @@ Status CLGenerateProposalsLayer::validate(const ITensorInfo *scores, const ITens ARM_COMPUTE_RETURN_ERROR_ON(num_images > 1); - if(is_qasymm8) + if (is_qasymm8) { ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(anchors, 1, DataType::QSYMM16); const UniformQuantizationInfo anchors_qinfo = anchors->quantization_info().uniform(); ARM_COMPUTE_RETURN_ERROR_ON(anchors_qinfo.scale != 0.125f); } - TensorInfo all_anchors_info(anchors->clone()->set_tensor_shape(TensorShape(values_per_roi, total_num_anchors)).set_is_resizable(true)); - ARM_COMPUTE_RETURN_ON_ERROR(CLComputeAllAnchorsKernel::validate(anchors, &all_anchors_info, ComputeAnchorsInfo(feat_width, feat_height, info.spatial_scale()))); - - TensorInfo deltas_permuted_info = deltas->clone()->set_tensor_shape(TensorShape(values_per_roi * num_anchors, feat_width, feat_height)).set_is_resizable(true); - TensorInfo scores_permuted_info = scores->clone()->set_tensor_shape(TensorShape(num_anchors, feat_width, feat_height)).set_is_resizable(true); - if(scores->data_layout() == DataLayout::NHWC) + TensorInfo all_anchors_info( + anchors->clone()->set_tensor_shape(TensorShape(values_per_roi, total_num_anchors)).set_is_resizable(true)); + ARM_COMPUTE_RETURN_ON_ERROR(CLComputeAllAnchorsKernel::validate( + anchors, &all_anchors_info, ComputeAnchorsInfo(feat_width, feat_height, info.spatial_scale()))); + + TensorInfo deltas_permuted_info = + deltas->clone() + ->set_tensor_shape(TensorShape(values_per_roi * num_anchors, feat_width, feat_height)) + .set_is_resizable(true); + TensorInfo scores_permuted_info = + scores->clone()->set_tensor_shape(TensorShape(num_anchors, feat_width, feat_height)).set_is_resizable(true); + if (scores->data_layout() == DataLayout::NHWC) { ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(deltas, &deltas_permuted_info); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(scores, &scores_permuted_info); } else { - ARM_COMPUTE_RETURN_ON_ERROR(CLPermuteKernel::validate(deltas, &deltas_permuted_info, PermutationVector{ 2, 0, 1 })); - ARM_COMPUTE_RETURN_ON_ERROR(CLPermuteKernel::validate(scores, &scores_permuted_info, PermutationVector{ 2, 0, 1 })); + ARM_COMPUTE_RETURN_ON_ERROR(CLPermute::validate(deltas, &deltas_permuted_info, PermutationVector{2, 0, 1})); + ARM_COMPUTE_RETURN_ON_ERROR(CLPermute::validate(scores, &scores_permuted_info, PermutationVector{2, 0, 1})); } - TensorInfo deltas_flattened_info(deltas->clone()->set_tensor_shape(TensorShape(values_per_roi, total_num_anchors)).set_is_resizable(true)); - ARM_COMPUTE_RETURN_ON_ERROR(CLReshapeLayerKernel::validate(&deltas_permuted_info, &deltas_flattened_info)); + TensorInfo deltas_flattened_info( + deltas->clone()->set_tensor_shape(TensorShape(values_per_roi, total_num_anchors)).set_is_resizable(true)); + ARM_COMPUTE_RETURN_ON_ERROR(CLReshapeLayer::validate(&deltas_permuted_info, &deltas_flattened_info)); - TensorInfo scores_flattened_info(scores->clone()->set_tensor_shape(TensorShape(1, total_num_anchors)).set_is_resizable(true)); - TensorInfo proposals_4_roi_values(deltas->clone()->set_tensor_shape(TensorShape(values_per_roi, total_num_anchors)).set_is_resizable(true)); + TensorInfo scores_flattened_info( + scores->clone()->set_tensor_shape(TensorShape(1, total_num_anchors)).set_is_resizable(true)); + TensorInfo proposals_4_roi_values( + deltas->clone()->set_tensor_shape(TensorShape(values_per_roi, total_num_anchors)).set_is_resizable(true)); - ARM_COMPUTE_RETURN_ON_ERROR(CLReshapeLayerKernel::validate(&scores_permuted_info, &scores_flattened_info)); + ARM_COMPUTE_RETURN_ON_ERROR(CLReshapeLayer::validate(&scores_permuted_info, &scores_flattened_info)); TensorInfo *proposals_4_roi_values_to_use = &proposals_4_roi_values; - TensorInfo proposals_4_roi_values_quantized(deltas->clone()->set_tensor_shape(TensorShape(values_per_roi, total_num_anchors)).set_is_resizable(true)); - proposals_4_roi_values_quantized.set_data_type(DataType::QASYMM16).set_quantization_info(QuantizationInfo(0.125f, 0)); - if(is_qasymm8) + TensorInfo proposals_4_roi_values_quantized( + deltas->clone()->set_tensor_shape(TensorShape(values_per_roi, total_num_anchors)).set_is_resizable(true)); + proposals_4_roi_values_quantized.set_data_type(DataType::QASYMM16) + .set_quantization_info(QuantizationInfo(0.125f, 0)); + if (is_qasymm8) { - TensorInfo all_anchors_f32_info(anchors->clone()->set_tensor_shape(TensorShape(values_per_roi, total_num_anchors)).set_is_resizable(true).set_data_type(DataType::F32)); - ARM_COMPUTE_RETURN_ON_ERROR(CLDequantizationLayerKernel::validate(&all_anchors_info, &all_anchors_f32_info)); - - TensorInfo deltas_flattened_f32_info(deltas->clone()->set_tensor_shape(TensorShape(values_per_roi, total_num_anchors)).set_is_resizable(true).set_data_type(DataType::F32)); - ARM_COMPUTE_RETURN_ON_ERROR(CLDequantizationLayerKernel::validate(&deltas_flattened_info, &deltas_flattened_f32_info)); - - TensorInfo proposals_4_roi_values_f32(deltas->clone()->set_tensor_shape(TensorShape(values_per_roi, total_num_anchors)).set_is_resizable(true).set_data_type(DataType::F32)); - ARM_COMPUTE_RETURN_ON_ERROR(CLBoundingBoxTransformKernel::validate(&all_anchors_f32_info, &proposals_4_roi_values_f32, &deltas_flattened_f32_info, - BoundingBoxTransformInfo(info.im_width(), info.im_height(), 1.f))); - - ARM_COMPUTE_RETURN_ON_ERROR(CLQuantizationLayerKernel::validate(&proposals_4_roi_values_f32, &proposals_4_roi_values_quantized)); + TensorInfo all_anchors_f32_info(anchors->clone() + ->set_tensor_shape(TensorShape(values_per_roi, total_num_anchors)) + .set_is_resizable(true) + .set_data_type(DataType::F32)); + ARM_COMPUTE_RETURN_ON_ERROR(CLDequantizationLayer::validate(&all_anchors_info, &all_anchors_f32_info)); + + TensorInfo deltas_flattened_f32_info(deltas->clone() + ->set_tensor_shape(TensorShape(values_per_roi, total_num_anchors)) + .set_is_resizable(true) + .set_data_type(DataType::F32)); + ARM_COMPUTE_RETURN_ON_ERROR( + CLDequantizationLayer::validate(&deltas_flattened_info, &deltas_flattened_f32_info)); + + TensorInfo proposals_4_roi_values_f32(deltas->clone() + ->set_tensor_shape(TensorShape(values_per_roi, total_num_anchors)) + .set_is_resizable(true) + .set_data_type(DataType::F32)); + ARM_COMPUTE_RETURN_ON_ERROR(CLBoundingBoxTransformKernel::validate( + &all_anchors_f32_info, &proposals_4_roi_values_f32, &deltas_flattened_f32_info, + BoundingBoxTransformInfo(info.im_width(), info.im_height(), 1.f))); + + ARM_COMPUTE_RETURN_ON_ERROR( + CLQuantizationLayer::validate(&proposals_4_roi_values_f32, &proposals_4_roi_values_quantized)); proposals_4_roi_values_to_use = &proposals_4_roi_values_quantized; } else { - ARM_COMPUTE_RETURN_ON_ERROR(CLBoundingBoxTransformKernel::validate(&all_anchors_info, &proposals_4_roi_values, &deltas_flattened_info, - BoundingBoxTransformInfo(info.im_width(), info.im_height(), 1.f))); + ARM_COMPUTE_RETURN_ON_ERROR( + CLBoundingBoxTransformKernel::validate(&all_anchors_info, &proposals_4_roi_values, &deltas_flattened_info, + BoundingBoxTransformInfo(info.im_width(), info.im_height(), 1.f))); } - ARM_COMPUTE_RETURN_ON_ERROR(CLPadLayerKernel::validate(proposals_4_roi_values_to_use, proposals, PaddingList{ { 1, 0 } })); + ARM_COMPUTE_RETURN_ON_ERROR( + CLPadLayerKernel::validate(proposals_4_roi_values_to_use, proposals, PaddingList{{1, 0}})); - if(num_valid_proposals->total_size() > 0) + if (num_valid_proposals->total_size() > 0) { ARM_COMPUTE_RETURN_ERROR_ON(num_valid_proposals->num_dimensions() > 1); ARM_COMPUTE_RETURN_ERROR_ON(num_valid_proposals->dimension(0) > 1); ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(num_valid_proposals, 1, DataType::U32); } - if(proposals->total_size() > 0) + if (proposals->total_size() > 0) { ARM_COMPUTE_RETURN_ERROR_ON(proposals->num_dimensions() > 2); ARM_COMPUTE_RETURN_ERROR_ON(proposals->dimension(0) != size_t(values_per_roi) + 1); ARM_COMPUTE_RETURN_ERROR_ON(proposals->dimension(1) != size_t(total_num_anchors)); - if(is_qasymm8) + if (is_qasymm8) { ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(proposals, 1, DataType::QASYMM16); const UniformQuantizationInfo proposals_qinfo = proposals->quantization_info().uniform(); @@ -298,7 +365,7 @@ Status CLGenerateProposalsLayer::validate(const ITensorInfo *scores, const ITens } } - if(scores_out->total_size() > 0) + if (scores_out->total_size() > 0) { ARM_COMPUTE_RETURN_ERROR_ON(scores_out->num_dimensions() > 1); ARM_COMPUTE_RETURN_ERROR_ON(scores_out->dimension(0) != size_t(total_num_anchors)); @@ -342,34 +409,34 @@ void CLGenerateProposalsLayer::run() MemoryGroupResourceScope scope_mg(_memory_group); // Compute all the anchors - CLScheduler::get().enqueue(_compute_anchors_kernel, false); + CLScheduler::get().enqueue(*_compute_anchors_kernel, false); // Transpose and reshape the inputs - if(!_is_nhwc) + if (!_is_nhwc) { - CLScheduler::get().enqueue(_permute_deltas_kernel, false); - CLScheduler::get().enqueue(_permute_scores_kernel, false); + _permute_deltas.run(); + _permute_scores.run(); } - CLScheduler::get().enqueue(_flatten_deltas_kernel, false); - CLScheduler::get().enqueue(_flatten_scores_kernel, false); + _flatten_deltas.run(); + _flatten_scores.run(); - if(_is_qasymm8) + if (_is_qasymm8) { - CLScheduler::get().enqueue(_dequantize_anchors, false); - CLScheduler::get().enqueue(_dequantize_deltas, false); + _dequantize_anchors->run(); + _dequantize_deltas->run(); } // Build the boxes - CLScheduler::get().enqueue(_bounding_box_kernel, false); + CLScheduler::get().enqueue(*_bounding_box_kernel, false); - if(_is_qasymm8) + if (_is_qasymm8) { - CLScheduler::get().enqueue(_quantize_all_proposals, false); + _quantize_all_proposals->run(); } // Non maxima suppression run_cpp_nms_kernel(); // Add dummy batch indexes - CLScheduler::get().enqueue(_pad_kernel, true); + CLScheduler::get().enqueue(*_pad_kernel, true); } } // namespace arm_compute diff --git a/src/runtime/CL/functions/CLHOGDescriptor.cpp b/src/runtime/CL/functions/CLHOGDescriptor.cpp deleted file mode 100644 index 0645cfdf22..0000000000 --- a/src/runtime/CL/functions/CLHOGDescriptor.cpp +++ /dev/null @@ -1,113 +0,0 @@ -/* - * Copyright (c) 2017-2020 ARM Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "arm_compute/runtime/CL/functions/CLHOGDescriptor.h" - -#include "arm_compute/core/Error.h" -#include "arm_compute/core/HOGInfo.h" -#include "arm_compute/core/Size2D.h" -#include "arm_compute/core/Validate.h" -#include "arm_compute/runtime/CL/CLScheduler.h" - -using namespace arm_compute; - -CLHOGDescriptor::CLHOGDescriptor(std::shared_ptr<IMemoryManager> memory_manager) - : _memory_group(std::move(memory_manager)), _gradient(), _orient_bin(), _block_norm(), _mag(), _phase(), _hog_space() -{ -} - -void CLHOGDescriptor::configure(ICLTensor *input, ICLTensor *output, const IHOG *hog, BorderMode border_mode, uint8_t constant_border_value) -{ - configure(CLKernelLibrary::get().get_compile_context(), input, output, hog, border_mode, constant_border_value); -} - -void CLHOGDescriptor::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, const IHOG *hog, BorderMode border_mode, uint8_t constant_border_value) -{ - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8); - ARM_COMPUTE_ERROR_ON(nullptr == output); - ARM_COMPUTE_ERROR_ON(nullptr == hog); - - const HOGInfo *hog_info = hog->info(); - const size_t width = input->info()->dimension(Window::DimX); - const size_t height = input->info()->dimension(Window::DimY); - const size_t num_bins = hog_info->num_bins(); - - Size2D cell_size = hog_info->cell_size(); - - // Calculate number of cells along the x and y directions for the hog_space - const size_t num_cells_x = width / cell_size.width; - const size_t num_cells_y = height / cell_size.height; - - // TensorShape of the input image - const TensorShape &shape_img = input->info()->tensor_shape(); - - // TensorShape of the hog space - TensorShape shape_hog_space = input->info()->tensor_shape(); - shape_hog_space.set(Window::DimX, num_cells_x); - shape_hog_space.set(Window::DimY, num_cells_y); - - // Intitialize tensors for magnitude, phase and hog space - TensorInfo info_mag(shape_img, Format::S16); - _mag.allocator()->init(info_mag); - - TensorInfo info_phase(shape_img, Format::U8); - _phase.allocator()->init(info_phase); - - TensorInfo info_space(shape_hog_space, num_bins, DataType::F32); - _hog_space.allocator()->init(info_space); - - // Manage intermediate buffers - _memory_group.manage(&_mag); - _memory_group.manage(&_phase); - - // Initialise gradient kernel - _gradient.configure(compile_context, input, &_mag, &_phase, hog_info->phase_type(), border_mode, constant_border_value); - - // Manage intermediate buffers - _memory_group.manage(&_hog_space); - - // Initialise orientation binning kernel - _orient_bin.configure(compile_context, &_mag, &_phase, &_hog_space, hog->info()); - - // Initialize HOG norm kernel - _block_norm.configure(compile_context, &_hog_space, output, hog->info()); - - // Allocate intermediate tensors - _mag.allocator()->allocate(); - _phase.allocator()->allocate(); - _hog_space.allocator()->allocate(); -} - -void CLHOGDescriptor::run() -{ - MemoryGroupResourceScope scope_mg(_memory_group); - - // Run gradient - _gradient.run(); - - // Run orientation binning - CLScheduler::get().enqueue(_orient_bin, false); - - // Run block normalization - CLScheduler::get().enqueue(_block_norm); -}
\ No newline at end of file diff --git a/src/runtime/CL/functions/CLHOGDetector.cpp b/src/runtime/CL/functions/CLHOGDetector.cpp deleted file mode 100644 index bf9bae1e8b..0000000000 --- a/src/runtime/CL/functions/CLHOGDetector.cpp +++ /dev/null @@ -1,75 +0,0 @@ -/* - * Copyright (c) 2017-2020 ARM Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "arm_compute/runtime/CL/functions/CLHOGDetector.h" - -#include "arm_compute/core/CL/kernels/CLHOGDetectorKernel.h" -#include "arm_compute/core/Helpers.h" -#include "arm_compute/runtime/CL/CLScheduler.h" - -#include <algorithm> - -using namespace arm_compute; - -CLHOGDetector::CLHOGDetector() - : _hog_detector_kernel(), _detection_windows(nullptr), _num_detection_windows() -{ -} - -void CLHOGDetector::configure(const ICLTensor *input, const ICLHOG *hog, ICLDetectionWindowArray *detection_windows, const Size2D &detection_window_stride, float threshold, size_t idx_class) -{ - configure(CLKernelLibrary::get().get_compile_context(), input, hog, detection_windows, detection_window_stride, threshold, idx_class); -} - -void CLHOGDetector::configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLHOG *hog, ICLDetectionWindowArray *detection_windows, const Size2D &detection_window_stride, - float threshold, size_t idx_class) -{ - _detection_windows = detection_windows; - - // Allocate buffer for storing the number of detected objects - _num_detection_windows = cl::Buffer(CLScheduler::get().context(), CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE, sizeof(unsigned int)); - - // Configure HOGDetectorKernel - _hog_detector_kernel.configure(compile_context, input, hog, detection_windows, &_num_detection_windows, detection_window_stride, threshold, idx_class); -} - -void CLHOGDetector::run() -{ - cl::CommandQueue q = CLScheduler::get().queue(); - - // Reset number of detections - const unsigned int init_num_detection_windows = _detection_windows->num_values(); - q.enqueueWriteBuffer(_num_detection_windows, CL_FALSE, 0, sizeof(unsigned int), &init_num_detection_windows); - - // Run CLHOGDetectorKernel - CLScheduler::get().enqueue(_hog_detector_kernel); - - // Read number of detections - unsigned int num_detection_windows = 0; - q.enqueueReadBuffer(_num_detection_windows, CL_TRUE, 0, sizeof(unsigned int), &num_detection_windows); - - // Update the number of values stored in _detection_windows - _detection_windows->resize(static_cast<size_t>(num_detection_windows)); - - q.flush(); -}
\ No newline at end of file diff --git a/src/runtime/CL/functions/CLHOGGradient.cpp b/src/runtime/CL/functions/CLHOGGradient.cpp deleted file mode 100644 index acf5f2c568..0000000000 --- a/src/runtime/CL/functions/CLHOGGradient.cpp +++ /dev/null @@ -1,87 +0,0 @@ -/* - * Copyright (c) 2017-2020 ARM Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "arm_compute/runtime/CL/functions/CLHOGGradient.h" - -#include "arm_compute/core/Helpers.h" -#include "arm_compute/core/Validate.h" -#include "arm_compute/runtime/CL/CLScheduler.h" - -using namespace arm_compute; - -CLHOGGradient::CLHOGGradient(std::shared_ptr<IMemoryManager> memory_manager) - : _memory_group(std::move(memory_manager)), _derivative(), _mag_phase(), _gx(), _gy() -{ -} - -void CLHOGGradient::configure(ICLTensor *input, ICLTensor *output_magnitude, ICLTensor *output_phase, PhaseType phase_type, BorderMode border_mode, uint8_t constant_border_value) -{ - configure(CLKernelLibrary::get().get_compile_context(), input, output_magnitude, output_phase, phase_type, border_mode, constant_border_value); -} - -void CLHOGGradient::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output_magnitude, ICLTensor *output_phase, PhaseType phase_type, BorderMode border_mode, - uint8_t constant_border_value) -{ - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8); - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output_magnitude, 1, DataType::S16); - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output_phase, 1, DataType::U8); - - const TensorShape &shape_img = input->info()->tensor_shape(); - - // Allocate image memory - TensorInfo info(shape_img, Format::S16); - _gx.allocator()->init(info); - _gy.allocator()->init(info); - - // Manage intermediate buffers - _memory_group.manage(&_gx); - _memory_group.manage(&_gy); - - // Initialise derivate kernel - _derivative.configure(compile_context, input, &_gx, &_gy, border_mode, constant_border_value); - - // Initialise magnitude/phase kernel - if(PhaseType::UNSIGNED == phase_type) - { - _mag_phase.configure(compile_context, &_gx, &_gy, output_magnitude, output_phase, MagnitudeType::L2NORM, PhaseType::UNSIGNED); - } - else - { - _mag_phase.configure(compile_context, &_gx, &_gy, output_magnitude, output_phase, MagnitudeType::L2NORM, PhaseType::SIGNED); - } - - // Allocate intermediate tensors - _gx.allocator()->allocate(); - _gy.allocator()->allocate(); -} - -void CLHOGGradient::run() -{ - MemoryGroupResourceScope scope_mg(_memory_group); - - // Run derivative - _derivative.run(); - - // Run magnitude/phase kernel - CLScheduler::get().enqueue(_mag_phase); -}
\ No newline at end of file diff --git a/src/runtime/CL/functions/CLHOGMultiDetection.cpp b/src/runtime/CL/functions/CLHOGMultiDetection.cpp deleted file mode 100644 index 248f7307e6..0000000000 --- a/src/runtime/CL/functions/CLHOGMultiDetection.cpp +++ /dev/null @@ -1,274 +0,0 @@ -/* - * Copyright (c) 2017-2020 ARM Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "arm_compute/runtime/CL/functions/CLHOGMultiDetection.h" - -#include "arm_compute/core/CL/OpenCL.h" -#include "arm_compute/core/Error.h" -#include "arm_compute/core/TensorInfo.h" -#include "arm_compute/runtime/CL/CLArray.h" -#include "arm_compute/runtime/CL/CLScheduler.h" -#include "arm_compute/runtime/CL/CLTensor.h" -#include "arm_compute/runtime/Scheduler.h" - -using namespace arm_compute; - -CLHOGMultiDetection::CLHOGMultiDetection(std::shared_ptr<IMemoryManager> memory_manager) // NOLINT - : _memory_group(std::move(memory_manager)), - _gradient_kernel(), - _orient_bin_kernel(), - _block_norm_kernel(), - _hog_detect_kernel(), - _non_maxima_kernel(), - _hog_space(), - _hog_norm_space(), - _detection_windows(), - _mag(), - _phase(), - _non_maxima_suppression(false), - _num_orient_bin_kernel(0), - _num_block_norm_kernel(0), - _num_hog_detect_kernel(0) -{ -} - -void CLHOGMultiDetection::configure(ICLTensor *input, const ICLMultiHOG *multi_hog, ICLDetectionWindowArray *detection_windows, ICLSize2DArray *detection_window_strides, BorderMode border_mode, - uint8_t constant_border_value, float threshold, bool non_maxima_suppression, float min_distance) -{ - configure(CLKernelLibrary::get().get_compile_context(), input, multi_hog, detection_windows, detection_window_strides, border_mode, constant_border_value, threshold, non_maxima_suppression, - min_distance); -} - -void CLHOGMultiDetection::configure(const CLCompileContext &compile_context, ICLTensor *input, const ICLMultiHOG *multi_hog, ICLDetectionWindowArray *detection_windows, - ICLSize2DArray *detection_window_strides, BorderMode border_mode, - uint8_t constant_border_value, float threshold, bool non_maxima_suppression, float min_distance) -{ - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8); - ARM_COMPUTE_ERROR_ON_INVALID_MULTI_HOG(multi_hog); - ARM_COMPUTE_ERROR_ON(nullptr == detection_windows); - ARM_COMPUTE_ERROR_ON(detection_window_strides->num_values() != multi_hog->num_models()); - - const size_t width = input->info()->dimension(Window::DimX); - const size_t height = input->info()->dimension(Window::DimY); - const TensorShape &shape_img = input->info()->tensor_shape(); - const size_t num_models = multi_hog->num_models(); - PhaseType phase_type = multi_hog->model(0)->info()->phase_type(); - - size_t prev_num_bins = multi_hog->model(0)->info()->num_bins(); - Size2D prev_cell_size = multi_hog->model(0)->info()->cell_size(); - Size2D prev_block_size = multi_hog->model(0)->info()->block_size(); - Size2D prev_block_stride = multi_hog->model(0)->info()->block_stride(); - - /* Check if CLHOGOrientationBinningKernel and CLHOGBlockNormalizationKernel kernels can be skipped for a specific HOG data-object - * - * 1) CLHOGOrientationBinningKernel and CLHOGBlockNormalizationKernel are skipped if the cell size and the number of bins don't change. - * Since "multi_hog" is sorted,it is enough to check the HOG descriptors at level "ith" and level "(i-1)th - * 2) CLHOGBlockNormalizationKernel is skipped if the cell size, the number of bins and block size do not change. - * Since "multi_hog" is sorted,it is enough to check the HOG descriptors at level "ith" and level "(i-1)th - * - * @note Since the orientation binning and block normalization kernels can be skipped, we need to keep track of the input to process for each kernel - * with "input_orient_bin", "input_hog_detect" and "input_block_norm" - */ - std::vector<size_t> input_orient_bin; - std::vector<size_t> input_hog_detect; - std::vector<std::pair<size_t, size_t>> input_block_norm; - - input_orient_bin.push_back(0); - input_hog_detect.push_back(0); - input_block_norm.emplace_back(0, 0); - - for(size_t i = 1; i < num_models; ++i) - { - size_t cur_num_bins = multi_hog->model(i)->info()->num_bins(); - Size2D cur_cell_size = multi_hog->model(i)->info()->cell_size(); - Size2D cur_block_size = multi_hog->model(i)->info()->block_size(); - Size2D cur_block_stride = multi_hog->model(i)->info()->block_stride(); - - if((cur_num_bins != prev_num_bins) || (cur_cell_size.width != prev_cell_size.width) || (cur_cell_size.height != prev_cell_size.height)) - { - prev_num_bins = cur_num_bins; - prev_cell_size = cur_cell_size; - prev_block_size = cur_block_size; - prev_block_stride = cur_block_stride; - - // Compute orientation binning and block normalization kernels. Update input to process - input_orient_bin.push_back(i); - input_block_norm.emplace_back(i, input_orient_bin.size() - 1); - } - else if((cur_block_size.width != prev_block_size.width) || (cur_block_size.height != prev_block_size.height) || (cur_block_stride.width != prev_block_stride.width) - || (cur_block_stride.height != prev_block_stride.height)) - { - prev_block_size = cur_block_size; - prev_block_stride = cur_block_stride; - - // Compute block normalization kernel. Update input to process - input_block_norm.emplace_back(i, input_orient_bin.size() - 1); - } - - // Update input to process for hog detector kernel - input_hog_detect.push_back(input_block_norm.size() - 1); - } - - _detection_windows = detection_windows; - _non_maxima_suppression = non_maxima_suppression; - _num_orient_bin_kernel = input_orient_bin.size(); // Number of CLHOGOrientationBinningKernel kernels to compute - _num_block_norm_kernel = input_block_norm.size(); // Number of CLHOGBlockNormalizationKernel kernels to compute - _num_hog_detect_kernel = input_hog_detect.size(); // Number of CLHOGDetector functions to compute - - _orient_bin_kernel.resize(_num_orient_bin_kernel); - _block_norm_kernel.resize(_num_block_norm_kernel); - _hog_detect_kernel.resize(_num_hog_detect_kernel); - _hog_space.resize(_num_orient_bin_kernel); - _hog_norm_space.resize(_num_block_norm_kernel); - - // Allocate tensors for magnitude and phase - TensorInfo info_mag(shape_img, Format::S16); - _mag.allocator()->init(info_mag); - - TensorInfo info_phase(shape_img, Format::U8); - _phase.allocator()->init(info_phase); - - // Manage intermediate buffers - _memory_group.manage(&_mag); - _memory_group.manage(&_phase); - - // Initialise gradient kernel - _gradient_kernel.configure(compile_context, input, &_mag, &_phase, phase_type, border_mode, constant_border_value); - - // Configure NETensor for the HOG space and orientation binning kernel - for(size_t i = 0; i < _num_orient_bin_kernel; ++i) - { - const size_t idx_multi_hog = input_orient_bin[i]; - - // Get the corresponding cell size and number of bins - const Size2D &cell = multi_hog->model(idx_multi_hog)->info()->cell_size(); - const size_t num_bins = multi_hog->model(idx_multi_hog)->info()->num_bins(); - - // Calculate number of cells along the x and y directions for the hog_space - const size_t num_cells_x = width / cell.width; - const size_t num_cells_y = height / cell.height; - - // TensorShape of hog space - TensorShape shape_hog_space = input->info()->tensor_shape(); - shape_hog_space.set(Window::DimX, num_cells_x); - shape_hog_space.set(Window::DimY, num_cells_y); - - // Allocate HOG space - TensorInfo info_space(shape_hog_space, num_bins, DataType::F32); - _hog_space[i].allocator()->init(info_space); - - // Manage intermediate buffers - _memory_group.manage(&_hog_space[i]); - - // Initialise orientation binning kernel - _orient_bin_kernel[i].configure(compile_context, &_mag, &_phase, &_hog_space[i], multi_hog->model(idx_multi_hog)->info()); - } - - // Allocate intermediate tensors - _mag.allocator()->allocate(); - _phase.allocator()->allocate(); - - // Configure CLTensor for the normalized HOG space and block normalization kernel - for(size_t i = 0; i < _num_block_norm_kernel; ++i) - { - const size_t idx_multi_hog = input_block_norm[i].first; - const size_t idx_orient_bin = input_block_norm[i].second; - - // Allocate normalized HOG space - TensorInfo tensor_info(*(multi_hog->model(idx_multi_hog)->info()), width, height); - _hog_norm_space[i].allocator()->init(tensor_info); - - // Manage intermediate buffers - _memory_group.manage(&_hog_norm_space[i]); - - // Initialize block normalization kernel - _block_norm_kernel[i].configure(compile_context, &_hog_space[idx_orient_bin], &_hog_norm_space[i], multi_hog->model(idx_multi_hog)->info()); - } - - // Allocate intermediate tensors - for(size_t i = 0; i < _num_orient_bin_kernel; ++i) - { - _hog_space[i].allocator()->allocate(); - } - - detection_window_strides->map(CLScheduler::get().queue(), true); - - // Configure HOG detector kernel - for(size_t i = 0; i < _num_hog_detect_kernel; ++i) - { - const size_t idx_block_norm = input_hog_detect[i]; - - _hog_detect_kernel[i].configure(compile_context, &_hog_norm_space[idx_block_norm], multi_hog->cl_model(i), detection_windows, detection_window_strides->at(i), threshold, i); - } - - detection_window_strides->unmap(CLScheduler::get().queue()); - - // Configure non maxima suppression kernel - _non_maxima_kernel.configure(_detection_windows, min_distance); - - // Allocate intermediate tensors - for(size_t i = 0; i < _num_block_norm_kernel; ++i) - { - _hog_norm_space[i].allocator()->allocate(); - } -} - -void CLHOGMultiDetection::run() -{ - ARM_COMPUTE_ERROR_ON_MSG(_detection_windows == nullptr, "Unconfigured function"); - - MemoryGroupResourceScope scope_mg(_memory_group); - - // Reset detection window - _detection_windows->clear(); - - // Run gradient - _gradient_kernel.run(); - - // Run orientation binning kernel - for(size_t i = 0; i < _num_orient_bin_kernel; ++i) - { - CLScheduler::get().enqueue(_orient_bin_kernel[i], false); - } - - // Run block normalization kernel - for(size_t i = 0; i < _num_block_norm_kernel; ++i) - { - CLScheduler::get().enqueue(_block_norm_kernel[i], false); - } - - // Run HOG detector kernel - for(size_t i = 0; i < _num_hog_detect_kernel; ++i) - { - _hog_detect_kernel[i].run(); - } - - // Run non-maxima suppression kernel if enabled - if(_non_maxima_suppression) - { - // Map detection windows array before computing non maxima suppression - _detection_windows->map(CLScheduler::get().queue(), true); - Scheduler::get().schedule(&_non_maxima_kernel, Window::DimY); - _detection_windows->unmap(CLScheduler::get().queue()); - } -} diff --git a/src/runtime/CL/functions/CLHarrisCorners.cpp b/src/runtime/CL/functions/CLHarrisCorners.cpp deleted file mode 100644 index aecec0d3c5..0000000000 --- a/src/runtime/CL/functions/CLHarrisCorners.cpp +++ /dev/null @@ -1,195 +0,0 @@ -/* - * Copyright (c) 2016-2020 ARM Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "arm_compute/runtime/CL/functions/CLHarrisCorners.h" - -#include "arm_compute/core/CL/OpenCL.h" -#include "arm_compute/core/CL/kernels/CLFillBorderKernel.h" -#include "arm_compute/core/CL/kernels/CLHarrisCornersKernel.h" -#include "arm_compute/core/Error.h" -#include "arm_compute/core/TensorInfo.h" -#include "arm_compute/core/Validate.h" -#include "arm_compute/runtime/CL/CLScheduler.h" -#include "arm_compute/runtime/CL/functions/CLSobel3x3.h" -#include "arm_compute/runtime/CL/functions/CLSobel5x5.h" -#include "arm_compute/runtime/CL/functions/CLSobel7x7.h" -#include "arm_compute/runtime/ITensorAllocator.h" -#include "arm_compute/runtime/Scheduler.h" -#include "support/MemorySupport.h" - -#include <cmath> -#include <utility> - -using namespace arm_compute; - -CLHarrisCorners::CLHarrisCorners(std::shared_ptr<IMemoryManager> memory_manager) // NOLINT - : _memory_group(std::move(memory_manager)), - _sobel(nullptr), - _harris_score(), - _non_max_suppr(), - _candidates(), - _sort_euclidean(), - _border_gx(), - _border_gy(), - _gx(), - _gy(), - _score(), - _nonmax(), - _corners_list(), - _num_corner_candidates(0), - _corners(nullptr) -{ -} - -void CLHarrisCorners::configure(ICLImage *input, float threshold, float min_dist, - float sensitivity, int32_t gradient_size, int32_t block_size, ICLKeyPointArray *corners, - BorderMode border_mode, uint8_t constant_border_value, bool use_fp16) -{ - configure(CLKernelLibrary::get().get_compile_context(), input, threshold, min_dist, sensitivity, gradient_size, block_size, corners, border_mode, constant_border_value, use_fp16); -} - -void CLHarrisCorners::configure(const CLCompileContext &compile_context, ICLImage *input, float threshold, float min_dist, - float sensitivity, int32_t gradient_size, int32_t block_size, ICLKeyPointArray *corners, - BorderMode border_mode, uint8_t constant_border_value, bool use_fp16) -{ - ARM_COMPUTE_UNUSED(use_fp16); //TODO(COMPMID-772): Add half float support - ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(input); - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8); - ARM_COMPUTE_ERROR_ON(!(block_size == 3 || block_size == 5 || block_size == 7)); - ARM_COMPUTE_ERROR_ON(nullptr == corners); - - _corners = corners; - - const TensorShape shape = input->info()->tensor_shape(); - const DataType dt = (gradient_size < 7) ? DataType::S16 : DataType::S32; - TensorInfo tensor_info(shape, 1, dt); - - _gx.allocator()->init(tensor_info); - _gy.allocator()->init(tensor_info); - - TensorInfo info_f32(shape, 1, DataType::F32); - _score.allocator()->init(info_f32); - _nonmax.allocator()->init(info_f32); - - _corners_list.resize(shape.x() * shape.y()); - - // Manage intermediate buffers - _memory_group.manage(&_gx); - _memory_group.manage(&_gy); - - /* Set/init Sobel kernel accordingly with gradient_size */ - switch(gradient_size) - { - case 3: - { - auto k = arm_compute::support::cpp14::make_unique<CLSobel3x3>(); - k->configure(compile_context, input, &_gx, &_gy, border_mode, constant_border_value); - _sobel = std::move(k); - break; - } - case 5: - { - auto k = arm_compute::support::cpp14::make_unique<CLSobel5x5>(); - k->configure(compile_context, input, &_gx, &_gy, border_mode, constant_border_value); - _sobel = std::move(k); - break; - } - case 7: - { - auto k = arm_compute::support::cpp14::make_unique<CLSobel7x7>(); - k->configure(compile_context, input, &_gx, &_gy, border_mode, constant_border_value); - _sobel = std::move(k); - break; - } - default: - ARM_COMPUTE_ERROR("Gradient size not implemented"); - } - - // Normalization factor - const float norm_factor = 1.0f / (255.0f * pow(4.0f, gradient_size / 2) * block_size); - const float pow4_normalization_factor = pow(norm_factor, 4); - - // Manage intermediate buffers - _memory_group.manage(&_score); - - // Set/init Harris Score kernel accordingly with block_size - _harris_score.configure(compile_context, &_gx, &_gy, &_score, block_size, pow4_normalization_factor, threshold, sensitivity, border_mode == BorderMode::UNDEFINED); - - // Configure border filling using harris score kernel's block size - _border_gx.configure(compile_context, &_gx, _harris_score.border_size(), border_mode, PixelValue(constant_border_value)); - _border_gy.configure(compile_context, &_gy, _harris_score.border_size(), border_mode, PixelValue(constant_border_value)); - - // Allocate intermediate buffers - _gx.allocator()->allocate(); - _gy.allocator()->allocate(); - - // Manage intermediate buffers - _memory_group.manage(&_nonmax); - - // Init non-maxima suppression function - _non_max_suppr.configure(compile_context, &_score, &_nonmax, border_mode); - - // Allocate intermediate buffers - _score.allocator()->allocate(); - - // Init corner candidates kernel - _candidates.configure(&_nonmax, _corners_list.data(), &_num_corner_candidates); - - // Allocate intermediate buffers - _nonmax.allocator()->allocate(); - - // Init euclidean distance - _sort_euclidean.configure(_corners_list.data(), _corners, &_num_corner_candidates, min_dist); -} - -void CLHarrisCorners::run() -{ - ARM_COMPUTE_ERROR_ON_MSG(_sobel == nullptr, "Unconfigured function"); - - MemoryGroupResourceScope scope_mg(_memory_group); - - // Init to 0 number of corner candidates - _num_corner_candidates = 0; - - // Run Sobel kernel - _sobel->run(); - - // Fill border before harris score kernel - CLScheduler::get().enqueue(_border_gx, false); - CLScheduler::get().enqueue(_border_gy, false); - - // Run harris score kernel - CLScheduler::get().enqueue(_harris_score, false); - - // Run non-maxima suppression - _non_max_suppr.run(); - - // Run corner candidate kernel - _nonmax.map(true); - Scheduler::get().schedule(&_candidates, Window::DimY); - _nonmax.unmap(); - - _corners->map(CLScheduler::get().queue(), true); - Scheduler::get().schedule(&_sort_euclidean, Window::DimY); - _corners->unmap(CLScheduler::get().queue()); -} diff --git a/src/runtime/CL/functions/CLHistogram.cpp b/src/runtime/CL/functions/CLHistogram.cpp deleted file mode 100644 index e723024334..0000000000 --- a/src/runtime/CL/functions/CLHistogram.cpp +++ /dev/null @@ -1,50 +0,0 @@ -/* - * Copyright (c) 2016-2020 ARM Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "arm_compute/runtime/CL/functions/CLHistogram.h" - -#include "arm_compute/runtime/CL/CLScheduler.h" - -using namespace arm_compute; - -CLHistogram::CLHistogram() - : _kernel(), _kernel_border() -{ -} - -void CLHistogram::configure(const ICLImage *input, ICLDistribution1D *output) -{ - configure(CLKernelLibrary::get().get_compile_context(), input, output); -} - -void CLHistogram::configure(const CLCompileContext &compile_context, const ICLImage *input, ICLDistribution1D *output) -{ - _kernel.configure(compile_context, input, output); - _kernel_border.configure(compile_context, input, output); -} - -void CLHistogram::run() -{ - CLScheduler::get().enqueue(_kernel, false); - CLScheduler::get().enqueue(_kernel_border); -} diff --git a/src/runtime/CL/functions/CLIndirectConvolutionLayer.cpp b/src/runtime/CL/functions/CLIndirectConvolutionLayer.cpp new file mode 100644 index 0000000000..1a2369c5c2 --- /dev/null +++ b/src/runtime/CL/functions/CLIndirectConvolutionLayer.cpp @@ -0,0 +1,100 @@ +/* + * Copyright (c) 2022 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/runtime/CL/functions/CLIndirectConvolutionLayer.h" + +#include "arm_compute/core/CL/ICLTensor.h" +#include "arm_compute/core/Utils.h" +#include "arm_compute/core/Validate.h" + +#include "src/common/utils/Log.h" +#include "src/gpu/cl/operators/ClIndirectConv2d.h" + +namespace arm_compute +{ +struct CLIndirectConvolutionLayer::Impl +{ + const ICLTensor *src{nullptr}; + const ICLTensor *weights{nullptr}; + const ICLTensor *biases{nullptr}; + ICLTensor *dst{nullptr}; + std::unique_ptr<opencl::ClIndirectConv2d> op{nullptr}; +}; + +CLIndirectConvolutionLayer::CLIndirectConvolutionLayer() : _impl(std::make_unique<Impl>()) +{ +} +CLIndirectConvolutionLayer::CLIndirectConvolutionLayer(CLIndirectConvolutionLayer &&) = default; +CLIndirectConvolutionLayer &CLIndirectConvolutionLayer::operator=(CLIndirectConvolutionLayer &&) = default; +CLIndirectConvolutionLayer::~CLIndirectConvolutionLayer() = default; + +void CLIndirectConvolutionLayer::configure(ICLTensor *input, + const ICLTensor *weights, + const ICLTensor *biases, + ICLTensor *output, + const PadStrideInfo &conv_info, + const ActivationLayerInfo &act_info) +{ + configure(CLKernelLibrary::get().get_compile_context(), input, weights, biases, output, conv_info, act_info); +} + +void CLIndirectConvolutionLayer::configure(const CLCompileContext &compile_context, + ICLTensor *input, + const ICLTensor *weights, + const ICLTensor *biases, + ICLTensor *output, + const PadStrideInfo &conv_info, + const ActivationLayerInfo &act_info) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output); + ARM_COMPUTE_LOG_PARAMS(input, weights, biases, output, conv_info, act_info); + + _impl->src = input; + _impl->weights = weights; + _impl->biases = biases; + _impl->dst = output; + _impl->op = std::make_unique<opencl::ClIndirectConv2d>(); + _impl->op->configure(compile_context, input->info(), weights->info(), + (biases != nullptr) ? biases->info() : nullptr, output->info(), conv_info, act_info); +} + +Status CLIndirectConvolutionLayer::validate(const ITensorInfo *input, + const ITensorInfo *weights, + const ITensorInfo *biases, + const ITensorInfo *output, + const PadStrideInfo &conv_info, + const ActivationLayerInfo &act_info) +{ + return opencl::ClIndirectConv2d::validate(input, weights, biases, output, conv_info, act_info); +} + +void CLIndirectConvolutionLayer::run() +{ + ITensorPack pack; + pack.add_tensor(TensorType::ACL_SRC, _impl->src); + pack.add_tensor(TensorType::ACL_SRC_1, _impl->weights); + pack.add_tensor(TensorType::ACL_SRC_2, _impl->biases); + pack.add_tensor(TensorType::ACL_DST, _impl->dst); + _impl->op->run(pack); +} +} // namespace arm_compute diff --git a/src/runtime/CL/functions/CLInstanceNormalizationLayer.cpp b/src/runtime/CL/functions/CLInstanceNormalizationLayer.cpp index 273a873c81..0e994e1aee 100644 --- a/src/runtime/CL/functions/CLInstanceNormalizationLayer.cpp +++ b/src/runtime/CL/functions/CLInstanceNormalizationLayer.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2020 ARM Limited. + * Copyright (c) 2019-2021 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -23,29 +23,68 @@ */ #include "arm_compute/runtime/CL/functions/CLInstanceNormalizationLayer.h" -#include "arm_compute/core/CL/kernels/CLInstanceNormalizationLayerKernel.h" +#include "arm_compute/core/Error.h" #include "arm_compute/core/Types.h" +#include "arm_compute/runtime/CL/CLHelpers.h" +#include "arm_compute/runtime/CL/CLScheduler.h" + +#include "src/common/utils/Log.h" +#include "src/core/CL/ICLKernel.h" +#include "src/core/CL/kernels/CLFillBorderKernel.h" +#include "src/core/CL/kernels/CLInstanceNormalizationLayerKernel.h" namespace arm_compute { -CLInstanceNormalizationLayer::CLInstanceNormalizationLayer() +CLInstanceNormalizationLayer::CLInstanceNormalizationLayer(CLRuntimeContext *ctx) // NOLINT + : _inst_norm_kernel(), _mean_var_kernel(), _mean_var_tensor(), _ctx(ctx) +{ +} +CLInstanceNormalizationLayer::~CLInstanceNormalizationLayer() { } -void CLInstanceNormalizationLayer::configure(ICLTensor *input, ICLTensor *output, float gamma, float beta, float epsilon, bool use_mixed_precision) +void CLInstanceNormalizationLayer::configure( + ICLTensor *input, ICLTensor *output, float gamma, float beta, float epsilon, bool use_mixed_precision) { configure(CLKernelLibrary::get().get_compile_context(), input, output, gamma, beta, epsilon, use_mixed_precision); } -void CLInstanceNormalizationLayer::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, float gamma, float beta, float epsilon, bool use_mixed_precision) +void CLInstanceNormalizationLayer::configure(const CLCompileContext &compile_context, + ICLTensor *input, + ICLTensor *output, + float gamma, + float beta, + float epsilon, + bool use_mixed_precision) { - auto k = arm_compute::support::cpp14::make_unique<CLInstanceNormalizationLayerKernel>(); - k->configure(compile_context, input, output, InstanceNormalizationLayerKernelInfo(gamma, beta, epsilon, use_mixed_precision)); - _kernel = std::move(k); + ARM_COMPUTE_LOG_PARAMS(input, output, gamma, beta, epsilon, use_mixed_precision); + auto w = std::make_unique<CLComputeMeanVariance>(); + w->configure(compile_context, input, &_mean_var_tensor, use_mixed_precision); + _mean_var_kernel = std::move(w); + auto k = std::make_unique<CLInstanceNormalizationLayerKernel>(); + k->configure(compile_context, input, &_mean_var_tensor, output, + InstanceNormalizationLayerKernelInfo(gamma, beta, epsilon, use_mixed_precision)); + _inst_norm_kernel = std::move(k); + _mean_var_tensor.allocator()->allocate(); } -Status CLInstanceNormalizationLayer::validate(const ITensorInfo *input, const ITensorInfo *output, float gamma, float beta, float epsilon, bool use_mixed_precision) +Status CLInstanceNormalizationLayer::validate(const ITensorInfo *input, + const ITensorInfo *output, + float gamma, + float beta, + float epsilon, + bool use_mixed_precision) { - return CLInstanceNormalizationLayerKernel::validate(input, output, InstanceNormalizationLayerKernelInfo(gamma, beta, epsilon, use_mixed_precision)); + return CLInstanceNormalizationLayerKernel::validate( + input, output, InstanceNormalizationLayerKernelInfo(gamma, beta, epsilon, use_mixed_precision)); } -} // namespace arm_compute
\ No newline at end of file + +void CLInstanceNormalizationLayer::run() +{ + ARM_COMPUTE_ERROR_ON_MSG(!_inst_norm_kernel, + "The child class didn't set the CL kernel or function isn't configured"); + schedule_kernel_on_ctx(_ctx, _mean_var_kernel.get()); + schedule_kernel_on_ctx(_ctx, _inst_norm_kernel.get()); +} + +} // namespace arm_compute diff --git a/src/runtime/CL/functions/CLIntegralImage.cpp b/src/runtime/CL/functions/CLIntegralImage.cpp deleted file mode 100644 index b3be2f8c2c..0000000000 --- a/src/runtime/CL/functions/CLIntegralImage.cpp +++ /dev/null @@ -1,51 +0,0 @@ -/* - * Copyright (c) 2016-2020 ARM Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "arm_compute/runtime/CL/functions/CLIntegralImage.h" - -#include "arm_compute/core/CL/kernels/CLIntegralImageKernel.h" -#include "arm_compute/runtime/CL/CLScheduler.h" - -using namespace arm_compute; - -CLIntegralImage::CLIntegralImage() - : _integral_hor(), _integral_vert() -{ -} - -void CLIntegralImage::configure(const ICLTensor *input, ICLTensor *output) -{ - configure(CLKernelLibrary::get().get_compile_context(), input, output); -} - -void CLIntegralImage::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output) -{ - _integral_hor.configure(compile_context, input, output); - _integral_vert.configure(compile_context, output); -} - -void CLIntegralImage::run() -{ - CLScheduler::get().enqueue(_integral_hor, false); - CLScheduler::get().enqueue(_integral_vert); -} diff --git a/src/runtime/CL/functions/CLL2NormalizeLayer.cpp b/src/runtime/CL/functions/CLL2NormalizeLayer.cpp index 14c83cd543..4fe1d9b20b 100644 --- a/src/runtime/CL/functions/CLL2NormalizeLayer.cpp +++ b/src/runtime/CL/functions/CLL2NormalizeLayer.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2020 ARM Limited. + * Copyright (c) 2017-2021 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -24,13 +24,17 @@ #include "arm_compute/runtime/CL/functions/CLL2NormalizeLayer.h" #include "arm_compute/core/CL/ICLTensor.h" -#include "arm_compute/core/CL/kernels/CLL2NormalizeLayerKernel.h" #include "arm_compute/core/Error.h" #include "arm_compute/core/PixelValue.h" #include "arm_compute/core/TensorInfo.h" #include "arm_compute/core/Validate.h" #include "arm_compute/runtime/CL/CLScheduler.h" +#include "src/common/utils/Log.h" +#include "src/core/CL/kernels/CLFillBorderKernel.h" +#include "src/core/CL/kernels/CLL2NormalizeLayerKernel.h" +#include "src/core/CL/kernels/CLReductionOperationKernel.h" + namespace arm_compute { namespace @@ -39,17 +43,25 @@ constexpr int max_input_tensor_dim = 3; } // namespace CLL2NormalizeLayer::CLL2NormalizeLayer(std::shared_ptr<IMemoryManager> memory_manager) - : _memory_group(std::move(memory_manager)), _reduce_func(), _normalize_kernel(), _sumsq() + : _memory_group(std::move(memory_manager)), + _reduce_func(), + _normalize_kernel(std::make_unique<CLL2NormalizeLayerKernel>()), + _sumsq() { } +CLL2NormalizeLayer::~CLL2NormalizeLayer() = default; + void CLL2NormalizeLayer::configure(ICLTensor *input, ICLTensor *output, int axis, float epsilon) { configure(CLKernelLibrary::get().get_compile_context(), input, output, axis, epsilon); } -void CLL2NormalizeLayer::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, int axis, float epsilon) +void CLL2NormalizeLayer::configure( + const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, int axis, float epsilon) { + ARM_COMPUTE_LOG_PARAMS(input, output, axis, epsilon); + // Reset auxiliary tensor _sumsq.allocator()->init(TensorInfo()); @@ -59,7 +71,7 @@ void CLL2NormalizeLayer::configure(const CLCompileContext &compile_context, ICLT // Configure kernels const uint32_t actual_axis = wrap_around(axis, max_input_tensor_dim); _reduce_func.configure(compile_context, input, &_sumsq, actual_axis, ReductionOperation::SUM_SQUARE); - _normalize_kernel.configure(compile_context, input, &_sumsq, output, axis, epsilon); + _normalize_kernel->configure(compile_context, input, &_sumsq, output, axis, epsilon); // Allocate intermediate tensor _sumsq.allocator()->allocate(); @@ -75,7 +87,8 @@ Status CLL2NormalizeLayer::validate(const ITensorInfo *input, const ITensorInfo sum_sq.set_tensor_shape(shape); const uint32_t actual_axis = wrap_around(axis, max_input_tensor_dim); - ARM_COMPUTE_RETURN_ON_ERROR(CLReductionOperation::validate(input, &sum_sq, actual_axis, ReductionOperation::SUM_SQUARE)); + ARM_COMPUTE_RETURN_ON_ERROR( + CLReductionOperation::validate(input, &sum_sq, actual_axis, ReductionOperation::SUM_SQUARE)); // Reduce shape on axis shape.set(actual_axis, 1); @@ -91,6 +104,6 @@ void CLL2NormalizeLayer::run() MemoryGroupResourceScope scope_mg(_memory_group); _reduce_func.run(); - CLScheduler::get().enqueue(_normalize_kernel, true); + CLScheduler::get().enqueue(*_normalize_kernel, true); } } // namespace arm_compute diff --git a/src/runtime/CL/functions/CLLSTMLayer.cpp b/src/runtime/CL/functions/CLLSTMLayer.cpp index 56f22e2fe0..3b50234c77 100644 --- a/src/runtime/CL/functions/CLLSTMLayer.cpp +++ b/src/runtime/CL/functions/CLLSTMLayer.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018-2020 ARM Limited. + * Copyright (c) 2018-2021 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -24,61 +24,172 @@ #include "arm_compute/runtime/CL/functions/CLLSTMLayer.h" #include "arm_compute/core/Utils.h" -#include "arm_compute/core/Validate.h" #include "arm_compute/core/utils/misc/InfoHelpers.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" #include "arm_compute/core/utils/quantization/AsymmHelpers.h" +#include "arm_compute/core/Validate.h" #include "arm_compute/runtime/CL/CLScheduler.h" +#include "src/common/utils/Log.h" +#include "src/core/CL/kernels/CLFillBorderKernel.h" +#include "src/gpu/cl/kernels/ClTransposeKernel.h" + namespace arm_compute { using namespace arm_compute::misc::shape_calculator; using namespace arm_compute::utils::info_helpers; CLLSTMLayer::CLLSTMLayer(std::shared_ptr<IMemoryManager> memory_manager) - : _memory_group(std::move(memory_manager)), _fully_connected_input_gate(), _accum_input_gate1(), _subtract_input_gate(), _pixelwise_mul_input_gate(), _activation_input_gate(), - _fully_connected_forget_gate(), _accum_forget_gate1(), _pixelwise_mul_forget_gate(), _activation_forget_gate(), _fully_connected_cell_state(), _gemm_cell_state1(), _transpose_cell_state(), - _accum_cell_state1(), _accum_cell_state2(), _pixelwise_mul_cell_state1(), _activation_cell_state(), _cell_clip(), _pixelwise_mul_cell_state2(), _fully_connected_output(), - _pixelwise_mul_output_state1(), _accum_output1(), _activation_output(), _activation_output_state(), _pixelwise_mul_output_state2(), _fully_connected_output_state(), _projection_clip(), - _copy_cell_state(), _copy_output(), _concat_scratch_buffer(), _concat_inputs_forget_gate(), _concat_weights_forget_gate(), _concat_weights_input_gate(), _concat_weights_output(), - _ones_memset_kernel(), _mean_std_norm_input_gate(), _pixelwise_mul_input_gate_coeff(), _accum_input_gate_bias(), _mean_std_norm_forget_gate(), _pixelwise_mul_forget_gate_coeff(), - _accum_forget_gate_bias(), _mean_std_norm_cell_gate(), _pixelwise_mul_cell_gate_coeff(), _accum_cell_gate_bias(), _mean_std_norm_output_gate(), _pixelwise_mul_output_gate_coeff(), - _accum_output_gate_bias(), _input_gate_out1(), _input_gate_out2(), _input_gate_out3(), _input_gate_out4(), _forget_gate_out1(), _forget_gate_out2(), _forget_gate_out3(), _forget_gate_out4(), - _forget_gate_out5(), _forget_gate_out6(), _cell_state_out1(), _cell_state_out2(), _cell_state_out3(), _cell_state_out4(), _cell_state_out5(), _output1(), _output2(), _output3(), _output4(), - _cell_state_activation(), _output_state1(), _ones(), _input_layer_norm_out1(), _input_layer_norm_out2(), _forget_layer_norm_out1(), _forget_layer_norm_out2(), _cell_layer_norm_out1(), - _cell_layer_norm_out2(), _output_layer_norm_out1(), _output_layer_norm_out2(), _run_peephole_opt(false), _run_cifg_opt(false), _perform_cell_clipping(false), _has_projection_weights(false), - _perform_projection_clipping(false), _is_prepared(false), _is_layer_norm_lstm(false) + : _memory_group(std::move(memory_manager)), + _fully_connected_input_gate(), + _accum_input_gate1(), + _subtract_input_gate(), + _pixelwise_mul_input_gate(), + _activation_input_gate(), + _fully_connected_forget_gate(), + _accum_forget_gate1(), + _pixelwise_mul_forget_gate(), + _activation_forget_gate(), + _fully_connected_cell_state(), + _gemm_cell_state1(), + _transpose_cell_state(std::make_unique<opencl::kernels::ClTransposeKernel>()), + _accum_cell_state1(), + _accum_cell_state2(), + _pixelwise_mul_cell_state1(), + _activation_cell_state(), + _cell_clip(), + _pixelwise_mul_cell_state2(), + _fully_connected_output(), + _pixelwise_mul_output_state1(), + _accum_output1(), + _activation_output(), + _activation_output_state(), + _pixelwise_mul_output_state2(), + _fully_connected_output_state(), + _projection_clip(), + _copy_cell_state(), + _copy_output(), + _concat_scratch_buffer(), + _concat_inputs_forget_gate(), + _concat_weights_forget_gate(), + _concat_weights_input_gate(), + _concat_weights_output(), + _ones_fill(), + _mean_std_norm_input_gate(), + _pixelwise_mul_input_gate_coeff(), + _accum_input_gate_bias(), + _mean_std_norm_forget_gate(), + _pixelwise_mul_forget_gate_coeff(), + _accum_forget_gate_bias(), + _mean_std_norm_cell_gate(), + _pixelwise_mul_cell_gate_coeff(), + _accum_cell_gate_bias(), + _mean_std_norm_output_gate(), + _pixelwise_mul_output_gate_coeff(), + _accum_output_gate_bias(), + _input_gate_out1(), + _input_gate_out2(), + _input_gate_out3(), + _input_gate_out4(), + _forget_gate_out1(), + _forget_gate_out2(), + _forget_gate_out3(), + _forget_gate_out4(), + _forget_gate_out5(), + _forget_gate_out6(), + _cell_state_out1(), + _cell_state_out2(), + _cell_state_out3(), + _cell_state_out4(), + _cell_state_out5(), + _output1(), + _output2(), + _output3(), + _output4(), + _cell_state_activation(), + _output_state1(), + _ones(), + _input_layer_norm_out1(), + _input_layer_norm_out2(), + _forget_layer_norm_out1(), + _forget_layer_norm_out2(), + _cell_layer_norm_out1(), + _cell_layer_norm_out2(), + _output_layer_norm_out1(), + _output_layer_norm_out2(), + _run_peephole_opt(false), + _run_cifg_opt(false), + _perform_cell_clipping(false), + _has_projection_weights(false), + _perform_projection_clipping(false), + _is_prepared(false), + _is_layer_norm_lstm(false) { } -void CLLSTMLayer::configure(const ICLTensor *input, - const ICLTensor *input_to_forget_weights, const ICLTensor *input_to_cell_weights, const ICLTensor *input_to_output_weights, - const ICLTensor *recurrent_to_forget_weights, const ICLTensor *recurrent_to_cell_weights, const ICLTensor *recurrent_to_output_weights, - const ICLTensor *forget_gate_bias, const ICLTensor *cell_bias, const ICLTensor *output_gate_bias, - const ICLTensor *output_state_in, const ICLTensor *cell_state_in, - ICLTensor *scratch_buffer, ICLTensor *output_state_out, ICLTensor *cell_state_out, ICLTensor *output, - const LSTMParams<ICLTensor> &lstm_params, const ActivationLayerInfo &activation_info, float cell_threshold, float projection_threshold) +CLLSTMLayer::~CLLSTMLayer() = default; + +void CLLSTMLayer::configure(const ICLTensor *input, + const ICLTensor *input_to_forget_weights, + const ICLTensor *input_to_cell_weights, + const ICLTensor *input_to_output_weights, + const ICLTensor *recurrent_to_forget_weights, + const ICLTensor *recurrent_to_cell_weights, + const ICLTensor *recurrent_to_output_weights, + const ICLTensor *forget_gate_bias, + const ICLTensor *cell_bias, + const ICLTensor *output_gate_bias, + const ICLTensor *output_state_in, + ICLTensor *cell_state_in, + ICLTensor *scratch_buffer, + ICLTensor *output_state_out, + ICLTensor *cell_state_out, + ICLTensor *output, + const LSTMParams<ICLTensor> &lstm_params, + const ActivationLayerInfo &activation_info, + float cell_threshold, + float projection_threshold) { - configure(CLKernelLibrary::get().get_compile_context(), input, input_to_forget_weights, input_to_cell_weights, input_to_output_weights, recurrent_to_forget_weights, recurrent_to_cell_weights, - recurrent_to_output_weights, forget_gate_bias, cell_bias, output_gate_bias, output_state_in, cell_state_in, scratch_buffer, output_state_out, cell_state_out, output, lstm_params, activation_info, + configure(CLKernelLibrary::get().get_compile_context(), input, input_to_forget_weights, input_to_cell_weights, + input_to_output_weights, recurrent_to_forget_weights, recurrent_to_cell_weights, + recurrent_to_output_weights, forget_gate_bias, cell_bias, output_gate_bias, output_state_in, + cell_state_in, scratch_buffer, output_state_out, cell_state_out, output, lstm_params, activation_info, cell_threshold, projection_threshold); } -void CLLSTMLayer::configure(const CLCompileContext &compile_context, const ICLTensor *input, - const ICLTensor *input_to_forget_weights, const ICLTensor *input_to_cell_weights, const ICLTensor *input_to_output_weights, - const ICLTensor *recurrent_to_forget_weights, const ICLTensor *recurrent_to_cell_weights, const ICLTensor *recurrent_to_output_weights, - const ICLTensor *forget_gate_bias, const ICLTensor *cell_bias, const ICLTensor *output_gate_bias, - const ICLTensor *output_state_in, const ICLTensor *cell_state_in, - ICLTensor *scratch_buffer, ICLTensor *output_state_out, ICLTensor *cell_state_out, ICLTensor *output, - const LSTMParams<ICLTensor> &lstm_params, const ActivationLayerInfo &activation_info, float cell_threshold, float projection_threshold) +void CLLSTMLayer::configure(const CLCompileContext &compile_context, + const ICLTensor *input, + const ICLTensor *input_to_forget_weights, + const ICLTensor *input_to_cell_weights, + const ICLTensor *input_to_output_weights, + const ICLTensor *recurrent_to_forget_weights, + const ICLTensor *recurrent_to_cell_weights, + const ICLTensor *recurrent_to_output_weights, + const ICLTensor *forget_gate_bias, + const ICLTensor *cell_bias, + const ICLTensor *output_gate_bias, + const ICLTensor *output_state_in, + ICLTensor *cell_state_in, + ICLTensor *scratch_buffer, + ICLTensor *output_state_out, + ICLTensor *cell_state_out, + ICLTensor *output, + const LSTMParams<ICLTensor> &lstm_params, + const ActivationLayerInfo &activation_info, + float cell_threshold, + float projection_threshold) { - ARM_COMPUTE_ERROR_ON_NULLPTR(input, - input_to_forget_weights, input_to_cell_weights, input_to_output_weights, + ARM_COMPUTE_ERROR_ON_NULLPTR(input, input_to_forget_weights, input_to_cell_weights, input_to_output_weights, recurrent_to_forget_weights, recurrent_to_cell_weights, recurrent_to_output_weights, - forget_gate_bias, cell_bias, output_gate_bias, - output_state_in, cell_state_in, + forget_gate_bias, cell_bias, output_gate_bias, output_state_in, cell_state_in, scratch_buffer, output_state_out, cell_state_out, output); + ARM_COMPUTE_LOG_PARAMS(input, input_to_forget_weights, input_to_cell_weights, input_to_output_weights, + recurrent_to_forget_weights, recurrent_to_cell_weights, recurrent_to_output_weights, + forget_gate_bias, cell_bias, output_gate_bias, output_state_in, cell_state_in, + scratch_buffer, output_state_out, cell_state_out, output, lstm_params, activation_info, + cell_threshold, projection_threshold); + _is_layer_norm_lstm = lstm_params.use_layer_norm(); // Set lstm parameters @@ -86,13 +197,12 @@ void CLLSTMLayer::configure(const CLCompileContext &compile_context, const ICLTe build_lstm_params_tensor_info(lstm_params, &lstm_params_info); // Validate - ARM_COMPUTE_ERROR_THROW_ON(CLLSTMLayer::validate(input->info(), input_to_forget_weights->info(), - input_to_cell_weights->info(), input_to_output_weights->info(), - recurrent_to_forget_weights->info(), recurrent_to_cell_weights->info(), recurrent_to_output_weights->info(), - forget_gate_bias->info(), cell_bias->info(), output_gate_bias->info(), - output_state_in->info(), cell_state_in->info(), - scratch_buffer->info(), output_state_out->info(), cell_state_out->info(), output->info(), - lstm_params_info, activation_info, cell_threshold, projection_threshold)); + ARM_COMPUTE_ERROR_THROW_ON(CLLSTMLayer::validate( + input->info(), input_to_forget_weights->info(), input_to_cell_weights->info(), input_to_output_weights->info(), + recurrent_to_forget_weights->info(), recurrent_to_cell_weights->info(), recurrent_to_output_weights->info(), + forget_gate_bias->info(), cell_bias->info(), output_gate_bias->info(), output_state_in->info(), + cell_state_in->info(), scratch_buffer->info(), output_state_out->info(), cell_state_out->info(), output->info(), + lstm_params_info, activation_info, cell_threshold, projection_threshold)); const TensorShape cell_state_shape = cell_state_in->info()->tensor_shape(); // Configure block that calculates the forget gate @@ -110,32 +220,37 @@ void CLLSTMLayer::configure(const CLCompileContext &compile_context, const ICLTe _forget_gate_out2.allocator()->init(TensorInfo(concat_shape, 1, input->info()->data_type())); _memory_group.manage(&_forget_gate_out2); - _concat_inputs_forget_gate.configure(compile_context, input, output_state_in, &_forget_gate_out2); + _concat_inputs_forget_gate.configure(compile_context, inputs_vector, &_forget_gate_out2, Window::DimX); std::vector<const ICLTensor *> weights_vector; weights_vector.emplace_back(input_to_forget_weights); weights_vector.emplace_back(recurrent_to_forget_weights); - const TensorShape weights_concat_shape = arm_compute::misc::shape_calculator::calculate_concatenate_shape(weights_vector, 0); + const TensorShape weights_concat_shape = + arm_compute::misc::shape_calculator::calculate_concatenate_shape(weights_vector, 0); _forget_gate_out6.allocator()->init(TensorInfo(weights_concat_shape, 1, input->info()->data_type())); - _concat_weights_forget_gate.configure(compile_context, input_to_forget_weights, recurrent_to_forget_weights, &_forget_gate_out6); + _concat_weights_forget_gate.configure(compile_context, weights_vector, &_forget_gate_out6, Window::DimX); _memory_group.manage(&_forget_gate_out5); - _fully_connected_forget_gate.configure(compile_context, &_forget_gate_out2, &_forget_gate_out6, (_is_layer_norm_lstm) ? nullptr : forget_gate_bias, &_forget_gate_out5); + _fully_connected_forget_gate.configure(compile_context, &_forget_gate_out2, &_forget_gate_out6, + (_is_layer_norm_lstm) ? nullptr : forget_gate_bias, &_forget_gate_out5); _memory_group.manage(&_forget_gate_out1); _memory_group.manage(&_forget_gate_out3); _forget_gate_out6.allocator()->allocate(); CLTensor *forget_gate_out = &_forget_gate_out5; - if(lstm_params.has_peephole_opt()) + if (lstm_params.has_peephole_opt()) { _forget_gate_out4.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type())); _run_peephole_opt = true; _memory_group.manage(&_forget_gate_out4); - _pixelwise_mul_forget_gate.configure(compile_context, cell_state_in, lstm_params.cell_to_forget_weights(), &_forget_gate_out4, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN); - _accum_forget_gate1.configure(compile_context, &_forget_gate_out5, &_forget_gate_out4, &_forget_gate_out3, ConvertPolicy::SATURATE); + _pixelwise_mul_forget_gate.configure(compile_context, cell_state_in, lstm_params.cell_to_forget_weights(), + &_forget_gate_out4, 1, ConvertPolicy::SATURATE, + RoundingPolicy::TO_NEAREST_EVEN); + _accum_forget_gate1.configure(compile_context, &_forget_gate_out5, &_forget_gate_out4, &_forget_gate_out3, + ConvertPolicy::SATURATE); _forget_gate_out4.allocator()->allocate(); _forget_gate_out5.allocator()->allocate(); forget_gate_out = &_forget_gate_out3; @@ -144,22 +259,25 @@ void CLLSTMLayer::configure(const CLCompileContext &compile_context, const ICLTe { _forget_gate_out3.allocator()->allocate(); } - if(_is_layer_norm_lstm) + if (_is_layer_norm_lstm) { _forget_layer_norm_out1.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type())); _forget_layer_norm_out2.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type())); _memory_group.manage(&_forget_layer_norm_out1); _memory_group.manage(&_forget_layer_norm_out2); _mean_std_norm_forget_gate.configure(compile_context, forget_gate_out); - _pixelwise_mul_forget_gate_coeff.configure(compile_context, forget_gate_out, lstm_params.forget_layer_norm_weights(), &_forget_layer_norm_out1, 1, ConvertPolicy::SATURATE, - RoundingPolicy::TO_NEAREST_EVEN); + _pixelwise_mul_forget_gate_coeff.configure(compile_context, forget_gate_out, + lstm_params.forget_layer_norm_weights(), &_forget_layer_norm_out1, 1, + ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN); // forget_gate_out is going to be reassigned, so allocate the tensor that it was assigned to before forget_gate_out->allocator()->allocate(); - _accum_forget_gate_bias.configure(compile_context, ArithmeticOperation::ADD, &_forget_layer_norm_out1, forget_gate_bias, &_forget_layer_norm_out2, ConvertPolicy::SATURATE); + _accum_forget_gate_bias.configure(compile_context, &_forget_layer_norm_out1, forget_gate_bias, + &_forget_layer_norm_out2, ConvertPolicy::SATURATE); _forget_layer_norm_out1.allocator()->allocate(); forget_gate_out = &_forget_layer_norm_out2; } - _activation_forget_gate.configure(compile_context, forget_gate_out, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)); + _activation_forget_gate.configure(compile_context, forget_gate_out, nullptr, + ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)); // Configure block that calculates the input gate // input_gate = Activation(input * input_to_input_weights + output_state * recurrent_to_input_weights + PixelWiseMul(cell_state, cell_to_input_weights) + input_gate_bias), without CIFG @@ -168,12 +286,13 @@ void CLLSTMLayer::configure(const CLCompileContext &compile_context, const ICLTe // input_gate = Activation((input,output_state) * (input_to_input_weights,recurrent_to_input_weights) + PixelWiseMul(cell_state, cell_to_input_weights) + input_gate_bias), without CIFG _input_gate_out1.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type())); CLTensor *input_gate_out = &_input_gate_out1; - if(lstm_params.has_cifg_opt()) + if (lstm_params.has_cifg_opt()) { _memory_group.manage(&_input_gate_out1); _ones.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type())); - _ones_memset_kernel.configure(compile_context, &_ones, PixelValue(1, _ones.info()->data_type())); - _subtract_input_gate.configure(compile_context, ArithmeticOperation::SUB, &_ones, forget_gate_out, &_input_gate_out1, ConvertPolicy::SATURATE); + _ones_fill.configure(compile_context, &_ones, PixelValue(1, _ones.info()->data_type())); + _subtract_input_gate.configure(compile_context, &_ones, forget_gate_out, &_input_gate_out1, + ConvertPolicy::SATURATE); _ones.allocator()->allocate(); _run_cifg_opt = true; } @@ -185,23 +304,29 @@ void CLLSTMLayer::configure(const CLCompileContext &compile_context, const ICLTe std::vector<const ICLTensor *> lstm_weights; lstm_weights.emplace_back(lstm_params.input_to_input_weights()); lstm_weights.emplace_back(lstm_params.recurrent_to_input_weights()); - TensorShape lstm_weights_concat_shape = arm_compute::misc::shape_calculator::calculate_concatenate_shape(lstm_weights, 0); + TensorShape lstm_weights_concat_shape = + arm_compute::misc::shape_calculator::calculate_concatenate_shape(lstm_weights, 0); _input_gate_out2.allocator()->init(TensorInfo(lstm_weights_concat_shape, 1, input->info()->data_type())); - _concat_weights_input_gate.configure(compile_context, lstm_params.input_to_input_weights(), lstm_params.recurrent_to_input_weights(), &_input_gate_out2); + _concat_weights_input_gate.configure(compile_context, lstm_weights, &_input_gate_out2, Window::DimX); _memory_group.manage(&_input_gate_out1); _memory_group.manage(&_input_gate_out3); - _fully_connected_input_gate.configure(compile_context, &_forget_gate_out2, &_input_gate_out2, (_is_layer_norm_lstm) ? nullptr : lstm_params.input_gate_bias(), &_input_gate_out3); + _fully_connected_input_gate.configure(compile_context, &_forget_gate_out2, &_input_gate_out2, + (_is_layer_norm_lstm) ? nullptr : lstm_params.input_gate_bias(), + &_input_gate_out3); _input_gate_out2.allocator()->allocate(); input_gate_out = &_input_gate_out3; - if(_run_peephole_opt) + if (_run_peephole_opt) { _memory_group.manage(&_input_gate_out4); - _pixelwise_mul_input_gate.configure(compile_context, cell_state_in, lstm_params.cell_to_input_weights(), &_input_gate_out4, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN); - _accum_input_gate1.configure(compile_context, &_input_gate_out3, &_input_gate_out4, &_input_gate_out1, ConvertPolicy::SATURATE); + _pixelwise_mul_input_gate.configure(compile_context, cell_state_in, lstm_params.cell_to_input_weights(), + &_input_gate_out4, 1, ConvertPolicy::SATURATE, + RoundingPolicy::TO_NEAREST_EVEN); + _accum_input_gate1.configure(compile_context, &_input_gate_out3, &_input_gate_out4, &_input_gate_out1, + ConvertPolicy::SATURATE); _input_gate_out3.allocator()->allocate(); _input_gate_out4.allocator()->allocate(); input_gate_out = &_input_gate_out1; @@ -211,22 +336,25 @@ void CLLSTMLayer::configure(const CLCompileContext &compile_context, const ICLTe _input_gate_out1.allocator()->allocate(); } - if(_is_layer_norm_lstm) + if (_is_layer_norm_lstm) { _input_layer_norm_out1.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type())); _input_layer_norm_out2.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type())); _memory_group.manage(&_input_layer_norm_out1); _memory_group.manage(&_input_layer_norm_out2); _mean_std_norm_input_gate.configure(compile_context, input_gate_out); - _pixelwise_mul_input_gate_coeff.configure(compile_context, input_gate_out, lstm_params.input_layer_norm_weights(), &_input_layer_norm_out1, 1, ConvertPolicy::SATURATE, - RoundingPolicy::TO_NEAREST_EVEN); + _pixelwise_mul_input_gate_coeff.configure(compile_context, input_gate_out, + lstm_params.input_layer_norm_weights(), &_input_layer_norm_out1, + 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN); // input_gate_out is going to be reassigned, so allocate the tensor that it was assigned to before input_gate_out->allocator()->allocate(); - _accum_input_gate_bias.configure(compile_context, ArithmeticOperation::ADD, &_input_layer_norm_out1, lstm_params.input_gate_bias(), &_input_layer_norm_out2, ConvertPolicy::SATURATE); + _accum_input_gate_bias.configure(compile_context, &_input_layer_norm_out1, lstm_params.input_gate_bias(), + &_input_layer_norm_out2, ConvertPolicy::SATURATE); _input_layer_norm_out1.allocator()->allocate(); input_gate_out = &_input_layer_norm_out2; } - _activation_input_gate.configure(compile_context, input_gate_out, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)); + _activation_input_gate.configure(compile_context, input_gate_out, nullptr, + ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)); } // Configure block that calculates the cell state @@ -239,43 +367,54 @@ void CLLSTMLayer::configure(const CLCompileContext &compile_context, const ICLTe _cell_state_out5.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type())); _memory_group.manage(&_cell_state_out1); - _fully_connected_cell_state.configure(compile_context, input, input_to_cell_weights, (_is_layer_norm_lstm) ? nullptr : cell_bias, &_cell_state_out1); + _fully_connected_cell_state.configure(compile_context, input, input_to_cell_weights, + (_is_layer_norm_lstm) ? nullptr : cell_bias, &_cell_state_out1); _memory_group.manage(&_cell_state_out2); - _transpose_cell_state.configure(compile_context, recurrent_to_cell_weights, &_cell_state_out2); + _transpose_cell_state->configure(compile_context, recurrent_to_cell_weights->info(), _cell_state_out2.info()); + _recurrent_to_cell_weights = recurrent_to_cell_weights; _memory_group.manage(&_cell_state_out3); - _gemm_cell_state1.configure(compile_context, output_state_in, &_cell_state_out2, nullptr, &_cell_state_out3, 1.f, 0.f); + _gemm_cell_state1.configure(compile_context, output_state_in, &_cell_state_out2, nullptr, &_cell_state_out3, 1.f, + 0.f); _cell_state_out2.allocator()->allocate(); _memory_group.manage(&_cell_state_out4); - _accum_cell_state1.configure(compile_context, ArithmeticOperation::ADD, &_cell_state_out1, &_cell_state_out3, &_cell_state_out4, ConvertPolicy::SATURATE); + _accum_cell_state1.configure(compile_context, &_cell_state_out1, &_cell_state_out3, &_cell_state_out4, + ConvertPolicy::SATURATE); CLTensor *cell_state_out_ptr = &_cell_state_out4; - if(_is_layer_norm_lstm) + if (_is_layer_norm_lstm) { _cell_layer_norm_out1.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type())); _cell_layer_norm_out2.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type())); _memory_group.manage(&_cell_layer_norm_out1); _memory_group.manage(&_cell_layer_norm_out2); _mean_std_norm_cell_gate.configure(compile_context, cell_state_out_ptr); - _pixelwise_mul_cell_gate_coeff.configure(compile_context, cell_state_out_ptr, lstm_params.cell_layer_norm_weights(), &_cell_layer_norm_out1, 1, ConvertPolicy::SATURATE, - RoundingPolicy::TO_NEAREST_EVEN); + _pixelwise_mul_cell_gate_coeff.configure(compile_context, cell_state_out_ptr, + lstm_params.cell_layer_norm_weights(), &_cell_layer_norm_out1, 1, + ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN); // cell_state_out_ptr is going to be reassigned, so allocate the tensor that it was assigned to before cell_state_out_ptr->allocator()->allocate(); - _accum_cell_gate_bias.configure(compile_context, ArithmeticOperation::ADD, &_cell_layer_norm_out1, cell_bias, &_cell_layer_norm_out2, ConvertPolicy::SATURATE); + _accum_cell_gate_bias.configure(compile_context, &_cell_layer_norm_out1, cell_bias, &_cell_layer_norm_out2, + ConvertPolicy::SATURATE); _cell_layer_norm_out1.allocator()->allocate(); cell_state_out_ptr = &_cell_layer_norm_out2; } _activation_cell_state.configure(compile_context, cell_state_out_ptr, nullptr, activation_info); _memory_group.manage(&_cell_state_out5); - _pixelwise_mul_cell_state1.configure(compile_context, cell_state_out_ptr, input_gate_out, &_cell_state_out5, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN); + _pixelwise_mul_cell_state1.configure(compile_context, cell_state_out_ptr, input_gate_out, &_cell_state_out5, 1, + ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN); cell_state_out_ptr->allocator()->allocate(); - _pixelwise_mul_cell_state2.configure(compile_context, forget_gate_out, cell_state_in, &_cell_state_out3, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN); - _accum_cell_state2.configure(compile_context, ArithmeticOperation::ADD, &_cell_state_out5, &_cell_state_out3, &_cell_state_out1, ConvertPolicy::SATURATE); + _pixelwise_mul_cell_state2.configure(compile_context, forget_gate_out, cell_state_in, &_cell_state_out3, 1, + ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN); + _accum_cell_state2.configure(compile_context, &_cell_state_out5, &_cell_state_out3, &_cell_state_out1, + ConvertPolicy::SATURATE); _cell_state_out3.allocator()->allocate(); _cell_state_out5.allocator()->allocate(); // Perform clipping - if(cell_threshold != 0.f) + if (cell_threshold != 0.f) { _perform_cell_clipping = true; - _cell_clip.configure(compile_context, &_cell_state_out1, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, -cell_threshold, cell_threshold)); + _cell_clip.configure(compile_context, &_cell_state_out1, nullptr, + ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, + cell_threshold, -cell_threshold)); } // Configure block that calculates the output @@ -287,26 +426,29 @@ void CLLSTMLayer::configure(const CLCompileContext &compile_context, const ICLTe std::vector<const ICLTensor *> in_out_weights; in_out_weights.emplace_back(input_to_output_weights); in_out_weights.emplace_back(recurrent_to_output_weights); - TensorShape in_out_weights_concat_shape = arm_compute::misc::shape_calculator::calculate_concatenate_shape(in_out_weights, 0); + TensorShape in_out_weights_concat_shape = + arm_compute::misc::shape_calculator::calculate_concatenate_shape(in_out_weights, 0); _output2.allocator()->init(TensorInfo(in_out_weights_concat_shape, 1, input->info()->data_type())); - _concat_weights_output.configure(compile_context, input_to_output_weights, recurrent_to_output_weights, &_output2); + _concat_weights_output.configure(compile_context, in_out_weights, &_output2, Window::DimX); _memory_group.manage(&_output1); _memory_group.manage(&_output4); - _fully_connected_output.configure(compile_context, &_forget_gate_out2, &_output2, (_is_layer_norm_lstm) ? nullptr : output_gate_bias, &_output4); + _fully_connected_output.configure(compile_context, &_forget_gate_out2, &_output2, + (_is_layer_norm_lstm) ? nullptr : output_gate_bias, &_output4); _output2.allocator()->allocate(); _forget_gate_out2.allocator()->allocate(); CLTensor *output_gate_out = &_output4; - if(lstm_params.has_peephole_opt()) + if (lstm_params.has_peephole_opt()) { _output3.allocator()->init(TensorInfo(_cell_state_out1.info()->tensor_shape(), 1, input->info()->data_type())); _memory_group.manage(&_output3); - _pixelwise_mul_output_state1.configure(compile_context, &_cell_state_out1, lstm_params.cell_to_output_weights(), &_output3, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN); + _pixelwise_mul_output_state1.configure(compile_context, &_cell_state_out1, lstm_params.cell_to_output_weights(), + &_output3, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN); _accum_output1.configure(compile_context, &_output4, &_output3, &_output1, ConvertPolicy::SATURATE); _output4.allocator()->allocate(); output_gate_out = &_output1; @@ -318,22 +460,25 @@ void CLLSTMLayer::configure(const CLCompileContext &compile_context, const ICLTe { _output1.allocator()->allocate(); } - if(_is_layer_norm_lstm) + if (_is_layer_norm_lstm) { _output_layer_norm_out1.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type())); _output_layer_norm_out2.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type())); _memory_group.manage(&_output_layer_norm_out1); _memory_group.manage(&_output_layer_norm_out2); _mean_std_norm_output_gate.configure(compile_context, output_gate_out); - _pixelwise_mul_output_gate_coeff.configure(compile_context, output_gate_out, lstm_params.output_layer_norm_weights(), &_output_layer_norm_out1, 1, ConvertPolicy::SATURATE, - RoundingPolicy::TO_NEAREST_EVEN); + _pixelwise_mul_output_gate_coeff.configure(compile_context, output_gate_out, + lstm_params.output_layer_norm_weights(), &_output_layer_norm_out1, 1, + ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN); // output_gate_out is going to be reassigned, so allocate the tensor that it was assigned to before output_gate_out->allocator()->allocate(); - _accum_output_gate_bias.configure(compile_context, ArithmeticOperation::ADD, &_output_layer_norm_out1, output_gate_bias, &_output_layer_norm_out2, ConvertPolicy::SATURATE); + _accum_output_gate_bias.configure(compile_context, &_output_layer_norm_out1, output_gate_bias, + &_output_layer_norm_out2, ConvertPolicy::SATURATE); _output_layer_norm_out1.allocator()->allocate(); output_gate_out = &_output_layer_norm_out2; } - _activation_output.configure(compile_context, output_gate_out, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)); + _activation_output.configure(compile_context, output_gate_out, nullptr, + ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)); // Configure block that calculates the output state /** lstm_res = PixelwiseMul(output, Activation(cell_state)) @@ -350,19 +495,24 @@ void CLLSTMLayer::configure(const CLCompileContext &compile_context, const ICLTe _memory_group.manage(&_cell_state_activation); _activation_output_state.configure(compile_context, &_cell_state_out1, &_cell_state_activation, activation_info); - _pixelwise_mul_output_state2.configure(compile_context, &_cell_state_activation, output_gate_out, output_state_out_tmp, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN); + _pixelwise_mul_output_state2.configure(compile_context, &_cell_state_activation, output_gate_out, + output_state_out_tmp, 1, ConvertPolicy::SATURATE, + RoundingPolicy::TO_NEAREST_EVEN); _cell_state_activation.allocator()->allocate(); - if(lstm_params.has_projection()) + if (lstm_params.has_projection()) { _has_projection_weights = true; - _fully_connected_output_state.configure(compile_context, output_state_out_tmp, lstm_params.projection_weights(), lstm_params.projection_bias(), output_state_out); + _fully_connected_output_state.configure(compile_context, output_state_out_tmp, lstm_params.projection_weights(), + lstm_params.projection_bias(), output_state_out); _output_state1.allocator()->allocate(); // Perform clipping - if(projection_threshold != 0.f) + if (projection_threshold != 0.f) { _perform_projection_clipping = true; - _projection_clip.configure(compile_context, output_state_out, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, -projection_threshold, projection_threshold)); + _projection_clip.configure(compile_context, output_state_out, nullptr, + ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, + -projection_threshold, projection_threshold)); } } @@ -371,8 +521,8 @@ void CLLSTMLayer::configure(const CLCompileContext &compile_context, const ICLTe _copy_output.configure(compile_context, output_state_out, output); // Vector for holding the tensors to store in scratch buffer - std::vector<ICLTensor *> scratch_inputs; - if(!lstm_params.has_cifg_opt()) + std::vector<const ICLTensor *> scratch_inputs; + if (!lstm_params.has_cifg_opt()) { scratch_inputs.emplace_back(input_gate_out); } @@ -386,29 +536,38 @@ void CLLSTMLayer::configure(const CLCompileContext &compile_context, const ICLTe output_gate_out->allocator()->allocate(); } -Status CLLSTMLayer::validate(const ITensorInfo *input, - const ITensorInfo *input_to_forget_weights, const ITensorInfo *input_to_cell_weights, const ITensorInfo *input_to_output_weights, - const ITensorInfo *recurrent_to_forget_weights, const ITensorInfo *recurrent_to_cell_weights, const ITensorInfo *recurrent_to_output_weights, - const ITensorInfo *forget_gate_bias, const ITensorInfo *cell_bias, const ITensorInfo *output_gate_bias, - const ITensorInfo *output_state_in, const ITensorInfo *cell_state_in, - const ITensorInfo *scratch_buffer, const ITensorInfo *output_state_out, const ITensorInfo *cell_state_out, const ITensorInfo *output, - const LSTMParams<ITensorInfo> &lstm_params, const ActivationLayerInfo &activation_info, float cell_threshold, float projection_threshold) +Status CLLSTMLayer::validate(const ITensorInfo *input, + const ITensorInfo *input_to_forget_weights, + const ITensorInfo *input_to_cell_weights, + const ITensorInfo *input_to_output_weights, + const ITensorInfo *recurrent_to_forget_weights, + const ITensorInfo *recurrent_to_cell_weights, + const ITensorInfo *recurrent_to_output_weights, + const ITensorInfo *forget_gate_bias, + const ITensorInfo *cell_bias, + const ITensorInfo *output_gate_bias, + const ITensorInfo *output_state_in, + const ITensorInfo *cell_state_in, + const ITensorInfo *scratch_buffer, + const ITensorInfo *output_state_out, + const ITensorInfo *cell_state_out, + const ITensorInfo *output, + const LSTMParams<ITensorInfo> &lstm_params, + const ActivationLayerInfo &activation_info, + float cell_threshold, + float projection_threshold) { - ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, - input_to_forget_weights, input_to_cell_weights, input_to_output_weights, - recurrent_to_forget_weights, recurrent_to_cell_weights, recurrent_to_output_weights, - forget_gate_bias, cell_bias, output_gate_bias, - output_state_in, cell_state_in, - scratch_buffer, output_state_out, cell_state_out, output); + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR( + input, input_to_forget_weights, input_to_cell_weights, input_to_output_weights, recurrent_to_forget_weights, + recurrent_to_cell_weights, recurrent_to_output_weights, forget_gate_bias, cell_bias, output_gate_bias, + output_state_in, cell_state_in, scratch_buffer, output_state_out, cell_state_out, output); // Check data types ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, - input_to_forget_weights, input_to_cell_weights, input_to_output_weights, - recurrent_to_forget_weights, recurrent_to_cell_weights, recurrent_to_output_weights, - forget_gate_bias, cell_bias, output_gate_bias, - output_state_in, cell_state_in, - scratch_buffer, output_state_out, cell_state_out, output); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES( + input, input_to_forget_weights, input_to_cell_weights, input_to_output_weights, recurrent_to_forget_weights, + recurrent_to_cell_weights, recurrent_to_output_weights, forget_gate_bias, cell_bias, output_gate_bias, + output_state_in, cell_state_in, scratch_buffer, output_state_out, cell_state_out, output); // Check dimensions ARM_COMPUTE_RETURN_ERROR_ON(input->num_dimensions() > 2); @@ -427,16 +586,16 @@ Status CLLSTMLayer::validate(const ITensorInfo *input, ARM_COMPUTE_RETURN_ERROR_ON(output_state_out->num_dimensions() > 2); ARM_COMPUTE_RETURN_ERROR_ON(cell_state_out->num_dimensions() > 2); ARM_COMPUTE_RETURN_ERROR_ON(output->num_dimensions() > 2); - ARM_COMPUTE_RETURN_ERROR_ON(cell_bias->dimension(0) * 4 != scratch_buffer->dimension(0) - && cell_bias->dimension(0) * 3 != scratch_buffer->dimension(0)); + ARM_COMPUTE_RETURN_ERROR_ON(cell_bias->dimension(0) * 4 != scratch_buffer->dimension(0) && + cell_bias->dimension(0) * 3 != scratch_buffer->dimension(0)); const unsigned int num_batches = input->dimension(1); const unsigned int num_cells = input_to_output_weights->dimension(1); - if(lstm_params.use_layer_norm()) + if (lstm_params.use_layer_norm()) { // If CIFG is used, input layer normalization weights tensor is omitted - if(lstm_params.has_cifg_opt()) + if (lstm_params.has_cifg_opt()) { ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.input_layer_norm_weights() != nullptr); } @@ -448,8 +607,12 @@ Status CLLSTMLayer::validate(const ITensorInfo *input, ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, lstm_params.input_layer_norm_weights()); } - ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(lstm_params.forget_layer_norm_weights(), lstm_params.cell_layer_norm_weights(), lstm_params.output_layer_norm_weights()); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, lstm_params.forget_layer_norm_weights(), lstm_params.cell_layer_norm_weights(), lstm_params.output_layer_norm_weights()); + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(lstm_params.forget_layer_norm_weights(), + lstm_params.cell_layer_norm_weights(), + lstm_params.output_layer_norm_weights()); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, lstm_params.forget_layer_norm_weights(), + lstm_params.cell_layer_norm_weights(), + lstm_params.output_layer_norm_weights()); ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.forget_layer_norm_weights()->num_dimensions() > 1); ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.cell_layer_norm_weights()->num_dimensions() > 1); ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.output_layer_norm_weights()->num_dimensions() > 1); @@ -459,7 +622,7 @@ Status CLLSTMLayer::validate(const ITensorInfo *input, } // Check peephole optimization - if(lstm_params.has_peephole_opt()) + if (lstm_params.has_peephole_opt()) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(lstm_params.cell_to_output_weights(), lstm_params.cell_to_forget_weights()); ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.cell_to_forget_weights()->num_dimensions() > 1); @@ -477,36 +640,42 @@ Status CLLSTMLayer::validate(const ITensorInfo *input, TensorInfo cell_state_tmp = TensorInfo(TensorShape(num_cells, num_batches), 1, input->data_type()); // Validate forget gate - ARM_COMPUTE_RETURN_ON_ERROR(CLFullyConnectedLayer::validate(input, input_to_forget_weights, (lstm_params.use_layer_norm()) ? nullptr : forget_gate_bias, &forget_gate)); + ARM_COMPUTE_RETURN_ON_ERROR(CLFullyConnectedLayer::validate( + input, input_to_forget_weights, (lstm_params.use_layer_norm()) ? nullptr : forget_gate_bias, &forget_gate)); std::vector<const ITensorInfo *> inputs_vector; inputs_vector.emplace_back(input); inputs_vector.emplace_back(output_state_in); - const TensorShape concat_shape = arm_compute::misc::shape_calculator::calculate_concatenate_shape(inputs_vector, 0); + const TensorShape concat_shape = arm_compute::misc::shape_calculator::calculate_concatenate_shape(inputs_vector, 0); TensorInfo forget_gate_concat = TensorInfo(concat_shape, 1, input->data_type()); - ARM_COMPUTE_RETURN_ON_ERROR(CLWidthConcatenate2TensorsKernel::validate(input, output_state_in, &forget_gate_concat)); + ARM_COMPUTE_RETURN_ON_ERROR(CLConcatenateLayer::validate(inputs_vector, &forget_gate_concat, Window::DimX)); - if(lstm_params.has_peephole_opt()) + if (lstm_params.has_peephole_opt()) { - ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplicationKernel::validate(cell_state_in, lstm_params.cell_to_forget_weights(), &forget_gate, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN)); - ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAddition::validate(&forget_gate, &forget_gate, &forget_gate, ConvertPolicy::SATURATE)); + ARM_COMPUTE_RETURN_ON_ERROR( + CLPixelWiseMultiplication::validate(cell_state_in, lstm_params.cell_to_forget_weights(), &forget_gate, 1, + ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN)); + ARM_COMPUTE_RETURN_ON_ERROR( + CLArithmeticAddition::validate(&forget_gate, &forget_gate, &forget_gate, ConvertPolicy::SATURATE)); } - if(lstm_params.use_layer_norm()) + if (lstm_params.use_layer_norm()) { ARM_COMPUTE_RETURN_ON_ERROR(CLMeanStdDevNormalizationLayer::validate(&forget_gate)); - ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplicationKernel::validate(&forget_gate, lstm_params.forget_layer_norm_weights(), &forget_gate, 1, ConvertPolicy::SATURATE, - RoundingPolicy::TO_NEAREST_EVEN)); - ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAddition::validate(&forget_gate, forget_gate_bias, &forget_gate, ConvertPolicy::SATURATE)); + ARM_COMPUTE_RETURN_ON_ERROR( + CLPixelWiseMultiplication::validate(&forget_gate, lstm_params.forget_layer_norm_weights(), &forget_gate, 1, + ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN)); + ARM_COMPUTE_RETURN_ON_ERROR( + CLArithmeticAddition::validate(&forget_gate, forget_gate_bias, &forget_gate, ConvertPolicy::SATURATE)); } - ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayerKernel::validate(&forget_gate, &forget_gate, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC))); + ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayer::validate( + &forget_gate, &forget_gate, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC))); // Validate input gate - if(!lstm_params.has_cifg_opt()) + if (!lstm_params.has_cifg_opt()) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(lstm_params.input_to_input_weights(), - lstm_params.recurrent_to_input_weights(), - lstm_params.input_gate_bias()); + lstm_params.recurrent_to_input_weights(), lstm_params.input_gate_bias()); ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.input_to_input_weights()->num_dimensions() > 2); ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.recurrent_to_input_weights()->num_dimensions() > 2); ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.input_gate_bias()->num_dimensions() > 1); @@ -514,98 +683,131 @@ Status CLLSTMLayer::validate(const ITensorInfo *input, std::vector<const ITensorInfo *> lstm_weights; lstm_weights.emplace_back(lstm_params.input_to_input_weights()); lstm_weights.emplace_back(lstm_params.recurrent_to_input_weights()); - TensorShape lstm_weights_concat_shape = arm_compute::misc::shape_calculator::calculate_concatenate_shape(lstm_weights, 0); - TensorInfo lstm_gate_concat = TensorInfo(lstm_weights_concat_shape, 1, input->data_type()); - ARM_COMPUTE_RETURN_ON_ERROR(CLWidthConcatenate2TensorsKernel::validate(lstm_params.input_to_input_weights(), lstm_params.recurrent_to_input_weights(), &lstm_gate_concat)); + TensorShape lstm_weights_concat_shape = + arm_compute::misc::shape_calculator::calculate_concatenate_shape(lstm_weights, 0); + TensorInfo lstm_gate_concat = TensorInfo(lstm_weights_concat_shape, 1, input->data_type()); + ARM_COMPUTE_RETURN_ON_ERROR(CLConcatenateLayer::validate(lstm_weights, &lstm_gate_concat, Window::DimX)); - ARM_COMPUTE_RETURN_ON_ERROR(CLFullyConnectedLayer::validate(input, lstm_params.input_to_input_weights(), (lstm_params.use_layer_norm()) ? nullptr : lstm_params.input_gate_bias(), &input_gate)); + ARM_COMPUTE_RETURN_ON_ERROR(CLFullyConnectedLayer::validate( + input, lstm_params.input_to_input_weights(), + (lstm_params.use_layer_norm()) ? nullptr : lstm_params.input_gate_bias(), &input_gate)); - if(lstm_params.has_peephole_opt()) + if (lstm_params.has_peephole_opt()) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(lstm_params.cell_to_input_weights()); ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.cell_to_input_weights()->num_dimensions() > 1); - ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplicationKernel::validate(cell_state_in, lstm_params.cell_to_input_weights(), &input_gate, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN)); - ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAddition::validate(&input_gate, &input_gate, &input_gate, ConvertPolicy::SATURATE)); + ARM_COMPUTE_RETURN_ON_ERROR( + CLPixelWiseMultiplication::validate(cell_state_in, lstm_params.cell_to_input_weights(), &input_gate, 1, + ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN)); + ARM_COMPUTE_RETURN_ON_ERROR( + CLArithmeticAddition::validate(&input_gate, &input_gate, &input_gate, ConvertPolicy::SATURATE)); } - if(lstm_params.use_layer_norm()) + if (lstm_params.use_layer_norm()) { ARM_COMPUTE_RETURN_ON_ERROR(CLMeanStdDevNormalizationLayer::validate(&input_gate)); - ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplicationKernel::validate(&input_gate, lstm_params.input_layer_norm_weights(), &input_gate, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN)); - ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAddition::validate(&input_gate, lstm_params.input_gate_bias(), &input_gate, ConvertPolicy::SATURATE)); + ARM_COMPUTE_RETURN_ON_ERROR( + CLPixelWiseMultiplication::validate(&input_gate, lstm_params.input_layer_norm_weights(), &input_gate, 1, + ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN)); + ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAddition::validate(&input_gate, lstm_params.input_gate_bias(), + &input_gate, ConvertPolicy::SATURATE)); } - ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayerKernel::validate(&input_gate, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC))); + ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayer::validate( + &input_gate, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC))); } else { - ARM_COMPUTE_RETURN_ON_ERROR(CLSaturatedArithmeticOperationKernel::validate(ArithmeticOperation::SUB, &forget_gate, &forget_gate, &forget_gate, ConvertPolicy::SATURATE)); + ARM_COMPUTE_RETURN_ON_ERROR( + CLArithmeticSubtraction::validate(&forget_gate, &forget_gate, &forget_gate, ConvertPolicy::SATURATE)); } // Validate cell state - ARM_COMPUTE_RETURN_ON_ERROR(CLFullyConnectedLayer::validate(input, input_to_cell_weights, (lstm_params.use_layer_norm()) ? nullptr : cell_bias, &cell_state_tmp)); - ARM_COMPUTE_RETURN_ON_ERROR(CLGEMM::validate(output_state_in, &units_out_transposed_info, nullptr, &cell_state_tmp, 1.f, 0.f, GEMMInfo())); - ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAddition::validate(&cell_state_tmp, &cell_state_tmp, &cell_state_tmp, ConvertPolicy::SATURATE)); - if(lstm_params.use_layer_norm()) + ARM_COMPUTE_RETURN_ON_ERROR(CLFullyConnectedLayer::validate( + input, input_to_cell_weights, (lstm_params.use_layer_norm()) ? nullptr : cell_bias, &cell_state_tmp)); + ARM_COMPUTE_RETURN_ON_ERROR( + CLGEMM::validate(output_state_in, &units_out_transposed_info, nullptr, &cell_state_tmp, 1.f, 0.f, GEMMInfo())); + ARM_COMPUTE_RETURN_ON_ERROR( + CLArithmeticAddition::validate(&cell_state_tmp, &cell_state_tmp, &cell_state_tmp, ConvertPolicy::SATURATE)); + if (lstm_params.use_layer_norm()) { ARM_COMPUTE_RETURN_ON_ERROR(CLMeanStdDevNormalizationLayer::validate(&cell_state_tmp)); - ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplicationKernel::validate(&cell_state_tmp, lstm_params.cell_layer_norm_weights(), &cell_state_tmp, 1, ConvertPolicy::SATURATE, - RoundingPolicy::TO_NEAREST_EVEN)); - ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAddition::validate(&cell_state_tmp, cell_bias, &cell_state_tmp, ConvertPolicy::SATURATE)); - } - ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayerKernel::validate(&cell_state_tmp, nullptr, activation_info)); - ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplicationKernel::validate(&cell_state_tmp, &input_gate, &cell_state_tmp, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN)); - ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplicationKernel::validate(&cell_state_tmp, &forget_gate, &cell_state_tmp, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN)); - ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAddition::validate(&cell_state_tmp, &cell_state_tmp, &cell_state_tmp, ConvertPolicy::SATURATE)); - if(cell_threshold != 0.f) - { - ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayerKernel::validate(&cell_state_tmp, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, -cell_threshold, - cell_threshold))); + ARM_COMPUTE_RETURN_ON_ERROR( + CLPixelWiseMultiplication::validate(&cell_state_tmp, lstm_params.cell_layer_norm_weights(), &cell_state_tmp, + 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN)); + ARM_COMPUTE_RETURN_ON_ERROR( + CLArithmeticAddition::validate(&cell_state_tmp, cell_bias, &cell_state_tmp, ConvertPolicy::SATURATE)); + } + ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayer::validate(&cell_state_tmp, nullptr, activation_info)); + ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplication::validate( + &cell_state_tmp, &input_gate, &cell_state_tmp, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN)); + ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplication::validate( + &cell_state_tmp, &forget_gate, &cell_state_tmp, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN)); + ARM_COMPUTE_RETURN_ON_ERROR( + CLArithmeticAddition::validate(&cell_state_tmp, &cell_state_tmp, &cell_state_tmp, ConvertPolicy::SATURATE)); + if (cell_threshold != 0.f) + { + ARM_COMPUTE_RETURN_ON_ERROR( + CLActivationLayer::validate(&cell_state_tmp, nullptr, + ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, + cell_threshold, -cell_threshold))); } std::vector<const ITensorInfo *> in_out_weights; in_out_weights.emplace_back(input_to_output_weights); in_out_weights.emplace_back(recurrent_to_output_weights); - TensorShape in_out_weights_concat_shape = arm_compute::misc::shape_calculator::calculate_concatenate_shape(in_out_weights, 0); - TensorInfo in_out_gate_concat = TensorInfo(in_out_weights_concat_shape, 1, input->data_type()); - ARM_COMPUTE_RETURN_ON_ERROR(CLWidthConcatenate2TensorsKernel::validate(input_to_output_weights, recurrent_to_output_weights, &in_out_gate_concat)); + TensorShape in_out_weights_concat_shape = + arm_compute::misc::shape_calculator::calculate_concatenate_shape(in_out_weights, 0); + TensorInfo in_out_gate_concat = TensorInfo(in_out_weights_concat_shape, 1, input->data_type()); + ARM_COMPUTE_RETURN_ON_ERROR(CLConcatenateLayer::validate(in_out_weights, &in_out_gate_concat, Window::DimX)); // Validate output gate tmp - ARM_COMPUTE_RETURN_ON_ERROR(CLFullyConnectedLayer::validate(input, input_to_output_weights, (lstm_params.use_layer_norm()) ? nullptr : output_gate_bias, &output_gate_tmp)); + ARM_COMPUTE_RETURN_ON_ERROR(CLFullyConnectedLayer::validate( + input, input_to_output_weights, (lstm_params.use_layer_norm()) ? nullptr : output_gate_bias, &output_gate_tmp)); - if(lstm_params.has_peephole_opt()) + if (lstm_params.has_peephole_opt()) { - ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplicationKernel::validate(&cell_state_tmp, lstm_params.cell_to_output_weights(), &output_gate_tmp, 1, ConvertPolicy::SATURATE, - RoundingPolicy::TO_NEAREST_EVEN)); - ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAddition::validate(&output_gate_tmp, &output_gate_tmp, &output_gate_tmp, ConvertPolicy::SATURATE)); + ARM_COMPUTE_RETURN_ON_ERROR( + CLPixelWiseMultiplication::validate(&cell_state_tmp, lstm_params.cell_to_output_weights(), &output_gate_tmp, + 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN)); + ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAddition::validate(&output_gate_tmp, &output_gate_tmp, &output_gate_tmp, + ConvertPolicy::SATURATE)); } - if(lstm_params.use_layer_norm()) + if (lstm_params.use_layer_norm()) { ARM_COMPUTE_RETURN_ON_ERROR(CLMeanStdDevNormalizationLayer::validate(&output_gate_tmp)); - ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplicationKernel::validate(&output_gate_tmp, lstm_params.output_layer_norm_weights(), &output_gate_tmp, 1, ConvertPolicy::SATURATE, - RoundingPolicy::TO_NEAREST_EVEN)); - ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAddition::validate(&output_gate_tmp, output_gate_bias, &output_gate_tmp, ConvertPolicy::SATURATE)); + ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplication::validate( + &output_gate_tmp, lstm_params.output_layer_norm_weights(), &output_gate_tmp, 1, ConvertPolicy::SATURATE, + RoundingPolicy::TO_NEAREST_EVEN)); + ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAddition::validate(&output_gate_tmp, output_gate_bias, &output_gate_tmp, + ConvertPolicy::SATURATE)); } - ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayerKernel::validate(&output_gate_tmp, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC))); + ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayer::validate( + &output_gate_tmp, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC))); // Validate output state - ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayerKernel::validate(&cell_state_tmp, &cell_state_tmp, activation_info)); - ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplicationKernel::validate(&cell_state_tmp, &output_gate_tmp, &output_gate_tmp, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN)); - if(lstm_params.has_projection()) - { - ARM_COMPUTE_RETURN_ON_ERROR(CLFullyConnectedLayer::validate(&output_gate_tmp, lstm_params.projection_weights(), lstm_params.projection_bias(), output_state_out)); - if(projection_threshold != 0.f) + ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayer::validate(&cell_state_tmp, &cell_state_tmp, activation_info)); + ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplication::validate(&cell_state_tmp, &output_gate_tmp, &output_gate_tmp, + 1, ConvertPolicy::SATURATE, + RoundingPolicy::TO_NEAREST_EVEN)); + if (lstm_params.has_projection()) + { + ARM_COMPUTE_RETURN_ON_ERROR(CLFullyConnectedLayer::validate(&output_gate_tmp, lstm_params.projection_weights(), + lstm_params.projection_bias(), output_state_out)); + if (projection_threshold != 0.f) { - ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayerKernel::validate(output_state_out, output_state_out, - ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, -projection_threshold, projection_threshold))); + ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayer::validate( + output_state_out, output_state_out, + ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, -projection_threshold, + projection_threshold))); } } // Validate copy kernel - ARM_COMPUTE_RETURN_ON_ERROR(CLCopyKernel::validate(&cell_state_tmp, cell_state_out)); - ARM_COMPUTE_RETURN_ON_ERROR(CLCopyKernel::validate(output_state_out, output)); + ARM_COMPUTE_RETURN_ON_ERROR(CLCopy::validate(&cell_state_tmp, cell_state_out)); + ARM_COMPUTE_RETURN_ON_ERROR(CLCopy::validate(output_state_out, output)); // Validate scratch concatenation - std::vector<ITensorInfo *> inputs_vector_info_raw; - if(!lstm_params.has_cifg_opt()) + std::vector<const ITensorInfo *> inputs_vector_info_raw; + if (!lstm_params.has_cifg_opt()) { inputs_vector_info_raw.push_back(&input_gate); } @@ -623,110 +825,113 @@ void CLLSTMLayer::run() MemoryGroupResourceScope scope_mg(_memory_group); - CLScheduler::get().enqueue(_concat_inputs_forget_gate); + _concat_inputs_forget_gate.run(); _fully_connected_forget_gate.run(); - if(_run_peephole_opt) + if (_run_peephole_opt) { - CLScheduler::get().enqueue(_pixelwise_mul_forget_gate); + _pixelwise_mul_forget_gate.run(); _accum_forget_gate1.run(); } - if(_is_layer_norm_lstm) + if (_is_layer_norm_lstm) { _mean_std_norm_forget_gate.run(); - CLScheduler::get().enqueue(_pixelwise_mul_forget_gate_coeff); - CLScheduler::get().enqueue(_accum_forget_gate_bias); + _pixelwise_mul_forget_gate_coeff.run(); + _accum_forget_gate_bias.run(); } - CLScheduler::get().enqueue(_activation_forget_gate); + _activation_forget_gate.run(); - if(_run_cifg_opt) + if (_run_cifg_opt) { - CLScheduler::get().enqueue(_ones_memset_kernel); - CLScheduler::get().enqueue(_subtract_input_gate); + _ones_fill.run(); + _subtract_input_gate.run(); } else { _fully_connected_input_gate.run(); - if(_run_peephole_opt) + if (_run_peephole_opt) { - CLScheduler::get().enqueue(_pixelwise_mul_input_gate); + _pixelwise_mul_input_gate.run(); _accum_input_gate1.run(); } - if(_is_layer_norm_lstm) + if (_is_layer_norm_lstm) { _mean_std_norm_input_gate.run(); - CLScheduler::get().enqueue(_pixelwise_mul_input_gate_coeff); - CLScheduler::get().enqueue(_accum_input_gate_bias); + _pixelwise_mul_input_gate_coeff.run(); + _accum_input_gate_bias.run(); } - CLScheduler::get().enqueue(_activation_input_gate); + _activation_input_gate.run(); } _fully_connected_cell_state.run(); - CLScheduler::get().enqueue(_transpose_cell_state); + ITensorPack pack; + pack.add_tensor(TensorType::ACL_SRC, _recurrent_to_cell_weights); + pack.add_tensor(TensorType::ACL_DST, &_cell_state_out2); + CLScheduler::get().enqueue_op(*_transpose_cell_state, pack, false); _gemm_cell_state1.run(); - CLScheduler::get().enqueue(_accum_cell_state1); - if(_is_layer_norm_lstm) + _accum_cell_state1.run(); + if (_is_layer_norm_lstm) { _mean_std_norm_cell_gate.run(); - CLScheduler::get().enqueue(_pixelwise_mul_cell_gate_coeff); - CLScheduler::get().enqueue(_accum_cell_gate_bias); + _pixelwise_mul_cell_gate_coeff.run(); + _accum_cell_gate_bias.run(); } - CLScheduler::get().enqueue(_activation_cell_state); - CLScheduler::get().enqueue(_pixelwise_mul_cell_state1); - CLScheduler::get().enqueue(_pixelwise_mul_cell_state2); - CLScheduler::get().enqueue(_accum_cell_state2); + _activation_cell_state.run(); + _pixelwise_mul_cell_state1.run(); + _pixelwise_mul_cell_state2.run(); + _accum_cell_state2.run(); - if(_perform_cell_clipping) + if (_perform_cell_clipping) { - CLScheduler::get().enqueue(_cell_clip); + _cell_clip.run(); } _fully_connected_output.run(); - if(_run_peephole_opt) + if (_run_peephole_opt) { - CLScheduler::get().enqueue(_pixelwise_mul_output_state1); + _pixelwise_mul_output_state1.run(); _accum_output1.run(); } - if(_is_layer_norm_lstm) + if (_is_layer_norm_lstm) { _mean_std_norm_output_gate.run(); - CLScheduler::get().enqueue(_pixelwise_mul_output_gate_coeff); - CLScheduler::get().enqueue(_accum_output_gate_bias); + _pixelwise_mul_output_gate_coeff.run(); + _accum_output_gate_bias.run(); } - CLScheduler::get().enqueue(_activation_output); + _activation_output.run(); - CLScheduler::get().enqueue(_activation_output_state); - CLScheduler::get().enqueue(_pixelwise_mul_output_state2); + _activation_output_state.run(); + _pixelwise_mul_output_state2.run(); - if(_has_projection_weights) + if (_has_projection_weights) { _fully_connected_output_state.run(); - if(_perform_projection_clipping) + if (_perform_projection_clipping) { - CLScheduler::get().enqueue(_projection_clip); + _projection_clip.run(); } } - CLScheduler::get().enqueue(_copy_cell_state); - CLScheduler::get().enqueue(_copy_output); + _copy_cell_state.run(); + _copy_output.run(); _concat_scratch_buffer.run(); } void CLLSTMLayer::prepare() { - if(!_is_prepared) + if (!_is_prepared) { - CLScheduler::get().enqueue(_concat_weights_forget_gate); - if(!_run_cifg_opt) + _concat_weights_forget_gate.run(); + if (!_run_cifg_opt) { - CLScheduler::get().enqueue(_concat_weights_input_gate); + _concat_weights_input_gate.run(); } - CLScheduler::get().enqueue(_concat_weights_output); + _concat_weights_output.run(); _is_prepared = true; } } diff --git a/src/runtime/CL/functions/CLLSTMLayerQuantized.cpp b/src/runtime/CL/functions/CLLSTMLayerQuantized.cpp index c57fcc9f21..ea64eda023 100644 --- a/src/runtime/CL/functions/CLLSTMLayerQuantized.cpp +++ b/src/runtime/CL/functions/CLLSTMLayerQuantized.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2020 ARM Limited. + * Copyright (c) 2019-2021 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -25,12 +25,14 @@ #include "arm_compute/runtime/CL/functions/CLLSTMLayerQuantized.h" #include "arm_compute/core/Utils.h" -#include "arm_compute/core/Validate.h" #include "arm_compute/core/utils/quantization/AsymmHelpers.h" +#include "arm_compute/core/Validate.h" + +#include "src/common/utils/Log.h" +#include "src/core/CL/kernels/CLFillBorderKernel.h" +#include "src/core/helpers/AutoConfiguration.h" -#include <cmath> #include <memory> -#include <tuple> namespace arm_compute { @@ -44,44 +46,129 @@ const QuantizationInfo qsymm_0(1.f / 32768.f, 0); // qsymm16 with 0 integer bit } // namespace CLLSTMLayerQuantized::CLLSTMLayerQuantized(std::shared_ptr<IMemoryManager> memory_manager) - : _memory_group(std::move(memory_manager)), _gemmlowp(), _output_stage(), _transpose_weights(), _concat_input_weights(), _concat_recurrent_weights(), _concat_weights(), _concat_inputs(), - _concat_bias(), _sigmoid_forget_gate(), _sigmoid_input_gate(), _sigmoid_output_gate(), _tanh_modulation_gate(), _tanh_output_state(), _add_cell_state_tmps(), _add2(), _mul_forget_gate_cell_state(), - _mul_input_gate_input_mod_gate(), _mul_output_state_tmp_output_gate(), _slice_input_tensor(), _slice_forget_tensor(), _slice_cell_tensor(), _slice_output_tensor(), _dequantize(), _quantize(), - _input_to_input_weights(nullptr), _input_to_forget_weights(nullptr), _input_to_cell_weights(nullptr), _input_to_output_weights(nullptr), _recurrent_to_input_weights(nullptr), - _recurrent_to_forget_weights(nullptr), _recurrent_to_cell_weights(nullptr), _recurrent_to_output_weights(nullptr), _input_gate_bias(nullptr), _forget_gate_bias(nullptr), _cell_bias(nullptr), - _output_gate_bias(nullptr), _recurrent_weights(), _input_weights(), _weights(), _input(), _weights_transposed(), _output_highp(), _output_lowp(), _bias(), _forget_gate_input(), _input_gate_input(), - _output_gate_input(), _input_modulation_gate_input(), _forget_gate_output(), _input_gate_output(), _output_gate_output(), _input_modulation_gate_output(), _cell_state_tmp1(), _cell_state_tmp2(), - _output_state_tmp(), _output_state_out_symm(), _output_state_out_f32(), _is_prepared(false) + : _memory_group(std::move(memory_manager)), + _gemmlowp(), + _output_stage(), + _transpose_weights(), + _concat_input_weights(), + _concat_recurrent_weights(), + _concat_weights(), + _concat_inputs(), + _concat_bias(), + _sigmoid_forget_gate(), + _sigmoid_input_gate(), + _sigmoid_output_gate(), + _tanh_modulation_gate(), + _tanh_output_state(), + _add_cell_state_tmps(), + _add2(), + _mul_forget_gate_cell_state(), + _mul_input_gate_input_mod_gate(), + _mul_output_state_tmp_output_gate(), + _slice_input_tensor(), + _slice_forget_tensor(), + _slice_cell_tensor(), + _slice_output_tensor(), + _dequantize(), + _quantize(), + _input_to_input_weights(nullptr), + _input_to_forget_weights(nullptr), + _input_to_cell_weights(nullptr), + _input_to_output_weights(nullptr), + _recurrent_to_input_weights(nullptr), + _recurrent_to_forget_weights(nullptr), + _recurrent_to_cell_weights(nullptr), + _recurrent_to_output_weights(nullptr), + _input_gate_bias(nullptr), + _forget_gate_bias(nullptr), + _cell_bias(nullptr), + _output_gate_bias(nullptr), + _recurrent_weights(), + _input_weights(), + _weights(), + _input(), + _weights_transposed(), + _output_highp(), + _output_lowp(), + _bias(), + _forget_gate_input(), + _input_gate_input(), + _output_gate_input(), + _input_modulation_gate_input(), + _forget_gate_output(), + _input_gate_output(), + _output_gate_output(), + _input_modulation_gate_output(), + _cell_state_tmp1(), + _cell_state_tmp2(), + _output_state_tmp(), + _output_state_out_symm(), + _output_state_out_f32(), + _is_prepared(false) { } void CLLSTMLayerQuantized::configure(const ICLTensor *input, - const ICLTensor *input_to_input_weights, const ICLTensor *input_to_forget_weights, const ICLTensor *input_to_cell_weights, const ICLTensor *input_to_output_weights, - const ICLTensor *recurrent_to_input_weights, const ICLTensor *recurrent_to_forget_weights, const ICLTensor *recurrent_to_cell_weights, const ICLTensor *recurrent_to_output_weights, - const ICLTensor *input_gate_bias, const ICLTensor *forget_gate_bias, const ICLTensor *cell_bias, const ICLTensor *output_gate_bias, - ICLTensor *cell_state_in, const ICLTensor *output_state_in, - ICLTensor *cell_state_out, ICLTensor *output_state_out) + const ICLTensor *input_to_input_weights, + const ICLTensor *input_to_forget_weights, + const ICLTensor *input_to_cell_weights, + const ICLTensor *input_to_output_weights, + const ICLTensor *recurrent_to_input_weights, + const ICLTensor *recurrent_to_forget_weights, + const ICLTensor *recurrent_to_cell_weights, + const ICLTensor *recurrent_to_output_weights, + const ICLTensor *input_gate_bias, + const ICLTensor *forget_gate_bias, + const ICLTensor *cell_bias, + const ICLTensor *output_gate_bias, + ICLTensor *cell_state_in, + const ICLTensor *output_state_in, + ICLTensor *cell_state_out, + ICLTensor *output_state_out) { - configure(CLKernelLibrary::get().get_compile_context(), input, input_to_input_weights, input_to_forget_weights, input_to_cell_weights, input_to_output_weights, recurrent_to_input_weights, - recurrent_to_forget_weights, recurrent_to_cell_weights, recurrent_to_output_weights, input_gate_bias, forget_gate_bias, cell_bias, output_gate_bias, cell_state_in, output_state_in, cell_state_out, - output_state_out); + configure(CLKernelLibrary::get().get_compile_context(), input, input_to_input_weights, input_to_forget_weights, + input_to_cell_weights, input_to_output_weights, recurrent_to_input_weights, recurrent_to_forget_weights, + recurrent_to_cell_weights, recurrent_to_output_weights, input_gate_bias, forget_gate_bias, cell_bias, + output_gate_bias, cell_state_in, output_state_in, cell_state_out, output_state_out); } -void CLLSTMLayerQuantized::configure(const CLCompileContext &compile_context, const ICLTensor *input, - const ICLTensor *input_to_input_weights, const ICLTensor *input_to_forget_weights, const ICLTensor *input_to_cell_weights, const ICLTensor *input_to_output_weights, - const ICLTensor *recurrent_to_input_weights, const ICLTensor *recurrent_to_forget_weights, const ICLTensor *recurrent_to_cell_weights, const ICLTensor *recurrent_to_output_weights, - const ICLTensor *input_gate_bias, const ICLTensor *forget_gate_bias, const ICLTensor *cell_bias, const ICLTensor *output_gate_bias, - ICLTensor *cell_state_in, const ICLTensor *output_state_in, - ICLTensor *cell_state_out, ICLTensor *output_state_out) +void CLLSTMLayerQuantized::configure(const CLCompileContext &compile_context, + const ICLTensor *input, + const ICLTensor *input_to_input_weights, + const ICLTensor *input_to_forget_weights, + const ICLTensor *input_to_cell_weights, + const ICLTensor *input_to_output_weights, + const ICLTensor *recurrent_to_input_weights, + const ICLTensor *recurrent_to_forget_weights, + const ICLTensor *recurrent_to_cell_weights, + const ICLTensor *recurrent_to_output_weights, + const ICLTensor *input_gate_bias, + const ICLTensor *forget_gate_bias, + const ICLTensor *cell_bias, + const ICLTensor *output_gate_bias, + ICLTensor *cell_state_in, + const ICLTensor *output_state_in, + ICLTensor *cell_state_out, + ICLTensor *output_state_out) { - ARM_COMPUTE_ERROR_ON_NULLPTR(input, input_to_input_weights, input_to_forget_weights, input_to_cell_weights, input_to_output_weights, - recurrent_to_input_weights, recurrent_to_forget_weights, recurrent_to_cell_weights, recurrent_to_output_weights, - input_gate_bias, forget_gate_bias, cell_bias, output_gate_bias, cell_state_in, output_state_in, cell_state_out, output_state_out); - - ARM_COMPUTE_ERROR_THROW_ON(CLLSTMLayerQuantized::validate(input->info(), input_to_input_weights->info(), input_to_forget_weights->info(), input_to_cell_weights->info(), - input_to_output_weights->info(), - recurrent_to_input_weights->info(), recurrent_to_forget_weights->info(), recurrent_to_cell_weights->info(), recurrent_to_output_weights->info(), - input_gate_bias->info(), forget_gate_bias->info(), cell_bias->info(), output_gate_bias->info(), cell_state_in->info(), output_state_in->info(), cell_state_out->info(), output_state_out->info())); + ARM_COMPUTE_ERROR_ON_NULLPTR(input, input_to_input_weights, input_to_forget_weights, input_to_cell_weights, + input_to_output_weights, recurrent_to_input_weights, recurrent_to_forget_weights, + recurrent_to_cell_weights, recurrent_to_output_weights, input_gate_bias, + forget_gate_bias, cell_bias, output_gate_bias, cell_state_in, output_state_in, + cell_state_out, output_state_out); + + ARM_COMPUTE_LOG_PARAMS(input, input_to_input_weights, input_to_forget_weights, input_to_cell_weights, + input_to_output_weights, recurrent_to_input_weights, recurrent_to_forget_weights, + recurrent_to_cell_weights, recurrent_to_output_weights, input_gate_bias, forget_gate_bias, + cell_bias, output_gate_bias, cell_state_in, output_state_in, cell_state_out, + output_state_out); + + ARM_COMPUTE_ERROR_THROW_ON(CLLSTMLayerQuantized::validate( + input->info(), input_to_input_weights->info(), input_to_forget_weights->info(), input_to_cell_weights->info(), + input_to_output_weights->info(), recurrent_to_input_weights->info(), recurrent_to_forget_weights->info(), + recurrent_to_cell_weights->info(), recurrent_to_output_weights->info(), input_gate_bias->info(), + forget_gate_bias->info(), cell_bias->info(), output_gate_bias->info(), cell_state_in->info(), + output_state_in->info(), cell_state_out->info(), output_state_out->info())); const int input_size = input->info()->dimension(0); const int batch_size = input->info()->dimension(1); @@ -89,8 +176,10 @@ void CLLSTMLayerQuantized::configure(const CLCompileContext &compile_context, co const QuantizationInfo qweights = input_to_input_weights->info()->quantization_info(); // Weights quantization - auto_init_if_empty(*cell_state_out->info(), TensorInfo(TensorShape(batch_size, output_size), 1, DataType::QSYMM16, qsymm_4)); - auto_init_if_empty(*output_state_out->info(), TensorInfo(TensorShape(batch_size, output_size), 1, DataType::QASYMM8, qasymm)); + auto_init_if_empty(*cell_state_out->info(), + TensorInfo(TensorShape(batch_size, output_size), 1, DataType::QSYMM16, qsymm_4)); + auto_init_if_empty(*output_state_out->info(), + TensorInfo(TensorShape(batch_size, output_size), 1, DataType::QASYMM8, qasymm)); _input_to_input_weights = input_to_input_weights; _input_to_forget_weights = input_to_forget_weights; @@ -118,17 +207,20 @@ void CLLSTMLayerQuantized::configure(const CLCompileContext &compile_context, co recurrent_weights_vector.emplace_back(recurrent_to_cell_weights); recurrent_weights_vector.emplace_back(recurrent_to_output_weights); - _input_weights.allocator()->init(TensorInfo(TensorShape(input_size, 4 * output_size), 1, DataType::QASYMM8, qweights)); + _input_weights.allocator()->init( + TensorInfo(TensorShape(input_size, 4 * output_size), 1, DataType::QASYMM8, qweights)); _concat_input_weights.configure(compile_context, inputs_weights_vector, &_input_weights, Window::DimY); - _recurrent_weights.allocator()->init(TensorInfo(TensorShape(output_size, 4 * output_size), 1, DataType::QASYMM8, qweights)); + _recurrent_weights.allocator()->init( + TensorInfo(TensorShape(output_size, 4 * output_size), 1, DataType::QASYMM8, qweights)); _concat_recurrent_weights.configure(compile_context, recurrent_weights_vector, &_recurrent_weights, Window::DimY); std::vector<const ICLTensor *> weights_vector; weights_vector.emplace_back(&_recurrent_weights); weights_vector.emplace_back(&_input_weights); - _weights.allocator()->init(TensorInfo(TensorShape(output_size + input_size, 4 * output_size), 1, DataType::QASYMM8, qweights)); + _weights.allocator()->init( + TensorInfo(TensorShape(output_size + input_size, 4 * output_size), 1, DataType::QASYMM8, qweights)); _concat_weights.configure(compile_context, weights_vector, &_weights, Window::DimX); _transpose_weights.configure(compile_context, &_weights, &_weights_transposed); @@ -138,7 +230,8 @@ void CLLSTMLayerQuantized::configure(const CLCompileContext &compile_context, co input_vector.emplace_back(output_state_in); _memory_group.manage(&_input); - _input.allocator()->init(TensorInfo(TensorShape(output_size + input_size, batch_size), 1, DataType::QASYMM8, qasymm)); + _input.allocator()->init( + TensorInfo(TensorShape(output_size + input_size, batch_size), 1, DataType::QASYMM8, qasymm)); _concat_inputs.configure(compile_context, input_vector, &_input, Window::DimX); // Bias concatenation @@ -153,7 +246,8 @@ void CLLSTMLayerQuantized::configure(const CLCompileContext &compile_context, co // Invert the offset for gemmlowp _input.info()->set_quantization_info(QuantizationInfo(qasymm.uniform().scale, -qasymm.uniform().offset)); - _weights_transposed.info()->set_quantization_info(QuantizationInfo(qweights.uniform().scale, -qweights.uniform().offset)); + _weights_transposed.info()->set_quantization_info( + QuantizationInfo(qweights.uniform().scale, -qweights.uniform().offset)); // Run gemmlowp _memory_group.manage(&_output_highp); @@ -163,7 +257,8 @@ void CLLSTMLayerQuantized::configure(const CLCompileContext &compile_context, co // Set the offset back _input.info()->set_quantization_info(QuantizationInfo(qasymm.uniform().scale, qasymm.uniform().offset)); - _weights_transposed.info()->set_quantization_info(QuantizationInfo(qweights.uniform().scale, qweights.uniform().offset)); + _weights_transposed.info()->set_quantization_info( + QuantizationInfo(qweights.uniform().scale, qweights.uniform().offset)); // multiplier = (input_scale * weights_scale) / output_scale (2 ^ (-12)) _output_lowp.allocator()->init(TensorInfo(_output_highp.info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_3)); @@ -174,90 +269,122 @@ void CLLSTMLayerQuantized::configure(const CLCompileContext &compile_context, co quantization::calculate_quantized_multiplier(multiplier, &output_multiplier, &output_shift); _memory_group.manage(&_output_lowp); - _output_stage.configure(compile_context, &_output_highp, &_bias, &_output_lowp, output_multiplier, output_shift); + + GEMMLowpOutputStageInfo info{}; + info.type = GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT; + info.gemmlowp_multiplier = output_multiplier; + info.gemmlowp_shift = output_shift; + info.output_data_type = DataType::QSYMM16; + _output_stage.configure(compile_context, &_output_highp, &_bias, &_output_lowp, info); _output_highp.allocator()->allocate(); _bias.allocator()->allocate(); // Get the gate tensors - if(batch_size > 1) + if (batch_size > 1) { _memory_group.manage(&_input_gate_input); - _slice_input_tensor.configure(compile_context, &_output_lowp, &_input_gate_input, { 0, 0 }, { output_size, batch_size }); + _slice_input_tensor.configure(compile_context, &_output_lowp, &_input_gate_input, {0, 0}, + {output_size, batch_size}); _memory_group.manage(&_forget_gate_input); - _slice_forget_tensor.configure(compile_context, &_output_lowp, &_forget_gate_input, { output_size, 0 }, { 2 * output_size, batch_size }); + _slice_forget_tensor.configure(compile_context, &_output_lowp, &_forget_gate_input, {output_size, 0}, + {2 * output_size, batch_size}); _memory_group.manage(&_input_modulation_gate_input); - _slice_cell_tensor.configure(compile_context, &_output_lowp, &_input_modulation_gate_input, { 2 * output_size, 0 }, { 3 * output_size, batch_size }); + _slice_cell_tensor.configure(compile_context, &_output_lowp, &_input_modulation_gate_input, + {2 * output_size, 0}, {3 * output_size, batch_size}); _memory_group.manage(&_output_gate_input); - _slice_output_tensor.configure(compile_context, &_output_lowp, &_output_gate_input, { 3 * output_size, 0 }, { 4 * output_size, batch_size }); + _slice_output_tensor.configure(compile_context, &_output_lowp, &_output_gate_input, {3 * output_size, 0}, + {4 * output_size, batch_size}); _output_lowp.allocator()->allocate(); } else { _memory_group.manage(&_input_gate_input); - _slice_input_tensor.configure(compile_context, &_output_lowp, &_input_gate_input, { 0 }, { output_size }); + _slice_input_tensor.configure(compile_context, &_output_lowp, &_input_gate_input, {0}, {output_size}); _memory_group.manage(&_forget_gate_input); - _slice_forget_tensor.configure(compile_context, &_output_lowp, &_forget_gate_input, { output_size }, { 2 * output_size }); + _slice_forget_tensor.configure(compile_context, &_output_lowp, &_forget_gate_input, {output_size}, + {2 * output_size}); _memory_group.manage(&_input_modulation_gate_input); - _slice_cell_tensor.configure(compile_context, &_output_lowp, &_input_modulation_gate_input, { 2 * output_size }, { 3 * output_size }); + _slice_cell_tensor.configure(compile_context, &_output_lowp, &_input_modulation_gate_input, {2 * output_size}, + {3 * output_size}); _memory_group.manage(&_output_gate_input); - _slice_output_tensor.configure(compile_context, &_output_lowp, &_output_gate_input, { 3 * output_size }, { 4 * output_size }); + _slice_output_tensor.configure(compile_context, &_output_lowp, &_output_gate_input, {3 * output_size}, + {4 * output_size}); _output_lowp.allocator()->allocate(); } // Forget gate _memory_group.manage(&_forget_gate_output); - _forget_gate_output.allocator()->init(TensorInfo(_forget_gate_input.info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_0)); - _sigmoid_forget_gate.configure(compile_context, &_forget_gate_input, &_forget_gate_output, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)); + _forget_gate_output.allocator()->init( + TensorInfo(_forget_gate_input.info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_0)); + _sigmoid_forget_gate.configure(compile_context, &_forget_gate_input, &_forget_gate_output, + ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)); _forget_gate_input.allocator()->allocate(); // Input gate _memory_group.manage(&_input_gate_output); - _input_gate_output.allocator()->init(TensorInfo(_input_gate_input.info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_0)); - _sigmoid_input_gate.configure(compile_context, &_input_gate_input, &_input_gate_output, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)); + _input_gate_output.allocator()->init( + TensorInfo(_input_gate_input.info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_0)); + _sigmoid_input_gate.configure(compile_context, &_input_gate_input, &_input_gate_output, + ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)); _input_gate_input.allocator()->allocate(); // Input modulation gate equation _memory_group.manage(&_input_modulation_gate_output); - _input_modulation_gate_output.allocator()->init(TensorInfo(_input_modulation_gate_input.info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_0)); - _tanh_modulation_gate.configure(compile_context, &_input_modulation_gate_input, &_input_modulation_gate_output, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.0f, 1.0f)); + _input_modulation_gate_output.allocator()->init( + TensorInfo(_input_modulation_gate_input.info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_0)); + _tanh_modulation_gate.configure(compile_context, &_input_modulation_gate_input, &_input_modulation_gate_output, + ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.0f, 1.0f)); _input_modulation_gate_input.allocator()->allocate(); // Output gate _memory_group.manage(&_output_gate_output); - _output_gate_output.allocator()->init(TensorInfo(_output_gate_input.info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_0)); - _sigmoid_output_gate.configure(compile_context, &_output_gate_input, &_output_gate_output, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)); + _output_gate_output.allocator()->init( + TensorInfo(_output_gate_input.info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_0)); + _sigmoid_output_gate.configure(compile_context, &_output_gate_input, &_output_gate_output, + ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)); _output_gate_input.allocator()->allocate(); // Long term memory _memory_group.manage(&_cell_state_tmp1); - _cell_state_tmp1.allocator()->init(TensorInfo(_forget_gate_output.info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_4)); - _mul_forget_gate_cell_state.configure(compile_context, &_forget_gate_output, cell_state_in, &_cell_state_tmp1, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO); + _cell_state_tmp1.allocator()->init( + TensorInfo(_forget_gate_output.info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_4)); + _mul_forget_gate_cell_state.configure(compile_context, &_forget_gate_output, cell_state_in, &_cell_state_tmp1, 1, + ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO); _forget_gate_output.allocator()->allocate(); _memory_group.manage(&_cell_state_tmp2); - _cell_state_tmp2.allocator()->init(TensorInfo(_input_gate_output.info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_4)); - _mul_input_gate_input_mod_gate.configure(compile_context, &_input_gate_output, &_input_modulation_gate_output, &_cell_state_tmp2, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO); + _cell_state_tmp2.allocator()->init( + TensorInfo(_input_gate_output.info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_4)); + _mul_input_gate_input_mod_gate.configure(compile_context, &_input_gate_output, &_input_modulation_gate_output, + &_cell_state_tmp2, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO); _input_modulation_gate_output.allocator()->allocate(); _input_gate_output.allocator()->allocate(); - _add_cell_state_tmps.configure(compile_context, &_cell_state_tmp1, &_cell_state_tmp2, cell_state_out, ConvertPolicy::SATURATE); + _add_cell_state_tmps.configure(compile_context, &_cell_state_tmp1, &_cell_state_tmp2, cell_state_out, + ConvertPolicy::SATURATE); _cell_state_tmp1.allocator()->allocate(); _cell_state_tmp2.allocator()->allocate(); // Short term memory _memory_group.manage(&_output_state_tmp); - _output_state_tmp.allocator()->init(TensorInfo(cell_state_out->info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_0)); - _tanh_output_state.configure(compile_context, cell_state_out, &_output_state_tmp, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.0f, 1.0f)); + _output_state_tmp.allocator()->init( + TensorInfo(cell_state_out->info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_0)); + _tanh_output_state.configure(compile_context, cell_state_out, &_output_state_tmp, + ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.0f, 1.0f)); _memory_group.manage(&_output_state_out_symm); - _output_state_out_symm.allocator()->init(TensorInfo(_output_gate_output.info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_0)); - _mul_output_state_tmp_output_gate.configure(compile_context, &_output_state_tmp, &_output_gate_output, &_output_state_out_symm, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO); + _output_state_out_symm.allocator()->init( + TensorInfo(_output_gate_output.info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_0)); + _mul_output_state_tmp_output_gate.configure(compile_context, &_output_state_tmp, &_output_gate_output, + &_output_state_out_symm, 1, ConvertPolicy::SATURATE, + RoundingPolicy::TO_ZERO); _output_gate_output.allocator()->allocate(); _output_state_tmp.allocator()->allocate(); // Requantize the output state from QSYMM16 to QASYMM8 _memory_group.manage(&_output_state_out_f32); - _output_state_out_f32.allocator()->init(TensorInfo(_output_state_out_symm.info()->tensor_shape(), 1, DataType::F32)); + _output_state_out_f32.allocator()->init( + TensorInfo(_output_state_out_symm.info()->tensor_shape(), 1, DataType::F32)); _dequantize.configure(compile_context, &_output_state_out_symm, &_output_state_out_f32); _output_state_out_symm.allocator()->allocate(); @@ -266,15 +393,29 @@ void CLLSTMLayerQuantized::configure(const CLCompileContext &compile_context, co } Status CLLSTMLayerQuantized::validate(const ITensorInfo *input, - const ITensorInfo *input_to_input_weights, const ITensorInfo *input_to_forget_weights, const ITensorInfo *input_to_cell_weights, const ITensorInfo *input_to_output_weights, - const ITensorInfo *recurrent_to_input_weights, const ITensorInfo *recurrent_to_forget_weights, const ITensorInfo *recurrent_to_cell_weights, const ITensorInfo *recurrent_to_output_weights, - const ITensorInfo *input_gate_bias, const ITensorInfo *forget_gate_bias, const ITensorInfo *cell_bias, const ITensorInfo *output_gate_bias, - const ITensorInfo *cell_state_in, const ITensorInfo *output_state_in, - const ITensorInfo *cell_state_out, const ITensorInfo *output_state_out) + const ITensorInfo *input_to_input_weights, + const ITensorInfo *input_to_forget_weights, + const ITensorInfo *input_to_cell_weights, + const ITensorInfo *input_to_output_weights, + const ITensorInfo *recurrent_to_input_weights, + const ITensorInfo *recurrent_to_forget_weights, + const ITensorInfo *recurrent_to_cell_weights, + const ITensorInfo *recurrent_to_output_weights, + const ITensorInfo *input_gate_bias, + const ITensorInfo *forget_gate_bias, + const ITensorInfo *cell_bias, + const ITensorInfo *output_gate_bias, + const ITensorInfo *cell_state_in, + const ITensorInfo *output_state_in, + const ITensorInfo *cell_state_out, + const ITensorInfo *output_state_out) { - ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, input_to_input_weights, input_to_forget_weights, input_to_cell_weights, input_to_output_weights, recurrent_to_input_weights, - recurrent_to_forget_weights, recurrent_to_cell_weights, recurrent_to_output_weights, input_gate_bias, forget_gate_bias, cell_bias, output_gate_bias, cell_state_in, - output_state_in, cell_state_out, output_state_out); + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR( + input, input_to_input_weights, input_to_forget_weights, input_to_cell_weights, input_to_output_weights, + recurrent_to_input_weights, recurrent_to_forget_weights, recurrent_to_cell_weights, recurrent_to_output_weights, + input_gate_bias, forget_gate_bias, cell_bias, output_gate_bias, cell_state_in, output_state_in, cell_state_out, + output_state_out); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_NOT_IN(input, DataType::QASYMM8); const int input_size = input->dimension(0); const int batch_size = input->dimension(1); @@ -286,29 +427,51 @@ Status CLLSTMLayerQuantized::validate(const ITensorInfo *input, ARM_COMPUTE_RETURN_ERROR_ON(input_gate_bias->num_dimensions() > 1); ARM_COMPUTE_RETURN_ERROR_ON(output_state_in->num_dimensions() > 2); - TensorInfo input_weights_info(input_to_input_weights->clone()->set_tensor_shape(TensorShape(input_size, output_size)).set_data_type(DataType::QASYMM8)); - TensorInfo recurrent_weights_info(input_to_input_weights->clone()->set_tensor_shape(TensorShape(output_size, output_size)).set_data_type(DataType::QASYMM8)); - TensorInfo bias_info(input_gate_bias->clone()->set_tensor_shape(TensorShape(output_size)).set_data_type(DataType::S32)); - TensorInfo output_state_info(cell_state_in->clone()->set_tensor_shape(TensorShape(output_size, batch_size)).set_data_type(DataType::QASYMM8).set_quantization_info(qasymm)); - TensorInfo cell_state_info(cell_state_in->clone()->set_tensor_shape(TensorShape(output_size, batch_size)).set_data_type(DataType::QSYMM16).set_quantization_info(qsymm_4)); + TensorInfo input_weights_info(input_to_input_weights->clone() + ->set_tensor_shape(TensorShape(input_size, output_size)) + .set_data_type(DataType::QASYMM8)); + TensorInfo recurrent_weights_info(input_to_input_weights->clone() + ->set_tensor_shape(TensorShape(output_size, output_size)) + .set_data_type(DataType::QASYMM8)); + TensorInfo bias_info( + input_gate_bias->clone()->set_tensor_shape(TensorShape(output_size)).set_data_type(DataType::S32)); + TensorInfo output_state_info(cell_state_in->clone() + ->set_tensor_shape(TensorShape(output_size, batch_size)) + .set_data_type(DataType::QASYMM8) + .set_quantization_info(qasymm)); + TensorInfo cell_state_info(cell_state_in->clone() + ->set_tensor_shape(TensorShape(output_size, batch_size)) + .set_data_type(DataType::QSYMM16) + .set_quantization_info(qsymm_4)); // Shape checks - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&input_weights_info, input_to_input_weights, input_to_forget_weights, input_to_cell_weights, input_to_output_weights); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&recurrent_weights_info, recurrent_to_input_weights, recurrent_to_forget_weights, recurrent_to_cell_weights, recurrent_to_output_weights); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&bias_info, input_gate_bias, forget_gate_bias, cell_bias, output_gate_bias); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&input_weights_info, input_to_input_weights, input_to_forget_weights, + input_to_cell_weights, input_to_output_weights); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&recurrent_weights_info, recurrent_to_input_weights, + recurrent_to_forget_weights, recurrent_to_cell_weights, + recurrent_to_output_weights); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&bias_info, input_gate_bias, forget_gate_bias, cell_bias, + output_gate_bias); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&cell_state_info, cell_state_in); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&output_state_info, output_state_in); // Data type checks - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&input_weights_info, input, input_to_input_weights, input_to_forget_weights, input_to_cell_weights, input_to_output_weights); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&recurrent_weights_info, recurrent_to_input_weights, recurrent_to_forget_weights, recurrent_to_cell_weights, recurrent_to_output_weights); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&bias_info, input_gate_bias, forget_gate_bias, cell_bias, output_gate_bias); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&input_weights_info, input, input_to_input_weights, + input_to_forget_weights, input_to_cell_weights, + input_to_output_weights); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&recurrent_weights_info, recurrent_to_input_weights, + recurrent_to_forget_weights, recurrent_to_cell_weights, + recurrent_to_output_weights); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&bias_info, input_gate_bias, forget_gate_bias, cell_bias, + output_gate_bias); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&cell_state_info, cell_state_in); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&output_state_info, output_state_in); // Quantization checks - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(input_to_input_weights, input_to_forget_weights, input_to_cell_weights, input_to_output_weights); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(recurrent_to_input_weights, recurrent_to_forget_weights, recurrent_to_cell_weights, recurrent_to_output_weights); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(input_to_input_weights, input_to_forget_weights, + input_to_cell_weights, input_to_output_weights); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(recurrent_to_input_weights, recurrent_to_forget_weights, + recurrent_to_cell_weights, recurrent_to_output_weights); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(&cell_state_info, cell_state_in); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(&output_state_info, output_state_in); @@ -330,7 +493,8 @@ Status CLLSTMLayerQuantized::validate(const ITensorInfo *input, recurrent_weights_vector.emplace_back(recurrent_to_cell_weights); recurrent_weights_vector.emplace_back(recurrent_to_output_weights); const TensorInfo recurrent_weights(TensorShape(output_size, 4 * output_size), 1, DataType::QASYMM8, qweights); - ARM_COMPUTE_RETURN_ON_ERROR(CLConcatenateLayer::validate(recurrent_weights_vector, &recurrent_weights, Window::DimY)); + ARM_COMPUTE_RETURN_ON_ERROR( + CLConcatenateLayer::validate(recurrent_weights_vector, &recurrent_weights, Window::DimY)); // _concat_weights std::vector<const ITensorInfo *> weights_vector; @@ -340,7 +504,7 @@ Status CLLSTMLayerQuantized::validate(const ITensorInfo *input, ARM_COMPUTE_RETURN_ON_ERROR(CLConcatenateLayer::validate(weights_vector, &weights, Window::DimX)); // _transpose_weights const TensorShape weights_transposed_shape(weights.tensor_shape()[1], weights.tensor_shape()[0]); - TensorInfo weights_transposed = weights.clone()->set_is_resizable(true).set_tensor_shape(weights_transposed_shape); + TensorInfo weights_transposed = weights.clone()->set_is_resizable(true).set_tensor_shape(weights_transposed_shape); ARM_COMPUTE_RETURN_ON_ERROR(CLTranspose::validate(&weights, &weights_transposed)); // _concat_inputs @@ -366,7 +530,8 @@ Status CLLSTMLayerQuantized::validate(const ITensorInfo *input, // _gemmlowp const TensorInfo output_highp(TensorShape(4 * output_size, batch_size), 1, DataType::S32); - ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixMultiplyCore::validate(&input_concatenated, &weights_transposed, nullptr, &output_highp)); + ARM_COMPUTE_RETURN_ON_ERROR( + CLGEMMLowpMatrixMultiplyCore::validate(&input_concatenated, &weights_transposed, nullptr, &output_highp)); // Set the offset back input_concatenated.set_quantization_info(QuantizationInfo(qasymm.uniform().scale, qasymm.uniform().offset)); @@ -377,78 +542,107 @@ Status CLLSTMLayerQuantized::validate(const ITensorInfo *input, const float multiplier = 4096.f * qasymm.uniform().scale * qweights.uniform().scale; int output_multiplier = 0; int output_shift = 0; - ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier(multiplier, &output_multiplier, &output_shift)); + ARM_COMPUTE_RETURN_ON_ERROR( + quantization::calculate_quantized_multiplier(multiplier, &output_multiplier, &output_shift)); // _output_stage - ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPoint::validate(&output_highp, &bias_concatenated, &output_lowp)); + GEMMLowpOutputStageInfo info{}; + info.type = GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT; + info.gemmlowp_multiplier = output_multiplier; + info.gemmlowp_shift = output_shift; + info.output_data_type = DataType::QSYMM16; + ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpOutputStage::validate(&output_highp, &bias_concatenated, &output_lowp, info)); TensorInfo input_gate_input; TensorInfo forget_gate_input; TensorInfo input_modulation_gate_input; TensorInfo output_gate_input; - if(batch_size > 1) + if (batch_size > 1) { // _slice_input_tensor input_gate_input = TensorInfo(TensorShape(output_size, batch_size), 1, DataType::QSYMM16, qsymm_3); - ARM_COMPUTE_RETURN_ON_ERROR(CLSlice::validate(&output_lowp, &input_gate_input, { 0, 0 }, { output_size, batch_size })); + ARM_COMPUTE_RETURN_ON_ERROR( + CLSlice::validate(&output_lowp, &input_gate_input, {0, 0}, {output_size, batch_size})); // _slice_forget_tensor forget_gate_input = TensorInfo(TensorShape(output_size, batch_size), 1, DataType::QSYMM16, qsymm_3); - ARM_COMPUTE_RETURN_ON_ERROR(CLSlice::validate(&output_lowp, &forget_gate_input, { output_size, 0 }, { 2 * output_size, batch_size })); + ARM_COMPUTE_RETURN_ON_ERROR( + CLSlice::validate(&output_lowp, &forget_gate_input, {output_size, 0}, {2 * output_size, batch_size})); // _slice_cell_tensor input_modulation_gate_input = TensorInfo(TensorShape(output_size, batch_size), 1, DataType::QSYMM16, qsymm_3); - ARM_COMPUTE_RETURN_ON_ERROR(CLSlice::validate(&output_lowp, &input_modulation_gate_input, { 2 * output_size, 0 }, { 3 * output_size, batch_size })); + ARM_COMPUTE_RETURN_ON_ERROR(CLSlice::validate(&output_lowp, &input_modulation_gate_input, {2 * output_size, 0}, + {3 * output_size, batch_size})); // _slice_output_tensor output_gate_input = TensorInfo(TensorShape(output_size, batch_size), 1, DataType::QSYMM16, qsymm_3); - ARM_COMPUTE_RETURN_ON_ERROR(CLSlice::validate(&output_lowp, &output_gate_input, { 3 * output_size, 0 }, { 4 * output_size, batch_size })); + ARM_COMPUTE_RETURN_ON_ERROR( + CLSlice::validate(&output_lowp, &output_gate_input, {3 * output_size, 0}, {4 * output_size, batch_size})); } else { // _slice_input_tensor input_gate_input = TensorInfo(TensorShape(output_size), 1, DataType::QSYMM16, qsymm_3); - ARM_COMPUTE_RETURN_ON_ERROR(CLSlice::validate(&output_lowp, &input_gate_input, { 0 }, { output_size })); + ARM_COMPUTE_RETURN_ON_ERROR(CLSlice::validate(&output_lowp, &input_gate_input, {0}, {output_size})); // _slice_forget_tensor forget_gate_input = TensorInfo(TensorShape(output_size), 1, DataType::QSYMM16, qsymm_3); - ARM_COMPUTE_RETURN_ON_ERROR(CLSlice::validate(&output_lowp, &forget_gate_input, { output_size }, { 2 * output_size })); + ARM_COMPUTE_RETURN_ON_ERROR( + CLSlice::validate(&output_lowp, &forget_gate_input, {output_size}, {2 * output_size})); // _slice_cell_tensor input_modulation_gate_input = TensorInfo(TensorShape(output_size), 1, DataType::QSYMM16, qsymm_3); - ARM_COMPUTE_RETURN_ON_ERROR(CLSlice::validate(&output_lowp, &input_modulation_gate_input, { 2 * output_size }, { 3 * output_size })); + ARM_COMPUTE_RETURN_ON_ERROR( + CLSlice::validate(&output_lowp, &input_modulation_gate_input, {2 * output_size}, {3 * output_size})); // _slice_output_tensor output_gate_input = TensorInfo(TensorShape(output_size), 1, DataType::QSYMM16, qsymm_3); - ARM_COMPUTE_RETURN_ON_ERROR(CLSlice::validate(&output_lowp, &output_gate_input, { 3 * output_size }, { 4 * output_size })); + ARM_COMPUTE_RETURN_ON_ERROR( + CLSlice::validate(&output_lowp, &output_gate_input, {3 * output_size}, {4 * output_size})); } // _sigmoid_forget_gate const TensorInfo forget_gate_output(forget_gate_input.tensor_shape(), 1, DataType::QSYMM16, qsymm_0); - ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayer::validate(&forget_gate_input, &forget_gate_output, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC))); + ARM_COMPUTE_RETURN_ON_ERROR( + CLActivationLayer::validate(&forget_gate_input, &forget_gate_output, + ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC))); // _sigmoid_input_gate const TensorInfo input_gate_output(input_gate_input.tensor_shape(), 1, DataType::QSYMM16, qsymm_0); - ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayer::validate(&input_gate_input, &input_gate_output, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC))); + ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayer::validate( + &input_gate_input, &input_gate_output, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC))); // _tanh_modulation_gate - const TensorInfo input_modulation_gate_output(input_modulation_gate_input.tensor_shape(), 1, DataType::QSYMM16, qsymm_0); - ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayer::validate(&input_modulation_gate_input, &input_modulation_gate_output, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.0f, 1.0f))); + const TensorInfo input_modulation_gate_output(input_modulation_gate_input.tensor_shape(), 1, DataType::QSYMM16, + qsymm_0); + ARM_COMPUTE_RETURN_ON_ERROR( + CLActivationLayer::validate(&input_modulation_gate_input, &input_modulation_gate_output, + ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.0f, 1.0f))); // _sigmoid_output_gate const TensorInfo output_gate_output(output_gate_input.tensor_shape(), 1, DataType::QSYMM16, qsymm_0); - ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayer::validate(&output_gate_input, &output_gate_output, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC))); + ARM_COMPUTE_RETURN_ON_ERROR( + CLActivationLayer::validate(&output_gate_input, &output_gate_output, + ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC))); // _mul_forget_gate_cell_state const TensorInfo cell_state_tmp1(forget_gate_output.tensor_shape(), 1, DataType::QSYMM16, qsymm_4); - ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplication::validate(&forget_gate_output, cell_state_in, &cell_state_tmp1, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO)); + ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplication::validate( + &forget_gate_output, cell_state_in, &cell_state_tmp1, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO)); // _mul_input_gate_input_mod_gate const TensorInfo cell_state_tmp2(input_gate_output.tensor_shape(), 1, DataType::QSYMM16, qsymm_4); - ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplication::validate(&input_gate_output, &input_modulation_gate_output, &cell_state_tmp2, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO)); + ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplication::validate(&input_gate_output, &input_modulation_gate_output, + &cell_state_tmp2, 1, ConvertPolicy::SATURATE, + RoundingPolicy::TO_ZERO)); // _add_cell_state_tmps - ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAddition::validate(&cell_state_tmp1, &cell_state_tmp2, cell_state_out, ConvertPolicy::SATURATE)); + ARM_COMPUTE_RETURN_ON_ERROR( + CLArithmeticAddition::validate(&cell_state_tmp1, &cell_state_tmp2, cell_state_out, ConvertPolicy::SATURATE)); // _tanh_modulation_gate const TensorInfo output_state_tmp(cell_state_out->tensor_shape(), 1, DataType::QSYMM16, qsymm_0); - ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayer::validate(cell_state_out, &output_state_tmp, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.0f, 1.0f))); + ARM_COMPUTE_RETURN_ON_ERROR( + CLActivationLayer::validate(cell_state_out, &output_state_tmp, + ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.0f, 1.0f))); // _mul_output_state_tmp_output_gate const TensorInfo output_state_out_symm(output_gate_output.tensor_shape(), 1, DataType::QSYMM16, qsymm_0); - ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplication::validate(&output_state_tmp, &output_gate_output, &output_state_out_symm, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO)); + ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplication::validate(&output_state_tmp, &output_gate_output, + &output_state_out_symm, 1, ConvertPolicy::SATURATE, + RoundingPolicy::TO_ZERO)); // _dequantize const TensorInfo output_state_out_f32(output_state_out_symm.tensor_shape(), 1, DataType::F32); @@ -457,14 +651,14 @@ Status CLLSTMLayerQuantized::validate(const ITensorInfo *input, // _quantize ARM_COMPUTE_RETURN_ON_ERROR(CLQuantizationLayer::validate(&output_state_out_f32, output_state_out)); - if(cell_state_out->total_size() != 0) + if (cell_state_out->total_size() != 0) { ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&cell_state_info, cell_state_out); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&cell_state_info, cell_state_out); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(&cell_state_info, cell_state_out); } - if(output_state_out->total_size() != 0) + if (output_state_out->total_size() != 0) { ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&output_state_info, output_state_out); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&output_state_info, output_state_out); @@ -523,7 +717,7 @@ void CLLSTMLayerQuantized::run() void CLLSTMLayerQuantized::prepare() { - if(!_is_prepared) + if (!_is_prepared) { _input_weights.allocator()->allocate(); _concat_input_weights.run(); diff --git a/src/runtime/CL/functions/CLLaplacianPyramid.cpp b/src/runtime/CL/functions/CLLaplacianPyramid.cpp deleted file mode 100644 index 831f0cdcdf..0000000000 --- a/src/runtime/CL/functions/CLLaplacianPyramid.cpp +++ /dev/null @@ -1,109 +0,0 @@ -/* - * Copyright (c) 2017-2020 ARM Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "arm_compute/runtime/CL/functions/CLLaplacianPyramid.h" - -#include "arm_compute/core/Error.h" -#include "arm_compute/core/IPyramid.h" -#include "arm_compute/core/TensorInfo.h" -#include "arm_compute/core/Validate.h" -#include "arm_compute/runtime/CL/CLTensor.h" -#include "arm_compute/runtime/CL/functions/CLDepthConvertLayer.h" -#include "arm_compute/runtime/CL/functions/CLElementwiseOperations.h" -#include "arm_compute/runtime/CL/functions/CLGaussian5x5.h" -#include "arm_compute/runtime/CL/functions/CLGaussianPyramid.h" - -using namespace arm_compute; - -CLLaplacianPyramid::CLLaplacianPyramid() // NOLINT - : _num_levels(0), - _gaussian_pyr_function(), - _convf(), - _subf(), - _depth_function(), - _gauss_pyr(), - _conv_pyr() -{ -} - -void CLLaplacianPyramid::configure(ICLTensor *input, CLPyramid *pyramid, ICLTensor *output, BorderMode border_mode, uint8_t constant_border_value) -{ - configure(CLKernelLibrary::get().get_compile_context(), input, pyramid, output, border_mode, constant_border_value); -} - -void CLLaplacianPyramid::configure(const CLCompileContext &compile_context, ICLTensor *input, CLPyramid *pyramid, ICLTensor *output, BorderMode border_mode, uint8_t constant_border_value) -{ - ARM_COMPUTE_ERROR_ON(nullptr == pyramid); - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8); - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::S16); - ARM_COMPUTE_ERROR_ON(0 == pyramid->info()->num_levels()); - ARM_COMPUTE_ERROR_ON(input->info()->dimension(0) != pyramid->info()->width()); - ARM_COMPUTE_ERROR_ON(input->info()->dimension(1) != pyramid->info()->height()); - ARM_COMPUTE_ERROR_ON(output->info()->dimension(0) != pyramid->get_pyramid_level(pyramid->info()->num_levels() - 1)->info()->dimension(0)); - ARM_COMPUTE_ERROR_ON(output->info()->dimension(1) != pyramid->get_pyramid_level(pyramid->info()->num_levels() - 1)->info()->dimension(1)); - - _num_levels = pyramid->info()->num_levels(); - - // Create and initialize the gaussian pyramid and the convoluted pyramid - PyramidInfo pyramid_info; - pyramid_info.init(_num_levels, 0.5f, pyramid->info()->tensor_shape(), arm_compute::Format::U8); - - _gauss_pyr.init(pyramid_info); - _conv_pyr.init(pyramid_info); - - // Create Gaussian Pyramid function - _gaussian_pyr_function.configure(compile_context, input, &_gauss_pyr, border_mode, constant_border_value); - - _convf.resize(_num_levels); - _subf.resize(_num_levels); - - for(unsigned int i = 0; i < _num_levels; ++i) - { - _convf[i].configure(compile_context, _gauss_pyr.get_pyramid_level(i), _conv_pyr.get_pyramid_level(i), border_mode, constant_border_value); - _subf[i].configure(compile_context, _gauss_pyr.get_pyramid_level(i), _conv_pyr.get_pyramid_level(i), pyramid->get_pyramid_level(i), ConvertPolicy::WRAP); - } - - _depth_function.configure(compile_context, _conv_pyr.get_pyramid_level(_num_levels - 1), output, ConvertPolicy::WRAP, 0); - - _gauss_pyr.allocate(); - _conv_pyr.allocate(); -} - -void CLLaplacianPyramid::run() -{ - ARM_COMPUTE_ERROR_ON_MSG(0 == _num_levels, "Unconfigured function"); - - _gaussian_pyr_function.run(); // compute gaussian pyramid - - for(unsigned int i = 0; i < _num_levels; ++i) - { - _convf[i].run(); // convolute gaussian pyramid - } - - for(unsigned int i = 0; i < _num_levels; ++i) - { - _subf[i].run(); // compute laplacian image - } - - _depth_function.run(); -} diff --git a/src/runtime/CL/functions/CLLaplacianReconstruct.cpp b/src/runtime/CL/functions/CLLaplacianReconstruct.cpp deleted file mode 100644 index ea6a3f9a98..0000000000 --- a/src/runtime/CL/functions/CLLaplacianReconstruct.cpp +++ /dev/null @@ -1,106 +0,0 @@ -/* - * Copyright (c) 2017-2020 ARM Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "arm_compute/runtime/CL/functions/CLLaplacianReconstruct.h" - -#include "arm_compute/core/Error.h" -#include "arm_compute/core/IPyramid.h" -#include "arm_compute/core/ITensor.h" -#include "arm_compute/core/TensorInfo.h" -#include "arm_compute/core/Validate.h" - -#include <cstddef> - -using namespace arm_compute; - -CLLaplacianReconstruct::CLLaplacianReconstruct() // NOLINT - : _tmp_pyr(), - _addf(), - _scalef(), - _depthf() -{ -} - -void CLLaplacianReconstruct::configure(const CLPyramid *pyramid, ICLTensor *input, ICLTensor *output, BorderMode border_mode, uint8_t constant_border_value) -{ - configure(CLKernelLibrary::get().get_compile_context(), pyramid, input, output, border_mode, constant_border_value); -} - -void CLLaplacianReconstruct::configure(const CLCompileContext &compile_context, const CLPyramid *pyramid, ICLTensor *input, ICLTensor *output, BorderMode border_mode, uint8_t constant_border_value) -{ - ARM_COMPUTE_ERROR_ON(nullptr == pyramid); - ARM_COMPUTE_ERROR_ON(input == output); - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::S16); - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8); - ARM_COMPUTE_ERROR_ON(input->info()->num_dimensions() != pyramid->get_pyramid_level(0)->info()->num_dimensions()); - ARM_COMPUTE_ERROR_ON(output->info()->num_dimensions() != pyramid->get_pyramid_level(0)->info()->num_dimensions()); - ARM_COMPUTE_ERROR_ON(output->info()->dimension(0) != pyramid->get_pyramid_level(0)->info()->dimension(0)); - ARM_COMPUTE_ERROR_ON(output->info()->dimension(1) != pyramid->get_pyramid_level(0)->info()->dimension(1)); - ARM_COMPUTE_ERROR_ON(input->info()->dimension(0) != pyramid->get_pyramid_level(pyramid->info()->num_levels() - 1)->info()->dimension(0)); - ARM_COMPUTE_ERROR_ON(input->info()->dimension(1) != pyramid->get_pyramid_level(pyramid->info()->num_levels() - 1)->info()->dimension(1)); - - const size_t num_levels = pyramid->info()->num_levels(); - - // Create and initialize the tmp pyramid: I(n-2) = upsample( input + Laplace(n-1) ) - PyramidInfo pyramid_info; - pyramid_info.init(num_levels, 0.5f, output->info()->tensor_shape(), arm_compute::Format::S16); - _tmp_pyr.init(pyramid_info); - - // Allocate add and scale functions. Level 0 does not need to be scaled. - _addf.resize(num_levels); - _scalef.resize(num_levels - 1); - - const size_t last_level = num_levels - 1; - - _addf[last_level].configure(compile_context, input, pyramid->get_pyramid_level(last_level), _tmp_pyr.get_pyramid_level(last_level), ConvertPolicy::SATURATE); - - // Scale levels n-1 to 1, and add levels n-2 to 0 - for(size_t l = 0; l < last_level; ++l) - { - _scalef[l].configure(compile_context, _tmp_pyr.get_pyramid_level(l + 1), _tmp_pyr.get_pyramid_level(l), arm_compute::InterpolationPolicy::NEAREST_NEIGHBOR, border_mode, constant_border_value); - _addf[l].configure(compile_context, _tmp_pyr.get_pyramid_level(l), pyramid->get_pyramid_level(l), _tmp_pyr.get_pyramid_level(l), ConvertPolicy::SATURATE); - } - - // Convert level 0 from S16 to U8 - _depthf.configure(compile_context, _tmp_pyr.get_pyramid_level(0), output, ConvertPolicy::SATURATE, 0); - - _tmp_pyr.allocate(); -} - -void CLLaplacianReconstruct::run() -{ - ARM_COMPUTE_ERROR_ON_MSG(_addf.empty(), "Unconfigured function"); - - const size_t last_level = _tmp_pyr.info()->num_levels() - 1; - - _addf[last_level].run(); - - // Run l = [last_level - 1, 0] - for(size_t l = last_level; l-- > 0;) - { - _scalef[l].run(); - _addf[l].run(); - } - - _depthf.run(); -} diff --git a/src/runtime/CL/functions/CLLocallyConnectedLayer.cpp b/src/runtime/CL/functions/CLLocallyConnectedLayer.cpp deleted file mode 100644 index 950be5030f..0000000000 --- a/src/runtime/CL/functions/CLLocallyConnectedLayer.cpp +++ /dev/null @@ -1,211 +0,0 @@ -/* - * Copyright (c) 2017-2020 ARM Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "arm_compute/runtime/CL/functions/CLLocallyConnectedLayer.h" - -#include "arm_compute/core/PixelValue.h" -#include "arm_compute/core/Utils.h" -#include "arm_compute/core/Validate.h" -#include "arm_compute/runtime/CL/CLScheduler.h" - -#include <cmath> -#include <tuple> - -using namespace arm_compute; - -namespace -{ -void calculate_shapes(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info, - TensorShape &shape_wr, TensorShape &shape_im2col, TensorShape &shape_gemm) -{ - ARM_COMPUTE_UNUSED(output); - - const unsigned int kernel_width = weights->dimension(0); - const unsigned int kernel_height = weights->dimension(1); - - bool has_bias = (biases != nullptr); - - // Get convolved dimensions - unsigned int conv_w = 0; - unsigned int conv_h = 0; - std::tie(conv_w, conv_h) = scaled_dimensions(input->dimension(0), - input->dimension(1), - kernel_width, - kernel_height, - conv_info); - - const size_t mat_weights_cols = weights->dimension(3); - const size_t mat_weights_rows = weights->dimension(0) * weights->dimension(1) * weights->dimension(2) + ((has_bias) ? 1 : 0); - const size_t mat_weights_num = weights->dimension(4); - - shape_wr = TensorShape(mat_weights_cols, mat_weights_rows, mat_weights_num); - - const size_t mat_input_cols = mat_weights_rows; - const size_t mat_input_rows = conv_w * conv_h; - - shape_im2col = input->tensor_shape(); - if(shape_im2col.num_dimensions() >= 3) - { - shape_im2col.remove_dimension(2); - } - shape_im2col.set(0, mat_input_cols); - shape_im2col.set(1, mat_input_rows); - - shape_gemm = shape_im2col; - shape_gemm.set(0, mat_weights_cols); - shape_gemm.set(1, mat_input_rows); -} -} // namespace - -CLLocallyConnectedLayer::CLLocallyConnectedLayer(std::shared_ptr<IMemoryManager> memory_manager) - : _memory_group(std::move(memory_manager)), _input_im2col_kernel(), _weights_reshape_kernel(), _mm_kernel(), _output_col2im_kernel(), _input_im2col_reshaped(), _weights_reshaped(), _gemm_output(), - _is_prepared(false), _original_weights(nullptr) -{ -} - -Status CLLocallyConnectedLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info) -{ - ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output); - ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(2) != input->dimension(2)); - ARM_COMPUTE_RETURN_ERROR_ON(!conv_info.padding_is_symmetric()); - - bool has_bias = (biases != nullptr); - - if(has_bias) - { - ARM_COMPUTE_RETURN_ERROR_ON(biases->dimension(0) != weights->dimension(3)); - ARM_COMPUTE_RETURN_ERROR_ON(biases->num_dimensions() > 2); - } - - const unsigned int kernel_width = weights->dimension(0); - const unsigned int kernel_height = weights->dimension(1); - - // Get convolved dimensions - unsigned int conv_w = 0; - unsigned int conv_h = 0; - std::tie(conv_w, conv_h) = scaled_dimensions(input->dimension(0), input->dimension(1), kernel_width, kernel_height, - conv_info); - - ARM_COMPUTE_RETURN_ERROR_ON_MSG((output->dimension(0) != conv_w) || (output->dimension(1) != conv_h), "Output shape does not match the expected one"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(weights->dimension(4) != (conv_w * conv_h), "Weights shape does not match the expected one"); - - // Calculate intermediate buffer shapes - TensorShape shape_wr; - TensorShape shape_im2col; - TensorShape shape_gemm; - calculate_shapes(input, weights, biases, output, conv_info, shape_wr, shape_im2col, shape_gemm); - - TensorInfo weights_reshaped_info(shape_wr, 1, weights->data_type()); - TensorInfo input_im2col_reshaped_info(shape_im2col, 1, input->data_type()); - TensorInfo gemm_output_info(shape_gemm, 1, input->data_type()); - - ARM_COMPUTE_RETURN_ON_ERROR(CLIm2ColKernel::validate(input, &input_im2col_reshaped_info, Size2D(kernel_width, kernel_height), conv_info, has_bias)); - ARM_COMPUTE_RETURN_ON_ERROR(CLWeightsReshapeKernel::validate(weights, biases, &weights_reshaped_info)); - ARM_COMPUTE_RETURN_ON_ERROR(CLLocallyConnectedMatrixMultiplyKernel::validate(&input_im2col_reshaped_info, &weights_reshaped_info, &gemm_output_info)); - ARM_COMPUTE_RETURN_ON_ERROR(CLCol2ImKernel::validate(&gemm_output_info, output, Size2D(conv_w, conv_h))); - - return Status{}; -} - -void CLLocallyConnectedLayer::configure(const ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info) -{ - configure(CLKernelLibrary::get().get_compile_context(), input, weights, biases, output, conv_info); -} - -void CLLocallyConnectedLayer::configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, - const PadStrideInfo &conv_info) -{ - ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output); - ARM_COMPUTE_ERROR_THROW_ON(CLLocallyConnectedLayer::validate(input->info(), weights->info(), biases == nullptr ? nullptr : biases->info(), output->info(), conv_info)); - - bool _has_bias = (biases != nullptr); - _original_weights = weights; - _is_prepared = false; - - const unsigned int kernel_width = weights->info()->dimension(0); - const unsigned int kernel_height = weights->info()->dimension(1); - - // Get convolved dimensions - unsigned int conv_w = 0; - unsigned int conv_h = 0; - std::tie(conv_w, conv_h) = scaled_dimensions(input->info()->dimension(0), input->info()->dimension(1), kernel_width, kernel_height, - conv_info); - - // Calculate intermediate buffer shapes - TensorShape shape_wr; - TensorShape shape_im2col; - TensorShape shape_gemm; - calculate_shapes(input->info(), weights->info(), biases == nullptr ? nullptr : biases->info(), output->info(), conv_info, shape_wr, shape_im2col, shape_gemm); - - _weights_reshaped.allocator()->init(TensorInfo(shape_wr, 1, weights->info()->data_type())); - _input_im2col_reshaped.allocator()->init(TensorInfo(shape_im2col, 1, input->info()->data_type())); - _gemm_output.allocator()->init(TensorInfo(shape_gemm, 1, input->info()->data_type())); - - // Manage intermediate buffers - _memory_group.manage(&_input_im2col_reshaped); - _memory_group.manage(&_gemm_output); - - // Configure kernels - _input_im2col_kernel.configure(compile_context, input, &_input_im2col_reshaped, Size2D(kernel_width, kernel_height), conv_info, _has_bias); - _weights_reshape_kernel.configure(compile_context, weights, biases, &_weights_reshaped); - _mm_kernel.configure(compile_context, &_input_im2col_reshaped, &_weights_reshaped, &_gemm_output); - _output_col2im_kernel.configure(compile_context, &_gemm_output, output, Size2D(conv_w, conv_h)); - - // Allocate intermediate tensors - _input_im2col_reshaped.allocator()->allocate(); - _gemm_output.allocator()->allocate(); - - CLScheduler::get().tune_kernel_static(_input_im2col_kernel); -} - -void CLLocallyConnectedLayer::run() -{ - prepare(); - - MemoryGroupResourceScope scope_mg(_memory_group); - - // Run input reshaping - CLScheduler::get().enqueue(_input_im2col_kernel); - - // Runs vector matrix multiply on reshaped matrices - CLScheduler::get().enqueue(_mm_kernel); - - // Reshape output matrix - CLScheduler::get().enqueue(_output_col2im_kernel, false); -} - -void CLLocallyConnectedLayer::prepare() -{ - if(!_is_prepared) - { - ARM_COMPUTE_ERROR_ON(!_original_weights->is_used()); - - // Run weights reshaping and mark original weights tensor as unused - _weights_reshaped.allocator()->allocate(); - CLScheduler::get().enqueue(_weights_reshape_kernel); - _original_weights->mark_as_unused(); - - CLScheduler::get().queue().finish(); - _is_prepared = true; - } -} diff --git a/src/runtime/CL/functions/CLLogicalAnd.cpp b/src/runtime/CL/functions/CLLogicalAnd.cpp new file mode 100644 index 0000000000..ea21c54bc3 --- /dev/null +++ b/src/runtime/CL/functions/CLLogicalAnd.cpp @@ -0,0 +1,105 @@ +/* + * Copyright (c) 2020-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/runtime/CL/functions/CLLogicalAnd.h" + +#include "arm_compute/core/CL/ICLTensor.h" + +#include "src/common/utils/Log.h" +#include "src/gpu/cl/kernels/ClElementwiseKernel.h" + +#include <utility> + +namespace arm_compute +{ +namespace experimental +{ +void CLLogicalAnd::configure(const CLCompileContext &compile_context, + ITensorInfo *input1, + ITensorInfo *input2, + ITensorInfo *output) +{ + ARM_COMPUTE_LOG_PARAMS(input1, input2, output); + auto k = std::make_unique<arm_compute::opencl::kernels::ClLogicalBinaryKernel>(); + k->configure(compile_context, LogicalOperation::And, input1, input2, output); + _kernel = std::move(k); +} + +Status CLLogicalAnd::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output) +{ + return arm_compute::opencl::kernels::ClLogicalBinaryKernel::validate(LogicalOperation::And, input1, input2, output); +} + +void CLLogicalAnd::run(ITensorPack &tensors) +{ + ICLOperator::run(tensors); +} +} // namespace experimental + +struct CLLogicalAnd::Impl +{ + const ICLTensor *src0{nullptr}; + const ICLTensor *src1{nullptr}; + ICLTensor *dst{nullptr}; + std::unique_ptr<experimental::CLLogicalAnd> op{nullptr}; +}; + +CLLogicalAnd::CLLogicalAnd() : _impl(std::make_unique<Impl>()) +{ +} +CLLogicalAnd::CLLogicalAnd(CLLogicalAnd &&) = default; +CLLogicalAnd &CLLogicalAnd::operator=(CLLogicalAnd &&) = default; +CLLogicalAnd::~CLLogicalAnd() = default; + +void CLLogicalAnd::configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *output) +{ + configure(CLKernelLibrary::get().get_compile_context(), input1, input2, output); +} + +void CLLogicalAnd::configure(const CLCompileContext &compile_context, + ICLTensor *input1, + ICLTensor *input2, + ICLTensor *output) +{ + _impl->src0 = input1; + _impl->src1 = input2; + _impl->dst = output; + _impl->op = std::make_unique<experimental::CLLogicalAnd>(); + _impl->op->configure(compile_context, input1->info(), input2->info(), output->info()); +} + +Status CLLogicalAnd::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output) +{ + return experimental::CLLogicalAnd::validate(input1, input2, output); +} + +void CLLogicalAnd::run() +{ + ITensorPack pack; + pack.add_tensor(TensorType::ACL_SRC_0, _impl->src0); + pack.add_tensor(TensorType::ACL_SRC_1, _impl->src1); + pack.add_tensor(TensorType::ACL_DST, _impl->dst); + + _impl->op->run(pack); +} +} // namespace arm_compute diff --git a/src/runtime/CL/functions/CLLogicalNot.cpp b/src/runtime/CL/functions/CLLogicalNot.cpp new file mode 100644 index 0000000000..71f9cce54f --- /dev/null +++ b/src/runtime/CL/functions/CLLogicalNot.cpp @@ -0,0 +1,75 @@ +/* + * Copyright (c) 2020-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/runtime/CL/functions/CLLogicalNot.h" + +#include "arm_compute/core/CL/CLKernelLibrary.h" +#include "arm_compute/core/CL/ICLTensor.h" + +#include "src/core/CL/ICLKernel.h" +#include "src/gpu/cl/operators/ClLogicalNot.h" + +namespace arm_compute +{ +struct CLLogicalNot::Impl +{ + const ICLTensor *src{nullptr}; + ICLTensor *dst{nullptr}; + std::unique_ptr<opencl::ClLogicalNot> op{nullptr}; +}; + +CLLogicalNot::CLLogicalNot() : _impl(std::make_unique<Impl>()) +{ +} +CLLogicalNot::CLLogicalNot(CLLogicalNot &&) = default; +CLLogicalNot &CLLogicalNot::operator=(CLLogicalNot &&) = default; +CLLogicalNot::~CLLogicalNot() = default; + +void CLLogicalNot::configure(const ICLTensor *input, ICLTensor *output) +{ + configure(CLKernelLibrary::get().get_compile_context(), input, output); +} + +void CLLogicalNot::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output) +{ + _impl->src = input; + _impl->dst = output; + _impl->op = std::make_unique<opencl::ClLogicalNot>(); + _impl->op->configure(compile_context, input->info(), output->info()); +} + +Status CLLogicalNot::validate(const ITensorInfo *input, const ITensorInfo *output) +{ + return opencl::ClLogicalNot::validate(input, output); +} + +void CLLogicalNot::run() +{ + ITensorPack pack; + pack.add_tensor(TensorType::ACL_SRC, _impl->src); + pack.add_tensor(TensorType::ACL_DST, _impl->dst); + + _impl->op->run(pack); +} + +} // namespace arm_compute diff --git a/src/runtime/CL/functions/CLLogicalOr.cpp b/src/runtime/CL/functions/CLLogicalOr.cpp new file mode 100644 index 0000000000..3db4fdae84 --- /dev/null +++ b/src/runtime/CL/functions/CLLogicalOr.cpp @@ -0,0 +1,105 @@ +/* + * Copyright (c) 2020-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/runtime/CL/functions/CLLogicalOr.h" + +#include "arm_compute/core/CL/ICLTensor.h" + +#include "src/common/utils/Log.h" +#include "src/gpu/cl/kernels/ClElementwiseKernel.h" + +#include <utility> + +namespace arm_compute +{ +namespace experimental +{ +void CLLogicalOr::configure(const CLCompileContext &compile_context, + ITensorInfo *input1, + ITensorInfo *input2, + ITensorInfo *output) +{ + ARM_COMPUTE_LOG_PARAMS(input1, input2, output); + auto k = std::make_unique<arm_compute::opencl::kernels::ClLogicalBinaryKernel>(); + k->configure(compile_context, LogicalOperation::Or, input1, input2, output); + _kernel = std::move(k); +} + +Status CLLogicalOr::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output) +{ + return arm_compute::opencl::kernels::ClLogicalBinaryKernel::validate(LogicalOperation::Or, input1, input2, output); +} + +void CLLogicalOr::run(ITensorPack &tensors) +{ + ICLOperator::run(tensors); +} +} // namespace experimental + +struct CLLogicalOr::Impl +{ + const ICLTensor *src0{nullptr}; + const ICLTensor *src1{nullptr}; + ICLTensor *dst{nullptr}; + std::unique_ptr<experimental::CLLogicalOr> op{nullptr}; +}; + +CLLogicalOr::CLLogicalOr() : _impl(std::make_unique<Impl>()) +{ +} +CLLogicalOr::CLLogicalOr(CLLogicalOr &&) = default; +CLLogicalOr &CLLogicalOr::operator=(CLLogicalOr &&) = default; +CLLogicalOr::~CLLogicalOr() = default; + +void CLLogicalOr::configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *output) +{ + configure(CLKernelLibrary::get().get_compile_context(), input1, input2, output); +} + +void CLLogicalOr::configure(const CLCompileContext &compile_context, + ICLTensor *input1, + ICLTensor *input2, + ICLTensor *output) +{ + _impl->src0 = input1; + _impl->src1 = input2; + _impl->dst = output; + _impl->op = std::make_unique<experimental::CLLogicalOr>(); + _impl->op->configure(compile_context, input1->info(), input2->info(), output->info()); +} + +Status CLLogicalOr::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output) +{ + return experimental::CLLogicalOr::validate(input1, input2, output); +} + +void CLLogicalOr::run() +{ + ITensorPack pack; + pack.add_tensor(TensorType::ACL_SRC_0, _impl->src0); + pack.add_tensor(TensorType::ACL_SRC_1, _impl->src1); + pack.add_tensor(TensorType::ACL_DST, _impl->dst); + + _impl->op->run(pack); +} +} // namespace arm_compute diff --git a/src/runtime/CL/functions/CLMagnitude.cpp b/src/runtime/CL/functions/CLMagnitude.cpp deleted file mode 100644 index a267952d4a..0000000000 --- a/src/runtime/CL/functions/CLMagnitude.cpp +++ /dev/null @@ -1,43 +0,0 @@ -/* - * Copyright (c) 2016-2020 ARM Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "arm_compute/runtime/CL/functions/CLMagnitude.h" - -#include "arm_compute/core/CL/kernels/CLMagnitudePhaseKernel.h" -#include "support/MemorySupport.h" - -#include <utility> - -using namespace arm_compute; - -void CLMagnitude::configure(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, MagnitudeType mag_type) -{ - configure(CLKernelLibrary::get().get_compile_context(), input1, input2, output, mag_type); -} - -void CLMagnitude::configure(const CLCompileContext &compile_context, const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, MagnitudeType mag_type) -{ - auto k = arm_compute::support::cpp14::make_unique<CLMagnitudePhaseKernel>(); - k->configure(compile_context, input1, input2, output, nullptr, mag_type); - _kernel = std::move(k); -} diff --git a/src/runtime/CL/functions/CLMatMul.cpp b/src/runtime/CL/functions/CLMatMul.cpp new file mode 100644 index 0000000000..e8bdad706b --- /dev/null +++ b/src/runtime/CL/functions/CLMatMul.cpp @@ -0,0 +1,87 @@ +/* + * Copyright (c) 2023 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/runtime/CL/functions/CLMatMul.h" + +#include "arm_compute/runtime/CL/CLTensor.h" +#include "arm_compute/runtime/CL/CLTypes.h" + +#include "src/gpu/cl/operators/ClMatMul.h" + +namespace arm_compute +{ +using OperatorType = opencl::ClMatMul; + +struct CLMatMul::Impl +{ + std::unique_ptr<OperatorType> op{nullptr}; + ITensorPack run_pack{}; +}; +CLMatMul::CLMatMul() : _impl(std::make_unique<Impl>()) +{ +} + +CLMatMul::~CLMatMul() = default; + +void CLMatMul::configure(ICLTensor *lhs, + ICLTensor *rhs, + ICLTensor *output, + const MatMulInfo &matmul_info, + const GpuMatMulSettings &settings, + const ActivationLayerInfo &act_info) +{ + ARM_COMPUTE_UNUSED(settings); + configure(CLKernelLibrary::get().get_compile_context(), lhs, rhs, output, matmul_info, settings, act_info); +} + +void CLMatMul::configure(const CLCompileContext &compile_context, + ICLTensor *lhs, + ICLTensor *rhs, + ICLTensor *output, + const MatMulInfo &matmul_info, + const GpuMatMulSettings &settings, + const ActivationLayerInfo &act_info) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(lhs, rhs, output); + ARM_COMPUTE_UNUSED(settings); + + _impl->op = std::make_unique<OperatorType>(); + _impl->op->configure(compile_context, lhs->info(), rhs->info(), output->info(), matmul_info, act_info); + _impl->run_pack = {{ACL_SRC_0, lhs}, {ACL_SRC_1, rhs}, {ACL_DST, output}}; +} + +Status CLMatMul::validate(const ITensorInfo *lhs, + const ITensorInfo *rhs, + const ITensorInfo *output, + const MatMulInfo &matmul_info, + const ActivationLayerInfo &act_info) +{ + return OperatorType::validate(lhs, rhs, output, matmul_info, act_info); +} + +void CLMatMul::run() +{ + _impl->op->run(_impl->run_pack); +} + +} // namespace arm_compute diff --git a/src/runtime/CL/functions/CLMaxUnpoolingLayer.cpp b/src/runtime/CL/functions/CLMaxUnpoolingLayer.cpp new file mode 100644 index 0000000000..7494f379b9 --- /dev/null +++ b/src/runtime/CL/functions/CLMaxUnpoolingLayer.cpp @@ -0,0 +1,80 @@ +/* + * Copyright (c) 2020-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/runtime/CL/functions/CLMaxUnpoolingLayer.h" + +#include "arm_compute/core/CL/ICLTensor.h" +#include "arm_compute/core/PixelValue.h" +#include "arm_compute/core/Validate.h" +#include "arm_compute/runtime/CL/CLScheduler.h" + +#include "src/common/utils/Log.h" +#include "src/core/CL/kernels/CLMaxUnpoolingLayerKernel.h" + +namespace arm_compute +{ +CLMaxUnpoolingLayer::CLMaxUnpoolingLayer() + : _fill(), _unpooling_layer_kernel(std::make_unique<CLMaxUnpoolingLayerKernel>()) +{ +} + +CLMaxUnpoolingLayer::~CLMaxUnpoolingLayer() = default; + +void CLMaxUnpoolingLayer::configure(ICLTensor *input, + ICLTensor *indices, + ICLTensor *output, + const PoolingLayerInfo &pool_info) +{ + configure(CLKernelLibrary::get().get_compile_context(), input, indices, output, pool_info); +} + +void CLMaxUnpoolingLayer::configure(const CLCompileContext &compile_context, + ICLTensor *input, + ICLTensor *indices, + ICLTensor *output, + const PoolingLayerInfo &pool_info) +{ + ARM_COMPUTE_LOG_PARAMS(input, indices, output, pool_info); + const PixelValue zero_value(0.f); + _fill.configure(output, zero_value); + + _unpooling_layer_kernel->configure(compile_context, input, indices, output, pool_info); +} + +Status CLMaxUnpoolingLayer::validate(const ITensorInfo *input, + const ITensorInfo *indices, + const ITensorInfo *output, + const PoolingLayerInfo &pool_info) +{ + return CLMaxUnpoolingLayerKernel::validate(input, indices, output, pool_info); +} + +void CLMaxUnpoolingLayer::run() +{ + // Run fill + _fill.run(); + + // Run max unpooling layer + CLScheduler::get().enqueue(*_unpooling_layer_kernel); +} +} /* namespace arm_compute */ diff --git a/src/runtime/CL/functions/CLMeanStdDev.cpp b/src/runtime/CL/functions/CLMeanStdDev.cpp deleted file mode 100644 index e3ce704bfb..0000000000 --- a/src/runtime/CL/functions/CLMeanStdDev.cpp +++ /dev/null @@ -1,172 +0,0 @@ -/* - * Copyright (c) 2016-2020 ARM Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "arm_compute/core/TensorInfo.h" - -#include "arm_compute/runtime/CL/CLScheduler.h" -#include "arm_compute/runtime/CL/functions/CLMeanStdDev.h" - -using namespace arm_compute; - -CLMeanStdDev::CLMeanStdDev(std::shared_ptr<IMemoryManager> memory_manager) // NOLINT - : _memory_group(std::move(memory_manager)), - _data_type(), - _num_pixels(), - _run_stddev(), - _reduction_operation_mean(), - _reduction_operation_stddev(), - _reduction_output_mean(), - _reduction_output_stddev(), - _mean(nullptr), - _stddev(nullptr), - _mean_stddev_kernel(), - _fill_border_kernel(), - _global_sum(), - _global_sum_squared() -{ -} - -Status CLMeanStdDev::validate(ITensorInfo *input, float *mean, float *stddev) -{ - ARM_COMPUTE_RETURN_ERROR_ON_TENSOR_NOT_2D(input); - if(is_data_type_float(input->data_type())) - { - ARM_COMPUTE_UNUSED(mean); - ARM_COMPUTE_UNUSED(stddev); - - TensorShape output_shape = TensorShape{ 1, input->dimension(1) }; - TensorInfo output_shape_info = TensorInfo(output_shape, 1, DataType::U8); - return CLReductionOperation::validate(input, &output_shape_info, 0, ReductionOperation::SUM); - } - else - { - return CLMeanStdDevKernel::validate(input, mean, nullptr, stddev, nullptr); - } -} - -void CLMeanStdDev::configure(ICLImage *input, float *mean, float *stddev) -{ - configure(CLKernelLibrary::get().get_compile_context(), input, mean, stddev); -} - -void CLMeanStdDev::configure(const CLCompileContext &compile_context, ICLImage *input, float *mean, float *stddev) -{ - // In the case of F16/F32 we call reduction operation for calculating CLMeanStdDev - _data_type = input->info()->data_type(); - - if(is_data_type_float(_data_type)) - { - _num_pixels = input->info()->dimension(0) * input->info()->dimension(1); - - _memory_group.manage(&_reduction_output_mean); - _reduction_operation_mean.configure(compile_context, input, &_reduction_output_mean, 0, ReductionOperation::SUM); - _reduction_output_mean.allocator()->allocate(); - _mean = mean; - - if(stddev != nullptr) - { - _memory_group.manage(&_reduction_output_stddev); - _reduction_operation_stddev.configure(compile_context, input, &_reduction_output_stddev, 0, ReductionOperation::SUM_SQUARE); - _reduction_output_stddev.allocator()->allocate(); - _stddev = stddev; - _run_stddev = true; - } - } - else - { - _global_sum = cl::Buffer(CLScheduler::get().context(), CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE, sizeof(cl_ulong)); - - if(stddev != nullptr) - { - _global_sum_squared = cl::Buffer(CLScheduler::get().context(), CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE, sizeof(cl_ulong)); - } - - _mean_stddev_kernel.configure(compile_context, input, mean, &_global_sum, stddev, &_global_sum_squared); - _fill_border_kernel.configure(compile_context, input, _mean_stddev_kernel.border_size(), BorderMode::CONSTANT, PixelValue(static_cast<uint8_t>(0))); - } -} - -template <typename T> -void CLMeanStdDev::run_float() -{ - MemoryGroupResourceScope scope_mg(_memory_group); - - // Perform reduction on x-axis - _reduction_operation_mean.run(); - if(_run_stddev) - { - _reduction_operation_stddev.run(); - _reduction_output_stddev.map(true); - } - - _reduction_output_mean.map(true); - - auto mean = static_cast<T>(0); - - // Calculate final result for mean - for(unsigned int i = 0; i < _reduction_output_mean.info()->dimension(1); ++i) - { - mean += *reinterpret_cast<T *>(_reduction_output_mean.buffer() + _reduction_output_mean.info()->offset_element_in_bytes(Coordinates(0, i))); - } - - mean /= _num_pixels; - *_mean = mean; - - if(_run_stddev) - { - auto stddev = static_cast<T>(0); - // Calculate final result for stddev - for(unsigned int i = 0; i < _reduction_output_stddev.info()->dimension(1); ++i) - { - stddev += *reinterpret_cast<T *>(_reduction_output_stddev.buffer() + _reduction_output_stddev.info()->offset_element_in_bytes(Coordinates(0, i))); - } - *_stddev = std::sqrt((stddev / _num_pixels) - (mean * mean)); - - _reduction_output_stddev.unmap(); - } - _reduction_output_mean.unmap(); -} - -void CLMeanStdDev::run_int() -{ - CLScheduler::get().enqueue(_fill_border_kernel); - CLScheduler::get().enqueue(_mean_stddev_kernel); -} - -void CLMeanStdDev::run() -{ - switch(_data_type) - { - case DataType::F16: - run_float<half>(); - break; - case DataType::F32: - run_float<float>(); - break; - case DataType::U8: - run_int(); - break; - default: - ARM_COMPUTE_ERROR_ON("Not supported"); - } -} diff --git a/src/runtime/CL/functions/CLMeanStdDevNormalizationLayer.cpp b/src/runtime/CL/functions/CLMeanStdDevNormalizationLayer.cpp index 3dbab76c72..5892c0e840 100644 --- a/src/runtime/CL/functions/CLMeanStdDevNormalizationLayer.cpp +++ b/src/runtime/CL/functions/CLMeanStdDevNormalizationLayer.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2020 ARM Limited. + * Copyright (c) 2019-2021 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -23,9 +23,10 @@ */ #include "arm_compute/runtime/CL/functions/CLMeanStdDevNormalizationLayer.h" -#include "arm_compute/core/CL/kernels/CLMeanStdDevNormalizationKernel.h" #include "arm_compute/core/Types.h" -#include "support/MemorySupport.h" + +#include "src/common/utils/Log.h" +#include "src/core/CL/kernels/CLMeanStdDevNormalizationKernel.h" namespace arm_compute { @@ -34,9 +35,13 @@ void CLMeanStdDevNormalizationLayer::configure(ICLTensor *input, ICLTensor *outp configure(CLKernelLibrary::get().get_compile_context(), input, output, epsilon); } -void CLMeanStdDevNormalizationLayer::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, float epsilon) +void CLMeanStdDevNormalizationLayer::configure(const CLCompileContext &compile_context, + ICLTensor *input, + ICLTensor *output, + float epsilon) { - auto k = arm_compute::support::cpp14::make_unique<CLMeanStdDevNormalizationKernel>(); + ARM_COMPUTE_LOG_PARAMS(input, output, epsilon); + auto k = std::make_unique<CLMeanStdDevNormalizationKernel>(); k->configure(compile_context, input, output, epsilon); _kernel = std::move(k); } diff --git a/src/runtime/CL/functions/CLMedian3x3.cpp b/src/runtime/CL/functions/CLMedian3x3.cpp deleted file mode 100644 index dc53240f79..0000000000 --- a/src/runtime/CL/functions/CLMedian3x3.cpp +++ /dev/null @@ -1,45 +0,0 @@ -/* - * Copyright (c) 2016-2020 ARM Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "arm_compute/runtime/CL/functions/CLMedian3x3.h" - -#include "arm_compute/core/CL/kernels/CLMedian3x3Kernel.h" -#include "arm_compute/core/PixelValue.h" -#include "support/MemorySupport.h" - -#include <utility> - -using namespace arm_compute; - -void CLMedian3x3::configure(ICLTensor *input, ICLTensor *output, BorderMode border_mode, uint8_t constant_border_value) -{ - configure(CLKernelLibrary::get().get_compile_context(), input, output, border_mode, constant_border_value); -} - -void CLMedian3x3::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, BorderMode border_mode, uint8_t constant_border_value) -{ - auto k = arm_compute::support::cpp14::make_unique<CLMedian3x3Kernel>(); - k->configure(compile_context, input, output, border_mode == BorderMode::UNDEFINED); - _kernel = std::move(k); - _border_handler.configure(compile_context, input, _kernel->border_size(), border_mode, PixelValue(constant_border_value)); -} diff --git a/src/runtime/CL/functions/CLMinMaxLocation.cpp b/src/runtime/CL/functions/CLMinMaxLocation.cpp deleted file mode 100644 index 15b28330b5..0000000000 --- a/src/runtime/CL/functions/CLMinMaxLocation.cpp +++ /dev/null @@ -1,106 +0,0 @@ -/* - * Copyright (c) 2017-2020 ARM Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "arm_compute/runtime/CL/functions/CLMinMaxLocation.h" - -#include "arm_compute/core/CL/CLHelpers.h" - -namespace arm_compute -{ -CLMinMaxLocation::CLMinMaxLocation() - : _min_max_kernel(), - _min_max_loc_kernel(), - _min_max_vals(), - _min_max_count_vals(), - _min(nullptr), - _max(nullptr), - _min_count(nullptr), - _max_count(nullptr), - _min_loc(nullptr), - _max_loc(nullptr) -{ -} - -void CLMinMaxLocation::configure(const ICLImage *input, void *min, void *max, CLCoordinates2DArray *min_loc, CLCoordinates2DArray *max_loc, uint32_t *min_count, uint32_t *max_count) -{ - configure(CLKernelLibrary::get().get_compile_context(), input, min, max, min_loc, max_loc, min_count, max_count); -} - -void CLMinMaxLocation::configure(const CLCompileContext &compile_context, const ICLImage *input, void *min, void *max, CLCoordinates2DArray *min_loc, CLCoordinates2DArray *max_loc, - uint32_t *min_count, - uint32_t *max_count) -{ - ARM_COMPUTE_ERROR_ON(nullptr == min); - ARM_COMPUTE_ERROR_ON(nullptr == max); - - _min_max_vals = cl::Buffer(CLScheduler::get().context(), CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE, 2 * sizeof(int32_t)); - _min_max_count_vals = cl::Buffer(CLScheduler::get().context(), CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE, 2 * sizeof(uint32_t)); - _min = min; - _max = max; - _min_count = min_count; - _max_count = max_count; - _min_loc = min_loc; - _max_loc = max_loc; - - _min_max_kernel.configure(compile_context, input, &_min_max_vals); - _min_max_loc_kernel.configure(compile_context, input, &_min_max_vals, &_min_max_count_vals, _min_loc, _max_loc); -} - -void CLMinMaxLocation::run() -{ - cl::CommandQueue q = CLScheduler::get().queue(); - - CLScheduler::get().enqueue(_min_max_kernel, false); - CLScheduler::get().enqueue(_min_max_loc_kernel, false); - - // Update min and max - q.enqueueReadBuffer(_min_max_vals, CL_FALSE, 0 * sizeof(int32_t), sizeof(int32_t), static_cast<int32_t *>(_min)); - q.enqueueReadBuffer(_min_max_vals, CL_FALSE, 1 * sizeof(int32_t), sizeof(int32_t), static_cast<int32_t *>(_max)); - - // Update min and max count - if(_min_count != nullptr) - { - q.enqueueReadBuffer(_min_max_count_vals, CL_FALSE, 0 * sizeof(uint32_t), sizeof(uint32_t), _min_count); - } - if(_max_count != nullptr) - { - q.enqueueReadBuffer(_min_max_count_vals, CL_FALSE, 1 * sizeof(uint32_t), sizeof(uint32_t), _max_count); - } - - // Update min/max point arrays (Makes the kernel blocking) - if(_min_loc != nullptr) - { - unsigned int min_count = 0; - q.enqueueReadBuffer(_min_max_count_vals, CL_TRUE, 0 * sizeof(uint32_t), sizeof(uint32_t), &min_count); - size_t min_corner_size = std::min(static_cast<size_t>(min_count), _min_loc->max_num_values()); - _min_loc->resize(min_corner_size); - } - if(_max_loc != nullptr) - { - unsigned int max_count = 0; - q.enqueueReadBuffer(_min_max_count_vals, CL_TRUE, 1 * sizeof(uint32_t), sizeof(uint32_t), &max_count); - size_t max_corner_size = std::min(static_cast<size_t>(max_count), _max_loc->max_num_values()); - _max_loc->resize(max_corner_size); - } -} -} // namespace arm_compute diff --git a/src/runtime/CL/functions/CLNonLinearFilter.cpp b/src/runtime/CL/functions/CLNonLinearFilter.cpp deleted file mode 100644 index 96912a21cd..0000000000 --- a/src/runtime/CL/functions/CLNonLinearFilter.cpp +++ /dev/null @@ -1,46 +0,0 @@ -/* - * Copyright (c) 2017-2020 ARM Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "arm_compute/runtime/CL/functions/CLNonLinearFilter.h" - -#include "arm_compute/core/CL/kernels/CLNonLinearFilterKernel.h" -#include "support/MemorySupport.h" - -#include <utility> - -using namespace arm_compute; - -void CLNonLinearFilter::configure(ICLTensor *input, ICLTensor *output, NonLinearFilterFunction function, unsigned int mask_size, MatrixPattern pattern, const uint8_t *mask, - BorderMode border_mode, uint8_t constant_border_value) -{ - configure(CLKernelLibrary::get().get_compile_context(), input, output, function, mask_size, pattern, mask, border_mode, constant_border_value); -} - -void CLNonLinearFilter::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, NonLinearFilterFunction function, unsigned int mask_size, MatrixPattern pattern, - const uint8_t *mask, BorderMode border_mode, uint8_t constant_border_value) -{ - auto k = arm_compute::support::cpp14::make_unique<CLNonLinearFilterKernel>(); - k->configure(compile_context, input, output, function, mask_size, pattern, mask, border_mode == BorderMode::UNDEFINED); - _kernel = std::move(k); - _border_handler.configure(compile_context, input, _kernel->border_size(), border_mode, PixelValue(constant_border_value)); -} diff --git a/src/runtime/CL/functions/CLNonMaximaSuppression3x3.cpp b/src/runtime/CL/functions/CLNonMaximaSuppression3x3.cpp deleted file mode 100644 index 6d4a28db26..0000000000 --- a/src/runtime/CL/functions/CLNonMaximaSuppression3x3.cpp +++ /dev/null @@ -1,52 +0,0 @@ -/* - * Copyright (c) 2016-2020 ARM Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "arm_compute/runtime/CL/functions/CLNonMaximaSuppression3x3.h" - -#include "arm_compute/core/CL/kernels/CLNonMaximaSuppression3x3Kernel.h" -#include "support/MemorySupport.h" - -#include <utility> - -using namespace arm_compute; - -void CLNonMaximaSuppression3x3::configure(ICLTensor *input, ICLTensor *output, BorderMode border_mode) -{ - configure(CLKernelLibrary::get().get_compile_context(), input, output, border_mode); -} - -void CLNonMaximaSuppression3x3::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, BorderMode border_mode) -{ - auto k = arm_compute::support::cpp14::make_unique<CLNonMaximaSuppression3x3Kernel>(); - k->configure(compile_context, input, output, border_mode == BorderMode::UNDEFINED); - _kernel = std::move(k); - - if(border_mode != BorderMode::UNDEFINED) - { - _border_handler.configure(compile_context, input, _kernel->border_size(), BorderMode::CONSTANT); - } - else - { - _border_handler.configure(compile_context, input, _kernel->border_size(), BorderMode::UNDEFINED); - } -} diff --git a/src/runtime/CL/functions/CLNormalizationLayer.cpp b/src/runtime/CL/functions/CLNormalizationLayer.cpp index f59a4ca959..f93f82f1a2 100644 --- a/src/runtime/CL/functions/CLNormalizationLayer.cpp +++ b/src/runtime/CL/functions/CLNormalizationLayer.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2020 ARM Limited. + * Copyright (c) 2017-2021 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -25,44 +25,66 @@ #include "arm_compute/runtime/CL/functions/CLNormalizationLayer.h" #include "arm_compute/core/Error.h" +#include "arm_compute/core/PixelValue.h" #include "arm_compute/core/TensorInfo.h" #include "arm_compute/core/Types.h" #include "arm_compute/core/Validate.h" #include "arm_compute/runtime/CL/CLScheduler.h" -using namespace arm_compute; +#include "src/common/utils/Log.h" +#include "src/core/CL/kernels/CLFillBorderKernel.h" +#include "src/core/CL/kernels/CLNormalizationLayerKernel.h" +namespace arm_compute +{ CLNormalizationLayer::CLNormalizationLayer() - : _norm_kernel(), _border_handler() + : _norm_kernel(std::make_unique<CLNormalizationLayerKernel>()), + _border_handler(std::make_unique<CLFillBorderKernel>()) { } +CLNormalizationLayer::~CLNormalizationLayer() = default; + void CLNormalizationLayer::configure(ICLTensor *input, ICLTensor *output, const NormalizationLayerInfo &norm_info) { configure(CLKernelLibrary::get().get_compile_context(), input, output, norm_info); } -void CLNormalizationLayer::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, const NormalizationLayerInfo &norm_info) +void CLNormalizationLayer::configure(const CLCompileContext &compile_context, + ICLTensor *input, + ICLTensor *output, + const NormalizationLayerInfo &norm_info) { ARM_COMPUTE_ERROR_ON(input == nullptr); + ARM_COMPUTE_LOG_PARAMS(input, output, norm_info); // Configure normalization kernel - _norm_kernel.configure(compile_context, input, output, norm_info); + _norm_kernel->configure(compile_context, input, output, norm_info); - // Fill the border by 3 elements since we need vload4 in the IN_MAP normalization kernel - _border_handler.configure(compile_context, input, _norm_kernel.border_size(), BorderMode::CONSTANT, PixelValue()); + if (!_norm_kernel->border_size().empty()) + { + // Fill the border by 3 elements since we need vload4 in the IN_MAP normalization kernel + _border_handler->configure(compile_context, input, _norm_kernel->border_size(), BorderMode::CONSTANT, + PixelValue()); + } } -Status CLNormalizationLayer::validate(const ITensorInfo *input, const ITensorInfo *output, const NormalizationLayerInfo &norm_info) +Status CLNormalizationLayer::validate(const ITensorInfo *input, + const ITensorInfo *output, + const NormalizationLayerInfo &norm_info) { return CLNormalizationLayerKernel::validate(input, output, norm_info); } void CLNormalizationLayer::run() { - // Run border handler - CLScheduler::get().enqueue(_border_handler, false); + if (!_norm_kernel->border_size().empty()) + { + // Run border handler + CLScheduler::get().enqueue(*_border_handler, false); + } // Run normalization kernel - CLScheduler::get().enqueue(_norm_kernel); + CLScheduler::get().enqueue(*_norm_kernel); } +} // namespace arm_compute diff --git a/src/runtime/CL/functions/CLNormalizePlanarYUVLayer.cpp b/src/runtime/CL/functions/CLNormalizePlanarYUVLayer.cpp index b03de6475b..939c95bd45 100644 --- a/src/runtime/CL/functions/CLNormalizePlanarYUVLayer.cpp +++ b/src/runtime/CL/functions/CLNormalizePlanarYUVLayer.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018-2020 ARM Limited. + * Copyright (c) 2018-2021 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -24,27 +24,37 @@ #include "arm_compute/runtime/CL/functions/CLNormalizePlanarYUVLayer.h" -#include "arm_compute/core/CL/kernels/CLNormalizePlanarYUVLayerKernel.h" -#include "support/MemorySupport.h" +#include "src/common/utils/Log.h" +#include "src/core/CL/kernels/CLNormalizePlanarYUVLayerKernel.h" #include <utility> namespace arm_compute { -void CLNormalizePlanarYUVLayer::configure(const ICLTensor *input, ICLTensor *output, const ICLTensor *mean, const ICLTensor *std) +void CLNormalizePlanarYUVLayer::configure(const ICLTensor *input, + ICLTensor *output, + const ICLTensor *mean, + const ICLTensor *std) { configure(CLKernelLibrary::get().get_compile_context(), input, output, mean, std); } -void CLNormalizePlanarYUVLayer::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const ICLTensor *mean, const ICLTensor *std) +void CLNormalizePlanarYUVLayer::configure(const CLCompileContext &compile_context, + const ICLTensor *input, + ICLTensor *output, + const ICLTensor *mean, + const ICLTensor *std) { - auto k = arm_compute::support::cpp14::make_unique<CLNormalizePlanarYUVLayerKernel>(); + ARM_COMPUTE_LOG_PARAMS(input, output, mean, std); + auto k = std::make_unique<CLNormalizePlanarYUVLayerKernel>(); k->configure(compile_context, input, output, mean, std); _kernel = std::move(k); } -Status CLNormalizePlanarYUVLayer::validate(const ITensorInfo *input, const ITensorInfo *output, - const ITensorInfo *mean, const ITensorInfo *std) +Status CLNormalizePlanarYUVLayer::validate(const ITensorInfo *input, + const ITensorInfo *output, + const ITensorInfo *mean, + const ITensorInfo *std) { return CLNormalizePlanarYUVLayerKernel::validate(input, output, mean, std); } diff --git a/src/runtime/CL/functions/CLOpticalFlow.cpp b/src/runtime/CL/functions/CLOpticalFlow.cpp deleted file mode 100644 index 5f7c1704ee..0000000000 --- a/src/runtime/CL/functions/CLOpticalFlow.cpp +++ /dev/null @@ -1,179 +0,0 @@ -/* - * Copyright (c) 2017-2020 ARM Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "arm_compute/runtime/CL/functions/CLOpticalFlow.h" - -#include "arm_compute/core/CL/ICLTensor.h" -#include "arm_compute/core/CL/kernels/CLLKTrackerKernel.h" -#include "arm_compute/core/Error.h" -#include "arm_compute/core/TensorInfo.h" -#include "arm_compute/core/Window.h" -#include "arm_compute/runtime/CL/CLPyramid.h" -#include "arm_compute/runtime/CL/CLScheduler.h" -#include "arm_compute/runtime/CL/CLTensor.h" -#include "arm_compute/runtime/CL/CLTensorAllocator.h" -#include "arm_compute/runtime/CL/functions/CLScharr3x3.h" -#include "support/MemorySupport.h" - -using namespace arm_compute; - -CLOpticalFlow::CLOpticalFlow(std::shared_ptr<IMemoryManager> memory_manager) // NOLINT - : _memory_group(std::move(memory_manager)), - _tracker_init_kernel(), - _tracker_stage0_kernel(), - _tracker_stage1_kernel(), - _tracker_finalize_kernel(), - _func_scharr(), - _scharr_gx(), - _scharr_gy(), - _old_points(nullptr), - _new_points_estimates(nullptr), - _new_points(nullptr), - _old_points_internal(), - _new_points_internal(), - _coefficient_table(), - _old_values(), - _num_levels(0) -{ -} - -void CLOpticalFlow::configure(const CLPyramid *old_pyramid, const CLPyramid *new_pyramid, - const ICLKeyPointArray *old_points, const ICLKeyPointArray *new_points_estimates, ICLKeyPointArray *new_points, - Termination termination, float epsilon, size_t num_iterations, size_t window_dimension, bool use_initial_estimate, - BorderMode border_mode, uint8_t constant_border_value) -{ - configure(CLKernelLibrary::get().get_compile_context(), old_pyramid, new_pyramid, old_points, new_points_estimates, new_points, termination, epsilon, num_iterations, window_dimension, - use_initial_estimate, border_mode, constant_border_value); -} - -void CLOpticalFlow::configure(const CLCompileContext &compile_context, const CLPyramid *old_pyramid, const CLPyramid *new_pyramid, - const ICLKeyPointArray *old_points, const ICLKeyPointArray *new_points_estimates, ICLKeyPointArray *new_points, - Termination termination, float epsilon, size_t num_iterations, size_t window_dimension, bool use_initial_estimate, - BorderMode border_mode, uint8_t constant_border_value) -{ - ARM_COMPUTE_ERROR_ON(nullptr == old_pyramid); - ARM_COMPUTE_ERROR_ON(nullptr == new_pyramid); - ARM_COMPUTE_ERROR_ON(nullptr == old_points); - ARM_COMPUTE_ERROR_ON(nullptr == new_points_estimates); - ARM_COMPUTE_ERROR_ON(nullptr == new_points); - ARM_COMPUTE_ERROR_ON(old_pyramid->info()->num_levels() != new_pyramid->info()->num_levels()); - ARM_COMPUTE_ERROR_ON(0 == old_pyramid->info()->num_levels()); - ARM_COMPUTE_ERROR_ON(old_pyramid->info()->width() != new_pyramid->info()->width()); - ARM_COMPUTE_ERROR_ON(old_pyramid->info()->height() != new_pyramid->info()->height()); - ARM_COMPUTE_ERROR_ON(use_initial_estimate && old_points->num_values() != new_points_estimates->num_values()); - - // Set member variables - _old_points = old_points; - _new_points_estimates = new_points_estimates; - _new_points = new_points; - _num_levels = old_pyramid->info()->num_levels(); - - const float pyr_scale = old_pyramid->info()->scale(); - const int list_length = old_points->num_values(); - const int old_values_list_length = list_length * window_dimension * window_dimension; - - // Create kernels and tensors - _tracker_init_kernel.resize(_num_levels); - _tracker_stage0_kernel.resize(_num_levels); - _tracker_stage1_kernel.resize(_num_levels); - _func_scharr.resize(_num_levels); - _scharr_gx.resize(_num_levels); - _scharr_gy.resize(_num_levels); - - // Create internal keypoint arrays - _old_points_internal = arm_compute::support::cpp14::make_unique<CLLKInternalKeypointArray>(list_length); - _old_points_internal->resize(list_length); - _new_points_internal = arm_compute::support::cpp14::make_unique<CLLKInternalKeypointArray>(list_length); - _new_points_internal->resize(list_length); - _coefficient_table = arm_compute::support::cpp14::make_unique<CLCoefficientTableArray>(list_length); - _coefficient_table->resize(list_length); - _old_values = arm_compute::support::cpp14::make_unique<CLOldValueArray>(old_values_list_length); - _old_values->resize(old_values_list_length); - _new_points->resize(list_length); - - for(size_t i = 0; i < _num_levels; ++i) - { - // Get images from the ith level of old and right pyramid - ICLImage *old_ith_input = old_pyramid->get_pyramid_level(i); - ICLImage *new_ith_input = new_pyramid->get_pyramid_level(i); - - // Get width and height of images - const unsigned int width_ith = old_ith_input->info()->dimension(0); - const unsigned int height_ith = new_ith_input->info()->dimension(1); - - // Initialize Scharr tensors - TensorInfo tensor_info(TensorShape(width_ith, height_ith), 1, DataType::S16); - _scharr_gx[i].allocator()->init(tensor_info); - _scharr_gy[i].allocator()->init(tensor_info); - - // Manage intermediate buffers - _memory_group.manage(&_scharr_gx[i]); - _memory_group.manage(&_scharr_gy[i]); - - // Init Scharr kernel - _func_scharr[i].configure(compile_context, old_ith_input, &_scharr_gx[i], &_scharr_gy[i], border_mode, constant_border_value); - - // Init Lucas-Kanade init kernel - _tracker_init_kernel[i].configure(compile_context, old_points, new_points_estimates, _old_points_internal.get(), _new_points_internal.get(), use_initial_estimate, i, _num_levels, pyr_scale); - - // Init Lucas-Kanade stage0 kernel - _tracker_stage0_kernel[i].configure(compile_context, old_ith_input, &_scharr_gx[i], &_scharr_gy[i], - _old_points_internal.get(), _new_points_internal.get(), _coefficient_table.get(), _old_values.get(), - window_dimension, i); - - // Init Lucas-Kanade stage1 kernel - _tracker_stage1_kernel[i].configure(compile_context, new_ith_input, _new_points_internal.get(), _coefficient_table.get(), _old_values.get(), - termination, epsilon, num_iterations, window_dimension, i); - - // Allocate intermediate buffers - _scharr_gx[i].allocator()->allocate(); - _scharr_gy[i].allocator()->allocate(); - } - - // Finalize Lucas-Kanade - _tracker_finalize_kernel.configure(compile_context, _new_points_internal.get(), new_points); -} - -void CLOpticalFlow::run() -{ - ARM_COMPUTE_ERROR_ON_MSG(_num_levels == 0, "Unconfigured function"); - - MemoryGroupResourceScope scope_mg(_memory_group); - - for(unsigned int level = _num_levels; level > 0; --level) - { - // Run Scharr kernel - _func_scharr[level - 1].run(); - - // Run Lucas-Kanade init kernel - CLScheduler::get().enqueue(_tracker_init_kernel[level - 1]); - - // Run Lucas-Kanade stage0 kernel - CLScheduler::get().enqueue(_tracker_stage0_kernel[level - 1]); - - // Run Lucas-Kanade stage1 kernel - CLScheduler::get().enqueue(_tracker_stage1_kernel[level - 1]); - } - - CLScheduler::get().enqueue(_tracker_finalize_kernel, true); -} diff --git a/src/runtime/CL/functions/CLPReluLayer.cpp b/src/runtime/CL/functions/CLPReluLayer.cpp index 6543ab922e..ce6d285ebe 100644 --- a/src/runtime/CL/functions/CLPReluLayer.cpp +++ b/src/runtime/CL/functions/CLPReluLayer.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2020 ARM Limited. + * Copyright (c) 2019-2021 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -21,45 +21,63 @@ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ -#include "arm_compute/core/CL/kernels/CLElementwiseOperationKernel.h" +#include "arm_compute/runtime/CL/functions/CLPReluLayer.h" +#include "arm_compute/core/CL/CLKernelLibrary.h" #include "arm_compute/core/CL/ICLTensor.h" -#include "arm_compute/runtime/CL/functions/CLPReluLayer.h" -#include "support/MemorySupport.h" + +#include "src/gpu/cl/IClKernel.h" +#include "src/gpu/cl/operators/ClPRelu.h" namespace arm_compute { -namespace -{ -void configure_border_handler(const CLCompileContext &compile_context, CLFillBorderKernel &border_handler, BorderSize border_size, ICLTensor *input1, ICLTensor *input2, const ICLTensor *output) +using OperatorType = opencl::ClPRelu; + +struct CLPReluLayer::Impl { - if(output->info()->dimension(0) > 1) - { - ICLTensor *broadcasted_info = (input1->info()->dimension(0) == 1) ? input1 : input2; + const ICLTensor *src_0{nullptr}; + const ICLTensor *src_1{nullptr}; + ICLTensor *dst{nullptr}; + std::unique_ptr<OperatorType> op{nullptr}; +}; - if(broadcasted_info->info()->dimension(0) == 1) - { - border_handler.configure(compile_context, broadcasted_info, border_size, BorderMode::REPLICATE); - } - } +CLPReluLayer::CLPReluLayer() : _impl(std::make_unique<Impl>()) +{ } -} // namespace +CLPReluLayer::CLPReluLayer(CLPReluLayer &&) = default; +CLPReluLayer &CLPReluLayer::operator=(CLPReluLayer &&) = default; +CLPReluLayer::~CLPReluLayer() = default; void CLPReluLayer::configure(ICLTensor *input, ICLTensor *alpha, ICLTensor *output) { configure(CLKernelLibrary::get().get_compile_context(), input, alpha, output); } -void CLPReluLayer::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *alpha, ICLTensor *output) +void CLPReluLayer::configure(const CLCompileContext &compile_context, + ICLTensor *input, + ICLTensor *alpha, + ICLTensor *output) { - auto k = arm_compute::support::cpp14::make_unique<CLArithmeticOperationKernel>(); - k->configure(compile_context, ArithmeticOperation::PRELU, input, alpha, output); - _kernel = std::move(k); - configure_border_handler(compile_context, _border_handler, _kernel->border_size(), input, alpha, output); + _impl->src_0 = input; + _impl->src_1 = alpha; + _impl->dst = output; + _impl->op = std::make_unique<OperatorType>(); + _impl->op->configure(compile_context, input->info(), alpha->info(), + (output == nullptr ? input->info() : output->info())); } Status CLPReluLayer::validate(const ITensorInfo *input, const ITensorInfo *alpha, const ITensorInfo *output) { - return CLArithmeticOperationKernel::validate(ArithmeticOperation::PRELU, input, alpha, output); + return OperatorType::validate(input, alpha, output); +} + +void CLPReluLayer::run() +{ + ITensorPack pack; + pack.add_tensor(TensorType::ACL_SRC_0, _impl->src_0); + pack.add_tensor(TensorType::ACL_SRC_1, _impl->src_1); + pack.add_tensor(TensorType::ACL_DST, _impl->dst); + + _impl->op->run(pack); } } // namespace arm_compute diff --git a/src/runtime/CL/functions/CLPadLayer.cpp b/src/runtime/CL/functions/CLPadLayer.cpp index 078bdbc51f..e788ded512 100644 --- a/src/runtime/CL/functions/CLPadLayer.cpp +++ b/src/runtime/CL/functions/CLPadLayer.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2020 ARM Limited. + * Copyright (c) 2019-2021 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -23,65 +23,74 @@ */ #include "arm_compute/runtime/CL/functions/CLPadLayer.h" +#include "src/common/utils/Log.h" +#include "src/core/CL/kernels/CLPadLayerKernel.h" + namespace arm_compute { -CLPadLayer::CLPadLayer() - : _pad_kernel(), _copy_kernel(), _perform_pad(false) +CLPadLayer::CLPadLayer() : _pad_kernel(std::make_unique<CLPadLayerKernel>()), _copy(), _perform_pad(false) { } -void CLPadLayer::configure(ICLTensor *input, ICLTensor *output, const PaddingList &padding, PixelValue constant_value, PaddingMode mode) +CLPadLayer::~CLPadLayer() = default; + +void CLPadLayer::configure( + ICLTensor *input, ICLTensor *output, const PaddingList &padding, PixelValue constant_value, PaddingMode mode) { configure(CLKernelLibrary::get().get_compile_context(), input, output, padding, constant_value, mode); } -void CLPadLayer::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, const PaddingList &padding, PixelValue constant_value, PaddingMode mode) +void CLPadLayer::configure(const CLCompileContext &compile_context, + ICLTensor *input, + ICLTensor *output, + const PaddingList &padding, + PixelValue constant_value, + PaddingMode mode) { ARM_COMPUTE_ERROR_THROW_ON(validate(input->info(), output->info(), padding, constant_value, mode)); + ARM_COMPUTE_LOG_PARAMS(input, output, padding, constant_value, mode); - _perform_pad = std::any_of(padding.begin(), padding.end(), [](PaddingInfo info) - { - return info.first > 0 || info.second > 0; - }); + _perform_pad = + std::any_of(padding.begin(), padding.end(), [](PaddingInfo info) { return info.first > 0 || info.second > 0; }); - if(_perform_pad) + if (_perform_pad) { - _pad_kernel.configure(compile_context, input, output, padding, constant_value, mode); + _pad_kernel->configure(compile_context, input, output, padding, constant_value, mode); } else { // Copy the input to the whole output if no padding is applied - _copy_kernel.configure(compile_context, input, output); + _copy.configure(compile_context, input, output); } } -Status CLPadLayer::validate(const ITensorInfo *input, const ITensorInfo *output, const PaddingList &padding, PixelValue constant_value, PaddingMode mode) +Status CLPadLayer::validate(const ITensorInfo *input, + const ITensorInfo *output, + const PaddingList &padding, + PixelValue constant_value, + PaddingMode mode) { - bool perform_pad = std::any_of(padding.begin(), padding.end(), [](PaddingInfo info) - { - return info.first > 0 || info.second > 0; - }); + bool perform_pad = + std::any_of(padding.begin(), padding.end(), [](PaddingInfo info) { return info.first > 0 || info.second > 0; }); - if(perform_pad) + if (perform_pad) { ARM_COMPUTE_RETURN_ON_ERROR(CLPadLayerKernel::validate(input, output, padding, constant_value, mode)); } else { - Window copy_window = Window(); - copy_window.use_tensor_dimensions(output->tensor_shape()); - ARM_COMPUTE_RETURN_ON_ERROR(CLCopyKernel::validate(input, output, PaddingList(), ©_window)); + ARM_COMPUTE_RETURN_ON_ERROR(CLCopy::validate(input, output)); } return Status{}; } void CLPadLayer::run() { - if(_perform_pad) + if (_perform_pad) { - CLScheduler::get().enqueue(_pad_kernel); + CLScheduler::get().enqueue(*_pad_kernel); } else { - CLScheduler::get().enqueue(_copy_kernel); + _copy.run(); } } -} // namespace arm_compute
\ No newline at end of file +} // namespace arm_compute diff --git a/src/runtime/CL/functions/CLPermute.cpp b/src/runtime/CL/functions/CLPermute.cpp index e6323ce504..7f97eed98a 100644 --- a/src/runtime/CL/functions/CLPermute.cpp +++ b/src/runtime/CL/functions/CLPermute.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018-2020 ARM Limited. + * Copyright (c) 2018-2021 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -23,28 +23,60 @@ */ #include "arm_compute/runtime/CL/functions/CLPermute.h" +#include "arm_compute/core/CL/CLKernelLibrary.h" #include "arm_compute/core/CL/ICLTensor.h" -#include "arm_compute/core/CL/kernels/CLPermuteKernel.h" -#include "arm_compute/core/Error.h" -#include "support/MemorySupport.h" +#include "arm_compute/core/Types.h" +#include "arm_compute/core/Validate.h" + +#include "src/common/utils/Log.h" +#include "src/core/CL/ICLKernel.h" +#include "src/gpu/cl/operators/ClPermute.h" namespace arm_compute { +struct CLPermute::Impl +{ + const ICLTensor *src{nullptr}; + ICLTensor *dst{nullptr}; + std::unique_ptr<opencl::ClPermute> op{nullptr}; +}; + +CLPermute::CLPermute() : _impl(std::make_unique<Impl>()) +{ +} + +CLPermute::~CLPermute() = default; + void CLPermute::configure(const ICLTensor *input, ICLTensor *output, const PermutationVector &perm) { configure(CLKernelLibrary::get().get_compile_context(), input, output, perm); } -void CLPermute::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const PermutationVector &perm) +void CLPermute::configure(const CLCompileContext &compile_context, + const ICLTensor *input, + ICLTensor *output, + const PermutationVector &perm) { - auto k = arm_compute::support::cpp14::make_unique<CLPermuteKernel>(); - k->configure(compile_context, input, output, perm); - _kernel = std::move(k); + ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); + ARM_COMPUTE_LOG_PARAMS(input, output, perm); + + _impl->src = input; + _impl->dst = output; + + _impl->op = std::make_unique<opencl::ClPermute>(); + _impl->op->configure(compile_context, _impl->src->info(), _impl->dst->info(), perm); } Status CLPermute::validate(const ITensorInfo *input, const ITensorInfo *output, const PermutationVector &perm) { - ARM_COMPUTE_RETURN_ON_ERROR(CLPermuteKernel::validate(input, output, perm)); - return Status{}; + return opencl::ClPermute::validate(input, output, perm); +} + +void CLPermute::run() +{ + ITensorPack pack; + pack.add_tensor(TensorType::ACL_SRC, _impl->src); + pack.add_tensor(TensorType::ACL_DST, _impl->dst); + _impl->op->run(pack); } } // namespace arm_compute diff --git a/src/runtime/CL/functions/CLPhase.cpp b/src/runtime/CL/functions/CLPhase.cpp deleted file mode 100644 index b915104f38..0000000000 --- a/src/runtime/CL/functions/CLPhase.cpp +++ /dev/null @@ -1,43 +0,0 @@ -/* - * Copyright (c) 2016-2020 ARM Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "arm_compute/runtime/CL/functions/CLPhase.h" - -#include "arm_compute/core/CL/kernels/CLMagnitudePhaseKernel.h" -#include "support/MemorySupport.h" - -#include <utility> - -using namespace arm_compute; - -void CLPhase::configure(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, PhaseType phase_type) -{ - configure(CLKernelLibrary::get().get_compile_context(), input1, input2, output, phase_type); -} - -void CLPhase::configure(const CLCompileContext &compile_context, const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, PhaseType phase_type) -{ - auto k = arm_compute::support::cpp14::make_unique<CLMagnitudePhaseKernel>(); - k->configure(compile_context, input1, input2, nullptr, output, MagnitudeType::L1NORM, phase_type); - _kernel = std::move(k); -} diff --git a/src/runtime/CL/functions/CLPixelWiseMultiplication.cpp b/src/runtime/CL/functions/CLPixelWiseMultiplication.cpp index 3c1a7de76d..6aa9d9cbb3 100644 --- a/src/runtime/CL/functions/CLPixelWiseMultiplication.cpp +++ b/src/runtime/CL/functions/CLPixelWiseMultiplication.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2016-2020 ARM Limited. + * Copyright (c) 2016-2021 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -24,67 +24,132 @@ #include "arm_compute/runtime/CL/functions/CLPixelWiseMultiplication.h" #include "arm_compute/core/CL/ICLTensor.h" -#include "arm_compute/core/CL/kernels/CLPixelWiseMultiplicationKernel.h" -#include "support/MemorySupport.h" +#include "arm_compute/runtime/CL/CLScheduler.h" + +#include "src/core/CL/ICLKernel.h" +#include "src/gpu/cl/operators/ClMul.h" #include <utility> namespace arm_compute { -void CLPixelWiseMultiplication::configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *output, float scale, - ConvertPolicy overflow_policy, RoundingPolicy rounding_policy, const ActivationLayerInfo &act_info) +struct CLPixelWiseMultiplication::Impl +{ + const ICLTensor *src_0{nullptr}; + const ICLTensor *src_1{nullptr}; + ICLTensor *dst{nullptr}; + std::unique_ptr<opencl::ClMul> op{nullptr}; +}; + +CLPixelWiseMultiplication::CLPixelWiseMultiplication() : _impl(std::make_unique<Impl>()) +{ +} +CLPixelWiseMultiplication::CLPixelWiseMultiplication(CLPixelWiseMultiplication &&) = default; +CLPixelWiseMultiplication &CLPixelWiseMultiplication::operator=(CLPixelWiseMultiplication &&) = default; +CLPixelWiseMultiplication::~CLPixelWiseMultiplication() = default; + +void CLPixelWiseMultiplication::configure(ICLTensor *input1, + ICLTensor *input2, + ICLTensor *output, + float scale, + ConvertPolicy overflow_policy, + RoundingPolicy rounding_policy, + const ActivationLayerInfo &act_info) +{ + configure(CLKernelLibrary::get().get_compile_context(), input1, input2, output, scale, overflow_policy, + rounding_policy, act_info); +} + +void CLPixelWiseMultiplication::configure(const CLCompileContext &compile_context, + ICLTensor *input1, + ICLTensor *input2, + ICLTensor *output, + float scale, + ConvertPolicy overflow_policy, + RoundingPolicy rounding_policy, + const ActivationLayerInfo &act_info) { - configure(CLKernelLibrary::get().get_compile_context(), input1, input2, output, scale, overflow_policy, rounding_policy, act_info); + _impl->src_0 = input1; + _impl->src_1 = input2; + _impl->dst = output; + _impl->op = std::make_unique<opencl::ClMul>(); + _impl->op->configure(compile_context, input1->info(), input2->info(), output->info(), scale, overflow_policy, + rounding_policy, act_info); } -void CLPixelWiseMultiplication::configure(const CLCompileContext &compile_context, ICLTensor *input1, ICLTensor *input2, ICLTensor *output, float scale, - ConvertPolicy overflow_policy, RoundingPolicy rounding_policy, const ActivationLayerInfo &act_info) +Status CLPixelWiseMultiplication::validate(const ITensorInfo *input1, + const ITensorInfo *input2, + const ITensorInfo *output, + float scale, + ConvertPolicy overflow_policy, + RoundingPolicy rounding_policy, + const ActivationLayerInfo &act_info) { - auto k = arm_compute::support::cpp14::make_unique<CLPixelWiseMultiplicationKernel>(); - k->configure(compile_context, input1, input2, output, scale, overflow_policy, rounding_policy, act_info); - _kernel = std::move(k); - - if(output->info()->dimension(0) > 1) - { - ICLTensor *broadcasted_info = (input1->info()->dimension(0) == 1) ? input1 : input2; - - if(broadcasted_info->info()->dimension(0) == 1) - { - _border_handler.configure(compile_context, broadcasted_info, _kernel->border_size(), BorderMode::REPLICATE); - } - } + return opencl::ClMul::validate(input1, input2, output, scale, overflow_policy, rounding_policy, act_info); } -Status CLPixelWiseMultiplication::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, float scale, - ConvertPolicy overflow_policy, RoundingPolicy rounding_policy, const ActivationLayerInfo &act_info) +void CLPixelWiseMultiplication::run() { - return CLPixelWiseMultiplicationKernel::validate(input1, input2, output, scale, overflow_policy, rounding_policy, act_info); + ITensorPack pack; + pack.add_tensor(TensorType::ACL_SRC_0, _impl->src_0); + pack.add_tensor(TensorType::ACL_SRC_1, _impl->src_1); + pack.add_tensor(TensorType::ACL_DST, _impl->dst); + + _impl->op->run(pack); } -void CLComplexPixelWiseMultiplication::configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *output, const ActivationLayerInfo &act_info) +struct CLComplexPixelWiseMultiplication::Impl +{ + const ICLTensor *src_0{nullptr}; + const ICLTensor *src_1{nullptr}; + ICLTensor *dst{nullptr}; + std::unique_ptr<opencl::ClComplexMul> op{nullptr}; +}; + +CLComplexPixelWiseMultiplication::CLComplexPixelWiseMultiplication() : _impl(std::make_unique<Impl>()) +{ +} +CLComplexPixelWiseMultiplication::CLComplexPixelWiseMultiplication(CLComplexPixelWiseMultiplication &&) = default; +CLComplexPixelWiseMultiplication & +CLComplexPixelWiseMultiplication::operator=(CLComplexPixelWiseMultiplication &&) = default; +CLComplexPixelWiseMultiplication::~CLComplexPixelWiseMultiplication() = default; + +void CLComplexPixelWiseMultiplication::configure(ICLTensor *input1, + ICLTensor *input2, + ICLTensor *output, + const ActivationLayerInfo &act_info) { configure(CLKernelLibrary::get().get_compile_context(), input1, input2, output, act_info); } -void CLComplexPixelWiseMultiplication::configure(const CLCompileContext &compile_context, ICLTensor *input1, ICLTensor *input2, ICLTensor *output, const ActivationLayerInfo &act_info) +void CLComplexPixelWiseMultiplication::configure(const CLCompileContext &compile_context, + ICLTensor *input1, + ICLTensor *input2, + ICLTensor *output, + const ActivationLayerInfo &act_info) { - auto k = arm_compute::support::cpp14::make_unique<CLComplexPixelWiseMultiplicationKernel>(); - k->configure(compile_context, input1, input2, output, act_info); - _kernel = std::move(k); - - if(output->info()->dimension(0) > 1) - { - ICLTensor *broadcasted_info = (input1->info()->dimension(0) == 1) ? input1 : input2; - - if(broadcasted_info->info()->dimension(0) == 1) - { - _border_handler.configure(compile_context, broadcasted_info, _kernel->border_size(), BorderMode::REPLICATE); - } - } + _impl->src_0 = input1; + _impl->src_1 = input2; + _impl->dst = output; + _impl->op = std::make_unique<opencl::ClComplexMul>(); + _impl->op->configure(compile_context, input1->info(), input2->info(), output->info(), act_info); } -Status CLComplexPixelWiseMultiplication::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const ActivationLayerInfo &act_info) +Status CLComplexPixelWiseMultiplication::validate(const ITensorInfo *input1, + const ITensorInfo *input2, + const ITensorInfo *output, + const ActivationLayerInfo &act_info) { - return CLComplexPixelWiseMultiplicationKernel::validate(input1, input2, output, act_info); + return opencl::ClComplexMul::validate(input1, input2, output, act_info); +} + +void CLComplexPixelWiseMultiplication::run() +{ + ITensorPack pack; + pack.add_tensor(TensorType::ACL_SRC_0, _impl->src_0); + pack.add_tensor(TensorType::ACL_SRC_1, _impl->src_1); + pack.add_tensor(TensorType::ACL_DST, _impl->dst); + + _impl->op->run(pack); } } // namespace arm_compute diff --git a/src/runtime/CL/functions/CLPooling3dLayer.cpp b/src/runtime/CL/functions/CLPooling3dLayer.cpp new file mode 100644 index 0000000000..ce1092a7cc --- /dev/null +++ b/src/runtime/CL/functions/CLPooling3dLayer.cpp @@ -0,0 +1,77 @@ +/* + * Copyright (c) 2022 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/runtime/CL/functions/CLPooling3dLayer.h" + +#include "arm_compute/core/CL/CLKernelLibrary.h" +#include "arm_compute/core/CL/ICLTensor.h" + +#include "src/core/CL/ICLKernel.h" +#include "src/gpu/cl/operators/ClPool3d.h" + +namespace arm_compute +{ +struct CLPooling3dLayer::Impl +{ + const ICLTensor *src{nullptr}; + ICLTensor *dst{nullptr}; + ICLTensor *indices{nullptr}; + std::unique_ptr<opencl::ClPool3d> op{nullptr}; +}; + +CLPooling3dLayer::CLPooling3dLayer() : _impl(std::make_unique<Impl>()) +{ +} +CLPooling3dLayer::~CLPooling3dLayer() = default; + +void CLPooling3dLayer::configure(const ICLTensor *input, ICLTensor *output, const Pooling3dLayerInfo &pool_info) +{ + configure(CLKernelLibrary::get().get_compile_context(), input, output, pool_info); +} + +void CLPooling3dLayer::configure(const CLCompileContext &compile_context, + const ICLTensor *input, + ICLTensor *output, + const Pooling3dLayerInfo &pool_info) +{ + _impl->src = input; + _impl->dst = output; + + _impl->op = std::make_unique<opencl::ClPool3d>(); + _impl->op->configure(compile_context, input->info(), output->info(), pool_info); +} + +Status +CLPooling3dLayer::validate(const ITensorInfo *input, const ITensorInfo *output, const Pooling3dLayerInfo &pool_info) +{ + return opencl::ClPool3d::validate(input, output, pool_info); +} + +void CLPooling3dLayer::run() +{ + ITensorPack pack; + pack.add_tensor(TensorType::ACL_SRC, _impl->src); + pack.add_tensor(TensorType::ACL_DST_0, _impl->dst); + _impl->op->run(pack); +} +} // namespace arm_compute diff --git a/src/runtime/CL/functions/CLPoolingLayer.cpp b/src/runtime/CL/functions/CLPoolingLayer.cpp index e7735b00df..65e53b9be3 100644 --- a/src/runtime/CL/functions/CLPoolingLayer.cpp +++ b/src/runtime/CL/functions/CLPoolingLayer.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2020 ARM Limited. + * Copyright (c) 2017-2021 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -23,70 +23,64 @@ */ #include "arm_compute/runtime/CL/functions/CLPoolingLayer.h" +#include "arm_compute/core/CL/CLKernelLibrary.h" #include "arm_compute/core/CL/ICLTensor.h" -#include "arm_compute/core/CL/kernels/CLPoolingLayerKernel.h" -#include "arm_compute/runtime/CL/CLScheduler.h" -#include "support/MemorySupport.h" + +#include "src/core/CL/ICLKernel.h" +#include "src/gpu/cl/operators/ClPool2d.h" namespace arm_compute { -void CLPoolingLayer::configure(ICLTensor *input, ICLTensor *output, const PoolingLayerInfo &pool_info, ICLTensor *indices) +struct CLPoolingLayer::Impl { - configure(CLKernelLibrary::get().get_compile_context(), input, output, pool_info, indices); -} + const ICLTensor *src{nullptr}; + ICLTensor *dst{nullptr}; + ICLTensor *indices{nullptr}; + std::unique_ptr<opencl::ClPool2d> op{nullptr}; +}; -void CLPoolingLayer::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, const PoolingLayerInfo &pool_info, ICLTensor *indices) +CLPoolingLayer::CLPoolingLayer() : _impl(std::make_unique<Impl>()) { - ARM_COMPUTE_ERROR_ON_NULLPTR(input); - // Configure pooling kernel - auto k = arm_compute::support::cpp14::make_unique<CLPoolingLayerKernel>(); - k->set_target(CLScheduler::get().target()); - k->configure(compile_context, input, output, pool_info, indices); - _kernel = std::move(k); - - const DataType data_type = input->info()->data_type(); +} +CLPoolingLayer::~CLPoolingLayer() = default; - // Configure border depending on operation required (quantize border in case of asymmetric data_type) - BorderMode border_mode{}; - PixelValue pixel_value(0.f); - if(is_data_type_quantized_asymmetric(data_type) && !pool_info.exclude_padding) - { - pixel_value = PixelValue(0, data_type, input->info()->quantization_info()); - } +void CLPoolingLayer::configure(ICLTensor *input, + ICLTensor *output, + const PoolingLayerInfo &pool_info, + ICLTensor *indices) +{ + configure(CLKernelLibrary::get().get_compile_context(), input, output, pool_info, indices); +} - // Data layout - const auto data_layout = pool_info.data_layout == DataLayout::UNKNOWN ? input->info()->data_layout() : pool_info.data_layout; +void CLPoolingLayer::configure(const CLCompileContext &compile_context, + ICLTensor *input, + ICLTensor *output, + const PoolingLayerInfo &pool_info, + ICLTensor *indices) +{ + _impl->src = input; + _impl->dst = output; + _impl->indices = indices; - switch(data_layout) - { - case DataLayout::NCHW: - border_mode = (PoolingType::MAX == pool_info.pool_type) ? BorderMode::REPLICATE : BorderMode::CONSTANT; - break; - case DataLayout::NHWC: - border_mode = BorderMode::CONSTANT; - if(PoolingType::MAX == pool_info.pool_type) - { - if(is_data_type_quantized(data_type)) - { - std::tie(pixel_value, std::ignore) = get_min_max(data_type); - } - else - { - pixel_value = PixelValue(std::numeric_limits<float>::lowest()); - } - } - break; - default: - ARM_COMPUTE_ERROR("Data layout not supported"); - } - _border_handler.configure(compile_context, input, _kernel->border_size(), border_mode, pixel_value); + _impl->op = std::make_unique<opencl::ClPool2d>(); + _impl->op->configure(compile_context, input->info(), output->info(), pool_info, + (indices) ? indices->info() : nullptr); +} - // Tune kernels - CLScheduler::get().tune_kernel_static(*_kernel); +Status CLPoolingLayer::validate(const ITensorInfo *input, + const ITensorInfo *output, + const PoolingLayerInfo &pool_info, + const ITensorInfo *indices) +{ + return opencl::ClPool2d::validate(input, output, pool_info, indices); } -Status CLPoolingLayer::validate(const ITensorInfo *input, const ITensorInfo *output, const PoolingLayerInfo &pool_info, const ITensorInfo *indices) +void CLPoolingLayer::run() { - return CLPoolingLayerKernel::validate(input, output, pool_info, indices); + ITensorPack pack; + pack.add_tensor(TensorType::ACL_SRC, _impl->src); + pack.add_tensor(TensorType::ACL_DST_0, _impl->dst); + pack.add_tensor(TensorType::ACL_DST_1, _impl->indices); + _impl->op->run(pack); } } // namespace arm_compute diff --git a/src/runtime/CL/functions/CLPriorBoxLayer.cpp b/src/runtime/CL/functions/CLPriorBoxLayer.cpp index d01b4c711b..cfd0ec4fbf 100644 --- a/src/runtime/CL/functions/CLPriorBoxLayer.cpp +++ b/src/runtime/CL/functions/CLPriorBoxLayer.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018-2020 ARM Limited. + * Copyright (c) 2018-2021 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -24,40 +24,56 @@ #include "arm_compute/runtime/CL/functions/CLPriorBoxLayer.h" -#include "arm_compute/core/CL/kernels/CLPriorBoxLayerKernel.h" #include "arm_compute/core/Error.h" #include "arm_compute/core/TensorInfo.h" #include "arm_compute/core/Types.h" #include "arm_compute/core/Validate.h" #include "arm_compute/runtime/CL/CLScheduler.h" +#include "src/common/utils/Log.h" +#include "src/core/CL/kernels/CLFillBorderKernel.h" +#include "src/core/CL/kernels/CLPriorBoxLayerKernel.h" + using namespace arm_compute; -CLPriorBoxLayer::CLPriorBoxLayer() - : _min(nullptr), _max(nullptr), _aspect_ratios(nullptr) +CLPriorBoxLayer::CLPriorBoxLayer() : _min(nullptr), _max(nullptr), _aspect_ratios(nullptr) { } -void CLPriorBoxLayer::configure(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, const PriorBoxLayerInfo &info) +void CLPriorBoxLayer::configure(const ICLTensor *input1, + const ICLTensor *input2, + ICLTensor *output, + const PriorBoxLayerInfo &info) { configure(CLKernelLibrary::get().get_compile_context(), input1, input2, output, info); } -void CLPriorBoxLayer::configure(const CLCompileContext &compile_context, const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, const PriorBoxLayerInfo &info) +void CLPriorBoxLayer::configure(const CLCompileContext &compile_context, + const ICLTensor *input1, + const ICLTensor *input2, + ICLTensor *output, + const PriorBoxLayerInfo &info) { - _min = cl::Buffer(CLScheduler::get().context(), CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE, info.min_sizes().size() * sizeof(float)); - _aspect_ratios = cl::Buffer(CLScheduler::get().context(), CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE, info.aspect_ratios().size() * sizeof(float)); - if(!info.max_sizes().empty()) + ARM_COMPUTE_LOG_PARAMS(input1, input2, output, info); + _min = cl::Buffer(CLScheduler::get().context(), CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE, + info.min_sizes().size() * sizeof(float)); + _aspect_ratios = cl::Buffer(CLScheduler::get().context(), CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE, + info.aspect_ratios().size() * sizeof(float)); + if (!info.max_sizes().empty()) { - _max = cl::Buffer(CLScheduler::get().context(), CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE, info.max_sizes().size() * sizeof(float)); + _max = cl::Buffer(CLScheduler::get().context(), CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE, + info.max_sizes().size() * sizeof(float)); } - auto k = arm_compute::support::cpp14::make_unique<CLPriorBoxLayerKernel>(); + auto k = std::make_unique<CLPriorBoxLayerKernel>(); k->configure(compile_context, input1, input2, output, info, &_min, &_max, &_aspect_ratios); _kernel = std::move(k); } -Status CLPriorBoxLayer::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const PriorBoxLayerInfo &info) +Status CLPriorBoxLayer::validate(const ITensorInfo *input1, + const ITensorInfo *input2, + const ITensorInfo *output, + const PriorBoxLayerInfo &info) { return CLPriorBoxLayerKernel::validate(input1, input2, output, info); -}
\ No newline at end of file +} diff --git a/src/runtime/CL/functions/CLQLSTMLayer.cpp b/src/runtime/CL/functions/CLQLSTMLayer.cpp index 524c7b3aae..12f6f89290 100644 --- a/src/runtime/CL/functions/CLQLSTMLayer.cpp +++ b/src/runtime/CL/functions/CLQLSTMLayer.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020 ARM Limited. + * Copyright (c) 2020-2022 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -26,22 +26,36 @@ #include "arm_compute/core/KernelDescriptors.h" #include "arm_compute/core/QuantizationInfo.h" #include "arm_compute/core/Utils.h" -#include "arm_compute/core/Validate.h" #include "arm_compute/core/utils/misc/InfoHelpers.h" #include "arm_compute/core/utils/quantization/AsymmHelpers.h" +#include "arm_compute/core/Validate.h" #include "arm_compute/runtime/CL/CLScheduler.h" +#include "src/common/utils/Log.h" +#include "src/core/CL/kernels/CLFillBorderKernel.h" +#include "src/core/CL/kernels/CLQLSTMLayerNormalizationKernel.h" +#include "src/core/helpers/WindowHelpers.h" +#include "src/gpu/cl/kernels/ClGemmLowpReductionKernel.h" + namespace arm_compute { using namespace arm_compute::utils::info_helpers; +using namespace arm_compute::opencl::kernels; namespace { -Status validate_mm(GEMMLowpOutputStageInfo &gemmlowp_info, const ITensorInfo *mm_input, const ITensorInfo *mm_weights, const ITensorInfo *bias, - float gemmlowp_scale, const TensorInfo *mm_res_info, const TensorInfo *outstage_tensor_info) +Status validate_mm(GEMMLowpOutputStageInfo &gemmlowp_info, + const ITensorInfo *mm_input, + const ITensorInfo *mm_weights, + const ITensorInfo *bias, + float gemmlowp_scale, + const TensorInfo *mm_res_info, + const TensorInfo *outstage_tensor_info) { ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixMultiplyCore::validate(mm_input, mm_weights, nullptr, mm_res_info)); - ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier(gemmlowp_scale, &gemmlowp_info.gemmlowp_multiplier, &gemmlowp_info.gemmlowp_shift)); - ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpOutputStage::validate(mm_res_info, bias, outstage_tensor_info, gemmlowp_info)); + ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier( + gemmlowp_scale, &gemmlowp_info.gemmlowp_multiplier, &gemmlowp_info.gemmlowp_shift)); + ARM_COMPUTE_RETURN_ON_ERROR( + CLGEMMLowpOutputStage::validate(mm_res_info, bias, outstage_tensor_info, gemmlowp_info)); return Status{}; } } // namespace @@ -71,28 +85,71 @@ void CLQLSTMLayer::TensorCopyKernel::run() _src->map(q, true); _dst->map(q, true); - Iterator input_iter{ _src, _window }; - Iterator output_iter{ _dst, _window }; + Iterator input_iter{_src, _window}; + Iterator output_iter{_dst, _window}; - execute_window_loop(_window, [&](const Coordinates &) - { - memcpy(output_iter.ptr(), input_iter.ptr(), _row_size); - }, - input_iter, output_iter); + execute_window_loop( + _window, [&](const Coordinates &) { memcpy(output_iter.ptr(), input_iter.ptr(), _row_size); }, input_iter, + output_iter); _src->unmap(q); _dst->unmap(q); } CLQLSTMLayer::CLQLSTMLayer(std::shared_ptr<IMemoryManager> memory_manager) + : _input_to_input_reduction(std::make_unique<ClGemmLowpMatrixAReductionKernel>()), + _recurrent_to_input_reduction(std::make_unique<ClGemmLowpMatrixAReductionKernel>()), + _input_to_forget_reduction(std::make_unique<ClGemmLowpMatrixAReductionKernel>()), + _recurrent_to_forget_reduction(std::make_unique<ClGemmLowpMatrixAReductionKernel>()), + _input_to_cell_reduction(std::make_unique<ClGemmLowpMatrixAReductionKernel>()), + _recurrent_to_cell_reduction(std::make_unique<ClGemmLowpMatrixAReductionKernel>()), + _input_to_output_reduction(std::make_unique<ClGemmLowpMatrixAReductionKernel>()), + _recurrent_to_output_reduction(std::make_unique<ClGemmLowpMatrixAReductionKernel>()), + _projection_reduction(std::make_unique<ClGemmLowpMatrixAReductionKernel>()), + _layer_norms(), + _copy_output() { + for (auto &norm : _layer_norms) + { + norm = std::make_unique<CLQLSTMLayerNormalizationKernel>(); + } + _memory_group = MemoryGroup(std::move(memory_manager)); } -void CLQLSTMLayer::configure_mm(const CLCompileContext &compile_context, CLGEMMLowpMatrixMultiplyCore &mm, CLGEMMLowpOutputStage &outstage, GEMMLowpOutputStageInfo &gemmlowp_info, - const ICLTensor *mm_input, const ICLTensor *mm_weights, const ICLTensor *bias, - CLTensor *mm_res, CLTensor *outstage_res, float gemmlowp_scale, - const TensorInfo &mm_res_info, const TensorInfo &outstage_tensor_info) +CLQLSTMLayer::~CLQLSTMLayer() = default; + +void CLQLSTMLayer::configure_layer_norm(LayerNormGate g, const ICLTensor *in) +{ + ARM_COMPUTE_ERROR_ON(!_has_layer_norm); + + CLTensor *out = &get_layer_norm_output(g); + _memory_group.manage(out); + out->allocator()->init(*(in->info())); + + get_layer_norm(g).configure(in, out, get_layer_norm_weight(g), get_layer_norm_bias(g)); +} + +Status CLQLSTMLayer::validate_layer_norm(const ITensorInfo &in, const ITensorInfo &weight, const ITensorInfo &bias) +{ + // Output quantization scale will be different, but ignored here + // since it will be configured at configure() stage. + const TensorInfo out{in}; + return CLQLSTMLayerNormalizationKernel::validate(&in, &out, &weight, &bias); +} + +void CLQLSTMLayer::configure_mm(const CLCompileContext &compile_context, + CLGEMMLowpMatrixMultiplyCore &mm, + CLGEMMLowpOutputStage &outstage, + GEMMLowpOutputStageInfo &gemmlowp_info, + const ICLTensor *mm_input, + const ICLTensor *mm_weights, + const ICLTensor *bias, + CLTensor *mm_res, + CLTensor *outstage_res, + float gemmlowp_scale, + const TensorInfo &mm_res_info, + const TensorInfo &outstage_tensor_info) { _memory_group.manage(mm_res); _memory_group.manage(outstage_res); @@ -104,30 +161,51 @@ void CLQLSTMLayer::configure_mm(const CLCompileContext &compile_context, CLGEMML mm.configure(compile_context, mm_input, mm_weights, nullptr, mm_res); // Configure output stage - quantization::calculate_quantized_multiplier(gemmlowp_scale, &gemmlowp_info.gemmlowp_multiplier, &gemmlowp_info.gemmlowp_shift); + quantization::calculate_quantized_multiplier(gemmlowp_scale, &gemmlowp_info.gemmlowp_multiplier, + &gemmlowp_info.gemmlowp_shift); outstage.configure(compile_context, mm_res, bias, outstage_res, gemmlowp_info); mm_res->allocator()->allocate(); } -void CLQLSTMLayer::configure(const ICLTensor *input, - const ICLTensor *input_to_forget_weights, const ICLTensor *input_to_cell_weights, const ICLTensor *input_to_output_weights, - const ICLTensor *recurrent_to_forget_weights, const ICLTensor *recurrent_to_cell_weights, const ICLTensor *recurrent_to_output_weights, - const ICLTensor *forget_gate_bias, const ICLTensor *cell_bias, const ICLTensor *output_gate_bias, - const ICLTensor *cell_state_in, const ICLTensor *output_state_in, - ICLTensor *cell_state_out, ICLTensor *output_state_out, ICLTensor *output, +void CLQLSTMLayer::configure(const ICLTensor *input, + const ICLTensor *input_to_forget_weights, + const ICLTensor *input_to_cell_weights, + const ICLTensor *input_to_output_weights, + const ICLTensor *recurrent_to_forget_weights, + const ICLTensor *recurrent_to_cell_weights, + const ICLTensor *recurrent_to_output_weights, + const ICLTensor *forget_gate_bias, + const ICLTensor *cell_bias, + const ICLTensor *output_gate_bias, + ICLTensor *cell_state_in, + ICLTensor *output_state_in, + ICLTensor *cell_state_out, + ICLTensor *output_state_out, + ICLTensor *output, const LSTMParams<ICLTensor> &lstm_params) { - configure(CLKernelLibrary::get().get_compile_context(), input, input_to_forget_weights, input_to_cell_weights, input_to_output_weights, - recurrent_to_forget_weights, recurrent_to_cell_weights, recurrent_to_output_weights, forget_gate_bias, cell_bias, output_gate_bias, - cell_state_in, output_state_in, cell_state_out, output_state_out, output, lstm_params); + configure(CLKernelLibrary::get().get_compile_context(), input, input_to_forget_weights, input_to_cell_weights, + input_to_output_weights, recurrent_to_forget_weights, recurrent_to_cell_weights, + recurrent_to_output_weights, forget_gate_bias, cell_bias, output_gate_bias, cell_state_in, + output_state_in, cell_state_out, output_state_out, output, lstm_params); } -void CLQLSTMLayer::configure(const CLCompileContext &compile_context, const ICLTensor *input, - const ICLTensor *input_to_forget_weights, const ICLTensor *input_to_cell_weights, const ICLTensor *input_to_output_weights, - const ICLTensor *recurrent_to_forget_weights, const ICLTensor *recurrent_to_cell_weights, const ICLTensor *recurrent_to_output_weights, - const ICLTensor *forget_gate_bias, const ICLTensor *cell_bias, const ICLTensor *output_gate_bias, - const ICLTensor *cell_state_in, const ICLTensor *output_state_in, - ICLTensor *cell_state_out, ICLTensor *output_state_out, ICLTensor *output, +void CLQLSTMLayer::configure(const CLCompileContext &compile_context, + const ICLTensor *input, + const ICLTensor *input_to_forget_weights, + const ICLTensor *input_to_cell_weights, + const ICLTensor *input_to_output_weights, + const ICLTensor *recurrent_to_forget_weights, + const ICLTensor *recurrent_to_cell_weights, + const ICLTensor *recurrent_to_output_weights, + const ICLTensor *forget_gate_bias, + const ICLTensor *cell_bias, + const ICLTensor *output_gate_bias, + ICLTensor *cell_state_in, + ICLTensor *output_state_in, + ICLTensor *cell_state_out, + ICLTensor *output_state_out, + ICLTensor *output, const LSTMParams<ICLTensor> &lstm_params) { ARM_COMPUTE_ERROR_ON_NULLPTR(input, input_to_forget_weights, input_to_cell_weights, input_to_output_weights, @@ -135,16 +213,20 @@ void CLQLSTMLayer::configure(const CLCompileContext &compile_context, const ICLT forget_gate_bias, cell_bias, output_gate_bias, cell_state_in, output_state_in, cell_state_out, output_state_out, output); + ARM_COMPUTE_LOG_PARAMS(input, input_to_forget_weights, input_to_cell_weights, input_to_output_weights, + recurrent_to_forget_weights, recurrent_to_cell_weights, recurrent_to_output_weights, + forget_gate_bias, cell_bias, output_gate_bias, cell_state_in, output_state_in, + cell_state_out, output_state_out, output, lstm_params); // Set lstm parameters LSTMParams<ITensorInfo> lstm_params_info{}; build_lstm_params_tensor_info(lstm_params, &lstm_params_info); // Validate - ARM_COMPUTE_ERROR_THROW_ON(CLQLSTMLayer::validate(input->info(), input_to_forget_weights->info(), input_to_cell_weights->info(), input_to_output_weights->info(), - recurrent_to_forget_weights->info(), recurrent_to_cell_weights->info(), recurrent_to_output_weights->info(), - forget_gate_bias->info(), cell_bias->info(), output_gate_bias->info(), - cell_state_in->info(), output_state_in->info(), cell_state_out->info(), output_state_out->info(), output->info(), - lstm_params_info)); + ARM_COMPUTE_ERROR_THROW_ON(CLQLSTMLayer::validate( + input->info(), input_to_forget_weights->info(), input_to_cell_weights->info(), input_to_output_weights->info(), + recurrent_to_forget_weights->info(), recurrent_to_cell_weights->info(), recurrent_to_output_weights->info(), + forget_gate_bias->info(), cell_bias->info(), output_gate_bias->info(), cell_state_in->info(), + output_state_in->info(), cell_state_out->info(), output_state_out->info(), output->info(), lstm_params_info)); const int batch_size = input->info()->dimension(1); const int num_units = input_to_output_weights->info()->dimension(1); @@ -165,7 +247,7 @@ void CLQLSTMLayer::configure(const CLCompileContext &compile_context, const ICLT // Layer normalization _has_layer_norm = lstm_params.use_layer_norm(); - if(_has_layer_norm) + if (_has_layer_norm) { set_layer_norm_weight(lstm_params.forget_layer_norm_weights(), LayerNormGate::Forget); set_layer_norm_weight(lstm_params.cell_layer_norm_weights(), LayerNormGate::Cell); @@ -187,45 +269,75 @@ void CLQLSTMLayer::configure(const CLCompileContext &compile_context, const ICLT // Calculate quantized parameters for clipping. int16_t quantized_cell_clip = 0; - if(lstm_params.cell_clip() > 0.0f) + if (lstm_params.cell_clip() > 0.0f) { quantized_cell_clip = quantize_qsymm16(lstm_params.cell_clip(), qcell_state_in); } _has_cell_clipping = quantized_cell_clip > 0; // Precompute effective bias for optimizing the matmul computations. - if(!_has_cifg) + if (!_has_cifg) { _input_to_input_weights = lstm_params.input_to_input_weights(); _recurrent_to_input_weights = lstm_params.recurrent_to_input_weights(); - _input_to_input_reduction.configure(compile_context, _input_to_input_weights, &_input_to_input_eff_bias, GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true)); - _recurrent_to_input_reduction.configure(compile_context, _recurrent_to_input_weights, &_recurrent_to_input_eff_bias, GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true)); + _input_to_input_reduction->configure(compile_context, _input_to_input_weights->info(), + _input_to_input_eff_bias.info(), + GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true)); + _recurrent_to_input_reduction->configure( + compile_context, _recurrent_to_input_weights->info(), _recurrent_to_input_eff_bias.info(), + GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true)); } - _input_to_forget_reduction.configure(compile_context, input_to_forget_weights, &_input_to_forget_eff_bias, GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true)); - _recurrent_to_forget_reduction.configure(compile_context, recurrent_to_forget_weights, &_recurrent_to_forget_eff_bias, GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true)); - _input_to_cell_reduction.configure(compile_context, input_to_cell_weights, &_input_to_cell_eff_bias, GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true)); - _recurrent_to_cell_reduction.configure(compile_context, recurrent_to_cell_weights, &_recurrent_to_cell_eff_bias, GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true)); - _input_to_output_reduction.configure(compile_context, input_to_output_weights, &_input_to_output_eff_bias, GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true)); - _recurrent_to_output_reduction.configure(compile_context, recurrent_to_output_weights, &_recurrent_to_output_eff_bias, GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true)); - if(_has_projection) + _input_to_forget_reduction->configure(compile_context, input_to_forget_weights->info(), + _input_to_forget_eff_bias.info(), + GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true)); + _recurrent_to_forget_reduction->configure( + compile_context, recurrent_to_forget_weights->info(), _recurrent_to_forget_eff_bias.info(), + GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true)); + _input_to_cell_reduction->configure(compile_context, input_to_cell_weights->info(), _input_to_cell_eff_bias.info(), + GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true)); + _recurrent_to_cell_reduction->configure( + compile_context, recurrent_to_cell_weights->info(), _recurrent_to_cell_eff_bias.info(), + GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true)); + _input_to_output_reduction->configure(compile_context, input_to_output_weights->info(), + _input_to_output_eff_bias.info(), + GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true)); + _recurrent_to_output_reduction->configure( + compile_context, recurrent_to_output_weights->info(), _recurrent_to_output_eff_bias.info(), + GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true)); + if (_has_projection) { - _projection_reduction.configure(compile_context, _projection_weights, &_projection_eff_bias, GEMMLowpReductionKernelInfo(output_size, false, lstm_params.hidden_state_zero(), true)); + _projection_reduction->configure( + compile_context, _projection_weights->info(), _projection_eff_bias.info(), + GEMMLowpReductionKernelInfo(output_size, false, lstm_params.hidden_state_zero(), true)); + if (_projection_bias != nullptr) + { + _projection_bias_add.configure(compile_context, _projection_bias, &_projection_eff_bias, + &_projection_eff_bias, ConvertPolicy::SATURATE); + } } // Pre-transpose weights to be used in GEMM. - _transpose_input_to_forget_weights.configure(compile_context, input_to_forget_weights, &_input_to_forget_weights_transposed); - _transpose_input_to_cell_weights.configure(compile_context, input_to_cell_weights, &_input_to_cell_weights_transposed); - _transpose_input_to_output_weights.configure(compile_context, input_to_output_weights, &_input_to_output_weights_transposed); - _transpose_recurrent_to_forget_weights.configure(compile_context, recurrent_to_forget_weights, &_recurrent_to_forget_weights_transposed); - _transpose_recurrent_to_cell_weights.configure(compile_context, recurrent_to_cell_weights, &_recurrent_to_cell_weights_transposed); - _transpose_recurrent_to_output_weights.configure(compile_context, recurrent_to_output_weights, &_recurrent_to_output_weights_transposed); - if(!_has_cifg) + _transpose_input_to_forget_weights.configure(compile_context, input_to_forget_weights, + &_input_to_forget_weights_transposed); + _transpose_input_to_cell_weights.configure(compile_context, input_to_cell_weights, + &_input_to_cell_weights_transposed); + _transpose_input_to_output_weights.configure(compile_context, input_to_output_weights, + &_input_to_output_weights_transposed); + _transpose_recurrent_to_forget_weights.configure(compile_context, recurrent_to_forget_weights, + &_recurrent_to_forget_weights_transposed); + _transpose_recurrent_to_cell_weights.configure(compile_context, recurrent_to_cell_weights, + &_recurrent_to_cell_weights_transposed); + _transpose_recurrent_to_output_weights.configure(compile_context, recurrent_to_output_weights, + &_recurrent_to_output_weights_transposed); + if (!_has_cifg) { - _transpose_input_to_input_weights.configure(compile_context, lstm_params.input_to_input_weights(), &_input_to_input_weights_transposed); - _transpose_recurrent_to_input_weights.configure(compile_context, lstm_params.recurrent_to_input_weights(), &_recurrent_to_input_weights_transposed); + _transpose_input_to_input_weights.configure(compile_context, lstm_params.input_to_input_weights(), + &_input_to_input_weights_transposed); + _transpose_recurrent_to_input_weights.configure(compile_context, lstm_params.recurrent_to_input_weights(), + &_recurrent_to_input_weights_transposed); } - if(_has_projection) + if (_has_projection) { _transpose_projection_weights.configure(compile_context, _projection_weights, &_projection_weights_transposed); } @@ -238,42 +350,55 @@ void CLQLSTMLayer::configure(const CLCompileContext &compile_context, const ICLT const TensorInfo mm_out_info(TensorShape(num_units, batch_size), 1, DataType::S32); // Forget gate. - const TensorInfo forget_gate_outstage_info(mm_out_info.tensor_shape(), 1, DataType::QSYMM16, QuantizationInfo(lstm_params.forget_intermediate_scale(), 0)); - const float input_to_forget_scale = input_to_forget_weights->info()->quantization_info().uniform().scale * qinput.scale / lstm_params.forget_intermediate_scale(); - configure_mm(compile_context, _mm_input_to_forget, _input_to_forget_outstage, gemmlowp_info, - input, &_input_to_forget_weights_transposed, &_input_to_forget_eff_bias, - &_mm_input_to_forget_res, &_input_to_forget_outstage_res, input_to_forget_scale, - mm_out_info, forget_gate_outstage_info); - - const float recurrent_to_forget_scale = recurrent_to_forget_weights->info()->quantization_info().uniform().scale * qoutput_state_in.scale / lstm_params.forget_intermediate_scale(); + const TensorInfo forget_gate_outstage_info(mm_out_info.tensor_shape(), 1, DataType::QSYMM16, + QuantizationInfo(lstm_params.forget_intermediate_scale(), 0)); + const float input_to_forget_scale = input_to_forget_weights->info()->quantization_info().uniform().scale * + qinput.scale / lstm_params.forget_intermediate_scale(); + configure_mm(compile_context, _mm_input_to_forget, _input_to_forget_outstage, gemmlowp_info, input, + &_input_to_forget_weights_transposed, &_input_to_forget_eff_bias, &_mm_input_to_forget_res, + &_input_to_forget_outstage_res, input_to_forget_scale, mm_out_info, forget_gate_outstage_info); + + const float recurrent_to_forget_scale = recurrent_to_forget_weights->info()->quantization_info().uniform().scale * + qoutput_state_in.scale / lstm_params.forget_intermediate_scale(); configure_mm(compile_context, _mm_recurrent_to_forget, _recurrent_to_forget_outstage, gemmlowp_info, output_state_in, &_recurrent_to_forget_weights_transposed, &_recurrent_to_forget_eff_bias, &_mm_recurrent_to_forget_res, &_recurrent_to_forget_outstage_res, recurrent_to_forget_scale, mm_out_info, forget_gate_outstage_info); - _accumulate_input_recurrent_forget.configure(compile_context, ArithmeticOperation::ADD, &_input_to_forget_outstage_res, &_recurrent_to_forget_outstage_res, &_recurrent_to_forget_outstage_res, + _accumulate_input_recurrent_forget.configure(compile_context, &_input_to_forget_outstage_res, + &_recurrent_to_forget_outstage_res, &_recurrent_to_forget_outstage_res, ConvertPolicy::SATURATE); _input_to_forget_outstage_res.allocator()->allocate(); - if(_has_peephole) + if (_has_peephole) { _mul_cell_to_forget_res.allocator()->init(TensorInfo(cell_state_in->info()->tensor_shape(), 1, DataType::S32)); _memory_group.manage(&_mul_cell_to_forget_res); - _pixelwise_mul_cell_to_forget.configure(compile_context, cell_state_in, lstm_params.cell_to_forget_weights(), &_mul_cell_to_forget_res, 1.f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO); - _cell_to_forget_outstage_res.allocator()->init(TensorInfo(_mul_cell_to_forget_res.info()->tensor_shape(), 1, DataType::QSYMM16, QuantizationInfo(lstm_params.forget_intermediate_scale(), 0))); + _pixelwise_mul_cell_to_forget.configure(compile_context, cell_state_in, lstm_params.cell_to_forget_weights(), + &_mul_cell_to_forget_res, 1.f, ConvertPolicy::SATURATE, + RoundingPolicy::TO_ZERO); + _cell_to_forget_outstage_res.allocator()->init( + TensorInfo(_mul_cell_to_forget_res.info()->tensor_shape(), 1, DataType::QSYMM16, + QuantizationInfo(lstm_params.forget_intermediate_scale(), 0))); _memory_group.manage(&_cell_to_forget_outstage_res); - const float cell_to_forget_scale = std::pow(2, cell_shift) * lstm_params.cell_to_forget_weights()->info()->quantization_info().uniform().scale / lstm_params.forget_intermediate_scale(); - quantization::calculate_quantized_multiplier(cell_to_forget_scale, &gemmlowp_info.gemmlowp_multiplier, &gemmlowp_info.gemmlowp_shift); - _cell_to_forget_outstage.configure(compile_context, &_mul_cell_to_forget_res, nullptr, &_cell_to_forget_outstage_res, gemmlowp_info); + const float cell_to_forget_scale = + std::pow(2, cell_shift) * + lstm_params.cell_to_forget_weights()->info()->quantization_info().uniform().scale / + lstm_params.forget_intermediate_scale(); + quantization::calculate_quantized_multiplier(cell_to_forget_scale, &gemmlowp_info.gemmlowp_multiplier, + &gemmlowp_info.gemmlowp_shift); + _cell_to_forget_outstage.configure(compile_context, &_mul_cell_to_forget_res, nullptr, + &_cell_to_forget_outstage_res, gemmlowp_info); _mul_cell_to_forget_res.allocator()->allocate(); - _accumulate_cell_forget.configure(compile_context, ArithmeticOperation::ADD, &_recurrent_to_forget_outstage_res, &_cell_to_forget_outstage_res, &_recurrent_to_forget_outstage_res, + _accumulate_cell_forget.configure(compile_context, &_recurrent_to_forget_outstage_res, + &_cell_to_forget_outstage_res, &_recurrent_to_forget_outstage_res, ConvertPolicy::SATURATE); _cell_to_forget_outstage_res.allocator()->allocate(); } CLTensor *forget_activation_input = &_recurrent_to_forget_outstage_res; - if(_has_layer_norm) + if (_has_layer_norm) { configure_layer_norm(LayerNormGate::Forget, &_recurrent_to_forget_outstage_res); _recurrent_to_forget_outstage_res.allocator()->allocate(); @@ -286,30 +411,33 @@ void CLQLSTMLayer::configure(const CLCompileContext &compile_context, const ICLT const TensorInfo forget_gate_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, sigmoid_tanh_outqinfo); _memory_group.manage(&_forget_gate); _forget_gate.allocator()->init(forget_gate_info); - _forget_gate_sigmoid.configure(compile_context, forget_activation_input, &_forget_gate, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)); + _forget_gate_sigmoid.configure(compile_context, forget_activation_input, &_forget_gate, + ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)); forget_activation_input->allocator()->allocate(); // Modulation gate. - const TensorInfo cell_outstage_info(mm_out_info.tensor_shape(), 1, DataType::QSYMM16, QuantizationInfo(lstm_params.cell_intermediate_scale(), 0)); - const float input_to_cell_scale = input_to_cell_weights->info()->quantization_info().uniform().scale * qinput.scale / lstm_params.cell_intermediate_scale(); - configure_mm(compile_context, _mm_input_to_cell, _input_to_cell_outstage, gemmlowp_info, - input, &_input_to_cell_weights_transposed, &_input_to_cell_eff_bias, - &_mm_input_to_cell_res, &_input_to_cell_outstage_res, input_to_cell_scale, - mm_out_info, cell_outstage_info); - - const float recurrent_to_cell_scale = recurrent_to_cell_weights->info()->quantization_info().uniform().scale * qoutput_state_in.scale / lstm_params.cell_intermediate_scale(); - configure_mm(compile_context, _mm_recurrent_to_cell, _recurrent_to_cell_outstage, gemmlowp_info, - output_state_in, &_recurrent_to_cell_weights_transposed, &_recurrent_to_cell_eff_bias, - &_mm_recurrent_to_cell_res, &_recurrent_to_cell_outstage_res, recurrent_to_cell_scale, - mm_out_info, cell_outstage_info); - - _accumulate_input_recurrent_modulation.configure(compile_context, ArithmeticOperation::ADD, &_input_to_cell_outstage_res, &_recurrent_to_cell_outstage_res, &_recurrent_to_cell_outstage_res, + const TensorInfo cell_outstage_info(mm_out_info.tensor_shape(), 1, DataType::QSYMM16, + QuantizationInfo(lstm_params.cell_intermediate_scale(), 0)); + const float input_to_cell_scale = input_to_cell_weights->info()->quantization_info().uniform().scale * + qinput.scale / lstm_params.cell_intermediate_scale(); + configure_mm(compile_context, _mm_input_to_cell, _input_to_cell_outstage, gemmlowp_info, input, + &_input_to_cell_weights_transposed, &_input_to_cell_eff_bias, &_mm_input_to_cell_res, + &_input_to_cell_outstage_res, input_to_cell_scale, mm_out_info, cell_outstage_info); + + const float recurrent_to_cell_scale = recurrent_to_cell_weights->info()->quantization_info().uniform().scale * + qoutput_state_in.scale / lstm_params.cell_intermediate_scale(); + configure_mm(compile_context, _mm_recurrent_to_cell, _recurrent_to_cell_outstage, gemmlowp_info, output_state_in, + &_recurrent_to_cell_weights_transposed, &_recurrent_to_cell_eff_bias, &_mm_recurrent_to_cell_res, + &_recurrent_to_cell_outstage_res, recurrent_to_cell_scale, mm_out_info, cell_outstage_info); + + _accumulate_input_recurrent_modulation.configure(compile_context, &_input_to_cell_outstage_res, + &_recurrent_to_cell_outstage_res, &_recurrent_to_cell_outstage_res, ConvertPolicy::SATURATE); _input_to_cell_outstage_res.allocator()->allocate(); CLTensor *cell_activation_input = &_recurrent_to_cell_outstage_res; - if(_has_layer_norm) + if (_has_layer_norm) { configure_layer_norm(LayerNormGate::Cell, &_recurrent_to_cell_outstage_res); _recurrent_to_cell_outstage_res.allocator()->allocate(); @@ -319,122 +447,158 @@ void CLQLSTMLayer::configure(const CLCompileContext &compile_context, const ICLT const TensorInfo cell_gate_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, sigmoid_tanh_outqinfo); _memory_group.manage(&_cell_gate); _cell_gate.allocator()->init(cell_gate_info); - _cell_gate_tanh.configure(compile_context, cell_activation_input, &_cell_gate, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.f, 1.f)); + _cell_gate_tanh.configure(compile_context, cell_activation_input, &_cell_gate, + ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.f, 1.f)); cell_activation_input->allocator()->allocate(); // Input gate. const TensorInfo input_gate_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, sigmoid_tanh_outqinfo); _input_gate.allocator()->init(input_gate_info); _memory_group.manage(&_input_gate); - if(_has_cifg) + if (_has_cifg) { _ones.allocator()->init(*_forget_gate.info()); - _input_gate_sub.configure(compile_context, ArithmeticOperation::SUB, &_ones, &_forget_gate, &_input_gate, ConvertPolicy::SATURATE); + _input_gate_sub.configure(compile_context, &_ones, &_forget_gate, &_input_gate, ConvertPolicy::SATURATE); _ones.allocator()->allocate(); } else { - const TensorInfo input_outstage_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, QuantizationInfo(lstm_params.input_intermediate_scale(), 0)); - const float input_to_input_scale = _input_to_input_weights->info()->quantization_info().uniform().scale * qinput.scale / lstm_params.input_intermediate_scale(); - configure_mm(compile_context, _mm_input_to_input, _input_to_input_outstage, gemmlowp_info, - input, &_input_to_input_weights_transposed, &_input_to_input_eff_bias, - &_mm_input_to_input_res, &_input_to_input_outstage_res, input_to_input_scale, - mm_out_info, input_outstage_info); - - const float recurrent_to_input_scale = _recurrent_to_input_weights->info()->quantization_info().uniform().scale * qoutput_state_in.scale / lstm_params.input_intermediate_scale(); + const TensorInfo input_outstage_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, + QuantizationInfo(lstm_params.input_intermediate_scale(), 0)); + const float input_to_input_scale = _input_to_input_weights->info()->quantization_info().uniform().scale * + qinput.scale / lstm_params.input_intermediate_scale(); + configure_mm(compile_context, _mm_input_to_input, _input_to_input_outstage, gemmlowp_info, input, + &_input_to_input_weights_transposed, &_input_to_input_eff_bias, &_mm_input_to_input_res, + &_input_to_input_outstage_res, input_to_input_scale, mm_out_info, input_outstage_info); + + const float recurrent_to_input_scale = + _recurrent_to_input_weights->info()->quantization_info().uniform().scale * qoutput_state_in.scale / + lstm_params.input_intermediate_scale(); configure_mm(compile_context, _mm_recurrent_to_input, _recurrent_to_input_outstage, gemmlowp_info, output_state_in, &_recurrent_to_input_weights_transposed, &_recurrent_to_input_eff_bias, &_mm_recurrent_to_input_res, &_recurrent_to_input_outstage_res, recurrent_to_input_scale, mm_out_info, input_outstage_info); - _accumulate_input_recurrent_input.configure(compile_context, ArithmeticOperation::ADD, &_input_to_input_outstage_res, &_recurrent_to_input_outstage_res, &_recurrent_to_input_outstage_res, - ConvertPolicy::SATURATE); + _accumulate_input_recurrent_input.configure(compile_context, &_input_to_input_outstage_res, + &_recurrent_to_input_outstage_res, + &_recurrent_to_input_outstage_res, ConvertPolicy::SATURATE); _input_to_input_outstage_res.allocator()->allocate(); - if(_has_peephole) + if (_has_peephole) { - _mul_cell_to_input_res.allocator()->init(TensorInfo(cell_state_in->info()->tensor_shape(), 1, DataType::S32)); + _mul_cell_to_input_res.allocator()->init( + TensorInfo(cell_state_in->info()->tensor_shape(), 1, DataType::S32)); _memory_group.manage(&_mul_cell_to_input_res); - _pixelwise_mul_cell_to_input.configure(compile_context, cell_state_in, lstm_params.cell_to_input_weights(), &_mul_cell_to_input_res, 1.f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO); - const float cell_to_input_scale = std::pow(2, cell_shift) * lstm_params.cell_to_input_weights()->info()->quantization_info().uniform().scale / lstm_params.input_intermediate_scale(); - quantization::calculate_quantized_multiplier(cell_to_input_scale, &gemmlowp_info.gemmlowp_multiplier, &gemmlowp_info.gemmlowp_shift); - _cell_to_input_outstage_res.allocator()->init(TensorInfo(_mul_cell_to_input_res.info()->tensor_shape(), 1, DataType::QSYMM16, QuantizationInfo(lstm_params.input_intermediate_scale(), 0))); + _pixelwise_mul_cell_to_input.configure(compile_context, cell_state_in, lstm_params.cell_to_input_weights(), + &_mul_cell_to_input_res, 1.f, ConvertPolicy::SATURATE, + RoundingPolicy::TO_ZERO); + const float cell_to_input_scale = + std::pow(2, cell_shift) * + lstm_params.cell_to_input_weights()->info()->quantization_info().uniform().scale / + lstm_params.input_intermediate_scale(); + quantization::calculate_quantized_multiplier(cell_to_input_scale, &gemmlowp_info.gemmlowp_multiplier, + &gemmlowp_info.gemmlowp_shift); + _cell_to_input_outstage_res.allocator()->init( + TensorInfo(_mul_cell_to_input_res.info()->tensor_shape(), 1, DataType::QSYMM16, + QuantizationInfo(lstm_params.input_intermediate_scale(), 0))); _memory_group.manage(&_cell_to_input_outstage_res); - _cell_to_input_outstage.configure(compile_context, &_mul_cell_to_input_res, nullptr, &_cell_to_input_outstage_res, gemmlowp_info); + _cell_to_input_outstage.configure(compile_context, &_mul_cell_to_input_res, nullptr, + &_cell_to_input_outstage_res, gemmlowp_info); _mul_cell_to_input_res.allocator()->allocate(); - _accumulate_cell_input.configure(ArithmeticOperation::ADD, &_recurrent_to_input_outstage_res, &_cell_to_input_outstage_res, &_recurrent_to_input_outstage_res, ConvertPolicy::SATURATE); + _accumulate_cell_input.configure(&_recurrent_to_input_outstage_res, &_cell_to_input_outstage_res, + &_recurrent_to_input_outstage_res, ConvertPolicy::SATURATE); _cell_to_input_outstage_res.allocator()->allocate(); } CLTensor *input_activation_input = &_recurrent_to_input_outstage_res; - if(_has_layer_norm) + if (_has_layer_norm) { configure_layer_norm(LayerNormGate::Input, &_recurrent_to_input_outstage_res); _recurrent_to_input_outstage_res.allocator()->allocate(); input_activation_input = &get_layer_norm_output(LayerNormGate::Input); } - _input_gate_sigmoid.configure(compile_context, input_activation_input, &_input_gate, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)); + _input_gate_sigmoid.configure(compile_context, input_activation_input, &_input_gate, + ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)); input_activation_input->allocator()->allocate(); } // Cell. - // TODO(COMPMID-3396): Perform multiplication in the quantized domain in CLPixelWiseMultiplicationKernel - _pixelwise_mul_forget_cell.configure(compile_context, &_forget_gate, cell_state_in, &_forget_gate, 1.f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO); + // TODO(COMPMID-3396): Perform multiplication in the quantized domain in CLPixelWiseMultiplication + _pixelwise_mul_forget_cell.configure(compile_context, &_forget_gate, cell_state_in, &_forget_gate, 1.f, + ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO); const float cell_gate_scale = _cell_gate.info()->quantization_info().uniform().scale; const float mul_input_cell_scale = cell_gate_scale * std::pow(2, 15 + cell_shift); - const TensorInfo mul_input_cell_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, QuantizationInfo(mul_input_cell_scale, 0)); + const TensorInfo mul_input_cell_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, + QuantizationInfo(mul_input_cell_scale, 0)); _memory_group.manage(&_mul_input_cell_res); _mul_input_cell_res.allocator()->init(mul_input_cell_info); - _pixelwise_mul_input_cell.configure(compile_context, &_input_gate, &_cell_gate, &_mul_input_cell_res, 1.f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO); + _pixelwise_mul_input_cell.configure(compile_context, &_input_gate, &_cell_gate, &_mul_input_cell_res, 1.f, + ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO); _cell_gate.allocator()->allocate(); - _add_forget_cell.configure(compile_context, ArithmeticOperation::ADD, &_forget_gate, &_mul_input_cell_res, cell_state_out, ConvertPolicy::SATURATE); + _add_forget_cell.configure(compile_context, &_forget_gate, &_mul_input_cell_res, cell_state_out, + ConvertPolicy::SATURATE); _mul_input_cell_res.allocator()->allocate(); _forget_gate.allocator()->allocate(); - if(_has_cell_clipping) + if (_has_cell_clipping) { - _cell_clip.configure(compile_context, cell_state_out, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, -quantized_cell_clip, quantized_cell_clip)); + _cell_clip.configure(compile_context, cell_state_out, nullptr, + ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, + -quantized_cell_clip, quantized_cell_clip)); } // Output gate. - const TensorInfo output_outstage_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, QuantizationInfo(lstm_params.output_intermediate_scale(), 0)); - const float input_to_output_scale = input_to_output_weights->info()->quantization_info().uniform().scale * qinput.scale / lstm_params.output_intermediate_scale(); - configure_mm(compile_context, _mm_input_to_output, _input_to_output_outstage, gemmlowp_info, - input, &_input_to_output_weights_transposed, &_input_to_output_eff_bias, - &_mm_input_to_output_res, &_input_to_output_outstage_res, input_to_output_scale, - mm_out_info, output_outstage_info); - - const float recurrent_to_output_scale = recurrent_to_output_weights->info()->quantization_info().uniform().scale * qoutput_state_in.scale / lstm_params.output_intermediate_scale(); + const TensorInfo output_outstage_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, + QuantizationInfo(lstm_params.output_intermediate_scale(), 0)); + const float input_to_output_scale = input_to_output_weights->info()->quantization_info().uniform().scale * + qinput.scale / lstm_params.output_intermediate_scale(); + configure_mm(compile_context, _mm_input_to_output, _input_to_output_outstage, gemmlowp_info, input, + &_input_to_output_weights_transposed, &_input_to_output_eff_bias, &_mm_input_to_output_res, + &_input_to_output_outstage_res, input_to_output_scale, mm_out_info, output_outstage_info); + + const float recurrent_to_output_scale = recurrent_to_output_weights->info()->quantization_info().uniform().scale * + qoutput_state_in.scale / lstm_params.output_intermediate_scale(); configure_mm(compile_context, _mm_recurrent_to_output, _recurrent_to_output_outstage, gemmlowp_info, output_state_in, &_recurrent_to_output_weights_transposed, &_recurrent_to_output_eff_bias, &_mm_recurrent_to_output_res, &_recurrent_to_output_outstage_res, recurrent_to_output_scale, mm_out_info, output_outstage_info); - _accumulate_input_recurrent_output.configure(compile_context, ArithmeticOperation::ADD, &_recurrent_to_output_outstage_res, &_input_to_output_outstage_res, &_recurrent_to_output_outstage_res, + _accumulate_input_recurrent_output.configure(compile_context, &_recurrent_to_output_outstage_res, + &_input_to_output_outstage_res, &_recurrent_to_output_outstage_res, ConvertPolicy::SATURATE); _input_to_output_outstage_res.allocator()->allocate(); - if(_has_peephole) + if (_has_peephole) { - // TODO(COMPMID-3396): Perform multiplication in the quantized domain in CLPixelWiseMultiplicationKernel + // TODO(COMPMID-3396): Perform multiplication in the quantized domain in CLPixelWiseMultiplication // Here we are not using the output stage because all operations are done in float _mul_cell_to_output_res.allocator()->init(TensorInfo(cell_state_out->info()->tensor_shape(), 1, DataType::S32)); _memory_group.manage(&_mul_cell_to_output_res); - _pixelwise_mul_cell_to_output.configure(compile_context, cell_state_out, lstm_params.cell_to_output_weights(), &_mul_cell_to_output_res, 1.f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO); - - const float cell_to_output_scale = std::pow(2, cell_shift) * lstm_params.cell_to_output_weights()->info()->quantization_info().uniform().scale / lstm_params.output_intermediate_scale(); - quantization::calculate_quantized_multiplier(cell_to_output_scale, &gemmlowp_info.gemmlowp_multiplier, &gemmlowp_info.gemmlowp_shift); - _cell_to_output_outstage_res.allocator()->init(TensorInfo(_mul_cell_to_output_res.info()->tensor_shape(), 1, DataType::QSYMM16, QuantizationInfo(lstm_params.output_intermediate_scale(), 0))); + _pixelwise_mul_cell_to_output.configure(compile_context, cell_state_out, lstm_params.cell_to_output_weights(), + &_mul_cell_to_output_res, 1.f, ConvertPolicy::SATURATE, + RoundingPolicy::TO_ZERO); + + const float cell_to_output_scale = + std::pow(2, cell_shift) * + lstm_params.cell_to_output_weights()->info()->quantization_info().uniform().scale / + lstm_params.output_intermediate_scale(); + quantization::calculate_quantized_multiplier(cell_to_output_scale, &gemmlowp_info.gemmlowp_multiplier, + &gemmlowp_info.gemmlowp_shift); + _cell_to_output_outstage_res.allocator()->init( + TensorInfo(_mul_cell_to_output_res.info()->tensor_shape(), 1, DataType::QSYMM16, + QuantizationInfo(lstm_params.output_intermediate_scale(), 0))); _memory_group.manage(&_cell_to_output_outstage_res); - _cell_to_output_outstage.configure(compile_context, &_mul_cell_to_output_res, nullptr, &_cell_to_output_outstage_res, gemmlowp_info); + _cell_to_output_outstage.configure(compile_context, &_mul_cell_to_output_res, nullptr, + &_cell_to_output_outstage_res, gemmlowp_info); _mul_cell_to_output_res.allocator()->allocate(); - _accumulate_cell_to_output.configure(compile_context, ArithmeticOperation::ADD, &_recurrent_to_output_outstage_res, &_cell_to_output_outstage_res, &_recurrent_to_output_outstage_res, + _accumulate_cell_to_output.configure(compile_context, &_recurrent_to_output_outstage_res, + &_cell_to_output_outstage_res, &_recurrent_to_output_outstage_res, ConvertPolicy::SATURATE); _cell_to_output_outstage_res.allocator()->allocate(); } CLTensor *output_activation_input = &_recurrent_to_output_outstage_res; - if(_has_layer_norm) + if (_has_layer_norm) { configure_layer_norm(LayerNormGate::Output, &_recurrent_to_output_outstage_res); _recurrent_to_output_outstage_res.allocator()->allocate(); @@ -444,20 +608,24 @@ void CLQLSTMLayer::configure(const CLCompileContext &compile_context, const ICLT const TensorInfo output_gate_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, sigmoid_tanh_outqinfo); _memory_group.manage(&_output_gate); _output_gate.allocator()->init(output_gate_info); - _output_gate_sigmoid.configure(compile_context, output_activation_input, &_output_gate, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)); + _output_gate_sigmoid.configure(compile_context, output_activation_input, &_output_gate, + ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)); output_activation_input->allocator()->allocate(); // Hidden. - _hidden_tanh.configure(compile_context, cell_state_out, &_input_gate, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.f, 1.f)); - // TODO(COMPMID-3396): Perform multiplication in the quantized domain in CLPixelWiseMultiplicationKernel + _hidden_tanh.configure(compile_context, cell_state_out, &_input_gate, + ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.f, 1.f)); + // TODO(COMPMID-3396): Perform multiplication in the quantized domain in CLPixelWiseMultiplication _memory_group.manage(&_hidden_mul_res); const TensorInfo hidden_mul_res(_input_gate.info()->tensor_shape(), 1, DataType::S32); _hidden_mul_res.allocator()->init(hidden_mul_res); - _pixelwise_mul_hidden.configure(compile_context, &_output_gate, &_input_gate, &_hidden_mul_res, 1.f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO); + _pixelwise_mul_hidden.configure(compile_context, &_output_gate, &_input_gate, &_hidden_mul_res, 1.f, + ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO); _output_gate.allocator()->allocate(); _input_gate.allocator()->allocate(); const float hidden_state_scale = std::pow(2, -15) / lstm_params.hidden_state_scale() * std::pow(2, -15); - quantization::calculate_quantized_multiplier(hidden_state_scale, &gemmlowp_info.gemmlowp_multiplier, &gemmlowp_info.gemmlowp_shift, /* ignore_epsilon */ true); + quantization::calculate_quantized_multiplier(hidden_state_scale, &gemmlowp_info.gemmlowp_multiplier, + &gemmlowp_info.gemmlowp_shift, /* ignore_epsilon */ true); gemmlowp_info.gemmlowp_offset = lstm_params.hidden_state_zero(); gemmlowp_info.output_data_type = output_state_in->info()->data_type(); @@ -466,7 +634,7 @@ void CLQLSTMLayer::configure(const CLCompileContext &compile_context, const ICLT _memory_group.manage(&_hidden_gate); - if(_projection_tensor_copy_required) + if (_projection_tensor_copy_required) { _hidden_gate.allocator()->init(*output_state_out->info()); _hidden_gate.info()->set_tensor_shape(_hidden_mul_res.info()->tensor_shape()); @@ -477,60 +645,62 @@ void CLQLSTMLayer::configure(const CLCompileContext &compile_context, const ICLT _hidden_mul_res.allocator()->allocate(); // Projection. - if(_has_projection) + if (_has_projection) { const TensorInfo projection_outstage_info(*output_state_out->info()); - const UniformQuantizationInfo qprojection = _projection_weights->info()->quantization_info().uniform(); - const float projection_scale = qprojection.scale * lstm_params.hidden_state_scale() / qoutput_state_in.scale; - gemmlowp_info.gemmlowp_offset = qoutput_state_in.offset; - gemmlowp_info.gemmlowp_min_bound = std::numeric_limits<int8_t>::lowest(); - gemmlowp_info.gemmlowp_max_bound = std::numeric_limits<int8_t>::max(); - gemmlowp_info.output_data_type = DataType::QASYMM8_SIGNED; - - TensorInfo projection_mm_out_info{ mm_out_info }; + const UniformQuantizationInfo qprojection = _projection_weights->info()->quantization_info().uniform(); + const float projection_scale = qprojection.scale * lstm_params.hidden_state_scale() / qoutput_state_in.scale; + gemmlowp_info.gemmlowp_offset = qoutput_state_in.offset; + gemmlowp_info.gemmlowp_min_bound = std::numeric_limits<int8_t>::lowest(); + gemmlowp_info.gemmlowp_max_bound = std::numeric_limits<int8_t>::max(); + gemmlowp_info.output_data_type = DataType::QASYMM8_SIGNED; + + TensorInfo projection_mm_out_info{mm_out_info}; projection_mm_out_info.set_tensor_shape(TensorShape(output_size, batch_size)); - configure_mm(compile_context, _mm_projection, _projection_outstage, gemmlowp_info, - hidden_gate_result, &_projection_weights_transposed, &_projection_eff_bias, - &_mm_projection_res, &_projection_outstage_res, projection_scale, - projection_mm_out_info, projection_outstage_info); + configure_mm(compile_context, _mm_projection, _projection_outstage, gemmlowp_info, hidden_gate_result, + &_projection_weights_transposed, &_projection_eff_bias, &_mm_projection_res, + &_projection_outstage_res, projection_scale, projection_mm_out_info, projection_outstage_info); ICLTensor *accumulate_destination = output_state_out; - if(_projection_tensor_copy_required) + if (_projection_tensor_copy_required) { _hidden_gate.allocator()->allocate(); - _projection_accumulate_res.allocator()->init(*output_state_out->info()); + _projection_accumulate_res.allocator()->init(*output_state_in->info()); _projection_accumulate_res.info()->set_tensor_shape(_projection_outstage_res.info()->tensor_shape()); - _projection_output_to_accumulate_copy.configure(*output_state_out, _projection_accumulate_res); + _projection_output_to_accumulate_copy.configure(*output_state_in, _projection_accumulate_res); accumulate_destination = &_projection_accumulate_res; } - _accumulate_projection.configure(compile_context, ArithmeticOperation::ADD, &_projection_outstage_res, accumulate_destination, accumulate_destination, ConvertPolicy::SATURATE); + _accumulate_projection.configure(compile_context, &_projection_outstage_res, accumulate_destination, + accumulate_destination, ConvertPolicy::SATURATE); _projection_outstage_res.allocator()->allocate(); - if(_projection_tensor_copy_required) + if (_projection_tensor_copy_required) { _projection_accumulate_to_output_copy.configure(_projection_accumulate_res, *output_state_out); _projection_accumulate_res.allocator()->allocate(); } - int8_t quantized_projection_clip{ 0 }; - if(lstm_params.projection_clip() > 0.0f) + int8_t quantized_projection_clip{0}; + if (lstm_params.projection_clip() > 0.0f) { - quantized_projection_clip = utility::clamp<int8_t>(lstm_params.projection_clip() / qprojection.scale, -128, 127); + quantized_projection_clip = + utility::clamp<int8_t>(lstm_params.projection_clip() / qprojection.scale, -128, 127); } - if(quantized_projection_clip > 0) + if (quantized_projection_clip > 0) { - _projection_clip.configure(compile_context, output_state_out, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, -quantized_projection_clip, - quantized_projection_clip)); + _projection_clip.configure(compile_context, output_state_out, nullptr, + ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, + -quantized_projection_clip, quantized_projection_clip)); _has_projection_clipping = true; } } else { - if(_projection_tensor_copy_required) + if (_projection_tensor_copy_required) { _hidden_to_output_copy.configure(_hidden_gate, *output_state_out); _hidden_gate.allocator()->allocate(); @@ -541,17 +711,27 @@ void CLQLSTMLayer::configure(const CLCompileContext &compile_context, const ICLT _copy_output.configure(compile_context, output_state_out, output); } -Status CLQLSTMLayer::validate(const ITensorInfo *input, - const ITensorInfo *input_to_forget_weights, const ITensorInfo *input_to_cell_weights, const ITensorInfo *input_to_output_weights, - const ITensorInfo *recurrent_to_forget_weights, const ITensorInfo *recurrent_to_cell_weights, const ITensorInfo *recurrent_to_output_weights, - const ITensorInfo *forget_gate_bias, const ITensorInfo *cell_bias, const ITensorInfo *output_gate_bias, - const ITensorInfo *cell_state_in, const ITensorInfo *output_state_in, - const ITensorInfo *cell_state_out, const ITensorInfo *output_state_out, const ITensorInfo *output, +Status CLQLSTMLayer::validate(const ITensorInfo *input, + const ITensorInfo *input_to_forget_weights, + const ITensorInfo *input_to_cell_weights, + const ITensorInfo *input_to_output_weights, + const ITensorInfo *recurrent_to_forget_weights, + const ITensorInfo *recurrent_to_cell_weights, + const ITensorInfo *recurrent_to_output_weights, + const ITensorInfo *forget_gate_bias, + const ITensorInfo *cell_bias, + const ITensorInfo *output_gate_bias, + const ITensorInfo *cell_state_in, + const ITensorInfo *output_state_in, + const ITensorInfo *cell_state_out, + const ITensorInfo *output_state_out, + const ITensorInfo *output, const LSTMParams<ITensorInfo> &lstm_params) { - ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, input_to_forget_weights, input_to_cell_weights, input_to_output_weights, recurrent_to_forget_weights, recurrent_to_cell_weights, - recurrent_to_output_weights, forget_gate_bias, cell_bias, output_gate_bias, cell_state_in, output_state_in, - cell_state_out, output_state_out, output); + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, input_to_forget_weights, input_to_cell_weights, input_to_output_weights, + recurrent_to_forget_weights, recurrent_to_cell_weights, + recurrent_to_output_weights, forget_gate_bias, cell_bias, output_gate_bias, + cell_state_in, output_state_in, cell_state_out, output_state_out, output); ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8_SIGNED); ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->num_dimensions() != 2, "Input must have exactly 2 dimensions"); @@ -563,13 +743,16 @@ Status CLQLSTMLayer::validate(const ITensorInfo *input, ARM_COMPUTE_RETURN_ERROR_ON(input_to_output_weights->num_dimensions() != 2); ARM_COMPUTE_RETURN_ERROR_ON(input_to_output_weights->dimension(0) != input_size); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input_to_output_weights, input_to_forget_weights, input_to_cell_weights); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input_to_output_weights, input_to_forget_weights, + input_to_cell_weights); ARM_COMPUTE_RETURN_ERROR_ON(recurrent_to_output_weights->num_dimensions() != 2); ARM_COMPUTE_RETURN_ERROR_ON(recurrent_to_output_weights->dimension(1) != num_units); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(recurrent_to_output_weights, recurrent_to_forget_weights, recurrent_to_cell_weights); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(recurrent_to_output_weights, recurrent_to_forget_weights, + recurrent_to_cell_weights); ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input_to_forget_weights, 1, DataType::QSYMM8); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input_to_forget_weights, input_to_cell_weights, input_to_output_weights, - recurrent_to_forget_weights, recurrent_to_cell_weights, recurrent_to_output_weights); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input_to_forget_weights, input_to_cell_weights, + input_to_output_weights, recurrent_to_forget_weights, + recurrent_to_cell_weights, recurrent_to_output_weights); ARM_COMPUTE_RETURN_ERROR_ON(forget_gate_bias->num_dimensions() != 1); ARM_COMPUTE_RETURN_ERROR_ON(forget_gate_bias->dimension(0) != num_units); @@ -588,20 +771,25 @@ Status CLQLSTMLayer::validate(const ITensorInfo *input, ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output_state_in); // Check whether peephole weights are all there or none - if(lstm_params.has_peephole_opt()) + if (lstm_params.has_peephole_opt()) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(lstm_params.cell_to_forget_weights(), lstm_params.cell_to_output_weights()); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lstm_params.cell_to_forget_weights(), 1, DataType::QSYMM16); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lstm_params.cell_to_forget_weights(), 1, + DataType::QSYMM16); ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.cell_to_forget_weights()->num_dimensions() != 1); ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.cell_to_forget_weights()->dimension(0) != num_units); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(lstm_params.cell_to_forget_weights(), lstm_params.cell_to_output_weights()); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(lstm_params.cell_to_forget_weights(), lstm_params.cell_to_output_weights()); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(lstm_params.cell_to_forget_weights(), + lstm_params.cell_to_output_weights()); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(lstm_params.cell_to_forget_weights(), + lstm_params.cell_to_output_weights()); - if(!lstm_params.has_cifg_opt()) + if (!lstm_params.has_cifg_opt()) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(lstm_params.cell_to_input_weights()); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(lstm_params.cell_to_forget_weights(), lstm_params.cell_to_input_weights()); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(lstm_params.cell_to_forget_weights(), lstm_params.cell_to_input_weights()); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(lstm_params.cell_to_forget_weights(), + lstm_params.cell_to_input_weights()); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(lstm_params.cell_to_forget_weights(), + lstm_params.cell_to_input_weights()); } } @@ -615,7 +803,7 @@ Status CLQLSTMLayer::validate(const ITensorInfo *input, // Calculate quantized parameters for clipping. int16_t quantized_cell_clip = 0; - if(lstm_params.cell_clip() > 0.0f) + if (lstm_params.cell_clip() > 0.0f) { quantized_cell_clip = quantize_qsymm16(lstm_params.cell_clip(), qcell_state_in); } @@ -623,27 +811,50 @@ Status CLQLSTMLayer::validate(const ITensorInfo *input, // Precompute effective bias for optimizing the matmul computations. const TensorInfo eff_bias_info(TensorShape(num_units), 1, DataType::S32); const TensorInfo projection_eff_bias_info(TensorShape(output_size), 1, DataType::S32); - if(!lstm_params.has_cifg_opt()) + if (!lstm_params.has_cifg_opt()) { - ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixAReductionKernel::validate(lstm_params.input_to_input_weights(), &eff_bias_info, GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true))); - ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixAReductionKernel::validate(lstm_params.recurrent_to_input_weights(), &eff_bias_info, GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, - true))); + ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpMatrixAReductionKernel::validate( + lstm_params.input_to_input_weights(), &eff_bias_info, + GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true))); + ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpMatrixAReductionKernel::validate( + lstm_params.recurrent_to_input_weights(), &eff_bias_info, + GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true))); } - ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixAReductionKernel::validate(input_to_forget_weights, &eff_bias_info, GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true))); - ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixAReductionKernel::validate(recurrent_to_forget_weights, &eff_bias_info, GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true))); - ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixAReductionKernel::validate(input_to_cell_weights, &eff_bias_info, GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true))); - ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixAReductionKernel::validate(recurrent_to_cell_weights, &eff_bias_info, GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true))); - ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixAReductionKernel::validate(input_to_output_weights, &eff_bias_info, GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true))); - ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixAReductionKernel::validate(recurrent_to_output_weights, &eff_bias_info, GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true))); - if(lstm_params.has_projection()) + ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpMatrixAReductionKernel::validate( + input_to_forget_weights, &eff_bias_info, GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true))); + ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpMatrixAReductionKernel::validate( + recurrent_to_forget_weights, &eff_bias_info, + GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true))); + ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpMatrixAReductionKernel::validate( + input_to_cell_weights, &eff_bias_info, GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true))); + ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpMatrixAReductionKernel::validate( + recurrent_to_cell_weights, &eff_bias_info, + GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true))); + ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpMatrixAReductionKernel::validate( + input_to_output_weights, &eff_bias_info, GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true))); + ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpMatrixAReductionKernel::validate( + recurrent_to_output_weights, &eff_bias_info, + GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true))); + if (lstm_params.has_projection()) { - ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixAReductionKernel::validate(lstm_params.projection_weights(), &projection_eff_bias_info, GEMMLowpReductionKernelInfo(output_size, false, - lstm_params.hidden_state_zero(), - true))); + ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpMatrixAReductionKernel::validate( + lstm_params.projection_weights(), &projection_eff_bias_info, + GEMMLowpReductionKernelInfo(output_size, false, lstm_params.hidden_state_zero(), true))); + if (lstm_params.projection_bias() != nullptr) + { + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lstm_params.projection_bias(), 1, DataType::S32); + ARM_COMPUTE_RETURN_ON_ERROR( + CLArithmeticAddition::validate(lstm_params.projection_bias(), &projection_eff_bias_info, + &projection_eff_bias_info, ConvertPolicy::SATURATE)); + } } - const TensorInfo input_weights_transposed(TensorShape(num_units, input_size), 1, input_to_forget_weights->data_type(), input_to_forget_weights->quantization_info()); - const TensorInfo recurrent_weights_transposed(TensorShape(num_units, output_size), 1, recurrent_to_forget_weights->data_type(), recurrent_to_forget_weights->quantization_info()); + const TensorInfo input_weights_transposed(TensorShape(num_units, input_size), 1, + input_to_forget_weights->data_type(), + input_to_forget_weights->quantization_info()); + const TensorInfo recurrent_weights_transposed(TensorShape(num_units, output_size), 1, + recurrent_to_forget_weights->data_type(), + recurrent_to_forget_weights->quantization_info()); // Validate weights transpose ARM_COMPUTE_RETURN_ON_ERROR(CLTranspose::validate(input_to_forget_weights, &input_weights_transposed)); @@ -652,15 +863,20 @@ Status CLQLSTMLayer::validate(const ITensorInfo *input, ARM_COMPUTE_RETURN_ON_ERROR(CLTranspose::validate(recurrent_to_forget_weights, &recurrent_weights_transposed)); ARM_COMPUTE_RETURN_ON_ERROR(CLTranspose::validate(recurrent_to_cell_weights, &recurrent_weights_transposed)); ARM_COMPUTE_RETURN_ON_ERROR(CLTranspose::validate(recurrent_to_output_weights, &recurrent_weights_transposed)); - if(!lstm_params.has_cifg_opt()) + if (!lstm_params.has_cifg_opt()) { - ARM_COMPUTE_RETURN_ON_ERROR(CLTranspose::validate(lstm_params.input_to_input_weights(), &input_weights_transposed)); - ARM_COMPUTE_RETURN_ON_ERROR(CLTranspose::validate(lstm_params.recurrent_to_input_weights(), &recurrent_weights_transposed)); + ARM_COMPUTE_RETURN_ON_ERROR( + CLTranspose::validate(lstm_params.input_to_input_weights(), &input_weights_transposed)); + ARM_COMPUTE_RETURN_ON_ERROR( + CLTranspose::validate(lstm_params.recurrent_to_input_weights(), &recurrent_weights_transposed)); } - if(lstm_params.has_projection()) + if (lstm_params.has_projection()) { - const TensorInfo projection_weights_transposed(TensorShape(output_size, num_units), 1, lstm_params.projection_weights()->data_type(), lstm_params.projection_weights()->quantization_info()); - ARM_COMPUTE_RETURN_ON_ERROR(CLTranspose::validate(lstm_params.projection_weights(), &projection_weights_transposed)); + const TensorInfo projection_weights_transposed(TensorShape(output_size, num_units), 1, + lstm_params.projection_weights()->data_type(), + lstm_params.projection_weights()->quantization_info()); + ARM_COMPUTE_RETURN_ON_ERROR( + CLTranspose::validate(lstm_params.projection_weights(), &projection_weights_transposed)); } GEMMLowpOutputStageInfo gemmlowp_info; @@ -673,28 +889,42 @@ Status CLQLSTMLayer::validate(const ITensorInfo *input, // Forget gate. ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.forget_intermediate_scale() == 0); - const TensorInfo forget_outstage_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, QuantizationInfo(lstm_params.forget_intermediate_scale(), 0)); + const TensorInfo forget_outstage_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, + QuantizationInfo(lstm_params.forget_intermediate_scale(), 0)); const TensorInfo mm_out_info(TensorShape(num_units, batch_size), 1, DataType::S32); - const float input_to_forget_scale = input_to_forget_weights->quantization_info().uniform().scale * qinput.scale / lstm_params.forget_intermediate_scale(); - ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, input, &input_weights_transposed, &eff_bias_info, input_to_forget_scale, &mm_out_info, &forget_outstage_info)); + const float input_to_forget_scale = input_to_forget_weights->quantization_info().uniform().scale * qinput.scale / + lstm_params.forget_intermediate_scale(); + ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, input, &input_weights_transposed, &eff_bias_info, + input_to_forget_scale, &mm_out_info, &forget_outstage_info)); - const float recurrent_to_forget_scale = recurrent_to_forget_weights->quantization_info().uniform().scale * qoutput_state_in.scale / lstm_params.forget_intermediate_scale(); - ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, output_state_in, &recurrent_weights_transposed, &eff_bias_info, recurrent_to_forget_scale, &mm_out_info, &forget_outstage_info)); + const float recurrent_to_forget_scale = recurrent_to_forget_weights->quantization_info().uniform().scale * + qoutput_state_in.scale / lstm_params.forget_intermediate_scale(); + ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, output_state_in, &recurrent_weights_transposed, + &eff_bias_info, recurrent_to_forget_scale, &mm_out_info, + &forget_outstage_info)); - ARM_COMPUTE_RETURN_ON_ERROR(CLSaturatedArithmeticOperationKernel::validate(ArithmeticOperation::ADD, &forget_outstage_info, &forget_outstage_info, &forget_outstage_info, ConvertPolicy::SATURATE)); + ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAddition::validate(&forget_outstage_info, &forget_outstage_info, + &forget_outstage_info, ConvertPolicy::SATURATE)); - if(lstm_params.has_peephole_opt()) + if (lstm_params.has_peephole_opt()) { - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lstm_params.cell_to_forget_weights(), 1, DataType::QSYMM16); - ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplicationKernel::validate(cell_state_in, lstm_params.cell_to_forget_weights(), &mm_out_info, 1.f, ConvertPolicy::SATURATE, - RoundingPolicy::TO_ZERO)); - const float cell_to_forget_scale = std::pow(2, cell_shift) * lstm_params.cell_to_forget_weights()->quantization_info().uniform().scale / lstm_params.forget_intermediate_scale(); - ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier(cell_to_forget_scale, &gemmlowp_info.gemmlowp_multiplier, &gemmlowp_info.gemmlowp_shift)); - ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpOutputStage::validate(&mm_out_info, nullptr, &forget_outstage_info, gemmlowp_info)); - ARM_COMPUTE_RETURN_ON_ERROR(CLSaturatedArithmeticOperationKernel::validate(ArithmeticOperation::ADD, &forget_outstage_info, &forget_outstage_info, &forget_outstage_info, ConvertPolicy::SATURATE)); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lstm_params.cell_to_forget_weights(), 1, + DataType::QSYMM16); + ARM_COMPUTE_RETURN_ON_ERROR( + CLPixelWiseMultiplication::validate(cell_state_in, lstm_params.cell_to_forget_weights(), &mm_out_info, 1.f, + ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO)); + const float cell_to_forget_scale = std::pow(2, cell_shift) * + lstm_params.cell_to_forget_weights()->quantization_info().uniform().scale / + lstm_params.forget_intermediate_scale(); + ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier( + cell_to_forget_scale, &gemmlowp_info.gemmlowp_multiplier, &gemmlowp_info.gemmlowp_shift)); + ARM_COMPUTE_RETURN_ON_ERROR( + CLGEMMLowpOutputStage::validate(&mm_out_info, nullptr, &forget_outstage_info, gemmlowp_info)); + ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAddition::validate(&forget_outstage_info, &forget_outstage_info, + &forget_outstage_info, ConvertPolicy::SATURATE)); } - if(has_layer_norm) + if (has_layer_norm) { const ITensorInfo *w_info = lstm_params.forget_layer_norm_weights(); const ITensorInfo *b_info = forget_gate_bias; @@ -705,20 +935,29 @@ Status CLQLSTMLayer::validate(const ITensorInfo *input, const QuantizationInfo sigmoid_tanh_outqinfo(1.f / 32768.f, 0); const TensorInfo forget_gate_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, sigmoid_tanh_outqinfo); - ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayer::validate(&forget_outstage_info, &forget_gate_info, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC))); + ARM_COMPUTE_RETURN_ON_ERROR( + CLActivationLayer::validate(&forget_outstage_info, &forget_gate_info, + ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC))); // Modulation gate. ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.cell_intermediate_scale() == 0); - const TensorInfo cell_outstage_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, QuantizationInfo(lstm_params.cell_intermediate_scale(), 0)); - const float input_to_cell_scale = input_to_cell_weights->quantization_info().uniform().scale * qinput.scale / lstm_params.cell_intermediate_scale(); - ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, input, &input_weights_transposed, &eff_bias_info, input_to_cell_scale, &mm_out_info, &cell_outstage_info)); - - const float recurrent_to_cell_scale = recurrent_to_cell_weights->quantization_info().uniform().scale * qoutput_state_in.scale / lstm_params.cell_intermediate_scale(); - ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, output_state_in, &input_weights_transposed, &eff_bias_info, recurrent_to_cell_scale, &mm_out_info, &cell_outstage_info)); - - ARM_COMPUTE_RETURN_ON_ERROR(CLSaturatedArithmeticOperationKernel::validate(ArithmeticOperation::ADD, &cell_outstage_info, &cell_outstage_info, &cell_outstage_info, ConvertPolicy::SATURATE)); - - if(has_layer_norm) + const TensorInfo cell_outstage_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, + QuantizationInfo(lstm_params.cell_intermediate_scale(), 0)); + const float input_to_cell_scale = input_to_cell_weights->quantization_info().uniform().scale * qinput.scale / + lstm_params.cell_intermediate_scale(); + ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, input, &input_weights_transposed, &eff_bias_info, + input_to_cell_scale, &mm_out_info, &cell_outstage_info)); + + const float recurrent_to_cell_scale = recurrent_to_cell_weights->quantization_info().uniform().scale * + qoutput_state_in.scale / lstm_params.cell_intermediate_scale(); + ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, output_state_in, &recurrent_weights_transposed, + &eff_bias_info, recurrent_to_cell_scale, &mm_out_info, + &cell_outstage_info)); + + ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAddition::validate(&cell_outstage_info, &cell_outstage_info, + &cell_outstage_info, ConvertPolicy::SATURATE)); + + if (has_layer_norm) { const ITensorInfo *w_info = lstm_params.cell_layer_norm_weights(); const ITensorInfo *b_info = cell_bias; @@ -726,85 +965,123 @@ Status CLQLSTMLayer::validate(const ITensorInfo *input, } const TensorInfo cell_gate_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, sigmoid_tanh_outqinfo); - ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayer::validate(&cell_outstage_info, &cell_gate_info, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.f, 1.f))); + ARM_COMPUTE_RETURN_ON_ERROR( + CLActivationLayer::validate(&cell_outstage_info, &cell_gate_info, + ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.f, 1.f))); // Input gate. const TensorInfo input_gate_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, sigmoid_tanh_outqinfo); - if(lstm_params.has_cifg_opt()) + if (lstm_params.has_cifg_opt()) { - ARM_COMPUTE_RETURN_ERROR_ON_MSG(lstm_params.input_gate_bias() != nullptr, "Input gate bias must not be present when CIFG is used"); - ARM_COMPUTE_RETURN_ON_ERROR(CLSaturatedArithmeticOperationKernel::validate(ArithmeticOperation::SUB, &input_gate_info, &forget_gate_info, &forget_gate_info, ConvertPolicy::SATURATE)); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(lstm_params.input_gate_bias() != nullptr, + "Input gate bias must not be present when CIFG is used"); + ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticSubtraction::validate(&input_gate_info, &forget_gate_info, + &forget_gate_info, ConvertPolicy::SATURATE)); } else { - ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(lstm_params.input_to_input_weights(), lstm_params.recurrent_to_input_weights(), lstm_params.input_gate_bias()); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input_to_forget_weights, lstm_params.input_to_input_weights(), lstm_params.recurrent_to_input_weights()); + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(lstm_params.input_to_input_weights(), + lstm_params.recurrent_to_input_weights(), lstm_params.input_gate_bias()); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES( + input_to_forget_weights, lstm_params.input_to_input_weights(), lstm_params.recurrent_to_input_weights()); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input_to_forget_weights, lstm_params.input_to_input_weights()); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(recurrent_to_forget_weights, lstm_params.recurrent_to_input_weights()); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(recurrent_to_forget_weights, + lstm_params.recurrent_to_input_weights()); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(forget_gate_bias, lstm_params.input_gate_bias()); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(forget_gate_bias, lstm_params.input_gate_bias()); ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.input_intermediate_scale() == 0); - const TensorInfo input_outstage_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, QuantizationInfo(lstm_params.input_intermediate_scale(), 0)); - const float input_to_input_scale = lstm_params.input_to_input_weights()->quantization_info().uniform().scale * qinput.scale / lstm_params.input_intermediate_scale(); - ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, input, &input_weights_transposed, &eff_bias_info, input_to_input_scale, &mm_out_info, &input_outstage_info)); - - const float recurrent_to_input_scale = lstm_params.recurrent_to_input_weights()->quantization_info().uniform().scale * qoutput_state_in.scale / lstm_params.input_intermediate_scale(); - ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, output_state_in, &recurrent_weights_transposed, &eff_bias_info, recurrent_to_input_scale, &mm_out_info, &input_outstage_info)); - - ARM_COMPUTE_RETURN_ON_ERROR(CLSaturatedArithmeticOperationKernel::validate(ArithmeticOperation::ADD, &input_outstage_info, &input_outstage_info, &input_outstage_info, ConvertPolicy::SATURATE)); - - if(lstm_params.has_peephole_opt()) + const TensorInfo input_outstage_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, + QuantizationInfo(lstm_params.input_intermediate_scale(), 0)); + const float input_to_input_scale = lstm_params.input_to_input_weights()->quantization_info().uniform().scale * + qinput.scale / lstm_params.input_intermediate_scale(); + ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, input, &input_weights_transposed, &eff_bias_info, + input_to_input_scale, &mm_out_info, &input_outstage_info)); + + const float recurrent_to_input_scale = + lstm_params.recurrent_to_input_weights()->quantization_info().uniform().scale * qoutput_state_in.scale / + lstm_params.input_intermediate_scale(); + ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, output_state_in, &recurrent_weights_transposed, + &eff_bias_info, recurrent_to_input_scale, &mm_out_info, + &input_outstage_info)); + + ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAddition::validate(&input_outstage_info, &input_outstage_info, + &input_outstage_info, ConvertPolicy::SATURATE)); + + if (lstm_params.has_peephole_opt()) { - ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplicationKernel::validate(cell_state_in, lstm_params.cell_to_input_weights(), &mm_out_info, 1.f, ConvertPolicy::SATURATE, - RoundingPolicy::TO_ZERO)); - const float cell_to_input_scale = std::pow(2, cell_shift) * lstm_params.cell_to_input_weights()->quantization_info().uniform().scale / lstm_params.input_intermediate_scale(); - ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier(cell_to_input_scale, &gemmlowp_info.gemmlowp_multiplier, &gemmlowp_info.gemmlowp_shift)); - ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpOutputStage::validate(&mm_out_info, &eff_bias_info, &input_outstage_info, gemmlowp_info)); - ARM_COMPUTE_RETURN_ON_ERROR(CLSaturatedArithmeticOperationKernel::validate(ArithmeticOperation::ADD, &input_outstage_info, &input_outstage_info, &input_outstage_info, ConvertPolicy::SATURATE)); + ARM_COMPUTE_RETURN_ON_ERROR( + CLPixelWiseMultiplication::validate(cell_state_in, lstm_params.cell_to_input_weights(), &mm_out_info, + 1.f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO)); + const float cell_to_input_scale = std::pow(2, cell_shift) * + lstm_params.cell_to_input_weights()->quantization_info().uniform().scale / + lstm_params.input_intermediate_scale(); + ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier( + cell_to_input_scale, &gemmlowp_info.gemmlowp_multiplier, &gemmlowp_info.gemmlowp_shift)); + ARM_COMPUTE_RETURN_ON_ERROR( + CLGEMMLowpOutputStage::validate(&mm_out_info, &eff_bias_info, &input_outstage_info, gemmlowp_info)); + ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAddition::validate(&input_outstage_info, &input_outstage_info, + &input_outstage_info, ConvertPolicy::SATURATE)); } - if(has_layer_norm) + if (has_layer_norm) { const ITensorInfo *w_info = lstm_params.input_layer_norm_weights(); const ITensorInfo *b_info = lstm_params.input_gate_bias(); ARM_COMPUTE_RETURN_ON_ERROR(validate_layer_norm(cell_outstage_info, *w_info, *b_info)); } - ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayer::validate(&input_outstage_info, &input_gate_info, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC, 1.f, 1.f))); + ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayer::validate( + &input_outstage_info, &input_gate_info, + ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC, 1.f, 1.f))); } // Cell. - ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplicationKernel::validate(&forget_gate_info, cell_state_in, &forget_gate_info, 1.f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO)); - ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplicationKernel::validate(&input_gate_info, cell_state_in, &cell_gate_info, 1.f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO)); - ARM_COMPUTE_RETURN_ON_ERROR(CLSaturatedArithmeticOperationKernel::validate(ArithmeticOperation::ADD, &forget_gate_info, &cell_gate_info, cell_state_out, ConvertPolicy::SATURATE)); - if(quantized_cell_clip > 0) + ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplication::validate( + &forget_gate_info, cell_state_in, &forget_gate_info, 1.f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO)); + ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplication::validate( + &input_gate_info, cell_state_in, &cell_gate_info, 1.f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO)); + ARM_COMPUTE_RETURN_ON_ERROR( + CLArithmeticAddition::validate(&forget_gate_info, &cell_gate_info, cell_state_out, ConvertPolicy::SATURATE)); + if (quantized_cell_clip > 0) { - ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayer::validate(cell_state_out, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, -quantized_cell_clip, - quantized_cell_clip))); + ARM_COMPUTE_RETURN_ON_ERROR( + CLActivationLayer::validate(cell_state_out, nullptr, + ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, + -quantized_cell_clip, quantized_cell_clip))); } // Output gate. ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.output_intermediate_scale() == 0); - const TensorInfo output_outstage_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, QuantizationInfo(lstm_params.output_intermediate_scale(), 0)); - const float input_to_output_scale = input_to_output_weights->quantization_info().uniform().scale * qinput.scale / lstm_params.output_intermediate_scale(); - ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, input, &input_weights_transposed, &eff_bias_info, input_to_output_scale, &mm_out_info, &output_outstage_info)); - - const float recurrent_to_output_scale = recurrent_to_output_weights->quantization_info().uniform().scale * qoutput_state_in.scale / lstm_params.output_intermediate_scale(); - ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, output_state_in, &recurrent_weights_transposed, &eff_bias_info, recurrent_to_output_scale, &mm_out_info, &output_outstage_info)); - - ARM_COMPUTE_RETURN_ON_ERROR(CLSaturatedArithmeticOperationKernel::validate(ArithmeticOperation::ADD, &output_outstage_info, &output_outstage_info, &output_outstage_info, ConvertPolicy::SATURATE)); - if(lstm_params.has_peephole_opt()) + const TensorInfo output_outstage_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, + QuantizationInfo(lstm_params.output_intermediate_scale(), 0)); + const float input_to_output_scale = input_to_output_weights->quantization_info().uniform().scale * qinput.scale / + lstm_params.output_intermediate_scale(); + ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, input, &input_weights_transposed, &eff_bias_info, + input_to_output_scale, &mm_out_info, &output_outstage_info)); + + const float recurrent_to_output_scale = recurrent_to_output_weights->quantization_info().uniform().scale * + qoutput_state_in.scale / lstm_params.output_intermediate_scale(); + ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, output_state_in, &recurrent_weights_transposed, + &eff_bias_info, recurrent_to_output_scale, &mm_out_info, + &output_outstage_info)); + + ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAddition::validate(&output_outstage_info, &output_outstage_info, + &output_outstage_info, ConvertPolicy::SATURATE)); + if (lstm_params.has_peephole_opt()) { - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lstm_params.cell_to_output_weights(), 1, DataType::QSYMM16); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lstm_params.cell_to_output_weights(), 1, + DataType::QSYMM16); // TODO(COMPMID-3395): Perform multiplication in the quantized domain in NEPixelWiseMultiplicationKernel // Here we are not using the output stage because all operations are done in float // const float cell_to_output_scale = std::pow(2, cell_shift) * lstm_params.cell_to_output_weights()->quantization_info().uniform().scale / lstm_params.output_intermediate_scale(); // ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier(cell_to_output_scale, &gemmlowp_info.gemmlowp_multiplier, &gemmlowp_info.gemmlowp_shift)); - ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplicationKernel::validate(cell_state_out, lstm_params.cell_to_output_weights(), &output_outstage_info, 1.f, ConvertPolicy::SATURATE, - RoundingPolicy::TO_ZERO)); - ARM_COMPUTE_RETURN_ON_ERROR(CLSaturatedArithmeticOperationKernel::validate(ArithmeticOperation::ADD, &output_outstage_info, &output_outstage_info, &output_outstage_info, ConvertPolicy::SATURATE)); + ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplication::validate( + cell_state_out, lstm_params.cell_to_output_weights(), &output_outstage_info, 1.f, ConvertPolicy::SATURATE, + RoundingPolicy::TO_ZERO)); + ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAddition::validate(&output_outstage_info, &output_outstage_info, + &output_outstage_info, ConvertPolicy::SATURATE)); } - if(has_layer_norm) + if (has_layer_norm) { const ITensorInfo *w_info = lstm_params.output_layer_norm_weights(); const ITensorInfo *b_info = output_gate_bias; @@ -812,91 +1089,109 @@ Status CLQLSTMLayer::validate(const ITensorInfo *input, } const TensorInfo output_gate_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, sigmoid_tanh_outqinfo); - ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayer::validate(&output_outstage_info, &output_gate_info, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC))); + ARM_COMPUTE_RETURN_ON_ERROR( + CLActivationLayer::validate(&output_outstage_info, &output_gate_info, + ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC))); // Hidden. - ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayer::validate(cell_state_out, &input_gate_info, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.f, 1.f))); + ARM_COMPUTE_RETURN_ON_ERROR( + CLActivationLayer::validate(cell_state_out, &input_gate_info, + ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.f, 1.f))); const TensorInfo hidden_mul_res(TensorShape(num_units, batch_size), 1, DataType::S32); const TensorInfo hidden_out_info(TensorShape(num_units, batch_size), 1, DataType::QASYMM8_SIGNED); ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.hidden_state_scale() == 0); - ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplicationKernel::validate(&output_gate_info, &input_gate_info, &hidden_mul_res, 1.f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO)); + ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplication::validate( + &output_gate_info, &input_gate_info, &hidden_mul_res, 1.f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO)); const float hidden_state_scale = std::pow(2, -15) / lstm_params.hidden_state_scale() * std::pow(2, -15); - ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier(hidden_state_scale, &gemmlowp_info.gemmlowp_multiplier, &gemmlowp_info.gemmlowp_shift, /* ignore_epsilon */ true)); - gemmlowp_info.gemmlowp_offset = lstm_params.hidden_state_zero(); - ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpOutputStage::validate(&hidden_mul_res, nullptr, &hidden_out_info, gemmlowp_info)); + ARM_COMPUTE_RETURN_ON_ERROR( + quantization::calculate_quantized_multiplier(hidden_state_scale, &gemmlowp_info.gemmlowp_multiplier, + &gemmlowp_info.gemmlowp_shift, /* ignore_epsilon */ true)); + gemmlowp_info.gemmlowp_offset = lstm_params.hidden_state_zero(); + gemmlowp_info.output_data_type = hidden_out_info.data_type(); + ARM_COMPUTE_RETURN_ON_ERROR( + CLGEMMLowpOutputStage::validate(&hidden_mul_res, nullptr, &hidden_out_info, gemmlowp_info)); const bool projection_tensor_copy_required = num_units != output_size; // Projection. - if(lstm_params.has_projection()) + if (lstm_params.has_projection()) { - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(recurrent_to_forget_weights, lstm_params.projection_weights()); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(forget_gate_bias, lstm_params.projection_bias()); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(recurrent_to_forget_weights, + lstm_params.projection_weights()); ARM_COMPUTE_RETURN_ERROR_ON(qoutput_state_in.scale == 0); - const UniformQuantizationInfo qprojection = lstm_params.projection_weights()->quantization_info().uniform(); - const float projection_scale = qprojection.scale * lstm_params.hidden_state_scale() / qoutput_state_in.scale; - ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier(projection_scale, &gemmlowp_info.gemmlowp_multiplier, &gemmlowp_info.gemmlowp_shift)); + const UniformQuantizationInfo qprojection = lstm_params.projection_weights()->quantization_info().uniform(); + const float projection_scale = qprojection.scale * lstm_params.hidden_state_scale() / qoutput_state_in.scale; + ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier( + projection_scale, &gemmlowp_info.gemmlowp_multiplier, &gemmlowp_info.gemmlowp_shift)); gemmlowp_info.gemmlowp_offset = qoutput_state_in.offset; gemmlowp_info.gemmlowp_min_bound = std::numeric_limits<int8_t>::lowest(); gemmlowp_info.gemmlowp_max_bound = std::numeric_limits<int8_t>::max(); gemmlowp_info.output_data_type = DataType::QASYMM8_SIGNED; const TensorInfo projection_outstage_info(*output_state_out); - const TensorInfo projection_weights_transposed(TensorShape(output_size, num_units), 1, lstm_params.projection_weights()->data_type(), lstm_params.projection_weights()->quantization_info()); + const TensorInfo projection_weights_transposed(TensorShape(output_size, num_units), 1, + lstm_params.projection_weights()->data_type(), + lstm_params.projection_weights()->quantization_info()); - TensorInfo projection_mm_out_info{ mm_out_info }; + TensorInfo projection_mm_out_info{mm_out_info}; projection_mm_out_info.set_tensor_shape(TensorShape(output_size, batch_size)); - ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, &hidden_out_info, &projection_weights_transposed, &projection_eff_bias_info, projection_scale, &projection_mm_out_info, + ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, &hidden_out_info, &projection_weights_transposed, + &projection_eff_bias_info, projection_scale, &projection_mm_out_info, &projection_outstage_info)); - if(projection_tensor_copy_required) + if (projection_tensor_copy_required) { - ARM_COMPUTE_RETURN_ON_ERROR(CLQLSTMLayer::TensorCopyKernel::validate(*output_state_out, projection_outstage_info)); + ARM_COMPUTE_RETURN_ON_ERROR( + CLQLSTMLayer::TensorCopyKernel::validate(*output_state_in, projection_outstage_info)); } - ARM_COMPUTE_RETURN_ON_ERROR(CLSaturatedArithmeticOperationKernel::validate(ArithmeticOperation::ADD, output_state_out, output_state_out, output_state_out, ConvertPolicy::SATURATE)); + ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAddition::validate(output_state_out, output_state_out, output_state_out, + ConvertPolicy::SATURATE)); - if(projection_tensor_copy_required) + if (projection_tensor_copy_required) { - ARM_COMPUTE_RETURN_ON_ERROR(CLQLSTMLayer::TensorCopyKernel::validate(projection_outstage_info, *output_state_out)); + ARM_COMPUTE_RETURN_ON_ERROR( + CLQLSTMLayer::TensorCopyKernel::validate(projection_outstage_info, *output_state_out)); } - int8_t quantized_projection_clip{ 0 }; - if(lstm_params.projection_clip() > 0.0f) + int8_t quantized_projection_clip{0}; + if (lstm_params.projection_clip() > 0.0f) { quantized_projection_clip = quantize_qasymm8_signed(lstm_params.projection_clip(), qprojection); } - if(quantized_projection_clip > 0) + if (quantized_projection_clip > 0) { - ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayer::validate(output_state_out, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, -quantized_projection_clip, - quantized_projection_clip))); + ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayer::validate( + output_state_out, nullptr, + ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, + -quantized_projection_clip, quantized_projection_clip))); } } else { - if(projection_tensor_copy_required) + if (projection_tensor_copy_required) { ARM_COMPUTE_RETURN_ON_ERROR(CLQLSTMLayer::TensorCopyKernel::validate(hidden_out_info, *output_state_out)); } } - if(cell_state_out->total_size() > 0) + if (cell_state_out->total_size() > 0) { ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(cell_state_in, cell_state_out); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(cell_state_in, cell_state_out); } - if(output_state_out->total_size() > 0) + if (output_state_out->total_size() > 0) { ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output_state_out); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output_state_in, output_state_out); } - ARM_COMPUTE_RETURN_ON_ERROR(CLCopyKernel::validate(output_state_out, output)); + ARM_COMPUTE_RETURN_ON_ERROR(CLCopy::validate(output_state_out, output)); return Status{}; } @@ -913,16 +1208,16 @@ void CLQLSTMLayer::run() _mm_recurrent_to_forget.run(); _recurrent_to_forget_outstage.run(); - CLScheduler::get().enqueue(_accumulate_input_recurrent_forget); + _accumulate_input_recurrent_forget.run(); - if(_has_peephole) + if (_has_peephole) { - CLScheduler::get().enqueue(_pixelwise_mul_cell_to_forget); + _pixelwise_mul_cell_to_forget.run(); _cell_to_forget_outstage.run(); - CLScheduler::get().enqueue(_accumulate_cell_forget); + _accumulate_cell_forget.run(); } - if(_has_layer_norm) + if (_has_layer_norm) { CLScheduler::get().enqueue(get_layer_norm(LayerNormGate::Forget)); } @@ -935,9 +1230,9 @@ void CLQLSTMLayer::run() _mm_recurrent_to_cell.run(); _recurrent_to_cell_outstage.run(); - CLScheduler::get().enqueue(_accumulate_input_recurrent_modulation); + _accumulate_input_recurrent_modulation.run(); - if(_has_layer_norm) + if (_has_layer_norm) { CLScheduler::get().enqueue(get_layer_norm(LayerNormGate::Cell)); } @@ -945,9 +1240,9 @@ void CLQLSTMLayer::run() _cell_gate_tanh.run(); // Input gate - if(_has_cifg) + if (_has_cifg) { - CLScheduler::get().enqueue(_input_gate_sub); + _input_gate_sub.run(); } else { @@ -955,16 +1250,16 @@ void CLQLSTMLayer::run() _input_to_input_outstage.run(); _mm_recurrent_to_input.run(); _recurrent_to_input_outstage.run(); - CLScheduler::get().enqueue(_accumulate_input_recurrent_input); + _accumulate_input_recurrent_input.run(); - if(_has_peephole) + if (_has_peephole) { - CLScheduler::get().enqueue(_pixelwise_mul_cell_to_input); + _pixelwise_mul_cell_to_input.run(); _cell_to_input_outstage.run(); - CLScheduler::get().enqueue(_accumulate_cell_input); + _accumulate_cell_input.run(); } - if(_has_layer_norm) + if (_has_layer_norm) { CLScheduler::get().enqueue(get_layer_norm(LayerNormGate::Input)); } @@ -973,10 +1268,10 @@ void CLQLSTMLayer::run() } // Cell. - CLScheduler::get().enqueue(_pixelwise_mul_forget_cell); - CLScheduler::get().enqueue(_pixelwise_mul_input_cell); - CLScheduler::get().enqueue(_add_forget_cell); - if(_has_cell_clipping) + _pixelwise_mul_forget_cell.run(); + _pixelwise_mul_input_cell.run(); + _add_forget_cell.run(); + if (_has_cell_clipping) { _cell_clip.run(); } @@ -986,15 +1281,15 @@ void CLQLSTMLayer::run() _input_to_output_outstage.run(); _mm_recurrent_to_output.run(); _recurrent_to_output_outstage.run(); - CLScheduler::get().enqueue(_accumulate_input_recurrent_output); - if(_has_peephole) + _accumulate_input_recurrent_output.run(); + if (_has_peephole) { - CLScheduler::get().enqueue(_pixelwise_mul_cell_to_output); + _pixelwise_mul_cell_to_output.run(); _cell_to_output_outstage.run(); - CLScheduler::get().enqueue(_accumulate_cell_to_output); + _accumulate_cell_to_output.run(); } - if(_has_layer_norm) + if (_has_layer_norm) { CLScheduler::get().enqueue(get_layer_norm(LayerNormGate::Output)); } @@ -1003,47 +1298,47 @@ void CLQLSTMLayer::run() // Hidden. _hidden_tanh.run(); - CLScheduler::get().enqueue(_pixelwise_mul_hidden); + _pixelwise_mul_hidden.run(); _hidden_outstage.run(); // Projection. - if(_has_projection) + if (_has_projection) { _mm_projection.run(); _projection_outstage.run(); - if(_projection_tensor_copy_required) + if (_projection_tensor_copy_required) { _projection_output_to_accumulate_copy.run(); } - CLScheduler::get().enqueue(_accumulate_projection); + _accumulate_projection.run(); - if(_projection_tensor_copy_required) + if (_projection_tensor_copy_required) { _projection_accumulate_to_output_copy.run(); } - if(_has_projection_clipping) + if (_has_projection_clipping) { _projection_clip.run(); } } else { - if(_projection_tensor_copy_required) + if (_projection_tensor_copy_required) { _hidden_to_output_copy.run(); } } // Copy output_state_out to output - CLScheduler::get().enqueue(_copy_output); + _copy_output.run(); } void CLQLSTMLayer::prepare() { - if(!_is_prepared) + if (!_is_prepared) { // Pre-transpose weights to be used in GEMM. _input_to_forget_weights_transposed.allocator()->allocate(); @@ -1060,18 +1355,25 @@ void CLQLSTMLayer::prepare() _transpose_recurrent_to_output_weights.run(); // Precompute effective biases - if(_has_cifg) + if (_has_cifg) { _ones.map(true); - std::fill_n(reinterpret_cast<int16_t *>(_ones.buffer()), _ones.info()->total_size() / _ones.info()->element_size(), 32767); + std::fill_n(reinterpret_cast<int16_t *>(_ones.buffer()), + _ones.info()->total_size() / _ones.info()->element_size(), 32767); _ones.unmap(); } else { _input_to_input_eff_bias.allocator()->allocate(); _recurrent_to_input_eff_bias.allocator()->allocate(); - CLScheduler::get().enqueue(_input_to_input_reduction); - CLScheduler::get().enqueue(_recurrent_to_input_reduction); + + ITensorPack input_to_input_red_pack = {{ACL_SRC, _input_to_input_weights}, + {ACL_DST, &_input_to_input_eff_bias}}; + CLScheduler::get().enqueue_op(*_input_to_input_reduction, input_to_input_red_pack, false); + + ITensorPack rec_to_input_red_pack = {{ACL_SRC, _recurrent_to_input_weights}, + {ACL_DST, &_recurrent_to_input_eff_bias}}; + CLScheduler::get().enqueue_op(*_recurrent_to_input_reduction, rec_to_input_red_pack, false); _input_to_input_weights_transposed.allocator()->allocate(); _recurrent_to_input_weights_transposed.allocator()->allocate(); @@ -1086,19 +1388,38 @@ void CLQLSTMLayer::prepare() _recurrent_to_cell_eff_bias.allocator()->allocate(); _input_to_output_eff_bias.allocator()->allocate(); _recurrent_to_output_eff_bias.allocator()->allocate(); - CLScheduler::get().enqueue(_input_to_forget_reduction); - CLScheduler::get().enqueue(_recurrent_to_forget_reduction); - CLScheduler::get().enqueue(_input_to_cell_reduction); - CLScheduler::get().enqueue(_recurrent_to_cell_reduction); - CLScheduler::get().enqueue(_input_to_output_reduction); - CLScheduler::get().enqueue(_recurrent_to_output_reduction); - - if(_has_projection) + + ITensorPack input_to_forget_red_pack = {{ACL_SRC, _input_to_forget_weights}, + {ACL_DST, &_input_to_forget_eff_bias}}; + CLScheduler::get().enqueue_op(*_input_to_forget_reduction, input_to_forget_red_pack, false); + + ITensorPack rec_to_forget_red_pack = {{ACL_SRC, _recurrent_to_forget_weights}, + {ACL_DST, &_recurrent_to_forget_eff_bias}}; + CLScheduler::get().enqueue_op(*_recurrent_to_forget_reduction, rec_to_forget_red_pack, false); + + ITensorPack input_to_cell_red_pack = {{ACL_SRC, _input_to_cell_weights}, {ACL_DST, &_input_to_cell_eff_bias}}; + CLScheduler::get().enqueue_op(*_input_to_cell_reduction, input_to_cell_red_pack, false); + + ITensorPack rec_to_cell_red_pack = {{ACL_SRC, _recurrent_to_cell_weights}, + {ACL_DST, &_recurrent_to_cell_eff_bias}}; + CLScheduler::get().enqueue_op(*_recurrent_to_cell_reduction, rec_to_cell_red_pack, false); + + ITensorPack input_to_output_red_pack = {{ACL_SRC, _input_to_output_weights}, + {ACL_DST, &_input_to_output_eff_bias}}; + CLScheduler::get().enqueue_op(*_input_to_output_reduction, input_to_output_red_pack, false); + + ITensorPack rec_to_output_red_pack = {{ACL_SRC, _recurrent_to_output_weights}, + {ACL_DST, &_recurrent_to_output_eff_bias}}; + CLScheduler::get().enqueue_op(*_recurrent_to_output_reduction, rec_to_output_red_pack, false); + + if (_has_projection) { - if(_projection_bias != nullptr) + _projection_eff_bias.allocator()->allocate(); + ITensorPack proj_red_pack{{ACL_SRC, _projection_weights}, {ACL_DST, &_projection_eff_bias}}; + CLScheduler::get().enqueue_op(*_projection_reduction, proj_red_pack, false); + if (_projection_bias != nullptr) { - _projection_eff_bias.allocator()->allocate(); - CLScheduler::get().enqueue(_projection_reduction); + _projection_bias_add.run(); _projection_bias->mark_as_unused(); } @@ -1106,7 +1427,7 @@ void CLQLSTMLayer::prepare() _transpose_projection_weights.run(); _projection_weights->mark_as_unused(); - if(!_projection_tensor_copy_required) + if (!_projection_tensor_copy_required) { _hidden_gate.mark_as_unused(); _projection_accumulate_res.mark_as_unused(); diff --git a/src/runtime/CL/functions/CLQuantizationLayer.cpp b/src/runtime/CL/functions/CLQuantizationLayer.cpp index 6239f279ea..6edef29992 100644 --- a/src/runtime/CL/functions/CLQuantizationLayer.cpp +++ b/src/runtime/CL/functions/CLQuantizationLayer.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2020 ARM Limited. + * Copyright (c) 2017-2021 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -23,11 +23,26 @@ */ #include "arm_compute/runtime/CL/functions/CLQuantizationLayer.h" -#include "arm_compute/core/CL/kernels/CLQuantizationLayerKernel.h" -#include "support/MemorySupport.h" +#include "arm_compute/core/CL/CLKernelLibrary.h" +#include "arm_compute/core/CL/ICLTensor.h" + +#include "src/core/CL/ICLKernel.h" +#include "src/gpu/cl/operators/ClQuantize.h" namespace arm_compute { +struct CLQuantizationLayer::Impl +{ + const ICLTensor *src{nullptr}; + ICLTensor *dst{nullptr}; + std::unique_ptr<opencl::ClQuantize> op{nullptr}; +}; + +CLQuantizationLayer::CLQuantizationLayer() : _impl(std::make_unique<Impl>()) +{ +} +CLQuantizationLayer::~CLQuantizationLayer() = default; + void CLQuantizationLayer::configure(const ICLTensor *input, ICLTensor *output) { configure(CLKernelLibrary::get().get_compile_context(), input, output); @@ -35,13 +50,23 @@ void CLQuantizationLayer::configure(const ICLTensor *input, ICLTensor *output) void CLQuantizationLayer::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output) { - auto k = arm_compute::support::cpp14::make_unique<CLQuantizationLayerKernel>(); - k->configure(compile_context, input, output); - _kernel = std::move(k); + _impl->src = input; + _impl->dst = output; + + _impl->op = std::make_unique<opencl::ClQuantize>(); + _impl->op->configure(compile_context, input->info(), output->info()); } Status CLQuantizationLayer::validate(const ITensorInfo *input, const ITensorInfo *output) { - return CLQuantizationLayerKernel::validate(input, output); + return opencl::ClQuantize::validate(input, output); +} + +void CLQuantizationLayer::run() +{ + ITensorPack pack; + pack.add_tensor(TensorType::ACL_SRC, _impl->src); + pack.add_tensor(TensorType::ACL_DST, _impl->dst); + _impl->op->run(pack); } } // namespace arm_compute diff --git a/src/runtime/CL/functions/CLRNNLayer.cpp b/src/runtime/CL/functions/CLRNNLayer.cpp index 57b8d70089..34b78eefa7 100644 --- a/src/runtime/CL/functions/CLRNNLayer.cpp +++ b/src/runtime/CL/functions/CLRNNLayer.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018-2020 ARM Limited. + * Copyright (c) 2018-2021 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -29,23 +29,44 @@ #include "arm_compute/core/utils/misc/ShapeCalculator.h" #include "arm_compute/runtime/CL/CLScheduler.h" -#include <utility> +#include "src/common/utils/Log.h" +#include "src/core/CL/kernels/CLFillBorderKernel.h" -using namespace arm_compute; +namespace arm_compute +{ using namespace arm_compute::misc::shape_calculator; CLRNNLayer::CLRNNLayer(std::shared_ptr<IMemoryManager> memory_manager) - : _memory_group(std::move(memory_manager)), _gemm_state_f(), _add_kernel(), _activation_kernel(), _fully_connected_kernel(), _copy_kernel(), _fully_connected_out(), _gemm_output(), _add_output(), + : _memory_group(std::move(memory_manager)), + _gemm_state_f(), + _add_kernel(), + _activation(), + _fully_connected_kernel(), + _copy(), + _fully_connected_out(), + _gemm_output(), + _add_output(), _is_prepared(false) { } -Status CLRNNLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *recurrent_weights, const ITensorInfo *bias, const ITensorInfo *hidden_state, - const ITensorInfo *output, const ActivationLayerInfo &info) +CLRNNLayer::~CLRNNLayer() = default; + +Status CLRNNLayer::validate(const ITensorInfo *input, + const ITensorInfo *weights, + const ITensorInfo *recurrent_weights, + const ITensorInfo *bias, + const ITensorInfo *hidden_state, + const ITensorInfo *output, + const ActivationLayerInfo &info) { + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, recurrent_weights, bias, hidden_state, output); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_NOT_IN(input, DataType::F16, DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights, recurrent_weights, bias, hidden_state, output); + const int idx_width = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::WIDTH); const int idx_height = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::HEIGHT); - ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, recurrent_weights, bias, hidden_state, output); + ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(idx_width) != weights->dimension(idx_width)); ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(idx_height) != recurrent_weights->dimension(idx_width)); ARM_COMPUTE_RETURN_ERROR_ON(recurrent_weights->dimension(idx_width) != recurrent_weights->dimension(1)); @@ -55,28 +76,43 @@ Status CLRNNLayer::validate(const ITensorInfo *input, const ITensorInfo *weights ARM_COMPUTE_RETURN_ERROR_ON(hidden_state->dimension(idx_height) != input->dimension(idx_height)); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), hidden_state->tensor_shape()); - auto shape_info = TensorInfo(compute_rnn_shape(recurrent_weights, hidden_state->dimension(idx_height)), 1, input->data_type()); + auto shape_info = + TensorInfo(compute_rnn_shape(recurrent_weights, hidden_state->dimension(idx_height)), 1, input->data_type()); ARM_COMPUTE_RETURN_ON_ERROR(CLFullyConnectedLayer::validate(input, weights, bias, &shape_info)); ARM_COMPUTE_RETURN_ON_ERROR(CLGEMM::validate(hidden_state, recurrent_weights, nullptr, &shape_info, 1.f, 0.f)); - ARM_COMPUTE_RETURN_ON_ERROR(CLSaturatedArithmeticOperationKernel::validate(ArithmeticOperation::ADD, &shape_info, &shape_info, &shape_info, ConvertPolicy::SATURATE)); - ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayerKernel::validate(&shape_info, &shape_info, info)); + ARM_COMPUTE_RETURN_ON_ERROR( + CLArithmeticAddition::validate(&shape_info, &shape_info, &shape_info, ConvertPolicy::SATURATE)); + ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayer::validate(&shape_info, &shape_info, info)); return Status{}; } -void CLRNNLayer::configure(const ICLTensor *input, const ICLTensor *weights, const ICLTensor *recurrent_weights, const ICLTensor *bias, ICLTensor *hidden_state, ICLTensor *output, +void CLRNNLayer::configure(const ICLTensor *input, + const ICLTensor *weights, + const ICLTensor *recurrent_weights, + const ICLTensor *bias, + ICLTensor *hidden_state, + ICLTensor *output, ActivationLayerInfo &info) { - configure(CLKernelLibrary::get().get_compile_context(), input, weights, recurrent_weights, bias, hidden_state, output, info); + configure(CLKernelLibrary::get().get_compile_context(), input, weights, recurrent_weights, bias, hidden_state, + output, info); } -void CLRNNLayer::configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *weights, const ICLTensor *recurrent_weights, const ICLTensor *bias, - ICLTensor *hidden_state, - ICLTensor *output, ActivationLayerInfo &info) +void CLRNNLayer::configure(const CLCompileContext &compile_context, + const ICLTensor *input, + const ICLTensor *weights, + const ICLTensor *recurrent_weights, + const ICLTensor *bias, + ICLTensor *hidden_state, + ICLTensor *output, + ActivationLayerInfo &info) { ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, recurrent_weights, bias, hidden_state, output); - ARM_COMPUTE_ERROR_THROW_ON(CLRNNLayer::validate(input->info(), weights->info(), recurrent_weights->info(), bias->info(), hidden_state->info(), output->info(), info)); + ARM_COMPUTE_ERROR_THROW_ON(CLRNNLayer::validate(input->info(), weights->info(), recurrent_weights->info(), + bias->info(), hidden_state->info(), output->info(), info)); + ARM_COMPUTE_LOG_PARAMS(input, weights, recurrent_weights, bias, hidden_state, output, info); const int idx_height = get_data_layout_dimension_index(input->info()->data_layout(), DataLayoutDimension::HEIGHT); TensorShape shape = compute_rnn_shape(recurrent_weights->info(), hidden_state->info()->dimension(idx_height)); @@ -96,15 +132,15 @@ void CLRNNLayer::configure(const CLCompileContext &compile_context, const ICLTen _add_output.allocator()->init(TensorInfo(shape, 1, input->info()->data_type())); _memory_group.manage(&_add_output); - _add_kernel.configure(compile_context, ArithmeticOperation::ADD, &_fully_connected_out, &_gemm_output, &_add_output, ConvertPolicy::SATURATE); + _add_kernel.configure(compile_context, &_fully_connected_out, &_gemm_output, &_add_output, ConvertPolicy::SATURATE); _fully_connected_out.allocator()->allocate(); _gemm_output.allocator()->allocate(); - _activation_kernel.configure(compile_context, &_add_output, hidden_state, info); + _activation.configure(compile_context, &_add_output, hidden_state, info); _add_output.allocator()->allocate(); - _copy_kernel.configure(compile_context, hidden_state, output); + _copy.configure(compile_context, hidden_state, output); } void CLRNNLayer::run() @@ -115,16 +151,16 @@ void CLRNNLayer::run() _fully_connected_kernel.run(); _gemm_state_f.run(); - CLScheduler::get().enqueue(_add_kernel); - CLScheduler::get().enqueue(_activation_kernel); + _add_kernel.run(); + _activation.run(); // copy hidden out to output - CLScheduler::get().enqueue(_copy_kernel); + _copy.run(); } void CLRNNLayer::prepare() { - if(!_is_prepared) + if (!_is_prepared) { _fully_connected_kernel.prepare(); _gemm_state_f.prepare(); @@ -132,3 +168,4 @@ void CLRNNLayer::prepare() _is_prepared = true; } } +} // namespace arm_compute diff --git a/src/runtime/CL/functions/CLROIAlignLayer.cpp b/src/runtime/CL/functions/CLROIAlignLayer.cpp index 43b58ddb9b..1939d1d0ba 100644 --- a/src/runtime/CL/functions/CLROIAlignLayer.cpp +++ b/src/runtime/CL/functions/CLROIAlignLayer.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018-2020 ARM Limited. + * Copyright (c) 2018-2021 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -24,27 +24,41 @@ #include "arm_compute/runtime/CL/functions/CLROIAlignLayer.h" #include "arm_compute/core/CL/ICLArray.h" -#include "arm_compute/core/CL/kernels/CLROIAlignLayerKernel.h" -#include "support/MemorySupport.h" + +#include "src/common/utils/Log.h" +#include "src/core/CL/kernels/CLROIAlignLayerKernel.h" +#include "src/core/CL/kernels/CLROIPoolingLayerKernel.h" namespace arm_compute { -Status CLROIAlignLayer::validate(const ITensorInfo *input, const ITensorInfo *rois, ITensorInfo *output, const ROIPoolingLayerInfo &pool_info) +Status CLROIAlignLayer::validate(const ITensorInfo *input, + const ITensorInfo *rois, + ITensorInfo *output, + const ROIPoolingLayerInfo &pool_info) { ARM_COMPUTE_RETURN_ON_ERROR(CLROIAlignLayerKernel::validate(input, rois, output, pool_info)); return Status{}; } -void CLROIAlignLayer::configure(const ICLTensor *input, const ICLTensor *rois, ICLTensor *output, const ROIPoolingLayerInfo &pool_info) +void CLROIAlignLayer::configure(const ICLTensor *input, + const ICLTensor *rois, + ICLTensor *output, + const ROIPoolingLayerInfo &pool_info) { configure(CLKernelLibrary::get().get_compile_context(), input, rois, output, pool_info); } -void CLROIAlignLayer::configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *rois, ICLTensor *output, const ROIPoolingLayerInfo &pool_info) +void CLROIAlignLayer::configure(const CLCompileContext &compile_context, + const ICLTensor *input, + const ICLTensor *rois, + ICLTensor *output, + const ROIPoolingLayerInfo &pool_info) { + ARM_COMPUTE_LOG_PARAMS(input, rois, output, pool_info); + // Configure ROI pooling kernel - auto k = arm_compute::support::cpp14::make_unique<CLROIAlignLayerKernel>(); + auto k = std::make_unique<CLROIAlignLayerKernel>(); k->configure(compile_context, input, rois, output, pool_info); _kernel = std::move(k); } diff --git a/src/runtime/CL/functions/CLROIPoolingLayer.cpp b/src/runtime/CL/functions/CLROIPoolingLayer.cpp index bb54cfa2ca..0d2eab0c76 100644 --- a/src/runtime/CL/functions/CLROIPoolingLayer.cpp +++ b/src/runtime/CL/functions/CLROIPoolingLayer.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2020 ARM Limited. + * Copyright (c) 2017-2021 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -25,20 +25,37 @@ #include "arm_compute/core/CL/ICLArray.h" -#include "arm_compute/core/CL/kernels/CLROIPoolingLayerKernel.h" -#include "support/MemorySupport.h" +#include "src/common/utils/Log.h" +#include "src/core/CL/kernels/CLROIPoolingLayerKernel.h" using namespace arm_compute; -void CLROIPoolingLayer::configure(const ICLTensor *input, const ICLTensor *rois, ICLTensor *output, const ROIPoolingLayerInfo &pool_info) +Status CLROIPoolingLayer::validate(const ITensorInfo *input, + const ITensorInfo *rois, + ITensorInfo *output, + const ROIPoolingLayerInfo &pool_info) +{ + return CLROIPoolingLayerKernel::validate(input, rois, output, pool_info); +} + +void CLROIPoolingLayer::configure(const ICLTensor *input, + const ICLTensor *rois, + ICLTensor *output, + const ROIPoolingLayerInfo &pool_info) { configure(CLKernelLibrary::get().get_compile_context(), input, rois, output, pool_info); } -void CLROIPoolingLayer::configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *rois, ICLTensor *output, const ROIPoolingLayerInfo &pool_info) +void CLROIPoolingLayer::configure(const CLCompileContext &compile_context, + const ICLTensor *input, + const ICLTensor *rois, + const ICLTensor *output, + const ROIPoolingLayerInfo &pool_info) { + ARM_COMPUTE_LOG_PARAMS(input, rois, output, pool_info); + // Configure ROI pooling kernel - auto k = arm_compute::support::cpp14::make_unique<CLROIPoolingLayerKernel>(); + auto k = std::make_unique<CLROIPoolingLayerKernel>(); k->configure(compile_context, input, rois, output, pool_info); _kernel = std::move(k); } diff --git a/src/runtime/CL/functions/CLRange.cpp b/src/runtime/CL/functions/CLRange.cpp index b29b03d5b5..5c3f7f9c8c 100644 --- a/src/runtime/CL/functions/CLRange.cpp +++ b/src/runtime/CL/functions/CLRange.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018-2020 ARM Limited. + * Copyright (c) 2018-2021 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -24,11 +24,12 @@ #include "arm_compute/runtime/CL/functions/CLRange.h" #include "arm_compute/core/CL/ICLTensor.h" -#include "arm_compute/core/CL/kernels/CLRangeKernel.h" #include "arm_compute/core/Error.h" #include "arm_compute/core/Validate.h" #include "arm_compute/runtime/CL/CLScheduler.h" -#include "support/MemorySupport.h" + +#include "src/common/utils/Log.h" +#include "src/core/CL/kernels/CLRangeKernel.h" using namespace arm_compute; @@ -37,9 +38,11 @@ void CLRange::configure(ICLTensor *output, const float start, const float end, c configure(CLKernelLibrary::get().get_compile_context(), output, start, end, step); } -void CLRange::configure(const CLCompileContext &compile_context, ICLTensor *output, const float start, const float end, const float step) +void CLRange::configure( + const CLCompileContext &compile_context, ICLTensor *output, const float start, const float end, const float step) { - auto k = arm_compute::support::cpp14::make_unique<CLRangeKernel>(); + ARM_COMPUTE_LOG_PARAMS(output, start, end, step); + auto k = std::make_unique<CLRangeKernel>(); k->set_target(CLScheduler::get().target()); k->configure(compile_context, output, start, end, step); _kernel = std::move(k); diff --git a/src/runtime/CL/functions/CLReduceMean.cpp b/src/runtime/CL/functions/CLReduceMean.cpp index ce447636ec..bef8d887fd 100644 --- a/src/runtime/CL/functions/CLReduceMean.cpp +++ b/src/runtime/CL/functions/CLReduceMean.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018-2020 ARM Limited. + * Copyright (c) 2018-2021, 2023-2024 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -23,23 +23,29 @@ */ #include "arm_compute/runtime/CL/functions/CLReduceMean.h" -#include "arm_compute/core/CL/CLValidate.h" #include "arm_compute/core/CL/ICLTensor.h" -#include "arm_compute/core/CL/kernels/CLReductionOperationKernel.h" #include "arm_compute/core/Error.h" #include "arm_compute/core/Types.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" +#include "src/common/utils/Log.h" +#include "src/core/CL/CLValidate.h" +#include "src/core/CL/kernels/CLFillBorderKernel.h" +#include "src/core/CL/kernels/CLReductionOperationKernel.h" +#include "src/core/helpers/AutoConfiguration.h" + namespace arm_compute { namespace { -Status validate_config(const ITensorInfo *input, const Coordinates &reduction_axis, bool keep_dims, const ITensorInfo *output) +Status +validate_config(const ITensorInfo *input, const Coordinates &reduction_axis, bool keep_dims, const ITensorInfo *output) { ARM_COMPUTE_UNUSED(keep_dims); ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output); ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F16, DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, + DataType::F16, DataType::F32); ARM_COMPUTE_RETURN_ERROR_ON(reduction_axis.num_dimensions() < 1); ARM_COMPUTE_RETURN_ERROR_ON(reduction_axis.num_dimensions() > input->num_dimensions()); @@ -47,29 +53,36 @@ Status validate_config(const ITensorInfo *input, const Coordinates &reduction_ax const int input_dims = input->num_dimensions(); Coordinates axis_local = reduction_axis; - for(unsigned int i = 0; i < axis_local.num_dimensions(); ++i) + for (unsigned int i = 0; i < axis_local.num_dimensions(); ++i) { //axis: The dimensions to reduce. Must be in the range [-rank(input_tensor), rank(input_tensor)). ARM_COMPUTE_RETURN_ERROR_ON(axis_local[i] < (-static_cast<int>(input->num_dimensions()))); ARM_COMPUTE_RETURN_ERROR_ON(axis_local[i] >= static_cast<int>(input->num_dimensions())); } - if(output->tensor_shape().total_size() != 0) + if (output->tensor_shape().total_size() != 0) { // Only validate if not using auto_init for the output tensor TensorShape out_shape = input->tensor_shape(); // Validate output_shape only if not using auto_init convert_negative_axis(axis_local, input_dims); + +// Suppress warning produced by a compiler bug in GCC +// https://gcc.gnu.org/bugzilla/show_bug.cgi?id=104165 +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Warray-bounds" std::sort(axis_local.begin(), axis_local.begin() + reduction_ops); - for(unsigned int i = 0; i < reduction_ops; ++i) +#pragma GCC diagnostic pop + + for (unsigned int i = 0; i < reduction_ops; ++i) { ARM_COMPUTE_RETURN_ERROR_ON(axis_local[i] > 3); ARM_COMPUTE_RETURN_ERROR_ON(static_cast<unsigned int>(axis_local[i]) > input->num_dimensions() - 1); - if(output->total_size() > 0 && keep_dims) + if (output->total_size() > 0 && keep_dims) { ARM_COMPUTE_RETURN_ERROR_ON(output->dimension(axis_local[i]) != 1); } - if(keep_dims) + if (keep_dims) { out_shape.set(axis_local[i], 1); } @@ -78,86 +91,150 @@ Status validate_config(const ITensorInfo *input, const Coordinates &reduction_ax ARM_COMPUTE_RETURN_ERROR_ON(i > static_cast<unsigned int>(axis_local[i])); const unsigned int remove_index = axis_local[i] - i; ARM_COMPUTE_RETURN_ERROR_ON(remove_index >= out_shape.num_dimensions()); - out_shape.remove_dimension(remove_index); + out_shape.remove_dimension(remove_index, false); } } const TensorInfo out_info = input->clone()->set_tensor_shape(out_shape); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output, &out_info); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(input, output); + const bool requant = + is_data_type_quantized(input->data_type()) && input->quantization_info() != output->quantization_info(); + if (requant) + { + TensorInfo input_no_quant(input->clone()->set_data_type(DataType::F32)); + CLDequantizationLayer::validate(input, &input_no_quant); + TensorInfo output_no_quant(output->clone()->set_data_type(DataType::F32)); + CLQuantizationLayer::validate(&output_no_quant, output); + } } return Status{}; } -} +} // namespace + CLReduceMean::CLReduceMean(std::shared_ptr<IMemoryManager> memory_manager) - : _memory_group(std::move(memory_manager)), _reduction_kernels(), _reduced_outs(), _reshape(), _reduction_ops(), _keep_dims() + : _memory_group(std::move(memory_manager)), + _reduction_kernels(), + _reduced_outs(), + _reshape(), + _dequant(), + _requant(), + _reduction_ops(), + _keep_dims(), + _do_requant(), + _input_no_quant(), + _output_no_quant() { } + void CLReduceMean::configure(ICLTensor *input, const Coordinates &reduction_axis, bool keep_dims, ICLTensor *output) { configure(CLKernelLibrary::get().get_compile_context(), input, reduction_axis, keep_dims, output); } -void CLReduceMean::configure(const CLCompileContext &compile_context, ICLTensor *input, const Coordinates &reduction_axis, bool keep_dims, ICLTensor *output) +void CLReduceMean::configure(const CLCompileContext &compile_context, + ICLTensor *input, + const Coordinates &reduction_axis, + bool keep_dims, + ICLTensor *output) { // Perform validate step ARM_COMPUTE_ERROR_THROW_ON(CLReduceMean::validate(input->info(), reduction_axis, keep_dims, output->info())); + ARM_COMPUTE_LOG_PARAMS(input, reduction_axis, keep_dims, output); + // Output auto inizialitation if not yet initialized - const TensorShape output_shape = arm_compute::misc::shape_calculator::calculate_reduce_mean_shape(input, reduction_axis, keep_dims); + const TensorShape output_shape = + arm_compute::misc::shape_calculator::calculate_reduce_mean_shape(input->info(), reduction_axis, keep_dims); auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(output_shape)); + _do_requant = is_data_type_quantized(input->info()->data_type()) && + input->info()->quantization_info() != output->info()->quantization_info(); _reduction_ops = reduction_axis.num_dimensions(); _reduction_kernels.resize(_reduction_ops); _reduced_outs.resize(_reduction_ops - (keep_dims ? 1 : 0)); _keep_dims = keep_dims; + ICLTensor *tmp_input = input; + ICLTensor *tmp_output = output; + if (_do_requant) + { + _memory_group.manage(&_input_no_quant); + _memory_group.manage(&_output_no_quant); + TensorInfo output_no_quant_info = input->info()->clone()->set_tensor_shape(output_shape); + output_no_quant_info.set_data_type(DataType::F32); + auto_init_if_empty(*_output_no_quant.info(), output_no_quant_info); + auto_init_if_empty(*_input_no_quant.info(), input->info()->clone()->set_data_type(DataType::F32)); + _dequant.configure(compile_context, input, &_input_no_quant); + tmp_input = &_input_no_quant; + tmp_output = &_output_no_quant; + } + Coordinates axis_local = reduction_axis; - const int input_dims = input->info()->num_dimensions(); + const int input_dims = tmp_input->info()->num_dimensions(); convert_negative_axis(axis_local, input_dims); // Perform reduction for every axis - for(int i = 0; i < _reduction_ops; ++i) + for (int i = 0; i < _reduction_ops; ++i) { - TensorShape out_shape = i == 0 ? input->info()->tensor_shape() : (&_reduced_outs[i - 1])->info()->tensor_shape(); + TensorShape out_shape = + i == 0 ? tmp_input->info()->tensor_shape() : (&_reduced_outs[i - 1])->info()->tensor_shape(); out_shape.set(axis_local[i], 1); - auto in = (i == 0) ? input : (&_reduced_outs[i - 1]); + auto in = (i == 0) ? tmp_input : (&_reduced_outs[i - 1]); - if(i == _reduction_ops - 1 && keep_dims) + if (i == _reduction_ops - 1 && keep_dims) { - _reduction_kernels[i].configure(compile_context, in, output, axis_local[i], ReductionOperation::MEAN_SUM); + _reduction_kernels[i].configure(compile_context, in, tmp_output, axis_local[i], + ReductionOperation::MEAN_SUM); } else { - _reduced_outs[i].allocator()->init(TensorInfo(out_shape, input->info()->num_channels(), input->info()->data_type(), input->info()->quantization_info())); + _reduced_outs[i].allocator()->init(TensorInfo(out_shape, tmp_input->info()->num_channels(), + tmp_input->info()->data_type(), + tmp_input->info()->quantization_info())); _memory_group.manage(&_reduced_outs[i]); - _reduction_kernels[i].configure(compile_context, in, &_reduced_outs[i], axis_local[i], ReductionOperation::MEAN_SUM); + _reduction_kernels[i].configure(compile_context, in, &_reduced_outs[i], axis_local[i], + ReductionOperation::MEAN_SUM); } } // Allocate intermediate tensors - for(int i = 0; i < _reduction_ops - (keep_dims ? 1 : 0); ++i) + for (int i = 0; i < _reduction_ops - (keep_dims ? 1 : 0); ++i) { _reduced_outs[i].allocator()->allocate(); } // Configure reshape layer if we want to drop the dimensions - if(!keep_dims) + if (!_keep_dims) { - TensorShape out_shape = input->info()->tensor_shape(); + TensorShape out_shape = tmp_input->info()->tensor_shape(); // We have to sort the reduction axis vectors in order for remove_dimension // to work properly + +// Suppress warning produced by a compiler bug in GCC +// https://gcc.gnu.org/bugzilla/show_bug.cgi?id=104165 +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Warray-bounds" std::sort(axis_local.begin(), axis_local.begin() + _reduction_ops); - for(int i = 0; i < _reduction_ops; ++i) +#pragma GCC diagnostic pop + for (int i = 0; i < _reduction_ops; ++i) { - out_shape.remove_dimension(axis_local[i] - i); + out_shape.remove_dimension(axis_local[i] - i, false); } - auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(out_shape)); - _reshape.configure(compile_context, &_reduced_outs[_reduction_ops - 1], output); + auto_init_if_empty(*tmp_output->info(), tmp_input->info()->clone()->set_tensor_shape(out_shape)); + _reshape.configure(compile_context, &_reduced_outs[_reduction_ops - 1], tmp_output); + } + if (_do_requant) + { + _requant.configure(compile_context, &_output_no_quant, output); + _input_no_quant.allocator()->allocate(); + _output_no_quant.allocator()->allocate(); } } -Status CLReduceMean::validate(const ITensorInfo *input, const Coordinates &reduction_axis, bool keep_dims, const ITensorInfo *output) +Status CLReduceMean::validate(const ITensorInfo *input, + const Coordinates &reduction_axis, + bool keep_dims, + const ITensorInfo *output) { return validate_config(input, reduction_axis, keep_dims, output); } @@ -166,14 +243,21 @@ void CLReduceMean::run() { MemoryGroupResourceScope scope_mg(_memory_group); - for(auto &kernel : _reduction_kernels) + if (_do_requant) + { + _dequant.run(); + } + for (auto &kernel : _reduction_kernels) { kernel.run(); } - - if(!_keep_dims) + if (!_keep_dims) { _reshape.run(); } + if (_do_requant) + { + _requant.run(); + } } } // namespace arm_compute diff --git a/src/runtime/CL/functions/CLReductionOperation.cpp b/src/runtime/CL/functions/CLReductionOperation.cpp index b659ecfaf6..ba5489018e 100644 --- a/src/runtime/CL/functions/CLReductionOperation.cpp +++ b/src/runtime/CL/functions/CLReductionOperation.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2020 ARM Limited. + * Copyright (c) 2017-2021 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -24,39 +24,46 @@ #include "arm_compute/runtime/CL/functions/CLReductionOperation.h" #include "arm_compute/core/CL/ICLTensor.h" -#include "arm_compute/core/CL/kernels/CLReductionOperationKernel.h" -#include "arm_compute/core/Error.h" #include "arm_compute/core/Helpers.h" #include "arm_compute/core/PixelValue.h" #include "arm_compute/core/TensorInfo.h" -#include "arm_compute/core/Validate.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" +#include "arm_compute/core/Validate.h" #include "arm_compute/runtime/CL/CLScheduler.h" -#include "arm_compute/runtime/Tensor.h" -#include "arm_compute/runtime/Utils.h" -#include "support/MemorySupport.h" + +#include "src/common/utils/Log.h" +#include "src/core/CL/kernels/CLReductionOperationKernel.h" +#include "src/core/helpers/AutoConfiguration.h" +#include "src/runtime/Utils.h" namespace arm_compute { CLReductionOperation::CLReductionOperation(std::shared_ptr<IMemoryManager> memory_manager) - : _memory_group(std::move(memory_manager)), _results_vector(), _reduction_kernels_vector(), _border_handlers_vector(), _reshape_kernel(), _op(), _num_of_stages(), _reduction_axis(), _is_serial(), + : _memory_group(std::move(memory_manager)), + _unreshaped_output(), + _reduction_kernel(), + _reshape(), + _reduction_axis(), _is_reshape_required(false) { } -Status CLReductionOperation::validate(const ITensorInfo *input, const ITensorInfo *output, unsigned int axis, ReductionOperation op, bool keep_dims) +CLReductionOperation::~CLReductionOperation() = default; + +Status CLReductionOperation::validate( + const ITensorInfo *input, const ITensorInfo *output, unsigned int axis, ReductionOperation op, bool keep_dims) { ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis >= TensorShape::num_max_dimensions, "Reduction axis greater than max number of dimensions"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis >= TensorShape::num_max_dimensions, + "Reduction axis greater than max number of dimensions"); ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis > 3, "Unsupported reduction axis"); - const unsigned int num_of_stages = calculate_number_of_stages_only_x_axis(input->dimension(0), axis); - const bool is_serial = needs_serialized_reduction(op, input->data_type(), axis); - const bool is_reshape_required = !keep_dims; + const bool is_reshape_required = !keep_dims; - if(is_reshape_required && output->total_size() != 0) + if (is_reshape_required && output->total_size() != 0) { - const TensorInfo expected_output_shape = output->clone()->set_tensor_shape(arm_compute::misc::shape_calculator::compute_reduced_shape(input->tensor_shape(), axis, keep_dims)); + const TensorInfo expected_output_shape = output->clone()->set_tensor_shape( + arm_compute::misc::shape_calculator::compute_reduced_shape(input->tensor_shape(), axis, keep_dims)); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&expected_output_shape, output); } @@ -64,95 +71,29 @@ Status CLReductionOperation::validate(const ITensorInfo *input, const ITensorInf TensorInfo output_before_reshape; const auto input_shape = input->tensor_shape(); - const auto input_data_type = input->data_type(); const auto input_num_channles = input->num_channels(); const auto input_qinfo = input->quantization_info(); const auto output_data_type = output->data_type(); - auto initialize_tensorinfo = [](TensorInfo & ti, TensorShape shape, DataType data_type, int num_channels, QuantizationInfo qinfo) - { + auto initialize_tensorinfo = [](TensorInfo &ti, TensorShape shape, DataType data_type, int num_channels, + QuantizationInfo qinfo) { ti.set_data_type(data_type).set_tensor_shape(shape).set_num_channels(num_channels).set_quantization_info(qinfo); }; - if(is_reshape_required) + if (is_reshape_required) { auto shape_before_reshape = input_shape; shape_before_reshape.set(axis, 1); - initialize_tensorinfo(output_before_reshape, shape_before_reshape, output_data_type, input_num_channles, input_qinfo); + initialize_tensorinfo(output_before_reshape, shape_before_reshape, output_data_type, input_num_channles, + input_qinfo); output_internal = &output_before_reshape; } - if(is_serial) - { - ARM_COMPUTE_RETURN_ON_ERROR(CLReductionOperationKernel::validate(input, output_internal, axis, op)); - } - else - { - // Create temporary tensor infos - std::vector<TensorInfo> sums_vector(num_of_stages - 1); - - // Create intermediate tensor info - TensorShape shape{ input_shape }; - - shape.set(0, ceil(shape.x() / 128.f)); + ARM_COMPUTE_RETURN_ON_ERROR(CLReductionOperationKernel::validate(input, output_internal, axis, op)); - for(unsigned int i = 0; i < num_of_stages - 1; i++) - { - initialize_tensorinfo(sums_vector[i], shape, input_data_type, input_num_channles, input_qinfo); - } - - ReductionOperation first_kernel_op; - ReductionOperation intermediate_kernel_op; - ReductionOperation last_kernel_op; - switch(op) - { - case ReductionOperation::SUM: - case ReductionOperation::MEAN_SUM: - first_kernel_op = ReductionOperation::SUM; - intermediate_kernel_op = ReductionOperation::SUM; - last_kernel_op = op; - break; - case ReductionOperation::SUM_SQUARE: - first_kernel_op = ReductionOperation::SUM_SQUARE; - intermediate_kernel_op = ReductionOperation::SUM; - last_kernel_op = ReductionOperation::SUM; - break; - case ReductionOperation::PROD: - first_kernel_op = ReductionOperation::PROD; - intermediate_kernel_op = ReductionOperation::PROD; - last_kernel_op = ReductionOperation::PROD; - break; - case ReductionOperation::MIN: - first_kernel_op = ReductionOperation::MIN; - intermediate_kernel_op = ReductionOperation::MIN; - last_kernel_op = ReductionOperation::MIN; - break; - case ReductionOperation::MAX: - first_kernel_op = ReductionOperation::MAX; - intermediate_kernel_op = ReductionOperation::MAX; - last_kernel_op = ReductionOperation::MAX; - break; - default: - ARM_COMPUTE_ERROR("Not supported"); - } - - // Validate ReductionOperation only on first kernel - ARM_COMPUTE_RETURN_ON_ERROR(CLReductionOperationKernel::validate(input, &sums_vector[0], axis, first_kernel_op)); - - // Validate ReductionOperation on intermediate stages - for(unsigned int i = 1; i < num_of_stages - 1; ++i) - { - ARM_COMPUTE_RETURN_ON_ERROR(CLReductionOperationKernel::validate(&sums_vector[i - 1], &sums_vector[i], axis, intermediate_kernel_op)); - } - - // Validate ReductionOperation on the last stage - const unsigned int last_stage = num_of_stages - 1; - ARM_COMPUTE_RETURN_ON_ERROR(CLReductionOperationKernel::validate(&sums_vector[last_stage - 1], output_internal, axis, last_kernel_op, input->dimension(0))); - } - - if(is_reshape_required) + if (is_reshape_required) { - ARM_COMPUTE_RETURN_ON_ERROR(CLReshapeLayerKernel::validate(output_internal, output)); + ARM_COMPUTE_RETURN_ON_ERROR(CLReshapeLayer::validate(output_internal, output)); } return Status{}; @@ -160,199 +101,59 @@ Status CLReductionOperation::validate(const ITensorInfo *input, const ITensorInf ICLTensor *CLReductionOperation::configure_intermediate_result_vector(ICLTensor *input, ICLTensor *output) { - if(!_is_reshape_required && _is_serial) + if (!_is_reshape_required) { return output; } - auto intermediate_result_vector_size = _is_serial ? 1 : _num_of_stages; - - if(!_is_reshape_required) - { - --intermediate_result_vector_size; - } - - _results_vector.resize(intermediate_result_vector_size); auto shape = input->info()->tensor_shape(); - - shape.set(_reduction_axis, _is_serial ? 1 : ceil(shape.x() / 128.f)); - - for(auto &v : _results_vector) - { - if(&v == &_results_vector.back() && _is_reshape_required) - { - shape.set(_reduction_axis, 1); - } - v.allocator()->init(input->info()->clone()->set_tensor_shape(shape)); - } - - return _is_reshape_required ? &_results_vector.back() : output; + shape.set(_reduction_axis, 1); + _unreshaped_output.allocator()->init(input->info()->clone()->set_tensor_shape(shape)); + return &_unreshaped_output; } -void CLReductionOperation::configure(ICLTensor *input, ICLTensor *output, unsigned int axis, ReductionOperation op, bool keep_dims) +void CLReductionOperation::configure( + ICLTensor *input, ICLTensor *output, unsigned int axis, ReductionOperation op, bool keep_dims) { configure(CLKernelLibrary::get().get_compile_context(), input, output, axis, op, keep_dims); } -void CLReductionOperation::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, unsigned int axis, ReductionOperation op, bool keep_dims) +void CLReductionOperation::configure(const CLCompileContext &compile_context, + ICLTensor *input, + ICLTensor *output, + unsigned int axis, + ReductionOperation op, + bool keep_dims) { ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); - _op = op; - _num_of_stages = calculate_number_of_stages_only_x_axis(input->info()->dimension(0), axis); + ARM_COMPUTE_LOG_PARAMS(input, output, axis, op, keep_dims); _reduction_axis = axis; - _is_serial = needs_serialized_reduction(op, input->info()->data_type(), axis); _is_reshape_required = !keep_dims; auto *output_internal = configure_intermediate_result_vector(input, output); - if(_is_reshape_required) + if (_is_reshape_required) { - const TensorShape output_shape = arm_compute::misc::shape_calculator::compute_reduced_shape(input->info()->tensor_shape(), axis, false); - const auto output_data_type = input->info()->data_type(); - auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(output_shape).set_data_type(output_data_type).reset_padding().set_is_resizable(true)); - } - - // Configure reduction operation kernels - _reduction_kernels_vector.resize(_num_of_stages); + const TensorShape output_shape = + arm_compute::misc::shape_calculator::compute_reduced_shape(input->info()->tensor_shape(), axis, false); + const auto output_data_type = input->info()->data_type(); + auto_init_if_empty(*output->info(), input->info() + ->clone() + ->set_tensor_shape(output_shape) + .set_data_type(output_data_type) + .reset_padding() + .set_is_resizable(true)); - // Create temporary tensors - if(_is_serial) - { - if(_is_reshape_required) - { - _memory_group.manage(&_results_vector.back()); - } - - _reduction_kernels_vector[0].configure(compile_context, input, output_internal, axis, op, 0); + _memory_group.manage(&_unreshaped_output); } - else - { - _border_handlers_vector.resize(_num_of_stages); - _memory_group.manage(&_results_vector[0]); - ReductionOperation first_kernel_op; - ReductionOperation intermediate_kernel_op; - ReductionOperation last_kernel_op; - PixelValue pixelValue; - switch(op) - { - case ReductionOperation::SUM: - case ReductionOperation::MEAN_SUM: - first_kernel_op = ReductionOperation::SUM; - intermediate_kernel_op = ReductionOperation::SUM; - last_kernel_op = op; - pixelValue = PixelValue(); - break; - case ReductionOperation::SUM_SQUARE: - first_kernel_op = ReductionOperation::SUM_SQUARE; - intermediate_kernel_op = ReductionOperation::SUM; - last_kernel_op = ReductionOperation::SUM; - pixelValue = PixelValue(); - break; - case ReductionOperation::PROD: - first_kernel_op = ReductionOperation::PROD; - intermediate_kernel_op = ReductionOperation::PROD; - last_kernel_op = ReductionOperation::PROD; - pixelValue = PixelValue(1, input->info()->data_type()); - break; - case ReductionOperation::MIN: - first_kernel_op = ReductionOperation::MIN; - intermediate_kernel_op = ReductionOperation::MIN; - last_kernel_op = ReductionOperation::MIN; - switch(input->info()->data_type()) - { - case DataType::F32: - { - pixelValue = PixelValue(std::numeric_limits<float>::max()); - break; - } - case DataType::F16: - { - pixelValue = PixelValue(static_cast<half>(65504.0f)); - break; - } - case DataType::QASYMM8: - { - pixelValue = std::get<1>(get_min_max(input->info()->data_type())); - break; - } - case DataType::QASYMM8_SIGNED: - { - pixelValue = PixelValue(127, input->info()->data_type(), input->info()->quantization_info()); - break; - } - default: - { - ARM_COMPUTE_ERROR("Unsupported DataType"); - } - } - break; - case ReductionOperation::MAX: - first_kernel_op = ReductionOperation::MAX; - intermediate_kernel_op = ReductionOperation::MAX; - last_kernel_op = ReductionOperation::MAX; - switch(input->info()->data_type()) - { - case DataType::F32: - { - pixelValue = PixelValue(-std::numeric_limits<float>::max()); - break; - } - case DataType::F16: - { - pixelValue = PixelValue(static_cast<half>(-65504.0f)); - break; - } - case DataType::QASYMM8: - { - pixelValue = std::get<0>(get_min_max(input->info()->data_type())); - break; - } - case DataType::QASYMM8_SIGNED: - { - pixelValue = PixelValue(-128, input->info()->data_type(), input->info()->quantization_info()); - break; - } - default: - { - ARM_COMPUTE_ERROR("Unsupported DataType"); - } - } - break; - default: - ARM_COMPUTE_ERROR("Not supported"); - } + _reduction_kernel = std::make_unique<CLReductionOperationKernel>(); + _reduction_kernel->configure(compile_context, input, output_internal, axis, op); - _reduction_kernels_vector[0].configure(compile_context, input, &_results_vector[0], axis, first_kernel_op); - _border_handlers_vector[0].configure(compile_context, input, _reduction_kernels_vector[0].border_size(), BorderMode::CONSTANT, pixelValue); - - // Apply ReductionOperation on intermediate stages - for(unsigned int i = 1; i < _num_of_stages - 1; ++i) - { - _memory_group.manage(&_results_vector[i]); - _reduction_kernels_vector[i].configure(compile_context, &_results_vector[i - 1], &_results_vector[i], axis, intermediate_kernel_op); - _border_handlers_vector[i].configure(compile_context, &_results_vector[i - 1], _reduction_kernels_vector[i].border_size(), BorderMode::CONSTANT, pixelValue); - _results_vector[i - 1].allocator()->allocate(); - } - - // Apply ReductionOperation on the last stage - const unsigned int last_stage = _num_of_stages - 1; - const unsigned int input_width = input->info()->dimension(0); - - if(_is_reshape_required) - { - _memory_group.manage(&_results_vector.back()); - } - - _reduction_kernels_vector[last_stage].configure(compile_context, &_results_vector[last_stage - 1], output_internal, axis, last_kernel_op, input_width); - _border_handlers_vector[last_stage].configure(compile_context, &_results_vector[last_stage - 1], _reduction_kernels_vector[last_stage].border_size(), BorderMode::CONSTANT, pixelValue); - _results_vector[last_stage - 1].allocator()->allocate(); - } - - if(_is_reshape_required) + if (_is_reshape_required) { - _reshape_kernel.configure(compile_context, &_results_vector.back(), output); - _results_vector.back().allocator()->allocate(); + _reshape.configure(compile_context, &_unreshaped_output, output); + _unreshaped_output.allocator()->allocate(); } } @@ -360,22 +161,11 @@ void CLReductionOperation::run() { MemoryGroupResourceScope scope_mg(_memory_group); - if(_is_serial) - { - CLScheduler::get().enqueue(_reduction_kernels_vector[0], false); - } - else - { - for(unsigned int i = 0; i < _num_of_stages; ++i) - { - CLScheduler::get().enqueue(_border_handlers_vector[i], false); - CLScheduler::get().enqueue(_reduction_kernels_vector[i], false); - } - } + CLScheduler::get().enqueue(*_reduction_kernel, false); - if(_is_reshape_required) + if (_is_reshape_required) { - CLScheduler::get().enqueue(_reshape_kernel, false); + _reshape.run(); } } } // namespace arm_compute diff --git a/src/runtime/CL/functions/CLRemap.cpp b/src/runtime/CL/functions/CLRemap.cpp deleted file mode 100644 index af241ec299..0000000000 --- a/src/runtime/CL/functions/CLRemap.cpp +++ /dev/null @@ -1,57 +0,0 @@ -/* - * Copyright (c) 2017-2020 ARM Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "arm_compute/runtime/CL/functions/CLRemap.h" - -#include "arm_compute/core/CL/ICLTensor.h" -#include "arm_compute/core/CL/kernels/CLRemapKernel.h" -#include "arm_compute/core/Error.h" -#include "arm_compute/core/PixelValue.h" -#include "arm_compute/core/TensorInfo.h" -#include "arm_compute/core/Validate.h" -#include "support/MemorySupport.h" - -#include <utility> - -using namespace arm_compute; - -void CLRemap::configure(ICLTensor *input, const ICLTensor *map_x, const ICLTensor *map_y, ICLTensor *output, InterpolationPolicy policy, BorderMode border_mode, uint8_t constant_border_value) -{ - configure(CLKernelLibrary::get().get_compile_context(), input, map_x, map_y, output, policy, border_mode, constant_border_value); -} - -void CLRemap::configure(const CLCompileContext &compile_context, ICLTensor *input, const ICLTensor *map_x, const ICLTensor *map_y, ICLTensor *output, InterpolationPolicy policy, - BorderMode border_mode, - uint8_t constant_border_value) -{ - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8); - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8); - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(map_x, 1, DataType::F32); - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(map_y, 1, DataType::F32); - ARM_COMPUTE_ERROR_ON_MSG(policy == InterpolationPolicy::AREA, "Area interpolation is not supported"); - - auto k = arm_compute::support::cpp14::make_unique<CLRemapKernel>(); - k->configure(compile_context, input, map_x, map_y, output, policy, border_mode == BorderMode::UNDEFINED); - _kernel = std::move(k); - _border_handler.configure(compile_context, input, _kernel->border_size(), border_mode, PixelValue(constant_border_value)); -} diff --git a/src/runtime/CL/functions/CLReorgLayer.cpp b/src/runtime/CL/functions/CLReorgLayer.cpp index ea9331414c..156e9b90c1 100644 --- a/src/runtime/CL/functions/CLReorgLayer.cpp +++ b/src/runtime/CL/functions/CLReorgLayer.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018-2020 ARM Limited. + * Copyright (c) 2018-2021 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -24,11 +24,12 @@ #include "arm_compute/runtime/CL/functions/CLReorgLayer.h" #include "arm_compute/core/CL/ICLTensor.h" -#include "arm_compute/core/CL/kernels/CLReorgLayerKernel.h" #include "arm_compute/core/Error.h" #include "arm_compute/core/TensorInfo.h" #include "arm_compute/core/Validate.h" -#include "support/MemorySupport.h" + +#include "src/common/utils/Log.h" +#include "src/core/CL/kernels/CLReorgLayerKernel.h" #include <utility> @@ -39,9 +40,13 @@ void CLReorgLayer::configure(ICLTensor *input, ICLTensor *output, int32_t stride configure(CLKernelLibrary::get().get_compile_context(), input, output, stride); } -void CLReorgLayer::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, int32_t stride) +void CLReorgLayer::configure(const CLCompileContext &compile_context, + ICLTensor *input, + ICLTensor *output, + int32_t stride) { - auto k = arm_compute::support::cpp14::make_unique<CLReorgLayerKernel>(); + ARM_COMPUTE_LOG_PARAMS(input, output, stride); + auto k = std::make_unique<CLReorgLayerKernel>(); k->configure(compile_context, input, output, stride); _kernel = std::move(k); } diff --git a/src/runtime/CL/functions/CLReshapeLayer.cpp b/src/runtime/CL/functions/CLReshapeLayer.cpp index 13baedb3f9..3d6349fb25 100644 --- a/src/runtime/CL/functions/CLReshapeLayer.cpp +++ b/src/runtime/CL/functions/CLReshapeLayer.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2020 ARM Limited. + * Copyright (c) 2017-2021 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -23,12 +23,31 @@ */ #include "arm_compute/runtime/CL/functions/CLReshapeLayer.h" +#include "arm_compute/core/CL/CLKernelLibrary.h" #include "arm_compute/core/CL/ICLTensor.h" -#include "arm_compute/core/CL/kernels/CLReshapeLayerKernel.h" -#include "support/MemorySupport.h" +#include "arm_compute/core/Types.h" +#include "arm_compute/core/Validate.h" + +#include "src/core/CL/ICLKernel.h" +#include "src/gpu/cl/operators/ClReshape.h" /** [CLReshapeLayer snippet] **/ -using namespace arm_compute; +namespace arm_compute +{ +struct CLReshapeLayer::Impl +{ + const ICLTensor *src{nullptr}; + ICLTensor *dst{nullptr}; + std::unique_ptr<opencl::ClReshape> op{nullptr}; +}; + +CLReshapeLayer::CLReshapeLayer() : _impl(std::make_unique<Impl>()) +{ +} + +CLReshapeLayer::CLReshapeLayer(CLReshapeLayer &&) = default; +CLReshapeLayer &CLReshapeLayer::operator=(CLReshapeLayer &&) = default; +CLReshapeLayer::~CLReshapeLayer() = default; void CLReshapeLayer::configure(const ICLTensor *input, ICLTensor *output) { @@ -37,13 +56,26 @@ void CLReshapeLayer::configure(const ICLTensor *input, ICLTensor *output) void CLReshapeLayer::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output) { - auto k = arm_compute::support::cpp14::make_unique<CLReshapeLayerKernel>(); - k->configure(compile_context, input, output); - _kernel = std::move(k); + _impl->src = input; + _impl->dst = output; + _impl->op = std::make_unique<opencl::ClReshape>(); + _impl->op->configure(compile_context, input->info(), output->info()); } Status CLReshapeLayer::validate(const ITensorInfo *input, const ITensorInfo *output) { - return CLReshapeLayerKernel::validate(input, output); + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output); + ARM_COMPUTE_RETURN_ON_ERROR(opencl::ClReshape::validate(input, output)); + + return Status{}; } -/** [CLReshapeLayer snippet] **/ + +void CLReshapeLayer::run() +{ + ITensorPack pack; + pack.add_tensor(TensorType::ACL_SRC, _impl->src); + pack.add_tensor(TensorType::ACL_DST, _impl->dst); + _impl->op->run(pack); +} +} // namespace arm_compute + /** [CLReshapeLayer snippet] **/ diff --git a/src/runtime/CL/functions/CLReverse.cpp b/src/runtime/CL/functions/CLReverse.cpp index 3c8bc15a54..a20be2335a 100644 --- a/src/runtime/CL/functions/CLReverse.cpp +++ b/src/runtime/CL/functions/CLReverse.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018-2020 ARM Limited. + * Copyright (c) 2018-2021, 2023 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -23,26 +23,35 @@ */ #include "arm_compute/runtime/CL/functions/CLReverse.h" -#include "arm_compute/core/CL/kernels/CLReverseKernel.h" #include "arm_compute/core/Types.h" -#include "support/MemorySupport.h" + +#include "src/common/utils/Log.h" +#include "src/core/CL/kernels/CLReverseKernel.h" namespace arm_compute { -void CLReverse::configure(const ICLTensor *input, ICLTensor *output, const ICLTensor *axis) +void CLReverse::configure(const ICLTensor *input, ICLTensor *output, const ICLTensor *axis, bool use_inverted_axis) { - configure(CLKernelLibrary::get().get_compile_context(), input, output, axis); + configure(CLKernelLibrary::get().get_compile_context(), input, output, axis, use_inverted_axis); } -void CLReverse::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const ICLTensor *axis) +void CLReverse::configure(const CLCompileContext &compile_context, + const ICLTensor *input, + ICLTensor *output, + const ICLTensor *axis, + bool use_inverted_axis) { - auto k = arm_compute::support::cpp14::make_unique<CLReverseKernel>(); - k->configure(compile_context, input, output, axis); + ARM_COMPUTE_LOG_PARAMS(input, output, axis); + auto k = std::make_unique<CLReverseKernel>(); + k->configure(compile_context, input, output, axis, use_inverted_axis); _kernel = std::move(k); } -Status CLReverse::validate(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *axis) +Status CLReverse::validate(const ITensorInfo *input, + const ITensorInfo *output, + const ITensorInfo *axis, + bool use_inverted_axis) { - return CLReverseKernel::validate(input, output, axis); + return CLReverseKernel::validate(input, output, axis, use_inverted_axis); } } // namespace arm_compute diff --git a/src/runtime/CL/functions/CLScale.cpp b/src/runtime/CL/functions/CLScale.cpp index a9395bdc3d..abff0724e4 100644 --- a/src/runtime/CL/functions/CLScale.cpp +++ b/src/runtime/CL/functions/CLScale.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2016-2020 ARM Limited. + * Copyright (c) 2016-2021 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -23,45 +23,54 @@ */ #include "arm_compute/runtime/CL/functions/CLScale.h" +#include "arm_compute/core/CL/CLKernelLibrary.h" #include "arm_compute/core/CL/ICLTensor.h" -#include "arm_compute/core/CL/kernels/CLScaleKernel.h" -#include "arm_compute/core/Error.h" -#include "arm_compute/core/Validate.h" -#include "arm_compute/runtime/CL/CLScheduler.h" -#include "support/MemorySupport.h" +#include "arm_compute/core/KernelDescriptors.h" -using namespace arm_compute; +#include "src/core/CL/ICLKernel.h" +#include "src/gpu/cl/operators/ClScale.h" -void CLScale::configure(ICLTensor *input, ICLTensor *output, InterpolationPolicy policy, BorderMode border_mode, PixelValue constant_border_value, SamplingPolicy sampling_policy, bool use_padding, - bool align_corners) +namespace arm_compute { - configure(CLKernelLibrary::get().get_compile_context(), input, output, policy, border_mode, constant_border_value, sampling_policy, use_padding, align_corners); +struct CLScale::Impl +{ + const ICLTensor *src{nullptr}; + ICLTensor *dst{nullptr}; + std::unique_ptr<opencl::ClScale> op{nullptr}; +}; + +CLScale::CLScale() : _impl(std::make_unique<Impl>()) +{ +} +CLScale::~CLScale() = default; + +void CLScale::configure(ICLTensor *input, ICLTensor *output, const ScaleKernelInfo &info) +{ + configure(CLKernelLibrary::get().get_compile_context(), input, output, info); } -void CLScale::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, InterpolationPolicy policy, BorderMode border_mode, PixelValue constant_border_value, - SamplingPolicy sampling_policy, bool use_padding, bool align_corners) +void CLScale::configure(const CLCompileContext &compile_context, + ICLTensor *input, + ICLTensor *output, + const ScaleKernelInfo &info) { - ARM_COMPUTE_UNUSED(use_padding); - auto k = arm_compute::support::cpp14::make_unique<CLScaleKernel>(); - k->set_target(CLScheduler::get().target()); - k->configure(compile_context, input, output, policy, border_mode, sampling_policy, align_corners); - _kernel = std::move(k); + _impl->src = input; + _impl->dst = output; - // Tune kernels - CLScheduler::get().tune_kernel_static(*_kernel); + _impl->op = std::make_unique<opencl::ClScale>(); + _impl->op->configure(compile_context, input->info(), output->info(), info); +} - // In the case of NHWC we can't have undefined border mode as this would require to access elements outside z dimension, - // so we treat it like border constant. - if(border_mode == BorderMode::UNDEFINED && input->info()->data_layout() == DataLayout::NHWC) - { - border_mode = BorderMode::CONSTANT; - } - _border_handler.configure(compile_context, input, _kernel->border_size(), border_mode, constant_border_value); +Status CLScale::validate(const ITensorInfo *input, const ITensorInfo *output, const ScaleKernelInfo &info) +{ + return opencl::ClScale::validate(input, output, info); } -Status CLScale::validate(const ITensorInfo *input, const ITensorInfo *output, InterpolationPolicy policy, BorderMode border_mode, PixelValue constant_border_value, SamplingPolicy sampling_policy, - bool use_padding, bool align_corners) +void CLScale::run() { - ARM_COMPUTE_UNUSED(constant_border_value, use_padding); - return CLScaleKernel::validate(input, output, policy, border_mode, sampling_policy, align_corners); + ITensorPack pack; + pack.add_tensor(TensorType::ACL_SRC, _impl->src); + pack.add_tensor(TensorType::ACL_DST, _impl->dst); + _impl->op->run(pack); } +} // namespace arm_compute diff --git a/src/runtime/CL/functions/CLScatter.cpp b/src/runtime/CL/functions/CLScatter.cpp new file mode 100644 index 0000000000..e16fcc4ccc --- /dev/null +++ b/src/runtime/CL/functions/CLScatter.cpp @@ -0,0 +1,93 @@ +/* + * Copyright (c) 2024 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "arm_compute/runtime/CL/functions/CLScatter.h" + +#include "arm_compute/function_info/ScatterInfo.h" +#include "arm_compute/runtime/CL/CLTensor.h" + +#include "src/gpu/cl/operators/ClScatter.h" + +namespace arm_compute +{ +using OperatorType = opencl::ClScatter; + +struct CLScatter::Impl +{ + std::unique_ptr<OperatorType> op{nullptr}; + ITensorPack run_pack{}; +}; + +CLScatter::CLScatter() : _impl(std::make_unique<Impl>()) +{ +} + +CLScatter::~CLScatter() = default; + +void CLScatter::configure(const ICLTensor *src, + const ICLTensor *updates, + const ICLTensor *indices, + ICLTensor *output, + const ScatterInfo &info) +{ + ARM_COMPUTE_UNUSED(info); + configure(CLKernelLibrary::get().get_compile_context(), src, updates, indices, output, info); +} + +void CLScatter::configure(const CLCompileContext &compile_context, + const ICLTensor *src, + const ICLTensor *updates, + const ICLTensor *indices, + ICLTensor *output, + const ScatterInfo &info) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(updates, indices, output); + + _impl->op = std::make_unique<OperatorType>(); + if (src) + { // Src not nullptr. + _impl->op->configure(compile_context, src->info(), updates->info(), indices->info(), output->info(), info); + } + else + { + _impl->op->configure(compile_context, nullptr, updates->info(), indices->info(), output->info(), info); + } + _impl->run_pack = {{ACL_SRC_0, src}, {ACL_SRC_1, updates}, {ACL_SRC_2, indices}, {ACL_DST, output}}; +} + +Status CLScatter::validate(const ITensorInfo *src, + const ITensorInfo *updates, + const ITensorInfo *indices, + const ITensorInfo *output, + const ScatterInfo &info) +{ + return OperatorType::validate(src, updates, indices, output, info); +} + +void CLScatter::run() +{ + _impl->op->run(_impl->run_pack); +} + +} // namespace arm_compute diff --git a/src/runtime/CL/functions/CLScharr3x3.cpp b/src/runtime/CL/functions/CLScharr3x3.cpp deleted file mode 100644 index faad5424a2..0000000000 --- a/src/runtime/CL/functions/CLScharr3x3.cpp +++ /dev/null @@ -1,45 +0,0 @@ -/* - * Copyright (c) 2016-2020 ARM Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "arm_compute/runtime/CL/functions/CLScharr3x3.h" - -#include "arm_compute/core/CL/kernels/CLScharr3x3Kernel.h" -#include "arm_compute/core/PixelValue.h" -#include "support/MemorySupport.h" - -#include <utility> - -using namespace arm_compute; - -void CLScharr3x3::configure(ICLTensor *input, ICLTensor *output_x, ICLTensor *output_y, BorderMode border_mode, uint8_t constant_border_value) -{ - configure(CLKernelLibrary::get().get_compile_context(), input, output_x, output_y, border_mode, constant_border_value); -} - -void CLScharr3x3::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output_x, ICLTensor *output_y, BorderMode border_mode, uint8_t constant_border_value) -{ - auto k = arm_compute::support::cpp14::make_unique<CLScharr3x3Kernel>(); - k->configure(compile_context, input, output_x, output_y, border_mode == BorderMode::UNDEFINED); - _kernel = std::move(k); - _border_handler.configure(compile_context, input, _kernel->border_size(), border_mode, PixelValue(constant_border_value)); -} diff --git a/src/runtime/CL/functions/CLSelect.cpp b/src/runtime/CL/functions/CLSelect.cpp index 7187010448..b4897d9e62 100644 --- a/src/runtime/CL/functions/CLSelect.cpp +++ b/src/runtime/CL/functions/CLSelect.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018-2020 ARM Limited. + * Copyright (c) 2018-2021 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -23,10 +23,12 @@ */ #include "arm_compute/runtime/CL/functions/CLSelect.h" -#include "arm_compute/core/CL/kernels/CLSelectKernel.h" #include "arm_compute/core/Types.h" #include "arm_compute/runtime/CL/CLScheduler.h" +#include "src/common/utils/Log.h" +#include "src/core/CL/kernels/CLSelectKernel.h" + using namespace arm_compute; namespace arm_compute @@ -36,9 +38,14 @@ void CLSelect::configure(const ICLTensor *c, const ICLTensor *x, const ICLTensor configure(CLKernelLibrary::get().get_compile_context(), c, x, y, output); } -void CLSelect::configure(const CLCompileContext &compile_context, const ICLTensor *c, const ICLTensor *x, const ICLTensor *y, ICLTensor *output) +void CLSelect::configure(const CLCompileContext &compile_context, + const ICLTensor *c, + const ICLTensor *x, + const ICLTensor *y, + ICLTensor *output) { - auto k = arm_compute::support::cpp14::make_unique<CLSelectKernel>(); + ARM_COMPUTE_LOG_PARAMS(c, x, y, output); + auto k = std::make_unique<CLSelectKernel>(); k->configure(compile_context, c, x, y, output); _kernel = std::move(k); } diff --git a/src/runtime/CL/functions/CLSlice.cpp b/src/runtime/CL/functions/CLSlice.cpp index e8cc0f5499..f79c6a1235 100644 --- a/src/runtime/CL/functions/CLSlice.cpp +++ b/src/runtime/CL/functions/CLSlice.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018-2020 ARM Limited. + * Copyright (c) 2018-2021 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -24,43 +24,95 @@ #include "arm_compute/runtime/CL/functions/CLSlice.h" #include "arm_compute/core/CL/ICLTensor.h" -#include "arm_compute/core/CL/kernels/CLStridedSliceKernel.h" #include "arm_compute/core/Types.h" #include "arm_compute/core/utils/helpers/tensor_transform.h" -#include "support/MemorySupport.h" + +#include "src/common/utils/Log.h" +#include "src/core/CL/kernels/CLStridedSliceKernel.h" namespace arm_compute { -void CLSlice::configure(const ICLTensor *input, ICLTensor *output, const Coordinates &starts, const Coordinates &ends) +namespace experimental { - configure(CLKernelLibrary::get().get_compile_context(), input, output, starts, ends); -} - -void CLSlice::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const Coordinates &starts, const Coordinates &ends) +void CLSlice::configure(const CLCompileContext &compile_context, + const ITensorInfo *input, + ITensorInfo *output, + const Coordinates &starts, + const Coordinates &ends) { ARM_COMPUTE_ERROR_ON_NULLPTR(input); + ARM_COMPUTE_LOG_PARAMS(input, output, starts, ends); // Get absolute end coordinates const int32_t slice_end_mask = arm_compute::helpers::tensor_transform::construct_slice_end_mask(ends); - auto k = arm_compute::support::cpp14::make_unique<CLStridedSliceKernel>(); + auto k = std::make_unique<CLStridedSliceKernel>(); k->configure(compile_context, input, output, starts, ends, BiStrides(), 0, slice_end_mask, 0); _kernel = std::move(k); } -Status CLSlice::validate(const ITensorInfo *input, const ITensorInfo *output, const Coordinates &starts, const Coordinates &ends) +Status CLSlice::validate(const ITensorInfo *input, + const ITensorInfo *output, + const Coordinates &starts, + const Coordinates &ends) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input); // Check start dimensions for being non-negative - ARM_COMPUTE_RETURN_ERROR_ON(std::any_of(starts.cbegin(), starts.cbegin() + starts.num_dimensions(), [](int i) - { - return i < 0; - })); + ARM_COMPUTE_RETURN_ERROR_ON( + std::any_of(starts.cbegin(), starts.cbegin() + starts.num_dimensions(), [](int i) { return i < 0; })); // Get absolute end coordinates const int32_t slice_end_mask = arm_compute::helpers::tensor_transform::construct_slice_end_mask(ends); return CLStridedSliceKernel::validate(input, output, starts, ends, BiStrides(), 0, slice_end_mask, 0); } +} // namespace experimental + +struct CLSlice::Impl +{ + const ICLTensor *src{nullptr}; + ICLTensor *dst{nullptr}; + std::unique_ptr<experimental::CLSlice> op{nullptr}; +}; + +CLSlice::CLSlice() : _impl(std::make_unique<Impl>()) +{ +} +CLSlice::CLSlice(CLSlice &&) = default; +CLSlice &CLSlice::operator=(CLSlice &&) = default; +CLSlice::~CLSlice() = default; + +Status CLSlice::validate(const ITensorInfo *input, + const ITensorInfo *output, + const Coordinates &starts, + const Coordinates &ends) +{ + return experimental::CLSlice::validate(input, output, starts, ends); +} + +void CLSlice::configure(const ICLTensor *input, ICLTensor *output, const Coordinates &starts, const Coordinates &ends) +{ + configure(CLKernelLibrary::get().get_compile_context(), input, output, starts, ends); +} + +void CLSlice::configure(const CLCompileContext &compile_context, + const ICLTensor *input, + ICLTensor *output, + const Coordinates &starts, + const Coordinates &ends) +{ + _impl->src = input; + _impl->dst = output; + _impl->op = std::make_unique<experimental::CLSlice>(); + _impl->op->configure(compile_context, input->info(), output->info(), starts, ends); +} + +void CLSlice::run() +{ + ITensorPack pack; + pack.add_tensor(TensorType::ACL_SRC, _impl->src); + pack.add_tensor(TensorType::ACL_DST, _impl->dst); + _impl->op->run(pack); +} } // namespace arm_compute diff --git a/src/runtime/CL/functions/CLSobel3x3.cpp b/src/runtime/CL/functions/CLSobel3x3.cpp deleted file mode 100644 index c3604f970f..0000000000 --- a/src/runtime/CL/functions/CLSobel3x3.cpp +++ /dev/null @@ -1,45 +0,0 @@ -/* - * Copyright (c) 2016-2020 ARM Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "arm_compute/runtime/CL/functions/CLSobel3x3.h" - -#include "arm_compute/core/CL/kernels/CLSobel3x3Kernel.h" -#include "arm_compute/core/PixelValue.h" -#include "support/MemorySupport.h" - -#include <utility> - -using namespace arm_compute; - -void CLSobel3x3::configure(ICLTensor *input, ICLTensor *output_x, ICLTensor *output_y, BorderMode border_mode, uint8_t constant_border_value) -{ - configure(CLKernelLibrary::get().get_compile_context(), input, output_x, output_y, border_mode, constant_border_value); -} - -void CLSobel3x3::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output_x, ICLTensor *output_y, BorderMode border_mode, uint8_t constant_border_value) -{ - auto k = arm_compute::support::cpp14::make_unique<CLSobel3x3Kernel>(); - k->configure(compile_context, input, output_x, output_y, border_mode == BorderMode::UNDEFINED); - _kernel = std::move(k); - _border_handler.configure(compile_context, input, _kernel->border_size(), border_mode, PixelValue(constant_border_value)); -} diff --git a/src/runtime/CL/functions/CLSobel5x5.cpp b/src/runtime/CL/functions/CLSobel5x5.cpp deleted file mode 100644 index f8a33f3fb6..0000000000 --- a/src/runtime/CL/functions/CLSobel5x5.cpp +++ /dev/null @@ -1,93 +0,0 @@ -/* - * Copyright (c) 2016-2020 ARM Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "arm_compute/runtime/CL/functions/CLSobel5x5.h" - -#include "arm_compute/core/CL/ICLTensor.h" -#include "arm_compute/core/CL/kernels/CLSobel5x5Kernel.h" -#include "arm_compute/core/PixelValue.h" -#include "arm_compute/core/TensorInfo.h" -#include "arm_compute/core/Validate.h" -#include "arm_compute/runtime/CL/CLScheduler.h" -#include "arm_compute/runtime/ITensorAllocator.h" - -using namespace arm_compute; - -CLSobel5x5::CLSobel5x5(std::shared_ptr<IMemoryManager> memory_manager) - : _memory_group(std::move(memory_manager)), _sobel_hor(), _sobel_vert(), _border_handler(), _tmp_x(), _tmp_y() -{ -} - -void CLSobel5x5::configure(ICLTensor *input, ICLTensor *output_x, ICLTensor *output_y, BorderMode border_mode, uint8_t constant_border_value) -{ - configure(CLKernelLibrary::get().get_compile_context(), input, output_x, output_y, border_mode, constant_border_value); -} - -void CLSobel5x5::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output_x, ICLTensor *output_y, BorderMode border_mode, uint8_t constant_border_value) -{ - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8); - - const bool run_sobel_x = output_x != nullptr; - const bool run_sobel_y = output_y != nullptr; - - TensorInfo tensor_info(input->info()->tensor_shape(), 1, DataType::S16); - - if(run_sobel_x && run_sobel_y) - { - _tmp_x.allocator()->init(tensor_info); - _tmp_y.allocator()->init(tensor_info); - _memory_group.manage(&_tmp_x); - _memory_group.manage(&_tmp_y); - _sobel_hor.configure(compile_context, input, &_tmp_x, &_tmp_y, border_mode == BorderMode::UNDEFINED); - _sobel_vert.configure(compile_context, &_tmp_x, &_tmp_y, output_x, output_y, border_mode == BorderMode::UNDEFINED); - _tmp_x.allocator()->allocate(); - _tmp_y.allocator()->allocate(); - } - else if(run_sobel_x) - { - _tmp_x.allocator()->init(tensor_info); - _memory_group.manage(&_tmp_x); - _sobel_hor.configure(compile_context, input, &_tmp_x, nullptr, border_mode == BorderMode::UNDEFINED); - _sobel_vert.configure(compile_context, &_tmp_x, nullptr, output_x, nullptr, border_mode == BorderMode::UNDEFINED); - _tmp_x.allocator()->allocate(); - } - else if(run_sobel_y) - { - _tmp_y.allocator()->init(tensor_info); - _memory_group.manage(&_tmp_y); - _sobel_hor.configure(compile_context, input, nullptr, &_tmp_y, border_mode == BorderMode::UNDEFINED); - _sobel_vert.configure(compile_context, nullptr, &_tmp_y, nullptr, output_y, border_mode == BorderMode::UNDEFINED); - _tmp_y.allocator()->allocate(); - } - _border_handler.configure(compile_context, input, _sobel_hor.border_size(), border_mode, PixelValue(constant_border_value)); -} - -void CLSobel5x5::run() -{ - CLScheduler::get().enqueue(_border_handler, false); - - MemoryGroupResourceScope scope_mg(_memory_group); - - CLScheduler::get().enqueue(_sobel_hor, false); - CLScheduler::get().enqueue(_sobel_vert); -} diff --git a/src/runtime/CL/functions/CLSobel7x7.cpp b/src/runtime/CL/functions/CLSobel7x7.cpp deleted file mode 100644 index 6d3c7f0d08..0000000000 --- a/src/runtime/CL/functions/CLSobel7x7.cpp +++ /dev/null @@ -1,93 +0,0 @@ -/* - * Copyright (c) 2016-2020 ARM Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "arm_compute/runtime/CL/functions/CLSobel7x7.h" - -#include "arm_compute/core/CL/ICLTensor.h" -#include "arm_compute/core/CL/kernels/CLSobel7x7Kernel.h" -#include "arm_compute/core/PixelValue.h" -#include "arm_compute/core/TensorInfo.h" -#include "arm_compute/core/Validate.h" -#include "arm_compute/runtime/CL/CLScheduler.h" -#include "arm_compute/runtime/ITensorAllocator.h" - -using namespace arm_compute; - -CLSobel7x7::CLSobel7x7(std::shared_ptr<IMemoryManager> memory_manager) - : _memory_group(std::move(memory_manager)), _sobel_hor(), _sobel_vert(), _border_handler(), _tmp_x(), _tmp_y() -{ -} - -void CLSobel7x7::configure(ICLTensor *input, ICLTensor *output_x, ICLTensor *output_y, BorderMode border_mode, uint8_t constant_border_value) -{ - configure(CLKernelLibrary::get().get_compile_context(), input, output_x, output_y, border_mode, constant_border_value); -} - -void CLSobel7x7::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output_x, ICLTensor *output_y, BorderMode border_mode, uint8_t constant_border_value) -{ - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8); - - const bool run_sobel_x = output_x != nullptr; - const bool run_sobel_y = output_y != nullptr; - - TensorInfo tensor_info(input->info()->tensor_shape(), 1, DataType::S32); - - if(run_sobel_x && run_sobel_y) - { - _tmp_x.allocator()->init(tensor_info); - _tmp_y.allocator()->init(tensor_info); - _memory_group.manage(&_tmp_x); - _memory_group.manage(&_tmp_y); - _sobel_hor.configure(compile_context, input, &_tmp_x, &_tmp_y, border_mode == BorderMode::UNDEFINED); - _sobel_vert.configure(compile_context, &_tmp_x, &_tmp_y, output_x, output_y, border_mode == BorderMode::UNDEFINED); - _tmp_x.allocator()->allocate(); - _tmp_y.allocator()->allocate(); - } - else if(run_sobel_x) - { - _tmp_x.allocator()->init(tensor_info); - _memory_group.manage(&_tmp_x); - _sobel_hor.configure(compile_context, input, &_tmp_x, nullptr, border_mode == BorderMode::UNDEFINED); - _sobel_vert.configure(compile_context, &_tmp_x, nullptr, output_x, nullptr, border_mode == BorderMode::UNDEFINED); - _tmp_x.allocator()->allocate(); - } - else if(run_sobel_y) - { - _tmp_y.allocator()->init(tensor_info); - _memory_group.manage(&_tmp_y); - _sobel_hor.configure(compile_context, input, nullptr, &_tmp_y, border_mode == BorderMode::UNDEFINED); - _sobel_vert.configure(compile_context, nullptr, &_tmp_y, nullptr, output_y, border_mode == BorderMode::UNDEFINED); - _tmp_y.allocator()->allocate(); - } - _border_handler.configure(compile_context, input, _sobel_hor.border_size(), border_mode, PixelValue(constant_border_value)); -} - -void CLSobel7x7::run() -{ - CLScheduler::get().enqueue(_border_handler, false); - - MemoryGroupResourceScope scope_mg(_memory_group); - - CLScheduler::get().enqueue(_sobel_hor, false); - CLScheduler::get().enqueue(_sobel_vert); -} diff --git a/src/runtime/CL/functions/CLSoftmaxLayer.cpp b/src/runtime/CL/functions/CLSoftmaxLayer.cpp index b0b2117cd9..2e70e2aa08 100644 --- a/src/runtime/CL/functions/CLSoftmaxLayer.cpp +++ b/src/runtime/CL/functions/CLSoftmaxLayer.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2020 ARM Limited. + * Copyright (c) 2017-2021 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -24,213 +24,78 @@ #include "arm_compute/runtime/CL/functions/CLSoftmaxLayer.h" #include "arm_compute/core/CL/CLHelpers.h" -#include "arm_compute/core/CL/ICLKernel.h" -#include "arm_compute/core/CL/kernels/CLSoftmaxLayerKernel.h" +#include "arm_compute/core/CL/CLKernelLibrary.h" #include "arm_compute/core/Helpers.h" +#include "arm_compute/core/KernelDescriptors.h" #include "arm_compute/core/Types.h" #include "arm_compute/core/Utils.h" -#include "arm_compute/core/utils/misc/ShapeCalculator.h" -#include "arm_compute/runtime/CL/CLScheduler.h" + +#include "src/core/helpers/MemoryHelpers.h" +#include "src/gpu/cl/kernels/ClSoftmaxKernel.h" +#include "src/gpu/cl/operators/ClPermute.h" +#include "src/gpu/cl/operators/ClSoftmax.h" namespace arm_compute { +using OperatorType = opencl::ClSoftmax; + template <bool IS_LOG> -CLSoftmaxLayerGeneric<IS_LOG>::CLSoftmaxLayerGeneric(std::shared_ptr<IMemoryManager> memory_manager) - : _memory_group(std::move(memory_manager)), _max_shift_exp_sum_kernel(), _norm_kernel(), _flatten_kernel_ptr(), _reshape_kernel(), _max(), _sum(), _tmp(), _input_flattened(), _output_flattened(), - _needs_flattening(false) +struct CLSoftmaxLayerGeneric<IS_LOG>::Impl { -} + const ICLTensor *src{nullptr}; + ICLTensor *dst{nullptr}; + std::unique_ptr<OperatorType> op{nullptr}; + MemoryGroup memory_group{}; + ITensorPack run_pack{}; + WorkspaceData<CLTensor> workspace_tensors{}; +}; template <bool IS_LOG> -void CLSoftmaxLayerGeneric<IS_LOG>::configure_reshape_input_kernel(const ICLTensor *input, const ICLTensor *output, size_t axis) +CLSoftmaxLayerGeneric<IS_LOG>::CLSoftmaxLayerGeneric(std::shared_ptr<IMemoryManager> memory_manager) + : _impl(std::make_unique<Impl>()) { - configure_reshape_input_kernel(CLKernelLibrary::get().get_compile_context(), input, output, axis); + _impl->memory_group = MemoryGroup(std::move(memory_manager)); } template <bool IS_LOG> -void CLSoftmaxLayerGeneric<IS_LOG>::configure_reshape_input_kernel(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *output, size_t axis) -{ - // Flatten the input - const TensorShape shape_flatten = misc::shape_calculator::compute_softmax_shape(input->info(), axis); - - // Initialize the flat input - _input_flattened.allocator()->init(input->info()->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(shape_flatten)); - - // If we need to flatten the input, we can use CLFlattenKernel or CLReshapeKernel - // If flattening on the third axes, we use CLFlattenKernel. - // In all other cases we have to use CLReshapeKernel - if(axis != 3) - { - auto reshape_kernel_ptr = support::cpp14::make_unique<CLReshapeLayerKernel>(); - reshape_kernel_ptr->configure(compile_context, input, &_input_flattened); - _flatten_kernel_ptr = std::move(reshape_kernel_ptr); - } - else - { - auto flatten_kernel_ptr = support::cpp14::make_unique<CLFlattenLayerKernel>(); - flatten_kernel_ptr->configure(compile_context, input, &_input_flattened); - _flatten_kernel_ptr = std::move(flatten_kernel_ptr); - } - - // We need to init the output tensor here. Indeed, the reshape kernel expects - // both tensors to be already initialized - auto_init_if_empty(*output->info(), *input->info()->clone()); -} +CLSoftmaxLayerGeneric<IS_LOG>::~CLSoftmaxLayerGeneric() = default; template <bool IS_LOG> -void CLSoftmaxLayerGeneric<IS_LOG>::configure(const ICLTensor *input, ICLTensor *output, float beta, size_t axis) +void CLSoftmaxLayerGeneric<IS_LOG>::configure(const ICLTensor *input, ICLTensor *output, float beta, int32_t axis) { configure(CLKernelLibrary::get().get_compile_context(), input, output, beta, axis); } template <bool IS_LOG> -void CLSoftmaxLayerGeneric<IS_LOG>::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, float beta, size_t axis) +void CLSoftmaxLayerGeneric<IS_LOG>::configure( + const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, float beta, int32_t axis) { - // Perform validation step - ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); - ARM_COMPUTE_ERROR_THROW_ON(CLSoftmaxLayerGeneric<IS_LOG>::validate(input->info(), output->info(), beta, axis)); - - // We don't need flattening only in the case the input is 2D and axis is 1 - _needs_flattening = axis != 1; - - // If we are dealing with a 4D tensor, we will: - // - Flatten the input, so that we end up with a [width*height*depth] * batches 2D tensor - // - Execute all the pipeline (reduction + normalization) on the flattened tensor - // - Reshape the flattened output into the real output - if(_needs_flattening) - { - // Add to the memory manager _input_flattened - _memory_group.manage(&_input_flattened); - - // Cofigure _flatten_kernel and _input_flattened - configure_reshape_input_kernel(input, output, axis); - } - - // We want to deal with a 2D input. Either it is the flattened version of the original input (4D case) - // or it is the original input case (2D case) - const ICLTensor *input_2D = (_needs_flattening ? &_input_flattened : input); - - // Create intermediate tensors shapes - TensorInfo input_info = input_2D->info()->clone()->reset_padding().set_is_resizable(true); - DataType tmp_data_type = is_data_type_quantized_asymmetric(input_2D->info()->data_type()) ? DataType::S32 : input_2D->info()->data_type(); - TensorInfo tensor_info_tmp(input_info.clone()->set_data_type(tmp_data_type)); - _tmp.allocator()->init(tensor_info_tmp); - - TensorShape max_sum_shape = input_2D->info()->tensor_shape(); - max_sum_shape.set(0, 1); - _max.allocator()->init(input_info.clone()->set_tensor_shape(max_sum_shape)); - _sum.allocator()->init(input_info.clone()->set_tensor_shape(max_sum_shape).set_data_type(tmp_data_type)); - - // Set GPU target to kernels - _max_shift_exp_sum_kernel.set_target(CLScheduler::get().target()); - - // Manage intermediate buffers - _memory_group.manage(&_tmp); - _memory_group.manage(&_max); - _memory_group.manage(&_sum); + _impl->src = input; + _impl->dst = output; + _impl->op = std::make_unique<OperatorType>(); - SoftmaxKernelInfo softmax_info; - softmax_info.beta = beta; - softmax_info.is_log = IS_LOG; - softmax_info.input_data_type = input_2D->info()->data_type(); + SoftmaxKernelInfo softmax_info{beta, IS_LOG, input->info()->data_type(), axis}; + _impl->op->configure(compile_context, *input->info(), *output->info(), softmax_info); - // Configure kernels - _max_shift_exp_sum_kernel.configure(compile_context, input_2D, &_max, &_tmp, &_sum, softmax_info); - - if(_needs_flattening) - { - // Add to the memory manager _output_flattened - _memory_group.manage(&_output_flattened); - - // The normalization kernel stores the result in a flat output tensor - _norm_kernel.configure(compile_context, &_tmp, &_sum, &_output_flattened, softmax_info); - - // Reshape the flat output into a the requested (4D) output - _reshape_kernel.configure(compile_context, &_output_flattened, output); - - // Allocate the intermediate flat tensors - _input_flattened.allocator()->allocate(); - _output_flattened.allocator()->allocate(); - } - else - { - // Softmax 2D case - _norm_kernel.configure(compile_context, &_tmp, &_sum, output, softmax_info); - } - - // Allocate intermediate buffers - _tmp.allocator()->allocate(); - _max.allocator()->allocate(); - _sum.allocator()->allocate(); + _impl->run_pack = {{TensorType::ACL_SRC, _impl->src}, {TensorType::ACL_DST, _impl->dst}}; + _impl->workspace_tensors = manage_workspace<CLTensor>(_impl->op->workspace(), _impl->memory_group, _impl->run_pack); } template <bool IS_LOG> -Status CLSoftmaxLayerGeneric<IS_LOG>::validate(const ITensorInfo *input, const ITensorInfo *output, float beta, size_t axis) +Status +CLSoftmaxLayerGeneric<IS_LOG>::validate(const ITensorInfo *input, const ITensorInfo *output, float beta, int32_t axis) { - ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->num_dimensions() > 4, "Only up to 4 dimensions are supported"); - ARM_COMPUTE_UNUSED(beta); - - // Create intermediate tensor info - DataType tmp_data_type = is_data_type_quantized_asymmetric(input->data_type()) ? DataType::S32 : input->data_type(); - TensorInfo tensor_info_tmp(input->clone()->set_data_type(tmp_data_type).set_is_resizable(true)); - - TensorShape max_sum_shape = input->tensor_shape(); - max_sum_shape.set(0, 1); - TensorInfo tensor_info_max(input->clone()->set_tensor_shape(max_sum_shape).set_is_resizable(true)); - TensorInfo tensor_info_sum(input->clone()->set_tensor_shape(max_sum_shape).set_data_type(tmp_data_type).set_quantization_info(QuantizationInfo()).set_is_resizable(true)); - - const bool needs_flattening = (axis != 1); - - if(needs_flattening) - { - const TensorShape shape_flatten = misc::shape_calculator::compute_softmax_shape(input, axis); - TensorInfo tensor_info_flat(input->clone()->set_tensor_shape(shape_flatten).set_is_resizable(true)); - - if(axis != 3) - { - ARM_COMPUTE_RETURN_ON_ERROR(CLReshapeLayerKernel::validate(input, &tensor_info_flat)); - } - else - { - ARM_COMPUTE_RETURN_ON_ERROR(CLFlattenLayerKernel::validate(input, &tensor_info_flat)); - } - } - - SoftmaxKernelInfo softmax_info; - softmax_info.beta = beta; - softmax_info.is_log = IS_LOG; - softmax_info.input_data_type = input->data_type(); - - ARM_COMPUTE_RETURN_ON_ERROR(CLLogits1DMaxShiftExpSumKernel::validate(input, &tensor_info_max, &tensor_info_tmp, &tensor_info_sum)); - ARM_COMPUTE_RETURN_ON_ERROR(CLLogits1DNormKernel::validate(&tensor_info_tmp, &tensor_info_sum, output, softmax_info)); - - if(needs_flattening) - { - const TensorShape shape_flatten = misc::shape_calculator::compute_softmax_shape(input); - TensorInfo tensor_info_flat(input->clone()->set_tensor_shape(shape_flatten).set_is_resizable(true)); - } - - return Status{}; + SoftmaxKernelInfo softmax_info{beta, IS_LOG, input->data_type(), axis}; + return OperatorType::validate(*input, *output, softmax_info); } template <bool IS_LOG> -void CLSoftmaxLayerGeneric<IS_LOG>::run() +void CLSoftmaxLayerGeneric<IS_LOG>::run() { - MemoryGroupResourceScope scope_mg(_memory_group); - - if(_needs_flattening) - { - CLScheduler::get().enqueue(*_flatten_kernel_ptr, false); - } - - CLScheduler::get().enqueue(_max_shift_exp_sum_kernel, false); - CLScheduler::get().enqueue(_norm_kernel, !_needs_flattening); - - if(_needs_flattening) - { - CLScheduler::get().enqueue(_reshape_kernel, true); - } + // Acquire all the temporaries + MemoryGroupResourceScope scope_mg(_impl->memory_group); + ARM_COMPUTE_ERROR_ON_NULLPTR(_impl->src, _impl->dst); + _impl->op->run(_impl->run_pack); } template class CLSoftmaxLayerGeneric<false>; diff --git a/src/runtime/CL/functions/CLSpaceToBatchLayer.cpp b/src/runtime/CL/functions/CLSpaceToBatchLayer.cpp index 021d31649d..37f728895f 100644 --- a/src/runtime/CL/functions/CLSpaceToBatchLayer.cpp +++ b/src/runtime/CL/functions/CLSpaceToBatchLayer.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018-2020 ARM Limited. + * Copyright (c) 2018-2021 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -30,61 +30,99 @@ #include "arm_compute/core/Validate.h" #include "arm_compute/runtime/CL/CLScheduler.h" +#include "src/common/utils/Log.h" +#include "src/core/CL/kernels/CLSpaceToBatchLayerKernel.h" + namespace arm_compute { CLSpaceToBatchLayer::CLSpaceToBatchLayer() - : _space_to_batch_kernel(), _memset_kernel(), _has_padding(false) + : _space_to_batch_kernel(std::make_unique<CLSpaceToBatchLayerKernel>()), _fill(), _has_padding(false) { } -void CLSpaceToBatchLayer::configure(const ICLTensor *input, const ICLTensor *block_shape, const ICLTensor *paddings, ICLTensor *output) +CLSpaceToBatchLayer::~CLSpaceToBatchLayer() = default; + +void CLSpaceToBatchLayer::configure(const ICLTensor *input, + const ICLTensor *block_shape, + const ICLTensor *paddings, + ICLTensor *output) { configure(CLKernelLibrary::get().get_compile_context(), input, block_shape, paddings, output); } -void CLSpaceToBatchLayer::configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *block_shape, const ICLTensor *paddings, ICLTensor *output) +void CLSpaceToBatchLayer::configure(const CLCompileContext &compile_context, + const ICLTensor *input, + const ICLTensor *block_shape, + const ICLTensor *paddings, + ICLTensor *output) { ARM_COMPUTE_ERROR_ON_NULLPTR(input, block_shape, paddings, output); + ARM_COMPUTE_LOG_PARAMS(input, block_shape, paddings, output); - if(input->info()->tensor_shape().total_size() != output->info()->tensor_shape().total_size()) + if (input->info()->tensor_shape().total_size() != output->info()->tensor_shape().total_size()) { _has_padding = true; - _memset_kernel.configure(compile_context, output, PixelValue(0, input->info()->data_type(), input->info()->quantization_info())); + _fill.configure(compile_context, output, + PixelValue(0, input->info()->data_type(), input->info()->quantization_info())); } - _space_to_batch_kernel.configure(compile_context, input, block_shape, paddings, output); + _space_to_batch_kernel->configure(compile_context, input, block_shape, paddings, output); } -void CLSpaceToBatchLayer::configure(const ICLTensor *input, const int block_shape_x, const int block_shape_y, const Size2D &padding_left, const Size2D &padding_right, ICLTensor *output) +void CLSpaceToBatchLayer::configure(const ICLTensor *input, + const int block_shape_x, + const int block_shape_y, + const Size2D &padding_left, + const Size2D &padding_right, + ICLTensor *output) { - configure(CLKernelLibrary::get().get_compile_context(), input, block_shape_x, block_shape_y, padding_left, padding_right, output); + configure(CLKernelLibrary::get().get_compile_context(), input, block_shape_x, block_shape_y, padding_left, + padding_right, output); } -void CLSpaceToBatchLayer::configure(const CLCompileContext &compile_context, const ICLTensor *input, const int block_shape_x, const int block_shape_y, const Size2D &padding_left, - const Size2D &padding_right, ICLTensor *output) +void CLSpaceToBatchLayer::configure(const CLCompileContext &compile_context, + const ICLTensor *input, + const int block_shape_x, + const int block_shape_y, + const Size2D &padding_left, + const Size2D &padding_right, + ICLTensor *output) { ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); + ARM_COMPUTE_LOG_PARAMS(input, block_shape_x, block_shape_y, padding_left, padding_right, output); - if(input->info()->tensor_shape().total_size() != output->info()->tensor_shape().total_size()) + if (input->info()->tensor_shape().total_size() != output->info()->tensor_shape().total_size()) { _has_padding = true; - _memset_kernel.configure(compile_context, output, PixelValue(0, input->info()->data_type(), input->info()->quantization_info())); + _fill.configure(compile_context, output, + PixelValue(0, input->info()->data_type(), input->info()->quantization_info())); } - _space_to_batch_kernel.configure(compile_context, input, block_shape_x, block_shape_y, padding_left, padding_right, output); + _space_to_batch_kernel->configure(compile_context, input, block_shape_x, block_shape_y, padding_left, padding_right, + output); } -Status CLSpaceToBatchLayer::validate(const ITensorInfo *input, const ITensorInfo *block_shape, const ITensorInfo *paddings, const ITensorInfo *output) +Status CLSpaceToBatchLayer::validate(const ITensorInfo *input, + const ITensorInfo *block_shape, + const ITensorInfo *paddings, + const ITensorInfo *output) { - ARM_COMPUTE_RETURN_ON_ERROR(CLMemsetKernel::validate(output, PixelValue(0, input->data_type(), input->quantization_info()))); + ARM_COMPUTE_RETURN_ON_ERROR( + CLFill::validate(output, PixelValue(0, input->data_type(), input->quantization_info()))); ARM_COMPUTE_RETURN_ON_ERROR(CLSpaceToBatchLayerKernel::validate(input, block_shape, paddings, output)); return Status{}; } -Status CLSpaceToBatchLayer::validate(const ITensorInfo *input, const int block_shape_x, const int block_shape_y, const Size2D &padding_left, const Size2D &padding_right, +Status CLSpaceToBatchLayer::validate(const ITensorInfo *input, + const int block_shape_x, + const int block_shape_y, + const Size2D &padding_left, + const Size2D &padding_right, const ITensorInfo *output) { - ARM_COMPUTE_RETURN_ON_ERROR(CLMemsetKernel::validate(output, PixelValue(0, input->data_type(), input->quantization_info()))); - ARM_COMPUTE_RETURN_ON_ERROR(CLSpaceToBatchLayerKernel::validate(input, block_shape_x, block_shape_y, padding_left, padding_right, output)); + ARM_COMPUTE_RETURN_ON_ERROR( + CLFill::validate(output, PixelValue(0, input->data_type(), input->quantization_info()))); + ARM_COMPUTE_RETURN_ON_ERROR( + CLSpaceToBatchLayerKernel::validate(input, block_shape_x, block_shape_y, padding_left, padding_right, output)); return Status{}; } @@ -92,10 +130,11 @@ Status CLSpaceToBatchLayer::validate(const ITensorInfo *input, const int block_s void CLSpaceToBatchLayer::run() { // Zero out output only if we have paddings - if(_has_padding) + if (_has_padding) { - CLScheduler::get().enqueue(_memset_kernel, true); + //CLScheduler::get().enqueue(*_fill, true); + _fill.run(); } - CLScheduler::get().enqueue(_space_to_batch_kernel, true); + CLScheduler::get().enqueue(*_space_to_batch_kernel, true); } } // namespace arm_compute diff --git a/src/runtime/CL/functions/CLSpaceToDepthLayer.cpp b/src/runtime/CL/functions/CLSpaceToDepthLayer.cpp index a4ffefc189..22695c9ef3 100644 --- a/src/runtime/CL/functions/CLSpaceToDepthLayer.cpp +++ b/src/runtime/CL/functions/CLSpaceToDepthLayer.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2020 ARM Limited. + * Copyright (c) 2019-2021 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -30,21 +30,29 @@ #include "arm_compute/core/Validate.h" #include "arm_compute/runtime/CL/CLScheduler.h" +#include "src/common/utils/Log.h" +#include "src/core/CL/kernels/CLSpaceToDepthLayerKernel.h" + namespace arm_compute { -CLSpaceToDepthLayer::CLSpaceToDepthLayer() - : _space_to_depth_kernel() +CLSpaceToDepthLayer::CLSpaceToDepthLayer() : _space_to_depth_kernel(std::make_unique<CLSpaceToDepthLayerKernel>()) { } +CLSpaceToDepthLayer::~CLSpaceToDepthLayer() = default; + void CLSpaceToDepthLayer::configure(const ICLTensor *input, ICLTensor *output, int32_t block_shape) { configure(CLKernelLibrary::get().get_compile_context(), input, output, block_shape); } -void CLSpaceToDepthLayer::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, int32_t block_shape) +void CLSpaceToDepthLayer::configure(const CLCompileContext &compile_context, + const ICLTensor *input, + ICLTensor *output, + int32_t block_shape) { - _space_to_depth_kernel.configure(compile_context, input, output, block_shape); + ARM_COMPUTE_LOG_PARAMS(input, output, block_shape); + _space_to_depth_kernel->configure(compile_context, input, output, block_shape); } Status CLSpaceToDepthLayer::validate(const ITensorInfo *input, const ITensorInfo *output, int32_t block_shape) @@ -54,6 +62,6 @@ Status CLSpaceToDepthLayer::validate(const ITensorInfo *input, const ITensorInfo void CLSpaceToDepthLayer::run() { - CLScheduler::get().enqueue(_space_to_depth_kernel, true); + CLScheduler::get().enqueue(*_space_to_depth_kernel, true); } } // namespace arm_compute diff --git a/src/runtime/CL/functions/CLSplit.cpp b/src/runtime/CL/functions/CLSplit.cpp index cdc44d8373..6be43cc5cd 100644 --- a/src/runtime/CL/functions/CLSplit.cpp +++ b/src/runtime/CL/functions/CLSplit.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018-2020 ARM Limited. + * Copyright (c) 2018-2020 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -31,13 +31,15 @@ #include "arm_compute/core/utils/misc/ShapeCalculator.h" #include "arm_compute/runtime/CL/CLScheduler.h" +#include "src/core/helpers/AutoConfiguration.h" + namespace arm_compute { void CLSplit::run() { cl::CommandQueue q = CLScheduler::get().queue(); - for(unsigned i = 0; i < _num_outputs; ++i) + for (unsigned i = 0; i < _num_outputs; ++i) { _slice_functions[i].run(); } diff --git a/src/runtime/CL/functions/CLStackLayer.cpp b/src/runtime/CL/functions/CLStackLayer.cpp index 79c3fe5371..c15496fc31 100644 --- a/src/runtime/CL/functions/CLStackLayer.cpp +++ b/src/runtime/CL/functions/CLStackLayer.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018-2020 ARM Limited. + * Copyright (c) 2018-2021 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -21,8 +21,6 @@ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ -#include <complex> - #include "arm_compute/runtime/CL/functions/CLStackLayer.h" #include "arm_compute/core/CL/ICLTensor.h" @@ -33,31 +31,41 @@ #include "arm_compute/core/utils/misc/ShapeCalculator.h" #include "arm_compute/runtime/CL/CLScheduler.h" +#include "src/common/utils/Log.h" +#include "src/core/CL/kernels/CLStackLayerKernel.h" + +#include <complex> + namespace arm_compute { CLStackLayer::CLStackLayer() // NOLINT - : _input(), - _stack_kernels(), - _num_inputs(0) + : _input(), _stack_kernels(), _num_inputs(0) { } +CLStackLayer::~CLStackLayer() = default; + void CLStackLayer::configure(const std::vector<ICLTensor *> &input, int axis, ICLTensor *output) { configure(CLKernelLibrary::get().get_compile_context(), input, axis, output); } -void CLStackLayer::configure(const CLCompileContext &compile_context, const std::vector<ICLTensor *> &input, int axis, ICLTensor *output) +void CLStackLayer::configure(const CLCompileContext &compile_context, + const std::vector<ICLTensor *> &input, + int axis, + ICLTensor *output) { + ARM_COMPUTE_LOG_PARAMS(input, axis, output); _num_inputs = input.size(); - _stack_kernels.resize(_num_inputs); + _stack_kernels.reserve(_num_inputs); // Wrap around negative values const unsigned int axis_u = wrap_around(axis, static_cast<int>(input[0]->info()->num_dimensions() + 1)); - for(unsigned int i = 0; i < _num_inputs; i++) + for (unsigned int i = 0; i < _num_inputs; i++) { - _stack_kernels[i].configure(compile_context, input[i], axis_u, i, _num_inputs, output); + _stack_kernels.emplace_back(std::make_unique<CLStackLayerKernel>()); + _stack_kernels.back()->configure(compile_context, input[i], axis_u, i, _num_inputs, output); } } @@ -72,7 +80,7 @@ Status CLStackLayer::validate(const std::vector<ITensorInfo *> &input, int axis, const unsigned int num_inputs = input.size(); - for(unsigned int i = 0; i < num_inputs; i++) + for (unsigned int i = 0; i < num_inputs; i++) { // All the tensors must have the same rank ARM_COMPUTE_RETURN_ERROR_ON(input[i]->num_dimensions() != rank); @@ -85,9 +93,9 @@ Status CLStackLayer::validate(const std::vector<ITensorInfo *> &input, int axis, void CLStackLayer::run() { - for(unsigned i = 0; i < _num_inputs; i++) + for (unsigned i = 0; i < _num_inputs; i++) { - CLScheduler::get().enqueue(_stack_kernels[i], false); + CLScheduler::get().enqueue(*_stack_kernels[i], false); } } } // namespace arm_compute diff --git a/src/runtime/CL/functions/CLStridedSlice.cpp b/src/runtime/CL/functions/CLStridedSlice.cpp index 454759664c..c1953cc415 100644 --- a/src/runtime/CL/functions/CLStridedSlice.cpp +++ b/src/runtime/CL/functions/CLStridedSlice.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018-2020 ARM Limited. + * Copyright (c) 2018-2021 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -23,32 +23,113 @@ */ #include "arm_compute/runtime/CL/functions/CLStridedSlice.h" -#include "arm_compute/core/CL/kernels/CLStridedSliceKernel.h" +#include "arm_compute/core/CL/ICLTensor.h" #include "arm_compute/core/Types.h" -#include "support/MemorySupport.h" + +#include "src/common/utils/Log.h" +#include "src/core/CL/kernels/CLStridedSliceKernel.h" namespace arm_compute { -void CLStridedSlice::configure(const ICLTensor *input, ICLTensor *output, - const Coordinates &starts, const Coordinates &ends, const BiStrides &strides, - int32_t begin_mask, int32_t end_mask, int32_t shrink_axis_mask) +namespace experimental { - configure(CLKernelLibrary::get().get_compile_context(), input, output, starts, ends, strides, begin_mask, end_mask, shrink_axis_mask); -} - -void CLStridedSlice::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, - const Coordinates &starts, const Coordinates &ends, const BiStrides &strides, - int32_t begin_mask, int32_t end_mask, int32_t shrink_axis_mask) +void CLStridedSlice::configure(const CLCompileContext &compile_context, + const ITensorInfo *input, + ITensorInfo *output, + const Coordinates &starts, + const Coordinates &ends, + const BiStrides &strides, + int32_t begin_mask, + int32_t end_mask, + int32_t shrink_axis_mask) { - auto k = arm_compute::support::cpp14::make_unique<CLStridedSliceKernel>(); + ARM_COMPUTE_LOG_PARAMS(input, output, starts, ends, strides, begin_mask, end_mask, shrink_axis_mask); + auto k = std::make_unique<CLStridedSliceKernel>(); k->configure(compile_context, input, output, starts, ends, strides, begin_mask, end_mask, shrink_axis_mask); _kernel = std::move(k); } -Status CLStridedSlice::validate(const ITensorInfo *input, const ITensorInfo *output, - const Coordinates &starts, const Coordinates &ends, const BiStrides &strides, - int32_t begin_mask, int32_t end_mask, int32_t shrink_axis_mask) +Status CLStridedSlice::validate(const ITensorInfo *input, + const ITensorInfo *output, + const Coordinates &starts, + const Coordinates &ends, + const BiStrides &strides, + int32_t begin_mask, + int32_t end_mask, + int32_t shrink_axis_mask) { return CLStridedSliceKernel::validate(input, output, starts, ends, strides, begin_mask, end_mask, shrink_axis_mask); } +} // namespace experimental + +struct CLStridedSlice::Impl +{ + const ICLTensor *src{nullptr}; + ICLTensor *dst{nullptr}; + CLRuntimeContext *ctx{nullptr}; + std::unique_ptr<experimental::CLStridedSlice> op{nullptr}; +}; + +CLStridedSlice::CLStridedSlice(CLRuntimeContext *ctx) : _impl(std::make_unique<Impl>()) +{ + _impl->ctx = ctx; +} + +CLStridedSlice::CLStridedSlice(CLStridedSlice &&) = default; +CLStridedSlice &CLStridedSlice::operator=(CLStridedSlice &&) = default; +CLStridedSlice::~CLStridedSlice() = default; + +void CLStridedSlice::configure(const ICLTensor *input, + ICLTensor *output, + const Coordinates &starts, + const Coordinates &ends, + const BiStrides &strides, + int32_t begin_mask, + int32_t end_mask, + int32_t shrink_axis_mask) +{ + configure(CLKernelLibrary::get().get_compile_context(), input, output, starts, ends, strides, begin_mask, end_mask, + shrink_axis_mask); +} + +void CLStridedSlice::configure(const CLCompileContext &compile_context, + const ICLTensor *input, + ICLTensor *output, + const Coordinates &starts, + const Coordinates &ends, + const BiStrides &strides, + int32_t begin_mask, + int32_t end_mask, + int32_t shrink_axis_mask) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(input); + + _impl->src = input; + _impl->dst = output; + + _impl->op = std::make_unique<experimental::CLStridedSlice>(); + _impl->op->configure(compile_context, _impl->src->info(), _impl->dst->info(), starts, ends, strides, begin_mask, + end_mask, shrink_axis_mask); +} + +Status CLStridedSlice::validate(const ITensorInfo *input, + const ITensorInfo *output, + const Coordinates &starts, + const Coordinates &ends, + const BiStrides &strides, + int32_t begin_mask, + int32_t end_mask, + int32_t shrink_axis_mask) +{ + return experimental::CLStridedSlice::validate(input, output, starts, ends, strides, begin_mask, end_mask, + shrink_axis_mask); +} + +void CLStridedSlice::run() +{ + ITensorPack pack; + pack.add_tensor(TensorType::ACL_SRC, _impl->src); + pack.add_tensor(TensorType::ACL_DST, _impl->dst); + _impl->op->run(pack); +} } // namespace arm_compute diff --git a/src/runtime/CL/functions/CLTableLookup.cpp b/src/runtime/CL/functions/CLTableLookup.cpp deleted file mode 100644 index 47e15d3c12..0000000000 --- a/src/runtime/CL/functions/CLTableLookup.cpp +++ /dev/null @@ -1,43 +0,0 @@ -/* - * Copyright (c) 2016-2020 ARM Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "arm_compute/runtime/CL/functions/CLTableLookup.h" - -#include "arm_compute/core/CL/kernels/CLTableLookupKernel.h" -#include "support/MemorySupport.h" - -#include <utility> - -using namespace arm_compute; - -void CLTableLookup::configure(const ICLTensor *input, const ICLLut *lut, ICLTensor *output) -{ - configure(CLKernelLibrary::get().get_compile_context(), input, lut, output); -} - -void CLTableLookup::configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLLut *lut, ICLTensor *output) -{ - auto k = arm_compute::support::cpp14::make_unique<CLTableLookupKernel>(); - k->configure(compile_context, input, lut, output); - _kernel = std::move(k); -} diff --git a/src/runtime/CL/functions/CLThreshold.cpp b/src/runtime/CL/functions/CLThreshold.cpp deleted file mode 100644 index 57c92724fa..0000000000 --- a/src/runtime/CL/functions/CLThreshold.cpp +++ /dev/null @@ -1,44 +0,0 @@ -/* - * Copyright (c) 2016-2020 ARM Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "arm_compute/runtime/CL/functions/CLThreshold.h" - -#include "arm_compute/core/CL/kernels/CLThresholdKernel.h" -#include "support/MemorySupport.h" - -#include <utility> - -using namespace arm_compute; - -void CLThreshold::configure(const ICLTensor *input, ICLTensor *output, uint8_t threshold, uint8_t false_value, uint8_t true_value, ThresholdType type, uint8_t upper) -{ - configure(CLKernelLibrary::get().get_compile_context(), input, output, threshold, false_value, true_value, type, upper); -} - -void CLThreshold::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, uint8_t threshold, uint8_t false_value, uint8_t true_value, ThresholdType type, - uint8_t upper) -{ - auto k = arm_compute::support::cpp14::make_unique<CLThresholdKernel>(); - k->configure(compile_context, input, output, threshold, false_value, true_value, type, upper); - _kernel = std::move(k); -} diff --git a/src/runtime/CL/functions/CLTile.cpp b/src/runtime/CL/functions/CLTile.cpp index 178d7af95e..4f86c4adfa 100644 --- a/src/runtime/CL/functions/CLTile.cpp +++ b/src/runtime/CL/functions/CLTile.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018-2020 ARM Limited. + * Copyright (c) 2018-2021 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -23,8 +23,8 @@ */ #include "arm_compute/runtime/CL/functions/CLTile.h" -#include "arm_compute/core/CL/kernels/CLTileKernel.h" -#include "support/MemorySupport.h" +#include "src/common/utils/Log.h" +#include "src/core/CL/kernels/CLTileKernel.h" namespace arm_compute { @@ -33,9 +33,13 @@ void CLTile::configure(const ICLTensor *input, ICLTensor *output, const Multiple configure(CLKernelLibrary::get().get_compile_context(), input, output, multiples); } -void CLTile::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const Multiples &multiples) +void CLTile::configure(const CLCompileContext &compile_context, + const ICLTensor *input, + ICLTensor *output, + const Multiples &multiples) { - auto k = arm_compute::support::cpp14::make_unique<CLTileKernel>(); + ARM_COMPUTE_LOG_PARAMS(input, output, multiples); + auto k = std::make_unique<CLTileKernel>(); k->configure(compile_context, input, output, multiples); _kernel = std::move(k); } diff --git a/src/runtime/CL/functions/CLTranspose.cpp b/src/runtime/CL/functions/CLTranspose.cpp index f5121d06a5..5a738f47ce 100644 --- a/src/runtime/CL/functions/CLTranspose.cpp +++ b/src/runtime/CL/functions/CLTranspose.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2020 ARM Limited. + * Copyright (c) 2017-2021 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -23,12 +23,26 @@ */ #include "arm_compute/runtime/CL/functions/CLTranspose.h" -#include "arm_compute/core/CL/kernels/CLTransposeKernel.h" -#include "support/MemorySupport.h" +#include "arm_compute/core/CL/CLKernelLibrary.h" +#include "arm_compute/core/CL/ICLTensor.h" +#include "arm_compute/core/Types.h" +#include "arm_compute/core/Validate.h" -#include <utility> +#include "src/core/CL/ICLKernel.h" +#include "src/gpu/cl/operators/ClTranspose.h" -using namespace arm_compute; +namespace arm_compute +{ +struct CLTranspose::Impl +{ + const ICLTensor *src{nullptr}; + ICLTensor *dst{nullptr}; + std::unique_ptr<opencl::ClTranspose> op{nullptr}; +}; +CLTranspose::CLTranspose() : _impl(std::make_unique<Impl>()) +{ +} +CLTranspose::~CLTranspose() = default; void CLTranspose::configure(const ICLTensor *input, ICLTensor *output) { @@ -37,12 +51,23 @@ void CLTranspose::configure(const ICLTensor *input, ICLTensor *output) void CLTranspose::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output) { - auto k = arm_compute::support::cpp14::make_unique<CLTransposeKernel>(); - k->configure(compile_context, input, output); - _kernel = std::move(k); + ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); + _impl->src = input; + _impl->dst = output; + _impl->op = std::make_unique<opencl::ClTranspose>(); + _impl->op->configure(compile_context, _impl->src->info(), _impl->dst->info()); } Status CLTranspose::validate(const ITensorInfo *input, const ITensorInfo *output) { - return CLTransposeKernel::validate(input, output); + return opencl::ClTranspose::validate(input, output); +} + +void CLTranspose::run() +{ + ITensorPack pack; + pack.add_tensor(TensorType::ACL_SRC, _impl->src); + pack.add_tensor(TensorType::ACL_DST, _impl->dst); + _impl->op->run(pack); } +} // namespace arm_compute diff --git a/src/runtime/CL/functions/CLUnstack.cpp b/src/runtime/CL/functions/CLUnstack.cpp index 032fb993d0..ddd83e7824 100644 --- a/src/runtime/CL/functions/CLUnstack.cpp +++ b/src/runtime/CL/functions/CLUnstack.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018-2020 ARM Limited. + * Copyright (c) 2018-2021 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -29,6 +29,8 @@ #include "arm_compute/core/Types.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" +#include "src/common/utils/Log.h" + namespace arm_compute { namespace @@ -38,13 +40,15 @@ inline unsigned int wrap_axis(int axis, const ITensorInfo *const tensor) return wrap_around(axis, static_cast<int>(tensor->num_dimensions())); } -inline void setup_slice_coordinates_and_mask(Coordinates &slice_start, int32_t &slice_end_mask, const unsigned int input_num_dimensions) +inline void setup_slice_coordinates_and_mask(Coordinates &slice_start, + int32_t &slice_end_mask, + const unsigned int input_num_dimensions) { // Setups up coordinates to slice the input tensor: start coordinates to all 0s and the unstacking axis of both Start/End to slice just one 2d tensor at a time. Coordinates slice_end; slice_start.set_num_dimensions(input_num_dimensions); slice_end.set_num_dimensions(input_num_dimensions); - for(size_t k = 0; k < input_num_dimensions; ++k) + for (size_t k = 0; k < input_num_dimensions; ++k) { slice_start.set(k, 0); slice_end.set(k, -1); @@ -54,8 +58,7 @@ inline void setup_slice_coordinates_and_mask(Coordinates &slice_start, int32_t & } // namespace CLUnstack::CLUnstack() // NOLINT - : _num_slices(0), - _strided_slice_vector() + : _num_slices(0), _strided_slice_vector() { } @@ -64,14 +67,19 @@ void CLUnstack::configure(const ICLTensor *input, const std::vector<ICLTensor *> configure(CLKernelLibrary::get().get_compile_context(), input, output_vector, axis); } -void CLUnstack::configure(const CLCompileContext &compile_context, const ICLTensor *input, const std::vector<ICLTensor *> &output_vector, int axis) +void CLUnstack::configure(const CLCompileContext &compile_context, + const ICLTensor *input, + const std::vector<ICLTensor *> &output_vector, + int axis) { + ARM_COMPUTE_LOG_PARAMS(input, output_vector, axis); std::vector<ITensorInfo *> outputs_vector_info(output_vector.size()); - std::transform(output_vector.begin(), output_vector.end(), outputs_vector_info.begin(), [](ICLTensor * t) - { - ARM_COMPUTE_ERROR_ON_NULLPTR(t); - return t->info(); - }); + std::transform(output_vector.begin(), output_vector.end(), outputs_vector_info.begin(), + [](ICLTensor *t) + { + ARM_COMPUTE_ERROR_ON_NULLPTR(t); + return t->info(); + }); ARM_COMPUTE_ERROR_ON_NULLPTR(input); ARM_COMPUTE_ERROR_THROW_ON(CLUnstack::validate(input->info(), outputs_vector_info, axis)); @@ -84,11 +92,12 @@ void CLUnstack::configure(const CLCompileContext &compile_context, const ICLTens Coordinates slice_start; int32_t slice_end_mask; setup_slice_coordinates_and_mask(slice_start, slice_end_mask, input->info()->tensor_shape().num_dimensions()); - for(unsigned int slice = 0; slice < _num_slices; ++slice) + for (unsigned int slice = 0; slice < _num_slices; ++slice) { // Adjusts start and end coordinates to take a 2D slice at a time slice_start.set(axis_u, slice); - _strided_slice_vector[slice].configure(compile_context, input, output_vector[slice], slice_start, Coordinates(), BiStrides(), 0, slice_end_mask, (1 << axis_u)); + _strided_slice_vector[slice].configure(compile_context, input, output_vector[slice], slice_start, Coordinates(), + BiStrides(), 0, slice_end_mask, (1 << axis_u)); } } @@ -103,18 +112,20 @@ Status CLUnstack::validate(const ITensorInfo *input, const std::vector<ITensorIn ARM_COMPUTE_RETURN_ERROR_ON(num_slices > output_vector.size()); Coordinates slice_start; int32_t slice_end_mask; - for(size_t k = 0; k < num_slices; ++k) + for (size_t k = 0; k < num_slices; ++k) { slice_start.set(wrap_axis(axis, input), k); setup_slice_coordinates_and_mask(slice_start, slice_end_mask, input->tensor_shape().num_dimensions()); - ARM_COMPUTE_RETURN_ON_ERROR(CLStridedSlice::validate(input, output_vector[k], slice_start, Coordinates(), BiStrides(), 0, slice_end_mask, (1 << wrap_axis(axis, input)))); + ARM_COMPUTE_RETURN_ON_ERROR(CLStridedSlice::validate(input, output_vector[k], slice_start, Coordinates(), + BiStrides(), 0, slice_end_mask, + (1 << wrap_axis(axis, input)))); } return Status{}; } void CLUnstack::run() { - for(unsigned i = 0; i < _num_slices; ++i) + for (unsigned i = 0; i < _num_slices; ++i) { _strided_slice_vector[i].run(); } diff --git a/src/runtime/CL/functions/CLUpsampleLayer.cpp b/src/runtime/CL/functions/CLUpsampleLayer.cpp deleted file mode 100644 index dd04686d60..0000000000 --- a/src/runtime/CL/functions/CLUpsampleLayer.cpp +++ /dev/null @@ -1,63 +0,0 @@ -/* - * Copyright (c) 2018-2020 ARM Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "arm_compute/runtime/CL/functions/CLUpsampleLayer.h" - -#include "arm_compute/core/CL/OpenCL.h" -#include "arm_compute/core/Utils.h" -#include "arm_compute/runtime/CL/CLScheduler.h" - -namespace arm_compute -{ -CLUpsampleLayer::CLUpsampleLayer() // NOLINT - : _upsample(), - _output(nullptr) -{ -} - -Status CLUpsampleLayer::validate(const ITensorInfo *input, const ITensorInfo *output, - const Size2D &info, const InterpolationPolicy upsampling_policy) -{ - return CLUpsampleLayerKernel::validate(input, output, info, upsampling_policy); -} - -void CLUpsampleLayer::configure(ICLTensor *input, ICLTensor *output, - const Size2D &info, const InterpolationPolicy upsampling_policy) -{ - configure(CLKernelLibrary::get().get_compile_context(), input, output, info, upsampling_policy); -} - -void CLUpsampleLayer::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, - const Size2D &info, const InterpolationPolicy upsampling_policy) -{ - ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); - - _output = output; - _upsample.configure(compile_context, input, _output, info, upsampling_policy); -} - -void CLUpsampleLayer::run() -{ - CLScheduler::get().enqueue(_upsample, false); -} -} // namespace arm_compute diff --git a/src/runtime/CL/functions/CLWarpAffine.cpp b/src/runtime/CL/functions/CLWarpAffine.cpp deleted file mode 100644 index ce2171b3d4..0000000000 --- a/src/runtime/CL/functions/CLWarpAffine.cpp +++ /dev/null @@ -1,46 +0,0 @@ -/* - * Copyright (c) 2016-2020 ARM Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "arm_compute/runtime/CL/functions/CLWarpAffine.h" - -#include "arm_compute/core/CL/kernels/CLWarpAffineKernel.h" -#include "arm_compute/core/PixelValue.h" -#include "support/MemorySupport.h" - -#include <utility> - -using namespace arm_compute; - -void CLWarpAffine::configure(ICLTensor *input, ICLTensor *output, const std::array<float, 9> &matrix, InterpolationPolicy policy, BorderMode border_mode, uint8_t constant_border_value) -{ - configure(CLKernelLibrary::get().get_compile_context(), input, output, matrix, policy, border_mode, constant_border_value); -} - -void CLWarpAffine::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, const std::array<float, 9> &matrix, InterpolationPolicy policy, BorderMode border_mode, - uint8_t constant_border_value) -{ - auto k = arm_compute::support::cpp14::make_unique<CLWarpAffineKernel>(); - k->configure(compile_context, input, output, matrix, policy); - _kernel = std::move(k); - _border_handler.configure(compile_context, input, _kernel->border_size(), border_mode, PixelValue(constant_border_value)); -} diff --git a/src/runtime/CL/functions/CLWarpPerspective.cpp b/src/runtime/CL/functions/CLWarpPerspective.cpp deleted file mode 100644 index 06c06616d0..0000000000 --- a/src/runtime/CL/functions/CLWarpPerspective.cpp +++ /dev/null @@ -1,46 +0,0 @@ -/* - * Copyright (c) 2016-2020 ARM Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "arm_compute/runtime/CL/functions/CLWarpPerspective.h" - -#include "arm_compute/core/CL/kernels/CLWarpPerspectiveKernel.h" -#include "arm_compute/core/PixelValue.h" -#include "support/MemorySupport.h" - -#include <utility> - -using namespace arm_compute; - -void CLWarpPerspective::configure(ICLTensor *input, ICLTensor *output, const std::array<float, 9> &matrix, InterpolationPolicy policy, BorderMode border_mode, uint8_t constant_border_value) -{ - configure(CLKernelLibrary::get().get_compile_context(), input, output, matrix, policy, border_mode, constant_border_value); -} - -void CLWarpPerspective::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, const std::array<float, 9> &matrix, InterpolationPolicy policy, BorderMode border_mode, - uint8_t constant_border_value) -{ - auto k = arm_compute::support::cpp14::make_unique<CLWarpPerspectiveKernel>(); - k->configure(compile_context, input, output, matrix, policy); - _kernel = std::move(k); - _border_handler.configure(compile_context, input, _kernel->border_size(), border_mode, PixelValue(constant_border_value)); -} diff --git a/src/runtime/CL/functions/CLWinogradConvolutionLayer.cpp b/src/runtime/CL/functions/CLWinogradConvolutionLayer.cpp index 132c3ee926..645f817030 100644 --- a/src/runtime/CL/functions/CLWinogradConvolutionLayer.cpp +++ b/src/runtime/CL/functions/CLWinogradConvolutionLayer.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018-2020 ARM Limited. + * Copyright (c) 2018-2021 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -23,221 +23,105 @@ */ #include "arm_compute/runtime/CL/functions/CLWinogradConvolutionLayer.h" +#include "arm_compute/core/CL/CLKernelLibrary.h" #include "arm_compute/core/CL/ICLTensor.h" -#include "arm_compute/core/Utils.h" -#include "arm_compute/core/Validate.h" -#include "arm_compute/core/utils/misc/ShapeCalculator.h" -#include "arm_compute/runtime/CL/CLScheduler.h" +#include "arm_compute/core/KernelDescriptors.h" -using namespace arm_compute; +#include "src/core/CL/ICLKernel.h" +#include "src/core/helpers/MemoryHelpers.h" +#include "src/gpu/cl/operators/ClWinogradConv2d.h" +#include "support/Cast.h" -namespace +namespace arm_compute { -Size2D winograd_output_tile(const Size2D &input_dims, const Size2D &kernel_dims, DataLayout data_layout) +struct CLWinogradConvolutionLayer::Impl { - Size2D output_tile = Size2D{}; - - const unsigned int kernel_max_dim = std::max(kernel_dims.width, kernel_dims.height); - - // Check if the input spatial dimensions are smaller than 4 - const bool is_input_lt4_nchw = (input_dims.width <= 4 && input_dims.height <= 4) && (data_layout == DataLayout::NCHW); - - if(kernel_max_dim == 3U) - { - if(kernel_dims == Size2D(3U, 3U)) - { - output_tile = is_input_lt4_nchw ? Size2D(2U, 2U) : Size2D(4U, 4U); - } - else if(kernel_dims == Size2D(3U, 1U)) - { - output_tile = is_input_lt4_nchw ? Size2D(2U, 1U) : Size2D(4U, 1U); - } - else - { - output_tile = is_input_lt4_nchw ? Size2D(1U, 2U) : Size2D(1U, 4U); - } - } - else if(kernel_max_dim == 5U) - { - output_tile = Size2D(kernel_dims.width == 1 ? 1U : 4U, - kernel_dims.height == 1 ? 1U : 4U); - } - else if(kernel_max_dim == 7U) - { - output_tile = Size2D(kernel_dims.width == 1 ? 1U : 2U, - kernel_dims.height == 1 ? 1U : 2U); - } - - return output_tile; -} - -bool check_support_fast_math(const Size2D &output_tile, const Size2D &kernel_size) -{ - // Check if we want to configure a Winograd configuration which requires fast math - using WinogradConfiguration = std::pair<std::pair<int, int>, std::pair<int, int>>; - - std::vector<WinogradConfiguration> fast_math_winograd = - { - WinogradConfiguration(std::pair<int, int>(4, 4), std::pair<int, int>(5, 5)), - WinogradConfiguration(std::pair<int, int>(2, 2), std::pair<int, int>(7, 7)) - }; - - auto p = std::make_pair(std::pair<int, int>(output_tile.width, output_tile.height), - std::pair<int, int>(kernel_size.width, kernel_size.height)); - - return std::find(fast_math_winograd.begin(), fast_math_winograd.end(), p) != fast_math_winograd.end(); -} -} // namespace + const ICLTensor *src{nullptr}; + const ICLTensor *weights{nullptr}; + const ICLTensor *biases{nullptr}; + ICLTensor *dst{nullptr}; + std::unique_ptr<opencl::ClWinogradConv2d> op{nullptr}; + ITensorPack run_pack{}; + MemoryGroup memory_group{}; + WorkspaceData<CLTensor> workspace_tensors{}; + bool is_prepared{false}; +}; CLWinogradConvolutionLayer::CLWinogradConvolutionLayer(std::shared_ptr<IMemoryManager> memory_manager) - : _memory_group(memory_manager), _batched_mm(memory_manager), _input_transform(), _filter_transform(), _output_transform(), _input0(), _input1(), _batched_mm_output(), _original_weights(nullptr), - _is_prepared(false) + : _impl(std::make_unique<Impl>()) { + _impl->memory_group = MemoryGroup(memory_manager); } -void CLWinogradConvolutionLayer::configure(ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info, - bool enable_fast_math) +CLWinogradConvolutionLayer::~CLWinogradConvolutionLayer() = default; + +void CLWinogradConvolutionLayer::configure(ICLTensor *input, + const ICLTensor *weights, + const ICLTensor *biases, + ICLTensor *output, + const PadStrideInfo &conv_info, + const ActivationLayerInfo &act_info, + bool enable_fast_math) { - configure(CLKernelLibrary::get().get_compile_context(), input, weights, biases, output, conv_info, act_info, enable_fast_math); + configure(CLKernelLibrary::get().get_compile_context(), input, weights, biases, output, conv_info, act_info, + enable_fast_math); } -void CLWinogradConvolutionLayer::configure(const CLCompileContext &compile_context, ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, - const PadStrideInfo &conv_info, - const ActivationLayerInfo &act_info, bool enable_fast_math) +void CLWinogradConvolutionLayer::configure(const CLCompileContext &compile_context, + ICLTensor *input, + const ICLTensor *weights, + const ICLTensor *biases, + ICLTensor *output, + const PadStrideInfo &conv_info, + const ActivationLayerInfo &act_info, + bool enable_fast_math) { - // Get indices for the width and height - const size_t idx_width = get_data_layout_dimension_index(input->info()->data_layout(), DataLayoutDimension::WIDTH); - const size_t idx_height = get_data_layout_dimension_index(input->info()->data_layout(), DataLayoutDimension::HEIGHT); - - // Input shape, kernel size and output tile - const Size2D input_dims = Size2D(input->info()->tensor_shape()[idx_width], input->info()->tensor_shape()[idx_height]); - const Size2D kernel_size = Size2D(weights->info()->tensor_shape()[idx_width], weights->info()->tensor_shape()[idx_height]); - const Size2D output_tile = winograd_output_tile(input_dims, kernel_size, input->info()->data_layout()); - - // Check if the Winograd configuration requires fast math - if(!enable_fast_math) - { - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32); //disable winograd for fp16 if fast math is false. - ARM_COMPUTE_ERROR_ON_MSG(check_support_fast_math(output_tile, kernel_size), "This Winograd configuration requires enable_fast_math=true"); - } - const WinogradInfo winograd_info = WinogradInfo(output_tile, - kernel_size, - input_dims, - conv_info, - input->info()->data_layout()); - - _is_prepared = false; - _original_weights = weights; - - // Manage intermediate tensors - _memory_group.manage(&_input0); - _memory_group.manage(&_batched_mm_output); - - // Do not manage _input1 as it contains the weights - - // Configure input transform - _input_transform.configure(compile_context, input, &_input0, winograd_info); - - // Configure filter transform - _filter_transform.configure(compile_context, weights, &_input1, winograd_info); - - // Configure batched matrix multiply - _batched_mm.configure(compile_context, &_input0, &_input1, nullptr, &_batched_mm_output, 1.0f, 0.0f, GEMMInfo(false, false, true /* Reshape weights only for the first run*/, 0, false, false, - GEMMLowpOutputStageInfo(), - (input->info()->data_type() == DataType::F16))); - - // Configure output transform - _output_transform.configure(compile_context, &_batched_mm_output, biases, output, winograd_info, act_info); - - // Allocate temporary tensors - _input0.allocator()->allocate(); - _batched_mm_output.allocator()->allocate(); + _impl->src = input; + _impl->weights = weights; + _impl->biases = biases; + _impl->dst = output; + + _impl->op = std::make_unique<opencl::ClWinogradConv2d>(); + _impl->op->configure(compile_context, input->info(), weights->info(), + (biases != nullptr ? biases->info() : nullptr), output->info(), conv_info, act_info, + enable_fast_math); + + _impl->run_pack = {{TensorType::ACL_SRC_0, _impl->src}, + {TensorType::ACL_SRC_1, _impl->weights}, + {TensorType::ACL_SRC_2, _impl->biases}, + {TensorType::ACL_DST, _impl->dst}}; + _impl->workspace_tensors = + manage_workspace<CLTensor>(_impl->op->workspace(), _impl->memory_group, _impl->run_pack, _impl->run_pack); } -Status CLWinogradConvolutionLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info, - const ActivationLayerInfo &act_info, bool enable_fast_math) +Status CLWinogradConvolutionLayer::validate(const ITensorInfo *input, + const ITensorInfo *weights, + const ITensorInfo *biases, + const ITensorInfo *output, + const PadStrideInfo &conv_info, + const ActivationLayerInfo &act_info, + bool enable_fast_math) { - // Get indeces for the width and height - const size_t idx_width = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::WIDTH); - const size_t idx_height = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::HEIGHT); - - // Input shape, kernel size and output tile - const Size2D input_dims = Size2D(input->tensor_shape()[idx_width], input->tensor_shape()[idx_height]); - const Size2D kernel_size = Size2D(weights->tensor_shape()[idx_width], weights->tensor_shape()[idx_height]); - const Size2D output_tile = winograd_output_tile(input_dims, kernel_size, input->data_layout()); - - ARM_COMPUTE_RETURN_ERROR_ON_MSG(((conv_info.pad_left() > (kernel_size.x() / 2u)) || (conv_info.pad_right() > (kernel_size.x() / 2u))), "Winograd only supports padding up to half kernel size"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(((conv_info.pad_top() > (kernel_size.y() / 2u)) || (conv_info.pad_bottom() > (kernel_size.y() / 2u))), "Winograd only supports padding up to half kernel size"); - - // Check if the Winograd configuration requires fast math - if(!enable_fast_math) - { - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32); //disable winograd for fp16 if fast math is false. - ARM_COMPUTE_RETURN_ERROR_ON_MSG(check_support_fast_math(output_tile, kernel_size), "This Winograd configuration requires enable_fast_math=true"); - } - - const WinogradInfo winograd_info = WinogradInfo(output_tile, - kernel_size, - input_dims, - conv_info, - input->data_layout()); - - // Validate input transform - const TensorShape input0_shape = misc::shape_calculator::compute_winograd_input_transform_shape(*input, winograd_info); - const TensorInfo input0 = input->clone()->set_tensor_shape(input0_shape); - ARM_COMPUTE_RETURN_ON_ERROR(CLWinogradInputTransform::validate(input, &input0, winograd_info)); - - // Validate filter transform - const TensorShape input1_shape = misc::shape_calculator::compute_winograd_filter_transform_shape(*weights, winograd_info); - const TensorInfo input1 = weights->clone()->set_tensor_shape(input1_shape); - ARM_COMPUTE_RETURN_ON_ERROR(CLWinogradFilterTransformKernel::validate(weights, &input1, winograd_info)); - - // Validate batched matrix multiply - TensorShape batched_mm_output_shape = input0.tensor_shape(); - batched_mm_output_shape[0] = input1.tensor_shape()[0]; - const TensorInfo batched_mm_output = input0.clone()->set_tensor_shape(batched_mm_output_shape); - ARM_COMPUTE_RETURN_ON_ERROR(CLGEMM::validate(&input0, &input1, nullptr, &batched_mm_output, 1.0f, 0.0f, GEMMInfo(false, false, true /* Reshape weights only for the first run*/, 0, false, false, - GEMMLowpOutputStageInfo(), (input->data_type() == DataType::F16)))); - - // Configure output transform - ARM_COMPUTE_RETURN_ON_ERROR(CLWinogradOutputTransformKernel::validate(&batched_mm_output, biases, output, winograd_info, act_info)); - - return Status{}; + return opencl::ClWinogradConv2d::validate(input, weights, biases, output, conv_info, act_info, enable_fast_math); } void CLWinogradConvolutionLayer::run() { + MemoryGroupResourceScope scope_mg(_impl->memory_group); prepare(); - - MemoryGroupResourceScope scope_mg(_memory_group); - - // Run input transform - _input_transform.run(); - - // Run batched matrix multiplication - _batched_mm.run(); - - // Run output transform - CLScheduler::get().enqueue(_output_transform); + _impl->op->run(_impl->run_pack); } void CLWinogradConvolutionLayer::prepare() { - if(!_is_prepared) + if (!_impl->is_prepared) { - // Run filter transform and mark original weights as unused - _input1.allocator()->allocate(); - CLScheduler::get().enqueue(_filter_transform, false); - _original_weights->mark_as_unused(); - - // Prepare GEMM and release reshaped weights if marked unused by CLGEMM - _batched_mm.prepare(); - if(!_input1.is_used()) - { - _input1.allocator()->free(); - } + _impl->op->prepare(_impl->run_pack); - CLScheduler::get().queue().finish(); - _is_prepared = true; + // Release Preparation tensors + release_prepare_tensors(_impl->workspace_tensors, _impl->run_pack); + _impl->run_pack.remove_tensor(TensorType::ACL_SRC_1); + _impl->is_prepared = true; } } +} // namespace arm_compute diff --git a/src/runtime/CL/functions/CLWinogradInputTransform.cpp b/src/runtime/CL/functions/CLWinogradInputTransform.cpp deleted file mode 100644 index ae400768fe..0000000000 --- a/src/runtime/CL/functions/CLWinogradInputTransform.cpp +++ /dev/null @@ -1,50 +0,0 @@ -/* - * Copyright (c) 2018-2020 ARM Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "arm_compute/runtime/CL/functions/CLWinogradInputTransform.h" - -#include "arm_compute/core/CL/ICLTensor.h" -#include "arm_compute/core/CL/kernels/CLWinogradInputTransformKernel.h" -#include "arm_compute/core/Error.h" -#include "support/MemorySupport.h" - -using namespace arm_compute; - -void CLWinogradInputTransform::configure(ICLTensor *input, ICLTensor *output, const WinogradInfo &winograd_info) -{ - configure(CLKernelLibrary::get().get_compile_context(), input, output, winograd_info); -} - -void CLWinogradInputTransform::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, const WinogradInfo &winograd_info) -{ - auto k = arm_compute::support::cpp14::make_unique<CLWinogradInputTransformKernel>(); - k->configure(compile_context, input, output, winograd_info); - _kernel = std::move(k); - _border_handler.configure(compile_context, input, _kernel->border_size(), BorderMode::CONSTANT, PixelValue()); -} - -Status CLWinogradInputTransform::validate(const ITensorInfo *input, const ITensorInfo *output, const WinogradInfo &winograd_info) -{ - ARM_COMPUTE_RETURN_ON_ERROR(CLWinogradInputTransformKernel::validate(input, output, winograd_info)); - return Status{}; -} diff --git a/src/runtime/CL/functions/CLYOLOLayer.cpp b/src/runtime/CL/functions/CLYOLOLayer.cpp deleted file mode 100644 index 0c0c1065bc..0000000000 --- a/src/runtime/CL/functions/CLYOLOLayer.cpp +++ /dev/null @@ -1,47 +0,0 @@ -/* - * Copyright (c) 2018-2020 ARM Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "arm_compute/runtime/CL/functions/CLYOLOLayer.h" - -#include "arm_compute/core/CL/kernels/CLYOLOLayerKernel.h" -#include "arm_compute/core/Types.h" -#include "support/MemorySupport.h" - -using namespace arm_compute; - -void CLYOLOLayer::configure(ICLTensor *input, ICLTensor *output, const ActivationLayerInfo &act_info, int32_t num_classes) -{ - configure(CLKernelLibrary::get().get_compile_context(), input, output, act_info, num_classes); -} - -void CLYOLOLayer::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, const ActivationLayerInfo &act_info, int32_t num_classes) -{ - auto k = arm_compute::support::cpp14::make_unique<CLYOLOLayerKernel>(); - k->configure(compile_context, input, output, act_info, num_classes); - _kernel = std::move(k); -} - -Status CLYOLOLayer::validate(const ITensorInfo *input, const ITensorInfo *output, const ActivationLayerInfo &act_info, int32_t num_classes) -{ - return CLYOLOLayerKernel::validate(input, output, act_info, num_classes); -} diff --git a/src/runtime/CL/gemm/CLGEMMDefaultTypeBifrost.cpp b/src/runtime/CL/gemm/CLGEMMDefaultTypeBifrost.cpp new file mode 100644 index 0000000000..4270165ab4 --- /dev/null +++ b/src/runtime/CL/gemm/CLGEMMDefaultTypeBifrost.cpp @@ -0,0 +1,590 @@ +/* + * Copyright (c) 2020-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "src/runtime/CL/gemm/CLGEMMDefaultTypeBifrost.h" + +#include "arm_compute/core/CL/CLHelpers.h" +#include "arm_compute/core/CL/CLKernelLibrary.h" + +#include "src/gpu/cl/kernels/gemm/ClGemmHelpers.h" + +#include <map> +#include <utility> + +namespace arm_compute +{ +namespace cl_gemm +{ +CLGEMMDefaultTypeBifrost::CLGEMMDefaultTypeBifrost(GPUTarget gpu) : ICLGEMMKernelSelection(gpu) +{ +} + +CLGEMMKernelType CLGEMMDefaultTypeBifrost::select_kernel(const CLGEMMKernelSelectionParams ¶ms) +{ + // _target could be used in the future to have a dedicated heuristic for each GPU IP + ARM_COMPUTE_UNUSED(_target); + + using FunctionExecutorPtr = CLGEMMKernelType (CLGEMMDefaultTypeBifrost::*)( + unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant); + + // Default configurations for Bifrost architectures + static std::map<DataType, FunctionExecutorPtr> gemm_default_configs = { + {DataType::F32, &CLGEMMDefaultTypeBifrost::default_f32}, + {DataType::F16, &CLGEMMDefaultTypeBifrost::default_f16}, + {DataType::QASYMM8, &CLGEMMDefaultTypeBifrost::default_q8}, + {DataType::QASYMM8_SIGNED, &CLGEMMDefaultTypeBifrost::default_q8}, + {DataType::QSYMM8, &CLGEMMDefaultTypeBifrost::default_q8}, + {DataType::QSYMM8_PER_CHANNEL, &CLGEMMDefaultTypeBifrost::default_q8}}; + + // Mali-G71 configurations + static std::map<DataType, FunctionExecutorPtr> gemm_g71_configs = { + {DataType::F32, &CLGEMMDefaultTypeBifrost::default_f32}, + {DataType::F16, &CLGEMMDefaultTypeBifrost::g71_f16}, + {DataType::QASYMM8, &CLGEMMDefaultTypeBifrost::default_q8}, + {DataType::QASYMM8_SIGNED, &CLGEMMDefaultTypeBifrost::default_q8}, + {DataType::QSYMM8, &CLGEMMDefaultTypeBifrost::default_q8}, + {DataType::QSYMM8_PER_CHANNEL, &CLGEMMDefaultTypeBifrost::default_q8}}; + + // Mali-G52 configurations + static std::map<DataType, FunctionExecutorPtr> gemm_g52_configs = { + {DataType::F32, &CLGEMMDefaultTypeBifrost::g52_f32}, + {DataType::F16, &CLGEMMDefaultTypeBifrost::g52_f16}, + {DataType::QASYMM8, &CLGEMMDefaultTypeBifrost::default_q8}, + {DataType::QASYMM8_SIGNED, &CLGEMMDefaultTypeBifrost::default_q8}, + {DataType::QSYMM8, &CLGEMMDefaultTypeBifrost::default_q8}, + {DataType::QSYMM8_PER_CHANNEL, &CLGEMMDefaultTypeBifrost::default_q8}}; + + // Mali-G76 configurations + static std::map<DataType, FunctionExecutorPtr> gemm_g76_configs = { + {DataType::F32, &CLGEMMDefaultTypeBifrost::g76_f32}, + {DataType::F16, &CLGEMMDefaultTypeBifrost::g76_f16}, + {DataType::QASYMM8, &CLGEMMDefaultTypeBifrost::default_q8}, + {DataType::QASYMM8_SIGNED, &CLGEMMDefaultTypeBifrost::default_q8}, + {DataType::QSYMM8, &CLGEMMDefaultTypeBifrost::default_q8}, + {DataType::QSYMM8_PER_CHANNEL, &CLGEMMDefaultTypeBifrost::default_q8}}; + + const DataType data_type = params.data_type; + + switch (_target) + { + case GPUTarget::G71: + if (gemm_g71_configs.find(data_type) != gemm_g71_configs.end()) + { + return (this->*gemm_g71_configs[data_type])(params.m, params.n, params.k, params.b, + params.is_rhs_constant); + } + ARM_COMPUTE_ERROR("Not supported data type"); + case GPUTarget::G76: + if (gemm_g76_configs.find(data_type) != gemm_g76_configs.end()) + { + return (this->*gemm_g76_configs[data_type])(params.m, params.n, params.k, params.b, + params.is_rhs_constant); + } + ARM_COMPUTE_ERROR("Not supported data type"); + case GPUTarget::G52: + if (gemm_g52_configs.find(data_type) != gemm_g52_configs.end()) + { + return (this->*gemm_g52_configs[data_type])(params.m, params.n, params.k, params.b, + params.is_rhs_constant); + } + ARM_COMPUTE_ERROR("Not supported data type"); + default: + if (gemm_default_configs.find(data_type) != gemm_default_configs.end()) + { + return (this->*gemm_default_configs[data_type])(params.m, params.n, params.k, params.b, + params.is_rhs_constant); + } + ARM_COMPUTE_ERROR("Not supported data type"); + } +} + +CLGEMMKernelType CLGEMMDefaultTypeBifrost::default_f32( + unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant) +{ + ARM_COMPUTE_UNUSED(b); + + CLGEMMKernelType gemm_type = CLGEMMKernelType::NATIVE; + + if (is_rhs_constant) + { + if ((m > 1) && (n < 16)) + { + gemm_type = CLGEMMKernelType::RESHAPED; + } + else if (m == 1) + { + gemm_type = CLGEMMKernelType::RESHAPED_ONLY_RHS; + } + else + { + if ((k > 256) && (m > 4)) + { + constexpr float alpha = 3.2f; + constexpr float fact0 = 1.51f; + constexpr float fact1 = 1.66f; + constexpr float ops = 12.0f; + const float scale = k > 1024 ? 1.07f : 1.0f; + gemm_type = (alpha + ((n * fact0) / ops) < ((fact1 * n * scale) / ops)) + ? CLGEMMKernelType::RESHAPED + : CLGEMMKernelType::RESHAPED_ONLY_RHS; + } + else + { + gemm_type = CLGEMMKernelType::RESHAPED_ONLY_RHS; + } + } + + const auto workload = static_cast<float>((m * n) / 20.0f); + + gemm_type = ((workload > 1600.0f) && (gemm_type == CLGEMMKernelType::RESHAPED)) ? CLGEMMKernelType::RESHAPED + : gemm_type; + } + + return gemm_type; +} + +CLGEMMKernelType CLGEMMDefaultTypeBifrost::default_f16( + unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant) +{ + ARM_COMPUTE_UNUSED(n, k, b); + + if (is_rhs_constant) + { + if (m == 1) + { + return CLGEMMKernelType::RESHAPED_ONLY_RHS; + } + else + { + return CLGEMMKernelType::RESHAPED; + } + } + else + { + return CLGEMMKernelType::NATIVE; + } +} + +CLGEMMKernelType CLGEMMDefaultTypeBifrost::default_q8( + unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant) +{ + ARM_COMPUTE_UNUSED(m, n, k, b); + + if (is_rhs_constant) + { + return CLGEMMKernelType::RESHAPED_ONLY_RHS; + } + else + { + return CLGEMMKernelType::NATIVE; + } +} + +CLGEMMKernelType +CLGEMMDefaultTypeBifrost::g76_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant) +{ + ARM_COMPUTE_UNUSED(b); + + if (!is_rhs_constant) + { + return CLGEMMKernelType::NATIVE; + } + if (m == 1) + { + return CLGEMMKernelType::RESHAPED_ONLY_RHS; + } + if (k <= 496) + { + if (n <= 544) + { + return CLGEMMKernelType::RESHAPED_ONLY_RHS; + } + else + { + return CLGEMMKernelType::RESHAPED; + } + } + else + { + if (k <= 588) + { + if (k <= 552) + { + if (m <= 148) + { + return CLGEMMKernelType::RESHAPED_ONLY_RHS; + } + else + { + if (m <= 278) + { + return CLGEMMKernelType::RESHAPED; + } + else + { + return CLGEMMKernelType::RESHAPED_ONLY_RHS; + } + } + } + else + { + return CLGEMMKernelType::RESHAPED_ONLY_RHS; + } + } + else + { + return CLGEMMKernelType::RESHAPED; + } + } +} + +CLGEMMKernelType +CLGEMMDefaultTypeBifrost::g52_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant) +{ + ARM_COMPUTE_UNUSED(b); + + if (!is_rhs_constant) + { + return CLGEMMKernelType::NATIVE; + } + + if (m == 1) + { + return CLGEMMKernelType::RESHAPED_ONLY_RHS; + } + + const float r_mn = static_cast<float>(m) / static_cast<float>(n); + const float r_mk = static_cast<float>(m) / static_cast<float>(k); + const float r_nk = static_cast<float>(n) / static_cast<float>(k); + const float r_mnk = static_cast<float>(m) / (static_cast<float>(n) * static_cast<float>(k)); + + if (r_mn <= 1.5469f) + { + if (r_mk <= 0.8766f) + { + if (r_mk <= 0.0211f) + { + if (r_mnk <= 77.5833f) + { + return CLGEMMKernelType::RESHAPED_ONLY_RHS; + } + else + { + return CLGEMMKernelType::RESHAPED; + } + } + else + { + if (r_nk <= 0.0832f) + { + return CLGEMMKernelType::RESHAPED_ONLY_RHS; + } + else + { + return CLGEMMKernelType::RESHAPED; + } + } + } + else + { + if (r_mnk <= 193.0000f) + { + if (r_mn <= 0.9948f) + { + if (r_mk <= 2.5453f) + { + return CLGEMMKernelType::RESHAPED; + } + else + { + return CLGEMMKernelType::RESHAPED_ONLY_RHS; + } + } + else + { + return CLGEMMKernelType::RESHAPED_ONLY_RHS; + } + } + else + { + return CLGEMMKernelType::RESHAPED; + } + } + } + else + { + if (r_mn <= 17.7370f) + { + if (r_mnk <= 1391.2875f) + { + if (r_mk <= 2.9724f) + { + return CLGEMMKernelType::RESHAPED_ONLY_RHS; + } + else + { + if (r_mnk <= 470.0000f) + { + return CLGEMMKernelType::RESHAPED_ONLY_RHS; + } + else + { + return CLGEMMKernelType::RESHAPED; + } + } + } + else + { + if (r_nk <= 0.1381f) + { + if (r_mnk <= 9040.5000f) + { + return CLGEMMKernelType::RESHAPED_ONLY_RHS; + } + else + { + return CLGEMMKernelType::RESHAPED; + } + } + else + { + if (r_mn <= 5.6790f) + { + return CLGEMMKernelType::RESHAPED; + } + else + { + return CLGEMMKernelType::RESHAPED_ONLY_RHS; + } + } + } + } + else + { + return CLGEMMKernelType::RESHAPED_ONLY_RHS; + } + } +} + +CLGEMMKernelType +CLGEMMDefaultTypeBifrost::g76_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant) +{ + ARM_COMPUTE_UNUSED(b); + + if (!is_rhs_constant) + { + return CLGEMMKernelType::NATIVE; + } + + if (m == 1) + { + return CLGEMMKernelType::RESHAPED_ONLY_RHS; + } + + const float r_mn = static_cast<float>(m) / static_cast<float>(n); + const float r_nk = static_cast<float>(n) / static_cast<float>(k); + + if (k <= 212) + { + return CLGEMMKernelType::RESHAPED_ONLY_RHS; + } + else + { + if (r_nk <= 0.4990234375f) + { + if (k <= 1392) + { + return CLGEMMKernelType::RESHAPED_ONLY_RHS; + } + else + { + if (m <= 325) + { + return CLGEMMKernelType::RESHAPED_ONLY_RHS; + } + else + { + return CLGEMMKernelType::RESHAPED; + } + } + } + else + { + if (k <= 471) + { + return CLGEMMKernelType::RESHAPED_ONLY_RHS; + } + else + { + if (r_mn <= 0.04475911520421505f) + { + return CLGEMMKernelType::RESHAPED; + } + else + { + return CLGEMMKernelType::RESHAPED_ONLY_RHS; + } + } + } + } +} + +CLGEMMKernelType +CLGEMMDefaultTypeBifrost::g52_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant) +{ + if (!is_rhs_constant) + { + return CLGEMMKernelType::NATIVE; + } + + if (m == 1) + { + return CLGEMMKernelType::RESHAPED_ONLY_RHS; + } + + if (n <= 127.0000f) + { + if (n <= 63.5000f) + { + return CLGEMMKernelType::RESHAPED_ONLY_RHS; + } + else + { + if (m <= 3616.0000f) + { + if (b <= 18.5000f) + { + if (m <= 2970.5000f) + { + return CLGEMMKernelType::RESHAPED_ONLY_RHS; + } + else + { + if (k <= 104.0000f) + { + return CLGEMMKernelType::RESHAPED_ONLY_RHS; + } + else + { + return CLGEMMKernelType::RESHAPED; + } + } + } + else + { + return CLGEMMKernelType::RESHAPED; + } + } + else + { + return CLGEMMKernelType::RESHAPED; + } + } + } + else + { + if (m <= 12.5000f) + { + return CLGEMMKernelType::RESHAPED_ONLY_RHS; + } + else + { + if (k <= 104.0000f) + { + if (b <= 18.5000f) + { + if (m <= 490.0000f) + { + if (n <= 272.0000f) + { + return CLGEMMKernelType::RESHAPED_ONLY_RHS; + } + else + { + return CLGEMMKernelType::RESHAPED; + } + } + else + { + return CLGEMMKernelType::RESHAPED; + } + } + else + { + return CLGEMMKernelType::RESHAPED; + } + } + else + { + if (m <= 226.0000f) + { + if (n <= 140.0000f) + { + if (m <= 179.5000f) + { + return CLGEMMKernelType::RESHAPED; + } + else + { + return CLGEMMKernelType::RESHAPED_ONLY_RHS; + } + } + else + { + return CLGEMMKernelType::RESHAPED; + } + } + else + { + return CLGEMMKernelType::RESHAPED; + } + } + } + } +} + +CLGEMMKernelType +CLGEMMDefaultTypeBifrost::g71_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant) +{ + ARM_COMPUTE_UNUSED(b); + ARM_COMPUTE_UNUSED(n); + ARM_COMPUTE_UNUSED(k); + + if (is_rhs_constant) + { + if (m == 1) + { + return CLGEMMKernelType::RESHAPED_ONLY_RHS; + } + else + { + return CLGEMMKernelType::RESHAPED; + } + } + else + { + return CLGEMMKernelType::NATIVE; + } +} +} // namespace cl_gemm +} // namespace arm_compute diff --git a/src/runtime/CL/gemm/CLGEMMDefaultTypeBifrost.h b/src/runtime/CL/gemm/CLGEMMDefaultTypeBifrost.h new file mode 100644 index 0000000000..0cbab35c2e --- /dev/null +++ b/src/runtime/CL/gemm/CLGEMMDefaultTypeBifrost.h @@ -0,0 +1,58 @@ +/* + * Copyright (c) 2020 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef SRC_CLGEMMDEFAULTTYPEBIFROST_H +#define SRC_CLGEMMDEFAULTTYPEBIFROST_H + +#include "arm_compute/runtime/CL/ICLGEMMKernelSelection.h" + +namespace arm_compute +{ +namespace cl_gemm +{ +/** Bifrost based OpenCL GEMMKernel selection */ +class CLGEMMDefaultTypeBifrost final : public ICLGEMMKernelSelection +{ +public: + /** Constructor + * + * @param[in] gpu GPU target + */ + CLGEMMDefaultTypeBifrost(GPUTarget gpu); + + // Inherited overridden method + CLGEMMKernelType select_kernel(const CLGEMMKernelSelectionParams ¶ms) override; + +private: + CLGEMMKernelType g52_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant); + CLGEMMKernelType g76_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant); + CLGEMMKernelType g76_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant); + CLGEMMKernelType g52_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant); + CLGEMMKernelType g71_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant); + CLGEMMKernelType default_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant); + CLGEMMKernelType default_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant); + CLGEMMKernelType default_q8(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant); +}; +} // namespace cl_gemm +} // namespace arm_compute +#endif /* SRC_CLGEMMDEFAULTTYPEBIFROST_H */ diff --git a/src/runtime/CL/gemm/CLGEMMKernelSelectionMidgard.cpp b/src/runtime/CL/gemm/CLGEMMDefaultTypeMidgard.cpp index a94a392553..673038a8db 100644 --- a/src/runtime/CL/gemm/CLGEMMKernelSelectionMidgard.cpp +++ b/src/runtime/CL/gemm/CLGEMMDefaultTypeMidgard.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020 ARM Limited. + * Copyright (c) 2020-2021 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -21,13 +21,14 @@ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ -#include "arm_compute/runtime/CL/gemm/CLGEMMKernelSelectionMidgard.h" +#include "src/runtime/CL/gemm/CLGEMMDefaultTypeMidgard.h" #include "arm_compute/core/CL/CLHelpers.h" #include "arm_compute/core/CL/CLKernelLibrary.h" -#include "arm_compute/core/CL/gemm/CLGEMMHelpers.h" #include "arm_compute/core/GPUTarget.h" +#include "src/gpu/cl/kernels/gemm/ClGemmHelpers.h" + #include <map> #include <utility> @@ -35,58 +36,59 @@ namespace arm_compute { namespace cl_gemm { -CLGEMMKernelSelectionMidgard::CLGEMMKernelSelectionMidgard(GPUTarget gpu) - : ICLGEMMKernelSelection(gpu) +CLGEMMDefaultTypeMidgard::CLGEMMDefaultTypeMidgard(GPUTarget gpu) : ICLGEMMKernelSelection(gpu) { } -CLGEMMKernelType CLGEMMKernelSelectionMidgard::select_kernel(const CLGEMMKernelSelectionParams ¶ms) +CLGEMMKernelType CLGEMMDefaultTypeMidgard::select_kernel(const CLGEMMKernelSelectionParams ¶ms) { // _target could be used in the future to have a dedicated heuristic for each GPU IP ARM_COMPUTE_UNUSED(_target); - using FunctionExecutorPtr = CLGEMMKernelType (CLGEMMKernelSelectionMidgard::*)(unsigned int m, unsigned int n, unsigned int k, bool is_rhs_constant); + using FunctionExecutorPtr = CLGEMMKernelType (CLGEMMDefaultTypeMidgard::*)( + unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant); // Configurations for Midgard architectures - static std::map<DataType, FunctionExecutorPtr> gemm_configs = - { - { DataType::F32, &CLGEMMKernelSelectionMidgard::default_f32 }, - { DataType::F16, &CLGEMMKernelSelectionMidgard::default_f16 }, - { DataType::QASYMM8, &CLGEMMKernelSelectionMidgard::default_q8 }, - { DataType::QASYMM8_SIGNED, &CLGEMMKernelSelectionMidgard::default_q8 }, - { DataType::QSYMM8, &CLGEMMKernelSelectionMidgard::default_q8 }, - { DataType::QSYMM8_PER_CHANNEL, &CLGEMMKernelSelectionMidgard::default_q8 } - }; + static std::map<DataType, FunctionExecutorPtr> gemm_configs = { + {DataType::F32, &CLGEMMDefaultTypeMidgard::default_f32}, + {DataType::F16, &CLGEMMDefaultTypeMidgard::default_f16}, + {DataType::QASYMM8, &CLGEMMDefaultTypeMidgard::default_q8}, + {DataType::QASYMM8_SIGNED, &CLGEMMDefaultTypeMidgard::default_q8}, + {DataType::QSYMM8, &CLGEMMDefaultTypeMidgard::default_q8}, + {DataType::QSYMM8_PER_CHANNEL, &CLGEMMDefaultTypeMidgard::default_q8}}; const DataType data_type = params.data_type; - if(gemm_configs.find(data_type) != gemm_configs.end()) + if (gemm_configs.find(data_type) != gemm_configs.end()) { - return (this->*gemm_configs[data_type])(params.m, params.n, params.k, params.is_rhs_constant); + return (this->*gemm_configs[data_type])(params.m, params.n, params.k, params.b, params.is_rhs_constant); } ARM_COMPUTE_ERROR("Not supported data type"); } -CLGEMMKernelType CLGEMMKernelSelectionMidgard::default_f32(unsigned int m, unsigned int n, unsigned int k, bool is_rhs_constant) +CLGEMMKernelType CLGEMMDefaultTypeMidgard::default_f32( + unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant) { - ARM_COMPUTE_UNUSED(n, k); + ARM_COMPUTE_UNUSED(n, k, b); // We reshape the matrices only if we do not have the vector-by-matrix case and we reshape the matrix B only once - return ((m != 1) && is_rhs_constant) ? CLGEMMKernelType::RESHAPED_V1 : CLGEMMKernelType::NATIVE_V1; + return ((m != 1) && is_rhs_constant) ? CLGEMMKernelType::RESHAPED : CLGEMMKernelType::NATIVE; } -CLGEMMKernelType CLGEMMKernelSelectionMidgard::default_f16(unsigned int m, unsigned int n, unsigned int k, bool is_rhs_constant) +CLGEMMKernelType CLGEMMDefaultTypeMidgard::default_f16( + unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant) { - ARM_COMPUTE_UNUSED(n, k); + ARM_COMPUTE_UNUSED(n, k, b); // We reshape the matrices only if we do not have the vector-by-matrix case and we reshape the matrix B only once - return ((m != 1) && is_rhs_constant) ? CLGEMMKernelType::RESHAPED_V1 : CLGEMMKernelType::NATIVE_V1; + return ((m != 1) && is_rhs_constant) ? CLGEMMKernelType::RESHAPED : CLGEMMKernelType::NATIVE; } -CLGEMMKernelType CLGEMMKernelSelectionMidgard::default_q8(unsigned int m, unsigned int n, unsigned int k, bool is_rhs_constant) +CLGEMMKernelType CLGEMMDefaultTypeMidgard::default_q8( + unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant) { - ARM_COMPUTE_UNUSED(m, n, k, is_rhs_constant); + ARM_COMPUTE_UNUSED(m, n, k, b, is_rhs_constant); return CLGEMMKernelType::NATIVE; } diff --git a/src/runtime/CL/functions/CLComputeAllAnchors.cpp b/src/runtime/CL/gemm/CLGEMMDefaultTypeMidgard.h index 62714fed5c..241072fd58 100644 --- a/src/runtime/CL/functions/CLComputeAllAnchors.cpp +++ b/src/runtime/CL/gemm/CLGEMMDefaultTypeMidgard.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2020 ARM Limited. + * Copyright (c) 2020 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -21,27 +21,33 @@ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ -#include "arm_compute/runtime/CL/functions/CLComputeAllAnchors.h" +#ifndef SRC_CLGEMMDefaultTypeMidgard_H +#define SRC_CLGEMMDefaultTypeMidgard_H -#include "support/MemorySupport.h" +#include "arm_compute/runtime/CL/ICLGEMMKernelSelection.h" namespace arm_compute { -void CLComputeAllAnchors::configure(const ICLTensor *anchors, ICLTensor *all_anchors, const ComputeAnchorsInfo &info) +namespace cl_gemm { - configure(CLKernelLibrary::get().get_compile_context(), anchors, all_anchors, info); -} - -void CLComputeAllAnchors::configure(const CLCompileContext &compile_context, const ICLTensor *anchors, ICLTensor *all_anchors, const ComputeAnchorsInfo &info) +/** Midgard based OpenCL GEMMKernel selection */ +class CLGEMMDefaultTypeMidgard final : public ICLGEMMKernelSelection { - // Configure ComputeAllAnchors kernel - auto k = arm_compute::support::cpp14::make_unique<CLComputeAllAnchorsKernel>(); - k->configure(compile_context, anchors, all_anchors, info); - _kernel = std::move(k); -} +public: + /** Constructor + * + * @param[in] gpu GPU target + */ + CLGEMMDefaultTypeMidgard(GPUTarget gpu); -Status CLComputeAllAnchors::validate(const ITensorInfo *anchors, const ITensorInfo *all_anchors, const ComputeAnchorsInfo &info) -{ - return CLComputeAllAnchorsKernel::validate(anchors, all_anchors, info); -} + // Inherited overridden method + CLGEMMKernelType select_kernel(const CLGEMMKernelSelectionParams ¶ms) override; + +private: + CLGEMMKernelType default_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant); + CLGEMMKernelType default_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant); + CLGEMMKernelType default_q8(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant); +}; +} // namespace cl_gemm } // namespace arm_compute +#endif /* SRC_CLGEMMDefaultTypeMidgard_H */ diff --git a/src/runtime/CL/gemm/CLGEMMDefaultTypeValhall.cpp b/src/runtime/CL/gemm/CLGEMMDefaultTypeValhall.cpp new file mode 100644 index 0000000000..851e23bc84 --- /dev/null +++ b/src/runtime/CL/gemm/CLGEMMDefaultTypeValhall.cpp @@ -0,0 +1,302 @@ +/* + * Copyright (c) 2020-2023 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "src/runtime/CL/gemm/CLGEMMDefaultTypeValhall.h" + +#include "arm_compute/core/CL/CLHelpers.h" +#include "arm_compute/core/CL/CLKernelLibrary.h" + +#include "src/gpu/cl/kernels/gemm/ClGemmHelpers.h" + +#include <map> +#include <utility> + +namespace arm_compute +{ +namespace cl_gemm +{ +CLGEMMDefaultTypeValhall::CLGEMMDefaultTypeValhall(GPUTarget gpu) : ICLGEMMKernelSelection(gpu) +{ +} + +CLGEMMKernelType CLGEMMDefaultTypeValhall::select_kernel(const CLGEMMKernelSelectionParams ¶ms) +{ + // _target could be used in the future to have a dedicated heuristic for each GPU IP + ARM_COMPUTE_UNUSED(_target); + + using FunctionExecutorPtr = CLGEMMKernelType (CLGEMMDefaultTypeValhall::*)( + unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant); + + // Default configurations for Valhall architectures + static std::map<DataType, FunctionExecutorPtr> gemm_default_configs = { + {DataType::F32, &CLGEMMDefaultTypeValhall::default_f32}, + {DataType::F16, &CLGEMMDefaultTypeValhall::default_f16}, + {DataType::QASYMM8, &CLGEMMDefaultTypeValhall::default_q8}, + {DataType::QASYMM8_SIGNED, &CLGEMMDefaultTypeValhall::default_q8}, + {DataType::QSYMM8, &CLGEMMDefaultTypeValhall::default_q8}, + {DataType::QSYMM8_PER_CHANNEL, &CLGEMMDefaultTypeValhall::default_q8}}; + + // Mali-G77 configurations + static std::map<DataType, FunctionExecutorPtr> gemm_g77_configs = { + {DataType::F32, &CLGEMMDefaultTypeValhall::default_f32}, + {DataType::F16, &CLGEMMDefaultTypeValhall::g77_f16}, + {DataType::QASYMM8, &CLGEMMDefaultTypeValhall::default_q8}, + {DataType::QASYMM8_SIGNED, &CLGEMMDefaultTypeValhall::default_q8}, + {DataType::QSYMM8, &CLGEMMDefaultTypeValhall::default_q8}, + {DataType::QSYMM8_PER_CHANNEL, &CLGEMMDefaultTypeValhall::default_q8}}; + + // Mali-G78 configurations + static std::map<DataType, FunctionExecutorPtr> gemm_g78_configs = { + {DataType::F32, &CLGEMMDefaultTypeValhall::g78_f32}, + {DataType::F16, &CLGEMMDefaultTypeValhall::g78_f16}, + {DataType::QASYMM8, &CLGEMMDefaultTypeValhall::default_q8}, + {DataType::QASYMM8_SIGNED, &CLGEMMDefaultTypeValhall::default_q8}, + {DataType::QSYMM8, &CLGEMMDefaultTypeValhall::default_q8}, + {DataType::QSYMM8_PER_CHANNEL, &CLGEMMDefaultTypeValhall::default_q8}}; + + // Mali-G710 and Mali-G610 configurations + static std::map<DataType, FunctionExecutorPtr> gemm_g710_configs = { + {DataType::F32, &CLGEMMDefaultTypeValhall::default_f32}, + {DataType::F16, &CLGEMMDefaultTypeValhall::g710_f16}, + {DataType::QASYMM8, &CLGEMMDefaultTypeValhall::default_q8}, + {DataType::QASYMM8_SIGNED, &CLGEMMDefaultTypeValhall::default_q8}, + {DataType::QSYMM8, &CLGEMMDefaultTypeValhall::default_q8}, + {DataType::QSYMM8_PER_CHANNEL, &CLGEMMDefaultTypeValhall::default_q8}}; + + // Mali-G715 and Mali-G615 configurations + static std::map<DataType, FunctionExecutorPtr> gemm_g715_configs = { + {DataType::F32, &CLGEMMDefaultTypeValhall::g715_f32}, + {DataType::F16, &CLGEMMDefaultTypeValhall::g715_f16}, + {DataType::QASYMM8, &CLGEMMDefaultTypeValhall::default_q8}, + {DataType::QASYMM8_SIGNED, &CLGEMMDefaultTypeValhall::default_q8}, + {DataType::QSYMM8, &CLGEMMDefaultTypeValhall::default_q8}, + {DataType::QSYMM8_PER_CHANNEL, &CLGEMMDefaultTypeValhall::default_q8}}; + + const DataType data_type = params.data_type; + + switch (_target) + { + case GPUTarget::G710: + case GPUTarget::G610: + if (gemm_g710_configs.find(data_type) != gemm_g710_configs.end()) + { + return (this->*gemm_g710_configs[data_type])(params.m, params.n, params.k, params.b, + params.is_rhs_constant); + } + ARM_COMPUTE_ERROR("Not supported data type"); + case GPUTarget::G715: + case GPUTarget::G615: + if (gemm_g715_configs.find(data_type) != gemm_g715_configs.end()) + { + return (this->*gemm_g715_configs[data_type])(params.m, params.n, params.k, params.b, + params.is_rhs_constant); + } + ARM_COMPUTE_ERROR("Not supported data type"); + case GPUTarget::G78: + if (gemm_g78_configs.find(data_type) != gemm_g78_configs.end()) + { + return (this->*gemm_g78_configs[data_type])(params.m, params.n, params.k, params.b, + params.is_rhs_constant); + } + ARM_COMPUTE_ERROR("Not supported data type"); + case GPUTarget::G77: + if (gemm_g77_configs.find(data_type) != gemm_g77_configs.end()) + { + return (this->*gemm_g77_configs[data_type])(params.m, params.n, params.k, params.b, + params.is_rhs_constant); + } + ARM_COMPUTE_ERROR("Not supported data type"); + default: + if (gemm_default_configs.find(data_type) != gemm_default_configs.end()) + { + return (this->*gemm_default_configs[data_type])(params.m, params.n, params.k, params.b, + params.is_rhs_constant); + } + ARM_COMPUTE_ERROR("Not supported data type"); + } +} + +CLGEMMKernelType CLGEMMDefaultTypeValhall::default_f32( + unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant) +{ + ARM_COMPUTE_UNUSED(m, n, k, b); + + return is_rhs_constant ? CLGEMMKernelType::RESHAPED_ONLY_RHS : CLGEMMKernelType::NATIVE; +} + +CLGEMMKernelType CLGEMMDefaultTypeValhall::default_f16( + unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant) +{ + ARM_COMPUTE_UNUSED(m, n, k, b); + + return is_rhs_constant ? CLGEMMKernelType::RESHAPED_ONLY_RHS : CLGEMMKernelType::NATIVE; +} + +CLGEMMKernelType +CLGEMMDefaultTypeValhall::g77_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant) +{ + ARM_COMPUTE_UNUSED(m, n, k, b); + + return is_rhs_constant ? CLGEMMKernelType::RESHAPED_ONLY_RHS : CLGEMMKernelType::NATIVE; +} + +CLGEMMKernelType +CLGEMMDefaultTypeValhall::g710_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant) +{ + ARM_COMPUTE_UNUSED(m, n, k, b); + + return is_rhs_constant ? CLGEMMKernelType::RESHAPED_ONLY_RHS : CLGEMMKernelType::NATIVE; +} + +CLGEMMKernelType CLGEMMDefaultTypeValhall::default_q8( + unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant) +{ + ARM_COMPUTE_UNUSED(m, n, k, b); + + if (is_rhs_constant) + { + return CLGEMMKernelType::RESHAPED_ONLY_RHS; + } + else + { + return CLGEMMKernelType::NATIVE; + } +} + +CLGEMMKernelType +CLGEMMDefaultTypeValhall::g78_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant) +{ + ARM_COMPUTE_UNUSED(b); + + if (!is_rhs_constant) + { + return CLGEMMKernelType::NATIVE; + } + + if (m == 1) + { + return CLGEMMKernelType::RESHAPED_ONLY_RHS; + } + + if (n <= 272.0000f) + { + return CLGEMMKernelType::RESHAPED_ONLY_RHS; + } + else + { + if (k <= 471.0000f) + { + return CLGEMMKernelType::RESHAPED_ONLY_RHS; + } + else + { + if (m <= 72.5000f) + { + return CLGEMMKernelType::RESHAPED_ONLY_RHS; + } + else + { + if (m <= 90.5000f) + { + return CLGEMMKernelType::RESHAPED; + } + else + { + if (k <= 2448.0000f) + { + if (n <= 756.0000f) + { + return CLGEMMKernelType::RESHAPED_ONLY_RHS; + } + else + { + return CLGEMMKernelType::RESHAPED; + } + } + else + { + return CLGEMMKernelType::RESHAPED; + } + } + } + } + } +} + +CLGEMMKernelType +CLGEMMDefaultTypeValhall::g78_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant) +{ + ARM_COMPUTE_UNUSED(m, n, k, b); + + if (!is_rhs_constant) + { + return CLGEMMKernelType::NATIVE; + } + + return CLGEMMKernelType::RESHAPED_ONLY_RHS; +} + +CLGEMMKernelType +CLGEMMDefaultTypeValhall::g715_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant) +{ + if (!is_rhs_constant) + { + return default_f32(m, n, k, b, is_rhs_constant); + } + + unsigned int best_m0; + unsigned int best_n0; + + if (opencl::kernels::gemm::is_mmul_kernel_preferred(m, n, k, b, DataType::F32, best_m0, best_n0)) + { + return CLGEMMKernelType::RESHAPED_ONLY_RHS_MMUL; + } + else + { + return default_f32(m, n, k, b, is_rhs_constant); + } +} + +CLGEMMKernelType +CLGEMMDefaultTypeValhall::g715_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant) +{ + if (!is_rhs_constant) + { + return g78_f16(m, n, k, b, is_rhs_constant); + } + + unsigned int best_m0; + unsigned int best_n0; + + if (opencl::kernels::gemm::is_mmul_kernel_preferred(m, n, k, b, DataType::F16, best_m0, best_n0)) + { + return CLGEMMKernelType::RESHAPED_ONLY_RHS_MMUL; + } + else + { + return g78_f16(m, n, k, b, is_rhs_constant); + } +} + +} // namespace cl_gemm +} // namespace arm_compute diff --git a/src/runtime/CL/gemm/CLGEMMDefaultTypeValhall.h b/src/runtime/CL/gemm/CLGEMMDefaultTypeValhall.h new file mode 100644 index 0000000000..e190295ee4 --- /dev/null +++ b/src/runtime/CL/gemm/CLGEMMDefaultTypeValhall.h @@ -0,0 +1,59 @@ +/* + * Copyright (c) 2020-2023 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef SRC_CLGEMMDEFAULTTYPEVALHALL_H +#define SRC_CLGEMMDEFAULTTYPEVALHALL_H + +#include "arm_compute/runtime/CL/ICLGEMMKernelSelection.h" + +namespace arm_compute +{ +namespace cl_gemm +{ +/** Valhall based OpenCL GEMMKernel selection */ +class CLGEMMDefaultTypeValhall final : public ICLGEMMKernelSelection +{ +public: + /** Constructor + * + * @param[in] gpu GPU target + */ + CLGEMMDefaultTypeValhall(GPUTarget gpu); + + // Inherited overridden method + CLGEMMKernelType select_kernel(const CLGEMMKernelSelectionParams ¶ms) override; + +private: + CLGEMMKernelType default_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant); + CLGEMMKernelType default_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant); + CLGEMMKernelType default_q8(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant); + CLGEMMKernelType g77_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant); + CLGEMMKernelType g78_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant); + CLGEMMKernelType g78_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant); + CLGEMMKernelType g710_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant); + CLGEMMKernelType g715_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant); + CLGEMMKernelType g715_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant); +}; +} // namespace cl_gemm +} // namespace arm_compute +#endif /* SRC_CLGEMMDEFAULTTYPEVALHALL_H */ diff --git a/src/runtime/CL/gemm/CLGEMMKernelSelection.h b/src/runtime/CL/gemm/CLGEMMKernelSelection.h new file mode 100644 index 0000000000..98dd44b1bf --- /dev/null +++ b/src/runtime/CL/gemm/CLGEMMKernelSelection.h @@ -0,0 +1,65 @@ +/* + * Copyright (c) 2020, 2023 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ACL_SRC_RUNTIME_CL_GEMM_CLGEMMKERNELSELECTION_H +#define ACL_SRC_RUNTIME_CL_GEMM_CLGEMMKERNELSELECTION_H + +#include "arm_compute/runtime/CL/ICLGEMMKernelSelection.h" + +#include "src/runtime/CL/gemm/CLGEMMDefaultTypeBifrost.h" +#include "src/runtime/CL/gemm/CLGEMMDefaultTypeMidgard.h" +#include "src/runtime/CL/gemm/CLGEMMDefaultTypeValhall.h" + +namespace arm_compute +{ +namespace cl_gemm +{ +/** CLGEMMKernelSelection factory class */ +class CLGEMMKernelSelectionFactory final +{ +public: + /** Static method to select the GEMM kernel accordingly with the GPU target and GEMM's dimensionality + * + * @param[in] gpu GPU target + * + * @return CLGEMMKernelSelection class + */ + static std::unique_ptr<ICLGEMMKernelSelection> create(GPUTarget gpu) + { + switch (get_arch_from_target(gpu)) + { + case GPUTarget::MIDGARD: + return std::make_unique<CLGEMMDefaultTypeMidgard>(gpu); + case GPUTarget::BIFROST: + return std::make_unique<CLGEMMDefaultTypeBifrost>(gpu); + case GPUTarget::VALHALL: + case GPUTarget::FIFTHGEN: + return std::make_unique<CLGEMMDefaultTypeValhall>(gpu); + default: + ARM_COMPUTE_ERROR("Not supported GPU target"); + } + } +}; +} // namespace cl_gemm +} // namespace arm_compute +#endif // ACL_SRC_RUNTIME_CL_GEMM_CLGEMMKERNELSELECTION_H diff --git a/src/runtime/CL/gemm/CLGEMMKernelSelectionBifrost.cpp b/src/runtime/CL/gemm/CLGEMMKernelSelectionBifrost.cpp deleted file mode 100644 index 041e7d6cb4..0000000000 --- a/src/runtime/CL/gemm/CLGEMMKernelSelectionBifrost.cpp +++ /dev/null @@ -1,236 +0,0 @@ -/* - * Copyright (c) 2020 ARM Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "arm_compute/runtime/CL/gemm/CLGEMMKernelSelectionBifrost.h" - -#include "arm_compute/core/CL/CLHelpers.h" -#include "arm_compute/core/CL/CLKernelLibrary.h" -#include "arm_compute/core/CL/gemm/CLGEMMHelpers.h" - -#include <map> -#include <utility> - -namespace arm_compute -{ -namespace cl_gemm -{ -CLGEMMKernelSelectionBifrost::CLGEMMKernelSelectionBifrost(GPUTarget gpu) - : ICLGEMMKernelSelection(gpu) -{ -} - -CLGEMMKernelType CLGEMMKernelSelectionBifrost::select_kernel(const CLGEMMKernelSelectionParams ¶ms) -{ - // _target could be used in the future to have a dedicated heuristic for each GPU IP - ARM_COMPUTE_UNUSED(_target); - - using FunctionExecutorPtr = CLGEMMKernelType (CLGEMMKernelSelectionBifrost::*)(unsigned int m, unsigned int n, unsigned int k, bool is_rhs_constant); - - // Default configurations for Bifrost architectures - static std::map<DataType, FunctionExecutorPtr> gemm_default_configs = - { - { DataType::F32, &CLGEMMKernelSelectionBifrost::default_f32 }, - { DataType::F16, &CLGEMMKernelSelectionBifrost::default_f16 }, - { DataType::QASYMM8, &CLGEMMKernelSelectionBifrost::default_q8 }, - { DataType::QASYMM8_SIGNED, &CLGEMMKernelSelectionBifrost::default_q8 }, - { DataType::QSYMM8, &CLGEMMKernelSelectionBifrost::default_q8 }, - { DataType::QSYMM8_PER_CHANNEL, &CLGEMMKernelSelectionBifrost::default_q8 } - }; - - // Mali-G71 configurations - static std::map<DataType, FunctionExecutorPtr> gemm_g71_configs = - { - { DataType::F32, &CLGEMMKernelSelectionBifrost::default_f32 }, - { DataType::F16, &CLGEMMKernelSelectionBifrost::g71_f16 }, - { DataType::QASYMM8, &CLGEMMKernelSelectionBifrost::default_q8 }, - { DataType::QASYMM8_SIGNED, &CLGEMMKernelSelectionBifrost::default_q8 }, - { DataType::QSYMM8, &CLGEMMKernelSelectionBifrost::default_q8 }, - { DataType::QSYMM8_PER_CHANNEL, &CLGEMMKernelSelectionBifrost::default_q8 } - }; - - // Mali-G76 configurations - static std::map<DataType, FunctionExecutorPtr> gemm_g76_configs = - { - { DataType::F32, &CLGEMMKernelSelectionBifrost::g76_f32 }, - { DataType::F16, &CLGEMMKernelSelectionBifrost::default_f16 }, - { DataType::QASYMM8, &CLGEMMKernelSelectionBifrost::default_q8 }, - { DataType::QASYMM8_SIGNED, &CLGEMMKernelSelectionBifrost::default_q8 }, - { DataType::QSYMM8, &CLGEMMKernelSelectionBifrost::default_q8 }, - { DataType::QSYMM8_PER_CHANNEL, &CLGEMMKernelSelectionBifrost::default_q8 } - }; - - const DataType data_type = params.data_type; - - switch(_target) - { - case GPUTarget::G71: - if(gemm_g71_configs.find(data_type) != gemm_g71_configs.end()) - { - return (this->*gemm_g71_configs[data_type])(params.m, params.n, params.k, params.is_rhs_constant); - } - ARM_COMPUTE_ERROR("Not supported data type"); - case GPUTarget::G76: - if(gemm_g76_configs.find(data_type) != gemm_g76_configs.end()) - { - return (this->*gemm_g76_configs[data_type])(params.m, params.n, params.k, params.is_rhs_constant); - } - ARM_COMPUTE_ERROR("Not supported data type"); - default: - if(gemm_default_configs.find(data_type) != gemm_default_configs.end()) - { - return (this->*gemm_default_configs[data_type])(params.m, params.n, params.k, params.is_rhs_constant); - } - ARM_COMPUTE_ERROR("Not supported data type"); - } -} - -CLGEMMKernelType CLGEMMKernelSelectionBifrost::default_f32(unsigned int m, unsigned int n, unsigned int k, bool is_rhs_constant) -{ - CLGEMMKernelType gemm_type = CLGEMMKernelType::NATIVE_V1; - - if(is_rhs_constant) - { - if((m > 1) && (n < 16)) - { - gemm_type = CLGEMMKernelType::RESHAPED_V1; - } - else if(m == 1) - { - gemm_type = CLGEMMKernelType::RESHAPED_ONLY_RHS; - } - else - { - if((k > 256) && (m > 4)) - { - constexpr float alpha = 3.2f; - constexpr float fact0 = 1.51f; - constexpr float fact1 = 1.66f; - constexpr float ops = 12.0f; - const float scale = k > 1024 ? 1.07f : 1.0f; - gemm_type = (alpha + ((n * fact0) / ops) < ((fact1 * n * scale) / ops)) ? CLGEMMKernelType::RESHAPED_V1 : CLGEMMKernelType::NATIVE_V1; - } - else - { - gemm_type = CLGEMMKernelType::NATIVE_V1; - } - } - - const auto workload = static_cast<float>((m * n) / 20.0f); - - gemm_type = ((workload > 1600.0f) && (gemm_type == CLGEMMKernelType::RESHAPED_V1)) ? CLGEMMKernelType::RESHAPED : gemm_type; - } - - return gemm_type; -} - -CLGEMMKernelType CLGEMMKernelSelectionBifrost::default_f16(unsigned int m, unsigned int n, unsigned int k, bool is_rhs_constant) -{ - ARM_COMPUTE_UNUSED(n, k); - if(is_rhs_constant) - { - if(m == 1) - { - return CLGEMMKernelType::RESHAPED_ONLY_RHS; - } - else - { - return CLGEMMKernelType::RESHAPED; - } - } - else - { - return CLGEMMKernelType::NATIVE_V1; - } -} - -CLGEMMKernelType CLGEMMKernelSelectionBifrost::default_q8(unsigned int m, unsigned int n, unsigned int k, bool is_rhs_constant) -{ - ARM_COMPUTE_UNUSED(m, n, k); - - if(is_rhs_constant) - { - return CLGEMMKernelType::RESHAPED_ONLY_RHS; - } - else - { - return CLGEMMKernelType::NATIVE; - } -} - -CLGEMMKernelType CLGEMMKernelSelectionBifrost::g76_f32(unsigned int m, unsigned int n, unsigned int k, bool is_rhs_constant) -{ - CLGEMMKernelType gemm_type = CLGEMMKernelType::NATIVE_V1; - - if(is_rhs_constant) - { - if((m > 1) && (n < 16)) - { - gemm_type = CLGEMMKernelType::RESHAPED; - } - else if(m == 1) - { - gemm_type = CLGEMMKernelType::RESHAPED_ONLY_RHS; - } - else - { - if((k > 256) && (m > 4)) - { - gemm_type = CLGEMMKernelType::RESHAPED; - } - else - { - gemm_type = CLGEMMKernelType::RESHAPED_ONLY_RHS; - } - } - } - - return gemm_type; -} - -CLGEMMKernelType CLGEMMKernelSelectionBifrost::g71_f16(unsigned int m, unsigned int n, unsigned int k, bool is_rhs_constant) -{ - if(is_rhs_constant) - { - if(m == 1) - { - if(n > k) - { - return CLGEMMKernelType::NATIVE_V1; - } - else - { - return CLGEMMKernelType::RESHAPED_ONLY_RHS; - } - } - else - { - return CLGEMMKernelType::RESHAPED; - } - } - else - { - return CLGEMMKernelType::NATIVE_V1; - } -} -} // namespace cl_gemm -} // namespace arm_compute diff --git a/src/runtime/CL/gemm/CLGEMMKernelSelectionValhall.cpp b/src/runtime/CL/gemm/CLGEMMKernelSelectionValhall.cpp deleted file mode 100644 index 775bb9bffd..0000000000 --- a/src/runtime/CL/gemm/CLGEMMKernelSelectionValhall.cpp +++ /dev/null @@ -1,98 +0,0 @@ -/* - * Copyright (c) 2020 ARM Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "arm_compute/runtime/CL/gemm/CLGEMMKernelSelectionValhall.h" - -#include "arm_compute/core/CL/CLHelpers.h" -#include "arm_compute/core/CL/CLKernelLibrary.h" -#include "arm_compute/core/CL/gemm/CLGEMMHelpers.h" - -#include <map> -#include <utility> - -namespace arm_compute -{ -namespace cl_gemm -{ -CLGEMMKernelSelectionValhall::CLGEMMKernelSelectionValhall(GPUTarget gpu) - : ICLGEMMKernelSelection(gpu) -{ -} - -CLGEMMKernelType CLGEMMKernelSelectionValhall::select_kernel(const CLGEMMKernelSelectionParams ¶ms) -{ - // _target could be used in the future to have a dedicated heuristic for each GPU IP - ARM_COMPUTE_UNUSED(_target); - - using FunctionExecutorPtr = CLGEMMKernelType (CLGEMMKernelSelectionValhall::*)(unsigned int m, unsigned int n, unsigned int k, bool is_rhs_constant); - - // Configurations for Valhall architectures - static std::map<DataType, FunctionExecutorPtr> gemm_configs = - { - { DataType::F32, &CLGEMMKernelSelectionValhall::default_f32 }, - { DataType::F16, &CLGEMMKernelSelectionValhall::default_f16 }, - { DataType::QASYMM8, &CLGEMMKernelSelectionValhall::default_q8 }, - { DataType::QASYMM8_SIGNED, &CLGEMMKernelSelectionValhall::default_q8 }, - { DataType::QSYMM8, &CLGEMMKernelSelectionValhall::default_q8 }, - { DataType::QSYMM8_PER_CHANNEL, &CLGEMMKernelSelectionValhall::default_q8 } - }; - - const DataType data_type = params.data_type; - - if(gemm_configs.find(data_type) != gemm_configs.end()) - { - return (this->*gemm_configs[data_type])(params.m, params.n, params.k, params.is_rhs_constant); - } - - ARM_COMPUTE_ERROR("Not supported data type"); -} - -CLGEMMKernelType CLGEMMKernelSelectionValhall::default_f32(unsigned int m, unsigned int n, unsigned int k, bool is_rhs_constant) -{ - ARM_COMPUTE_UNUSED(m, n, k); - - return is_rhs_constant ? CLGEMMKernelType::RESHAPED_ONLY_RHS : CLGEMMKernelType::NATIVE_V1; -} - -CLGEMMKernelType CLGEMMKernelSelectionValhall::default_f16(unsigned int m, unsigned int n, unsigned int k, bool is_rhs_constant) -{ - ARM_COMPUTE_UNUSED(m, n, k); - - return is_rhs_constant ? CLGEMMKernelType::RESHAPED_ONLY_RHS : CLGEMMKernelType::NATIVE_V1; -} - -CLGEMMKernelType CLGEMMKernelSelectionValhall::default_q8(unsigned int m, unsigned int n, unsigned int k, bool is_rhs_constant) -{ - ARM_COMPUTE_UNUSED(m, n, k); - - if(is_rhs_constant) - { - return CLGEMMKernelType::RESHAPED_ONLY_RHS; - } - else - { - return CLGEMMKernelType::NATIVE; - } -} -} // namespace cl_gemm -} // namespace arm_compute diff --git a/src/runtime/CL/gemm_auto_heuristics/CLGEMMAutoHeuristics.cpp b/src/runtime/CL/gemm_auto_heuristics/CLGEMMAutoHeuristics.cpp new file mode 100644 index 0000000000..8df57197e2 --- /dev/null +++ b/src/runtime/CL/gemm_auto_heuristics/CLGEMMAutoHeuristics.cpp @@ -0,0 +1,205 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "src/runtime/CL/gemm_auto_heuristics/CLGEMMAutoHeuristics.h" + +#include "arm_compute/core/Log.h" +#include "arm_compute/core/Validate.h" +#include "arm_compute/runtime/CL/CLScheduler.h" +#include "arm_compute/runtime/CL/ICLGEMMKernelSelection.h" + +#include "src/gpu/cl/kernels/gemm/ClGemmHelpers.h" +#include "src/gpu/cl/kernels/gemm/IClGemmKernelConfig.h" +#include "src/gpu/cl/kernels/gemm/native/ClGemmNativeKernelConfig.h" +#include "src/gpu/cl/kernels/gemm/reshaped/ClGemmReshapedKernelConfig.h" +#include "src/gpu/cl/kernels/gemm/reshaped_only_rhs/ClGemmReshapedOnlyRhsKernelConfig.h" +#include "src/runtime/CL/gemm/CLGEMMKernelSelection.h" +#include "src/runtime/CL/mlgo/MLGOHeuristics.h" +#include "src/runtime/CL/mlgo/Utils.h" +#include "utils/TypePrinter.h" + +namespace arm_compute +{ +namespace cl_gemm +{ +namespace auto_heuristics +{ +using namespace arm_compute::opencl::kernels::gemm; + +GEMMTypeResult select_mlgo_gemm_kernel(const CommonQuery &query, bool reshape_b_only_on_first_run) +{ + ARM_COMPUTE_UNUSED(reshape_b_only_on_first_run); + bool valid = false; + CLGEMMKernelType gemm_type{}; + const auto mlgo_heuristics = CLScheduler::get().gemm_heuristics(); + if (mlgo_heuristics != nullptr) + { + std::tie(valid, gemm_type) = mlgo_heuristics->get()->query_gemm_type( + mlgo::Query{string_from_target(query.gpu_target), query.data_type, query.m, query.n, query.k, query.b}); + } + if (valid) + { + ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("MLGOHeuristics query returns gemm type: %s.", + to_string(gemm_type).c_str()); + } + else + { + ARM_COMPUTE_LOG_INFO_MSG_CORE("MLGOHeuristics query failed"); + } + return GEMMTypeResult(valid, gemm_type); +} + +GEMMTypeResult select_default_gemm_kernel(const CommonQuery &query, bool reshape_b_only_on_first_run) +{ + std::unique_ptr<ICLGEMMKernelSelection> default_heuristics = CLGEMMKernelSelectionFactory::create(query.gpu_target); + ARM_COMPUTE_ERROR_ON_NULLPTR(default_heuristics.get()); + + CLGEMMKernelSelectionParams params; + params.m = query.m; + params.n = query.n; + params.k = query.k; + params.b = query.b; + params.is_rhs_constant = reshape_b_only_on_first_run; + params.data_type = query.data_type; + + const auto kernel_type = default_heuristics->select_kernel(params); + return GEMMTypeResult(true, kernel_type); +} + +GEMMConfigResult select_default_gemm_config_reshaped_only_rhs(const CommonQuery &query) +{ + GEMMLHSMatrixInfo lhs_info; + GEMMRHSMatrixInfo rhs_info; + std::unique_ptr<IClGemmKernelConfig> gemm_config = + ClGemmReshapedOnlyRhsKernelConfigurationFactory::create(query.gpu_target); + ARM_COMPUTE_ERROR_ON_NULLPTR(gemm_config.get()); + std::tie(lhs_info, rhs_info) = gemm_config->configure(query.m, query.n, query.k, query.b, query.data_type); + return GEMMConfigResult{true, lhs_info, rhs_info}; +} + +GEMMConfigResult select_mlgo_gemm_config_reshaped_only_rhs(const CommonQuery &query) +{ + bool valid = false; + GEMMLHSMatrixInfo lhs_info; + GEMMRHSMatrixInfo rhs_info; + mlgo::GEMMConfigReshapedOnlyRHS config{}; + const auto mlgo_heuristics = CLScheduler::get().gemm_heuristics(); + if (mlgo_heuristics != nullptr) + { + std::tie(valid, config) = mlgo_heuristics->get()->query_gemm_config_reshaped_only_rhs( + mlgo::Query{string_from_target(query.gpu_target), query.data_type, query.m, query.n, query.k, query.b}); + } + if (valid) + { + ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("MLGOHeuristics query returns gemm config: %s.", + to_string(config).c_str()); + // Setting irrelevant unsigned int parameters to 1 and bool parameters to false as they do no matter + std::tie(lhs_info, rhs_info) = configure_lhs_rhs_info( + query.m, query.n, config.m0, config.n0, config.k0, 1, config.h0, false, config.interleave_rhs, + !config.transpose_rhs, config.transpose_rhs, config.export_cl_image); + } + else + { + ARM_COMPUTE_LOG_INFO_MSG_CORE("MLGOHeuristics query failed"); + } + return GEMMConfigResult{valid, lhs_info, rhs_info}; +} + +GEMMConfigResult select_default_gemm_config_reshaped(const CommonQuery &query) +{ + GEMMLHSMatrixInfo lhs_info; + GEMMRHSMatrixInfo rhs_info; + std::unique_ptr<IClGemmKernelConfig> gemm_config = + ClGemmReshapedKernelConfigurationFactory::create(query.gpu_target); + ARM_COMPUTE_ERROR_ON_NULLPTR(gemm_config.get()); + std::tie(lhs_info, rhs_info) = gemm_config->configure(query.m, query.n, query.k, query.b, query.data_type); + return GEMMConfigResult{true, lhs_info, rhs_info}; +} + +GEMMConfigResult select_mlgo_gemm_config_reshaped(const CommonQuery &query) +{ + bool valid = false; + GEMMLHSMatrixInfo lhs_info; + GEMMRHSMatrixInfo rhs_info; + mlgo::GEMMConfigReshaped config{}; + const auto mlgo_heuristics = CLScheduler::get().gemm_heuristics(); + if (mlgo_heuristics != nullptr) + { + std::tie(valid, config) = mlgo_heuristics->get()->query_gemm_config_reshaped( + mlgo::Query{string_from_target(query.gpu_target), query.data_type, query.m, query.n, query.k, query.b}); + } + if (valid) + { + ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("MLGOHeuristics query returns gemm config: %s.", + to_string(config).c_str()); + std::tie(lhs_info, rhs_info) = configure_lhs_rhs_info( + query.m, query.n, config.m0, config.n0, config.k0, config.v0, config.h0, config.interleave_lhs, + config.interleave_rhs, !config.transpose_rhs, config.transpose_rhs, config.export_cl_image); + } + else + { + ARM_COMPUTE_LOG_INFO_MSG_CORE("MLGOHeuristics query failed"); + } + return GEMMConfigResult{valid, lhs_info, rhs_info}; +} + +GEMMConfigResult select_default_gemm_config_native(const CommonQuery &query) +{ + GEMMLHSMatrixInfo lhs_info; + GEMMRHSMatrixInfo rhs_info; + std::unique_ptr<IClGemmKernelConfig> gemm_config = ClGemmNativeKernelConfigurationFactory::create(query.gpu_target); + ARM_COMPUTE_ERROR_ON_NULLPTR(gemm_config.get()); + std::tie(lhs_info, rhs_info) = gemm_config->configure(query.m, query.n, query.k, query.b, query.data_type); + return GEMMConfigResult{true, lhs_info, rhs_info}; +} + +GEMMConfigResult select_mlgo_gemm_config_native(const CommonQuery &query) +{ + bool valid = false; + GEMMLHSMatrixInfo lhs_info; + GEMMRHSMatrixInfo rhs_info; + mlgo::GEMMConfigNative config{}; + const auto mlgo_heuristics = CLScheduler::get().gemm_heuristics(); + if (mlgo_heuristics != nullptr) + { + std::tie(valid, config) = mlgo_heuristics->get()->query_gemm_config_native( + mlgo::Query{string_from_target(query.gpu_target), query.data_type, query.m, query.n, query.k, query.b}); + } + if (valid) + { + ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("MLGOHeuristics query returns gemm config: %s.", + to_string(config).c_str()); + // Setting irrelevant unsigned int parameters to 1 and bool parameters to false as they do no matter + std::tie(lhs_info, rhs_info) = opencl::kernels::gemm::configure_lhs_rhs_info( + query.m, query.n, config.m0, config.n0, config.k0, 1, 1, false, false, false, false, false); + } + else + { + ARM_COMPUTE_LOG_INFO_MSG_CORE("MLGOHeuristics query failed"); + } + return GEMMConfigResult{valid, lhs_info, rhs_info}; +} +} // namespace auto_heuristics + +} // namespace cl_gemm +} // namespace arm_compute diff --git a/src/runtime/CL/gemm_auto_heuristics/CLGEMMAutoHeuristics.h b/src/runtime/CL/gemm_auto_heuristics/CLGEMMAutoHeuristics.h new file mode 100644 index 0000000000..f544715e03 --- /dev/null +++ b/src/runtime/CL/gemm_auto_heuristics/CLGEMMAutoHeuristics.h @@ -0,0 +1,136 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef SRC_RUNTIME_CL_GEMM_AUTO_HEURISTICS_CL_GEMM_AUTO_HEURISTICS_H +#define SRC_RUNTIME_CL_GEMM_AUTO_HEURISTICS_CL_GEMM_AUTO_HEURISTICS_H + +#include "arm_compute/core/GPUTarget.h" +#include "arm_compute/core/Types.h" +#include "arm_compute/runtime/CL/CLTypes.h" + +namespace arm_compute +{ +namespace cl_gemm +{ +namespace auto_heuristics +{ +/** A collection of adaptor functions that enable the auto selection between mlgo-based heuristics and default heuristics */ + +/** Common query */ +struct CommonQuery +{ + GPUTarget gpu_target; /**< Which @ref GPUTarget to query about */ + DataType data_type; /**< Data type */ + unsigned int m; /**< Number of rows for the lhs matrix. Lhs matrix NOT transposed */ + unsigned int n; /**< Number of columns for the rhs matrix. Rhs matrix NOT transposed */ + unsigned int k; /**< Number of rows for the rhs matrix. Rhs matrix NOT transposed */ + unsigned int b; /**< Batch size */ +}; + +/** Result of querying about GEMM type ( @ref CLGEMMKernelType) */ +struct GEMMTypeResult +{ + GEMMTypeResult(bool valid, CLGEMMKernelType gemm_type) : valid{valid}, gemm_type{gemm_type} + { + } + /** Test if the result is valid */ + operator bool() const + { + return valid; + } + bool valid; /** If the result is valid */ + CLGEMMKernelType gemm_type; /** @ref CLGEMMKernelType */ +}; + +/** Result of querying about GEMM config ( @ref GEMMLHSMatrixInfo and @ref GEMMRHSMatrixInfo) */ +struct GEMMConfigResult +{ + GEMMConfigResult(bool valid, const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info) + : valid{valid}, lhs_info{lhs_info}, rhs_info{rhs_info} + { + } + /** Test if the result is valid */ + operator bool() const + { + return valid; + } + bool valid; /** If the result is valid */ + GEMMLHSMatrixInfo lhs_info; /** @ref GEMMLHSMatrixInfo */ + GEMMRHSMatrixInfo rhs_info; /** @ref GEMMRHSMatrixInfo */ +}; + +/** Select gemm type based on mlgo heuristics + * @param query Query + * @param reshape_b_only_on_first_run Additional query parameter if reshape b only on first run + * @return GEMMTypeResult. Result is valid if bool(GEMMTypeResult) == true and invalid otherwise + */ +GEMMTypeResult select_mlgo_gemm_kernel(const CommonQuery &query, bool reshape_b_only_on_first_run); + +/** Select gemm type based on default heuristics + * @param query Query + * @param reshape_b_only_on_first_run Additional query parameter if reshape b only on first run + * @return GEMMTypeResult. Result is valid if bool(GEMMTypeResult) == true and invalid otherwise + */ +GEMMTypeResult select_default_gemm_kernel(const CommonQuery &query, bool reshape_b_only_on_first_run); + +/** Select gemm config based on mlgo heuristics + * @param query Query + * @return GEMMConfigResult. Result is valid if bool(GEMMConfigResult) == true and invalid otherwise + */ +GEMMConfigResult select_mlgo_gemm_config_reshaped_only_rhs(const CommonQuery &query); + +/** Select gemm config based on default heuristics + * @param query Query + * @return GEMMConfigResult. Result is valid if bool(GEMMConfigResult) == true and invalid otherwise + */ +GEMMConfigResult select_default_gemm_config_reshaped_only_rhs(const CommonQuery &query); + +/** Select gemm config based on mlgo heuristics + * @param query Query + * @return GEMMConfigResult. Result is valid if bool(GEMMConfigResult) == true and invalid otherwise + */ +GEMMConfigResult select_mlgo_gemm_config_reshaped(const CommonQuery &query); + +/** Select gemm config based on default heuristics + * @param query Query + * @return GEMMConfigResult. Result is valid if bool(GEMMConfigResult) == true and invalid otherwise + */ +GEMMConfigResult select_default_gemm_config_reshaped(const CommonQuery &query); + +/** Select gemm config based on mlgo heuristics + * @param query Query + * @return GEMMConfigResult. Result is valid if bool(GEMMConfigResult) == true and invalid otherwise + */ +GEMMConfigResult select_mlgo_gemm_config_native(const CommonQuery &query); + +/** Select gemm config based on default heuristics + * @param query Query + * @return GEMMConfigResult. Result is valid if bool(GEMMConfigResult) == true and invalid otherwise + */ +GEMMConfigResult select_default_gemm_config_native(const CommonQuery &query); + +} // namespace auto_heuristics +} // namespace cl_gemm +} // namespace arm_compute + +#endif // SRC_RUNTIME_CL_GEMM_AUTO_HEURISTICS_CL_GEMM_AUTO_HEURISTICS_H diff --git a/src/runtime/CL/mlgo/Common.h b/src/runtime/CL/mlgo/Common.h new file mode 100644 index 0000000000..08a7ee8c18 --- /dev/null +++ b/src/runtime/CL/mlgo/Common.h @@ -0,0 +1,81 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef SRC_RUNTIME_CL_MLGO_COMMON_H +#define SRC_RUNTIME_CL_MLGO_COMMON_H + +#include "arm_compute/core/Types.h" +#include "arm_compute/runtime/CL/CLTypes.h" + +namespace arm_compute +{ +namespace mlgo +{ +/** Types of Heuristic (tree) */ +enum class HeuristicType +{ + GEMM_Type, /**< About the type of gemm */ + GEMM_Config_Native, /**< About the gemm config for native kernel */ + GEMM_Config_Reshaped_Only_RHS, /**< About the gemm config for reshaped only rhs kernel */ + GEMM_Config_Reshaped /**< About the gemm config for reshaped kernel */ +}; + +using GEMMType = CLGEMMKernelType; + +/** GEMM Configuration for Native kernel */ +struct GEMMConfigNative +{ + unsigned int m0{1}; /**< Number of rows processed by the matrix multiplication */ + unsigned int n0{1}; /**< Number of columns processed by the matrix multiplication */ + unsigned int k0{1}; /**< Number of partial accumulations performed by the matrix multiplication */ +}; + +/** GEMM Configuration for Reshaped Only RHS kernel */ +struct GEMMConfigReshapedOnlyRHS +{ + unsigned int m0{1}; /**< Number of rows processed by the matrix multiplication */ + unsigned int n0{1}; /**< Number of columns processed by the matrix multiplication */ + unsigned int k0{1}; /**< Number of partial accumulations performed by the matrix multiplication */ + unsigned int h0{1}; /**< Number of horizontal blocks of size (k0xn0) stored on the same output row */ + bool interleave_rhs{false}; /**< True if the h0 (k0xn0) blocks have to be interleaved in the output row */ + bool transpose_rhs{false}; /**< True if the (k0xn0) block has to be transposed before been stored */ + bool export_cl_image{false}; /**< True if the reshaped rhs has to be exported to cl_image. n0 must be equal to 4 */ +}; + +/** GEMM Configuration for Reshaped kernel */ +struct GEMMConfigReshaped +{ + unsigned int m0{1}; /**< Number of rows processed by the matrix multiplication */ + unsigned int n0{1}; /**< Number of columns processed by the matrix multiplication */ + unsigned int k0{1}; /**< Number of partial accumulations performed by the matrix multiplication */ + unsigned int v0{1}; /**< Number of vertical blocks of size (m0xk0) stored on the same output row */ + unsigned int h0{1}; /**< Number of horizontal blocks of size (k0xn0) stored on the same output row */ + bool interleave_lhs{false}; /**< True if the v0 (m0xk0) blocks have to be interleaved in the output row */ + bool interleave_rhs{false}; /**< True if the h0 (k0xn0) blocks have to be interleaved in the output row */ + bool transpose_rhs{false}; /**< True if the (k0xn0) block has to be transposed before been stored */ + bool export_cl_image{false}; /**< True if the reshaped rhs has to be exported to cl_image. n0 must be equal to 4 */ +}; + +} // namespace mlgo +} // namespace arm_compute +#endif // SRC_RUNTIME_CL_MLGO_COMMON_H diff --git a/src/runtime/CL/mlgo/HeuristicTree.cpp b/src/runtime/CL/mlgo/HeuristicTree.cpp new file mode 100644 index 0000000000..f7b706902b --- /dev/null +++ b/src/runtime/CL/mlgo/HeuristicTree.cpp @@ -0,0 +1,248 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "src/runtime/CL/mlgo/HeuristicTree.h" + +#include "arm_compute/core/Log.h" + +#include "support/Cast.h" + +#include <algorithm> +#include <deque> +#include <set> +namespace arm_compute +{ +namespace mlgo +{ +namespace +{ +bool evaluate(GEMMShape shape, Condition cond) +{ + // PRE: all features and ConditionalOps are valid + constexpr float eps = 0.0001f; + // Calculate all secondary features + std::vector<std::pair<std::string, float>> cond_values{ + {"m", static_cast<float>(shape.m)}, + {"n", static_cast<float>(shape.n)}, + {"k", static_cast<float>(shape.k)}, + {"b", static_cast<float>(shape.b)}, + {"r_mn", static_cast<float>(shape.m) / shape.n}, + {"r_mk", static_cast<float>(shape.m) / shape.k}, + {"r_nk", static_cast<float>(shape.n) / shape.k}, + {"r_mnk", static_cast<float>(shape.m) / (static_cast<float>(shape.n) / shape.k)}, + {"workload", (static_cast<float>(shape.m) * shape.n * shape.b) / 20.0}}; + auto cond_value_pair_it = + std::find_if(cond_values.begin(), cond_values.end(), + [&cond](decltype(*cond_values.begin()) it) { return it.first == cond.feature; }); + + ARM_COMPUTE_ERROR_ON(cond_value_pair_it == cond_values.end()); + const float cond_value = cond_value_pair_it->second; + switch (cond.op) + { + case ConditionalOp::LT: + { + return cond_value < cond.threshold; + } + case ConditionalOp::LE: + { + return cond_value <= cond.threshold; + } + case ConditionalOp::GT: + { + return cond_value > cond.threshold; + } + case ConditionalOp::GE: + { + return cond_value >= cond.threshold; + } + case ConditionalOp::EQ: + default: + { + return std::abs(cond_value - cond.threshold) < eps; + } + } +} + +} // namespace + +constexpr size_t HeuristicTree::_max_num_nodes; +constexpr size_t HeuristicTree::_max_query_depth; +constexpr HeuristicTree::NodeID HeuristicTree::_root; + +HeuristicTree::HeuristicTree() : HeuristicTree(0, HeuristicType::GEMM_Type, "", DataType::F32) +{ +} + +HeuristicTree::HeuristicTree(TreeID id, HeuristicType h_type, const std::string &ip_target, DataType data_type) + : _id{id}, _heuristic_type{h_type}, _ip_target{ip_target}, _data_type{data_type}, _tree{} +{ +} + +template <typename T> +std::pair<bool, T> HeuristicTree::query(GEMMShape shape) const +{ + // Root ID = 0; + auto cur_node = _tree.at(_root).get(); + size_t depth = 0; + while (cur_node->type() != NodeType::Leaf) + { + if (depth > _max_query_depth) + { + ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("Exceeding max query depth: %zu. Is the tree too deep?", + _max_query_depth); + return std::make_pair(false, T{}); + } + ARM_COMPUTE_ERROR_ON_MSG(cur_node->type() != NodeType::Branch, "Unexpected NodeType"); + auto br_node = utils::cast::polymorphic_downcast<BranchNode *>(cur_node); + if (evaluate(shape, br_node->condition)) + { + cur_node = _tree.at(br_node->true_node).get(); + } + else + { + cur_node = _tree.at(br_node->false_node).get(); + } + ++depth; + } + ARM_COMPUTE_ERROR_ON_MSG(cur_node->type() != NodeType::Leaf, "Unexpected NodeType"); + auto l_node = utils::cast::polymorphic_downcast<LeafNode<T> *>(cur_node); + return std::make_pair(true, l_node->value); +} + +template <typename T> +bool HeuristicTree::add_leaf(NodeID id, T val) +{ + if (_tree.size() >= _max_num_nodes) + { + ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("Exceeding the maximum number of nodes allowed %zu", _max_num_nodes); + return false; + } + if (_tree.find(id) != _tree.end()) + { + ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("Cannot add node; node id %zu already exists", id); + return false; + } + _tree[id] = std::make_unique<LeafNode<T>>(id, val); + return true; +} + +bool HeuristicTree::add_branch(NodeID id, Condition cond, NodeID t_node, NodeID f_node) +{ + if (_tree.size() >= _max_num_nodes) + { + ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("Exceeding the maximum number of nodes allowed %zu", _max_num_nodes); + return false; + } + + const std::set<std::string> supported_features = {"m", "n", "k", "b", "r_mn", "r_mk", "r_nk", "r_mnk", "workload"}; + const auto orig_feature = cond.feature; + std::transform(cond.feature.begin(), cond.feature.end(), cond.feature.begin(), + [](char c) { return std::tolower(c); }); + if (supported_features.find(cond.feature) == supported_features.end()) + { + ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("Unsupported feature %s", orig_feature.c_str()); + return false; + } + + if (_tree.find(id) != _tree.end()) + { + ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("Cannot add node; node id %zu already exists", id); + return false; + } + _tree[id] = std::make_unique<BranchNode>(id, cond, t_node, f_node); + return true; +} + +bool HeuristicTree::check_if_structurally_correct() const +{ + std::set<NodeID> visited; + std::deque<NodeID> to_visit{_root}; + + while (!to_visit.empty()) + { + auto id = to_visit.front(); + to_visit.pop_front(); + if (_tree.find(id) == _tree.end()) + { + ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("Missing node %zu", id); + return false; + } + auto not_seen_before = visited.insert(id); + if (!not_seen_before.second) + { + ARM_COMPUTE_LOG_INFO_MSG_CORE("Not a tree; contains cycles or loops"); + return false; + } + auto cur_node = _tree.at(id).get(); + if (cur_node->type() == NodeType::Branch) + { + auto br_node = utils::cast::polymorphic_downcast<BranchNode *>(cur_node); + to_visit.push_back(br_node->true_node); + to_visit.push_back(br_node->false_node); + } + } + if (visited.size() != _tree.size()) + { + ARM_COMPUTE_LOG_INFO_MSG_CORE("Contains disjoint nodes"); + return false; + } + return true; +} + +bool HeuristicTree::check() +{ + if (_tree.empty()) + { + ARM_COMPUTE_LOG_INFO_MSG_CORE("Empty tree encountered"); + return false; + } + if (_tree.find(_root) == _tree.end()) + { + ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("Missing root. Root must have a Node ID of %zu", _root); + return false; + } + return check_if_structurally_correct(); +} + +/** Explicit template instantiation @relates HeuristicTree */ +template std::pair<bool, GEMMType> HeuristicTree::query<GEMMType>(GEMMShape shape) const; +/** Explicit template instantiation @relates HeuristicTree */ +template std::pair<bool, GEMMConfigNative> HeuristicTree::query<GEMMConfigNative>(GEMMShape shape) const; +/** Explicit template instantiation @relates HeuristicTree */ +template std::pair<bool, GEMMConfigReshapedOnlyRHS> +HeuristicTree::query<GEMMConfigReshapedOnlyRHS>(GEMMShape shape) const; +/** Explicit template instantiation @relates HeuristicTree */ +template std::pair<bool, GEMMConfigReshaped> HeuristicTree::query<GEMMConfigReshaped>(GEMMShape shape) const; + +/** Explicit template instantiation @relates HeuristicTree */ +template bool HeuristicTree::add_leaf(NodeID id, GEMMType val); +/** Explicit template instantiation @relates HeuristicTree */ +template bool HeuristicTree::add_leaf(NodeID id, GEMMConfigNative val); +/** Explicit template instantiation @relates HeuristicTree */ +template bool HeuristicTree::add_leaf(NodeID id, GEMMConfigReshapedOnlyRHS val); +/** Explicit template instantiation @relates HeuristicTree */ +template bool HeuristicTree::add_leaf(NodeID id, GEMMConfigReshaped val); + +} // namespace mlgo + +} // namespace arm_compute diff --git a/src/runtime/CL/mlgo/HeuristicTree.h b/src/runtime/CL/mlgo/HeuristicTree.h new file mode 100644 index 0000000000..a4f8c116b9 --- /dev/null +++ b/src/runtime/CL/mlgo/HeuristicTree.h @@ -0,0 +1,198 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef SRC_RUNTIME_CL_MLGO_HEURISTIC_TREE_H +#define SRC_RUNTIME_CL_MLGO_HEURISTIC_TREE_H + +#include "arm_compute/core/Types.h" + +#include "src/runtime/CL/mlgo/Common.h" + +#include <map> +#include <memory> +#include <string> +#include <utility> + +namespace arm_compute +{ +namespace mlgo +{ +/** Conditional ops */ +enum class ConditionalOp +{ + EQ, /**< Equal */ + LT, /**< Less than */ + LE, /**< Less than or equal to */ + GT, /**< Greater than */ + GE, /**< Greater than or equal to */ +}; + +/** A branch condition expression evaluating: feature op threshold */ +struct Condition +{ + std::string feature; /**< Feature name */ + ConditionalOp op; /**< Condtional op */ + float threshold; /**< Threshold value */ +}; + +/** GEMM Shape used for query */ +struct GEMMShape +{ + unsigned int m; /**< Number of rows for the lhs matrix. Lhs matrix NOT transposed */ + unsigned int n; /**< Number of columns for the rhs matrix. Rhs matrix NOT transposed */ + unsigned int k; /**< Number of rows for the rhs matrix. Rhs matrix NOT transposed */ + unsigned int b; /**< Batch size */ +}; + +/** A binary decision tree based heuristic */ +class HeuristicTree +{ +public: + using NodeID = size_t; + using TreeID = size_t; + using Index = std::tuple<HeuristicType, std::string, DataType>; + enum class NodeType + { + Branch, + Leaf + }; + struct Node + { + virtual NodeType type() const = 0; + virtual ~Node() = default; + }; + + struct BranchNode : public Node + { + BranchNode(NodeID id, Condition cond, NodeID t_node, NodeID f_node) + : id{id}, condition{cond}, true_node{t_node}, false_node{f_node} + { + } + NodeType type() const override + { + return NodeType::Branch; + } + NodeID id; + Condition condition; + NodeID true_node; + NodeID false_node; + }; + + template <typename T> + struct LeafNode : public Node + { + LeafNode(NodeID id, T val) : id{id}, value{val} + { + } + NodeType type() const override + { + return NodeType::Leaf; + } + NodeID id; + T value; + }; + +public: + /** Constructor */ + HeuristicTree(); + /** Constructor */ + HeuristicTree(TreeID id, HeuristicType h_type, const std::string &ip_target, DataType data_type); + // Since the HeuristicTree is a handle that owns the the nodes, it is move-only + /** Prevent copy construction */ + HeuristicTree(const HeuristicTree &) = delete; + /** Prevent copy assignment */ + HeuristicTree &operator=(const HeuristicTree &) = delete; + /** Move constructor */ + HeuristicTree(HeuristicTree &&other) noexcept = default; + /** Move assignment */ + HeuristicTree &operator=(HeuristicTree &&other) = default; + + /** Query a leaf value given a gemm shape + * + * @tparam T Leaf value type + * @param shape A @ref GEMMShape for the query + * @return std::pair<bool, T> Outcome contains bool, signalling if the query succeeded or not + */ + template <typename T> + std::pair<bool, T> query(GEMMShape shape) const; + + /** Add a leaf node + * + * @tparam T Leaf value type + * @param id Leaf node ID + * @param leaf_value Leaf node value + * @return bool If the addition succeeded or not + */ + template <typename T> + bool add_leaf(NodeID id, T leaf_value); + /** Add a branch node + * + * @param id Branch node ID + * @param cond Branch node @ref Condition + * @param true_node True node's ID + * @param false_node False node's ID + * @return bool If the addition succeeded or not + */ + bool add_branch(NodeID id, Condition cond, NodeID true_node, NodeID false_node); + + /** Get tree ID + * @return TreeID + */ + TreeID id() const + { + return _id; + } + + /** Get tree index + * @return Index + */ + Index index() const + { + return std::make_tuple(_heuristic_type, _ip_target, _data_type); + } + + /** Check if tree is valid + * @return bool + */ + bool check(); + +private: + static constexpr size_t _max_query_depth{1000}; // Maximum depth of query + static constexpr size_t _max_num_nodes{100000}; // Maximum number of nodes contained by the tree + static constexpr NodeID _root{0}; // Root tree ID + +private: + bool check_if_structurally_correct() const; + +private: + TreeID _id; /**< Heuristic tree ID */ + HeuristicType _heuristic_type; /**< Heuristic type */ + std::string _ip_target; /**< IP target associated with the tree */ + DataType _data_type; /**< Data type associated with the tree */ + std::map<NodeID, std::unique_ptr<Node>> _tree; /**< Tree representation */ +}; +} // namespace mlgo + +} // namespace arm_compute + +#endif //SRC_RUNTIME_CL_MLGO_HEURISTIC_TREE_H diff --git a/src/runtime/CL/mlgo/MLGOHeuristics.cpp b/src/runtime/CL/mlgo/MLGOHeuristics.cpp new file mode 100644 index 0000000000..aed46cd80f --- /dev/null +++ b/src/runtime/CL/mlgo/MLGOHeuristics.cpp @@ -0,0 +1,247 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "src/runtime/CL/mlgo/MLGOHeuristics.h" + +#include "arm_compute/core/Log.h" + +#include "src/runtime/CL/mlgo/MLGOParser.h" +#include "src/runtime/CL/mlgo/Utils.h" + +#include <fstream> + +namespace arm_compute +{ +namespace mlgo +{ +bool operator==(const GEMMConfigNative &lhs, const GEMMConfigNative &rhs) +{ + return std::tie(lhs.m0, lhs.n0, lhs.k0) == std::tie(rhs.m0, rhs.n0, rhs.k0); +} +bool operator==(const GEMMConfigReshapedOnlyRHS &lhs, const GEMMConfigReshapedOnlyRHS &rhs) +{ + return std::tie(lhs.m0, lhs.n0, lhs.k0, lhs.h0, lhs.interleave_rhs, lhs.transpose_rhs, lhs.export_cl_image) == + std::tie(rhs.m0, rhs.n0, rhs.k0, rhs.h0, rhs.interleave_rhs, rhs.transpose_rhs, rhs.export_cl_image); +} +bool operator==(const GEMMConfigReshaped &lhs, const GEMMConfigReshaped &rhs) +{ + return std::tie(lhs.m0, lhs.n0, lhs.k0, lhs.v0, lhs.h0, lhs.interleave_lhs, lhs.interleave_rhs, lhs.transpose_rhs, + lhs.export_cl_image) == std::tie(rhs.m0, rhs.n0, rhs.k0, rhs.v0, rhs.h0, rhs.interleave_lhs, + rhs.interleave_rhs, rhs.transpose_rhs, rhs.export_cl_image); +} + +constexpr size_t MLGOHeuristics::_max_num_trees; + +MLGOHeuristics::MLGOHeuristics() : _indices{}, _trees{}, _tree_valid{}, _valid{false} +{ +} + +std::pair<bool, GEMMType> MLGOHeuristics::query_gemm_type(const Query &query) const +{ + ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("MLGOHeuristics querying gemm type. %s.", to_string(query).c_str()); + const auto invalid = GEMMType::RESHAPED; + if (!_valid) + { + ARM_COMPUTE_LOG_INFO_MSG_CORE("Invalid DotMLGO. Use default heuristics instead"); + return {false, invalid}; + } + auto index = std::make_tuple(HeuristicType::GEMM_Type, query.ip_target, query.data_type); + GEMMShape shape_query{query.m, query.n, query.k, query.b}; + if (_trees.find(index) == _trees.end()) + { + ARM_COMPUTE_LOG_INFO_MSG_CORE("Cannot find tree index"); + return {false, invalid}; + } + return _trees.at(index).query<GEMMType>(shape_query); +} +std::pair<bool, GEMMConfigNative> MLGOHeuristics::query_gemm_config_native(const Query &query) const +{ + ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("MLGOHeuristics querying gemm config native. %s.", + to_string(query).c_str()); + const auto invalid = GEMMConfigNative{}; + if (!_valid) + { + ARM_COMPUTE_LOG_INFO_MSG_CORE("Invalid DotMLGO. Use default heuristics instead"); + return {false, invalid}; + } + auto index = std::make_tuple(HeuristicType::GEMM_Config_Native, query.ip_target, query.data_type); + GEMMShape shape_query{query.m, query.n, query.k, query.b}; + if (_trees.find(index) == _trees.end()) + { + ARM_COMPUTE_LOG_INFO_MSG_CORE("Cannot find tree index"); + return {false, invalid}; + } + return _trees.at(index).query<GEMMConfigNative>(shape_query); +} +std::pair<bool, GEMMConfigReshapedOnlyRHS> MLGOHeuristics::query_gemm_config_reshaped_only_rhs(const Query &query) const +{ + ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("MLGOHeuristics querying gemm config reshaped only rhs. %s.", + to_string(query).c_str()); + const auto invalid = GEMMConfigReshapedOnlyRHS{}; + if (!_valid) + { + ARM_COMPUTE_LOG_INFO_MSG_CORE("Invalid DotMLGO. Use default heuristics instead"); + return {false, invalid}; + } + auto index = std::make_tuple(HeuristicType::GEMM_Config_Reshaped_Only_RHS, query.ip_target, query.data_type); + GEMMShape shape_query{query.m, query.n, query.k, query.b}; + if (_trees.find(index) == _trees.end()) + { + ARM_COMPUTE_LOG_INFO_MSG_CORE("Cannot find tree index"); + return {false, invalid}; + } + return _trees.at(index).query<GEMMConfigReshapedOnlyRHS>(shape_query); +} +std::pair<bool, GEMMConfigReshaped> MLGOHeuristics::query_gemm_config_reshaped(const Query &query) const +{ + ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("MLGOHeuristics querying gemm config reshaped. %s.", + to_string(query).c_str()); + const auto invalid = GEMMConfigReshaped{}; + if (!_valid) + { + ARM_COMPUTE_LOG_INFO_MSG_CORE("Invalid DotMLGO. Use default heuristics instead"); + return {false, invalid}; + } + auto index = std::make_tuple(HeuristicType::GEMM_Config_Reshaped, query.ip_target, query.data_type); + GEMMShape shape_query{query.m, query.n, query.k, query.b}; + if (_trees.find(index) == _trees.end()) + { + ARM_COMPUTE_LOG_INFO_MSG_CORE("Cannot find tree index"); + return {false, invalid}; + } + return _trees.at(index).query<GEMMConfigReshaped>(shape_query); +} + +bool MLGOHeuristics::check_heuristic_tree(HeuristicTree::TreeID id) +{ + bool status; + HeuristicTree *tree{nullptr}; + std::tie(status, tree) = get_heuristic_tree(id); + if (!status) + { + return status; + } + status = tree->check(); + if (!status) + { + return status; + } + _tree_valid[id] = true; + return true; +} + +bool MLGOHeuristics::check_all() const +{ + // Tree validities are already checked and cached. + bool all_trees_are_checked = + std::find_if(_tree_valid.begin(), _tree_valid.end(), [](auto v) { return !v.second; }) == _tree_valid.end(); + if (!all_trees_are_checked) + { + ARM_COMPUTE_LOG_INFO_MSG_CORE("Missing checks on some trees. Make sure to call check_heuristic_tree after each " + "tree is completed. This could also indicate there are no trees in the dotmlgo"); + return false; + } + + // Other top level checks... + + return true; +} + +std::pair<bool, HeuristicTree *> MLGOHeuristics::get_heuristic_tree(HeuristicTree::TreeID id) +{ + if (_indices.find(id) == _indices.end()) + { + ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("Cannot find tree with id %zu", id); + return std::make_pair(false, nullptr); + } + const auto index = _indices[id]; + + if (_trees.find(index) == _trees.end()) + { + ARM_COMPUTE_LOG_INFO_MSG_CORE("Cannot find tree index"); + return std::make_pair(false, nullptr); + } + auto &t = _trees[index]; + + return std::make_pair(true, &t); +} + +bool MLGOHeuristics::add_heuristic_tree(HeuristicTree &&t) +{ + if (_indices.size() >= _max_num_trees) + { + ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("Exceeding the max number of trees allowed: %zu", _max_num_trees); + return false; + } + // PRE: correctness of t is guaranteed by the tree construction process + // Ensure unique id + const auto id = t.id(); + if (_indices.find(id) != _indices.end()) + { + ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("Cannot add redundant trees; tree id %zu already exists", id); + return false; + } + + // Ensure unique index + const auto index = t.index(); + if (_trees.find(index) != _trees.end()) + { + ARM_COMPUTE_LOG_INFO_MSG_CORE("Cannot add redundant trees; tree index already exists"); + return false; + } + + _indices[id] = index; + _trees[index] = std::move(t); + _tree_valid[id] = false; + return true; +} + +bool MLGOHeuristics::reload_from_file(const std::string &filename) +{ + std::ifstream fs; + fs.exceptions(std::ifstream::badbit); + fs.open(filename, std::ios::in); + if (!fs.is_open()) + { + ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("Cannot open DotMLGO file %s. Use default heuristics instead", + filename.c_str()); + return _valid = false; + } + return reload_from_stream(fs); +} + +bool MLGOHeuristics::reload_from_stream(std::istream &in) +{ + auto parsed = parser::parse_mlgo(in); + if (!parsed.first) + { + ARM_COMPUTE_LOG_INFO_MSG_CORE("DotMLGO parsing failed. Use default heuristics instead"); + return _valid = false; + } + *this = std::move(parsed.second); + ARM_COMPUTE_LOG_INFO_MSG_CORE("DotMLGO loaded successfully"); + return _valid = true; +} + +} // namespace mlgo +} // namespace arm_compute diff --git a/src/runtime/CL/mlgo/MLGOHeuristics.h b/src/runtime/CL/mlgo/MLGOHeuristics.h new file mode 100644 index 0000000000..6a491c5503 --- /dev/null +++ b/src/runtime/CL/mlgo/MLGOHeuristics.h @@ -0,0 +1,150 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef SRC_RUNTIME_CL_MLGO_MLGO_HEURISTICS_H +#define SRC_RUNTIME_CL_MLGO_MLGO_HEURISTICS_H + +#include "src/runtime/CL/mlgo/Common.h" +#include "src/runtime/CL/mlgo/HeuristicTree.h" + +#include <iostream> +#include <map> +#include <string> +#include <utility> +namespace arm_compute +{ +namespace mlgo +{ +/** Query interface */ +struct Query +{ + std::string ip_target; /**< The name of the IP target */ + DataType data_type; /**< Data type */ + unsigned int m; /**< Number of rows for the lhs matrix. Lhs matrix NOT transposed */ + unsigned int n; /**< Number of columns for the rhs matrix. Rhs matrix NOT transposed */ + unsigned int k; /**< Number of rows for the rhs matrix. Rhs matrix NOT transposed */ + unsigned int b; /**< Batch size */ +}; + +bool operator==(const GEMMConfigNative &lhs, const GEMMConfigNative &rhs); +bool operator==(const GEMMConfigReshapedOnlyRHS &lhs, const GEMMConfigReshapedOnlyRHS &rhs); +bool operator==(const GEMMConfigReshaped &lhs, const GEMMConfigReshaped &rhs); + +/** MLGOHeuristics for configuring GEMM kernels */ +class MLGOHeuristics +{ +public: + /** Constructor */ + MLGOHeuristics(); + /** Default Destructor */ + ~MLGOHeuristics() = default; + /** Prevent Copy Construct */ + MLGOHeuristics(const MLGOHeuristics &) = delete; + /** Prevent Copy Assignment */ + MLGOHeuristics &operator=(const MLGOHeuristics &) = delete; + /** Default Move Constructor */ + MLGOHeuristics(MLGOHeuristics &&) = default; + /** Default Move Assignment */ + MLGOHeuristics &operator=(MLGOHeuristics &&) = default; + /** Query the gemm type + * + * @param[in] query Query + * + * @return std::pair<bool, GEMMType> signals if the query succeeded or failed + */ + std::pair<bool, GEMMType> query_gemm_type(const Query &query) const; + /** Query the gemm configuration for native kernel + * + * @param[in] query Query + * + * @return std::pair<bool, GEMMConfigNative> bool signals if the query succeeded or failed + */ + std::pair<bool, GEMMConfigNative> query_gemm_config_native(const Query &query) const; + /** Query the gemm configuration for reshaped only rhs kernel + * + * @param[in] query Query + * + * @return std::pair<bool, GEMMConfigReshapedOnlyRHS> bool signals if the query succeeded or failed + */ + std::pair<bool, GEMMConfigReshapedOnlyRHS> query_gemm_config_reshaped_only_rhs(const Query &query) const; + /** Query the gemm configuration for reshaped kernel + * + * @param[in] query Query + * + * @return std::pair<bool, GEMMConfigReshaped> bool signals if the query succeeded or failed + */ + std::pair<bool, GEMMConfigReshaped> query_gemm_config_reshaped(const Query &query) const; + /** (Re)Load the heuristics from reading a dotmlgo file + * + * @param[in] filename Path to the dotmlgo file + * + * @return bool Signals if the reload succeeded or failed + */ + bool reload_from_file(const std::string &filename); + /** (Re)Load the heuristics from reading an input stream + * + * @param[in] istream Istream containing mlgo heuristics + * + * @return bool Signals if the reload succeeded or failed + */ + bool reload_from_stream(std::istream &istream); + + /** Get the heuristic tree from tree id + * + * @param[in] id Tree id. + * + * @return HeuristicTree& + */ + std::pair<bool, HeuristicTree *> get_heuristic_tree(HeuristicTree::TreeID id); + /** Add a heuristic tree + * @param t Heuristic tree to be added + */ + bool add_heuristic_tree(HeuristicTree &&t); + + /** Check the validity of the heuristic tree. + * + * @param id ID of the tree to be checked + * + * @return bool + */ + bool check_heuristic_tree(HeuristicTree::TreeID id); + + /** Check the overall validity of the heuristics. + * @return bool + */ + bool check_all() const; + +private: + static constexpr size_t _max_num_trees{100}; /**< Max number of trees that can be added*/ + +private: + // There exists a one-to-one mappipng between TreeID and Index, either can be used to identify a @ref HeuristicTree + std::map<HeuristicTree::TreeID, HeuristicTree::Index> _indices; /**< A mapping from TreeID to Index */ + std::map<HeuristicTree::Index, HeuristicTree> _trees; /**< A mapping from Index to HeuristicTree */ + std::map<HeuristicTree::TreeID, bool> _tree_valid; /**< Result cache of the tree validity checks */ + bool _valid; /**< Overall validity */ +}; + +} // namespace mlgo +} // namespace arm_compute +#endif //SRC_RUNTIME_CL_MLGO_MLGO_HEURISTICS_H diff --git a/src/runtime/CL/mlgo/MLGOParser.cpp b/src/runtime/CL/mlgo/MLGOParser.cpp new file mode 100644 index 0000000000..893daf2ed9 --- /dev/null +++ b/src/runtime/CL/mlgo/MLGOParser.cpp @@ -0,0 +1,806 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "src/runtime/CL/mlgo/MLGOParser.h" + +#include "arm_compute/core/Log.h" + +#include "src/runtime/CL/mlgo/Utils.h" + +#include <sstream> + +#define CHECK(parser_expr, valid_var) \ + (parser_expr); \ + if (!valid_var) \ + return; + +#define CHECK_DEFAULT(parser_expr, valid_var, default_val) \ + (parser_expr); \ + if (!valid_var) \ + return default_val; + +#ifdef ARM_COMPUTE_LOGGING_ENABLED + +#define FAIL_WITH_MSG(valid_var, pos, msg) \ + std::stringstream ss; \ + ss << "MLGOParser Error: " << pos << " " << msg; \ + ARM_COMPUTE_LOG_INFO_MSG_CORE(ss.str().c_str()); \ + valid_var = false; \ + return; + +#define FAIL_WITH_MSG_DEFAULT(valid_var, default_val, pos, msg) \ + std::stringstream ss; \ + ss << "MLGOParser Error: " << pos << " " << msg; \ + ARM_COMPUTE_LOG_INFO_MSG_CORE(ss.str().c_str()); \ + valid_var = false; \ + return default_val; + +#define LOG_TOKEN_POS(tokens, pos_var) const auto pos_var = tokens.current_pos(); + +#else // ARM_COMPUTE_LOGGING_ENABLED + +#define FAIL_WITH_MSG(valid_var, pos, msg) \ + valid_var = false; \ + return; + +#define FAIL_WITH_MSG_DEFAULT(valid_var, default_val, pos, msg) \ + valid_var = false; \ + return default_val; + +#define LOG_TOKEN_POS(tokens, pos_var) + +#endif // ARM_COMPUTE_LOGGING_ENABLED +namespace +{ +void ltrim(std::string &str) +{ + str.erase(str.begin(), std::find_if(str.begin(), str.end(), [](char ch) { return !std::isspace(ch); })); +} + +void rtrim(std::string &str) +{ + str.erase(std::find_if(str.rbegin(), str.rend(), [](char ch) { return !std::isspace(ch); }).base(), str.end()); +} + +void trim(std::string &str) +{ + ltrim(str); + rtrim(str); +} +} // namespace + +namespace arm_compute +{ +namespace mlgo +{ +namespace parser +{ +enum class ComparatorType +{ + Enum, + Num, + Var +}; + +TokenStream::TokenStream(std::istream &s, const std::string &delims) + : _delims{delims}, _istream{s}, _tokens{}, _lookahead_pos{} +{ + read(); +} + +TokenStream::operator bool() const +{ + ARM_COMPUTE_ERROR_ON_MSG(_tokens.empty(), "TokenStream can never be empty"); + return !reached_end(); +} + +Token TokenStream::take() +{ + ARM_COMPUTE_ERROR_ON_MSG(_tokens.empty(), "TokenStream can never be empty"); + Token t = _tokens.front(); + _tokens.pop_front(); + if (_tokens.empty()) + { + read(); + } + return t; +} +Token TokenStream::peek(size_t i) +{ + ARM_COMPUTE_ERROR_ON_MSG(_tokens.empty(), "TokenStream can never be empty"); + ARM_COMPUTE_ERROR_ON_MSG(i >= max_look_ahead, "TokenStream: Exceeding max look ahead"); + // NOTE: If i exceeds the stream (_istream.eof()), read() automatically appends a End token at the end + while (_istream && _tokens.size() <= i) + { + read(); + } + size_t ind = std::min(i, _tokens.size() - 1); + return _tokens[ind]; +} + +void advance(CharPosition &pos, char ch) +{ + if (ch == '\n') + { + pos.ln += 1; + pos.col = 0; + } + else + { + pos.col += 1; + } +} +void rewind(CharPosition &pos) +{ + pos.col -= 1; +} +void TokenStream::read() +{ + char ch; + // Skip any leading space and delim characters + do + { + // Reached eof + if (!_istream.get(ch)) + { + if (!reached_end()) + { + _tokens.emplace_back(TokenType::End, "", _lookahead_pos); + } + return; + } + advance(_lookahead_pos, ch); + } while (std::isspace(ch) || is_delim(ch)); + // Read chars until we hit a delim or eof + auto orig_pos = _lookahead_pos; + auto tok = recognize_tok(ch); + rewind(orig_pos); + tok.pos = orig_pos; + // Trim leading and trailing white spaces + trim(tok.value); + _tokens.push_back(tok); +} + +Token TokenStream::recognize_tok(char ch) +{ + if (ch == '[') + { + return Token{TokenType::L_List, "", _lookahead_pos}; + } + else if (ch == ']') + { + return Token{TokenType::R_List, "", _lookahead_pos}; + } + else if (ch == '.') + { + return float_after_dp_st(std::string{ch}); + } + else if (std::isdigit(ch)) + { + return num_st(std::string{ch}); + } + else + { + return text_st(std::string{ch}); + } +} + +Token TokenStream::num_st(std::string value) +{ + char ch{}; + while (_istream.get(ch)) + { + advance(_lookahead_pos, ch); + if (ch == '.') + { + return float_after_dp_st(value + ch); + } + else if (!std::isdigit(ch)) + { + if (!is_delim(ch) && !std::isspace(ch)) + { + rewind(_lookahead_pos); + _istream.unget(); + } + break; + } + value += ch; + } + return Token{TokenType::Int, value, _lookahead_pos}; +} + +Token TokenStream::float_after_dp_st(std::string value) +{ + char ch{}; + while (_istream.get(ch)) + { + advance(_lookahead_pos, ch); + if (!std::isdigit(ch)) + { + if (!is_delim(ch) && !std::isspace(ch)) + { + rewind(_lookahead_pos); + _istream.unget(); + } + break; + } + value += ch; + } + return Token{TokenType::Float, value, _lookahead_pos}; +} + +Token TokenStream::text_st(std::string value) +{ + char ch{}; + while (_istream.get(ch)) + { + advance(_lookahead_pos, ch); + if (is_delim(ch)) + { + break; + } + if (ch == '[' || ch == ']') + { + rewind(_lookahead_pos); + _istream.unget(); + break; + } + value += ch; + } + return Token{TokenType::Text, value, _lookahead_pos}; +} + +bool TokenStream::reached_end() const +{ + return _tokens.size() == 1 && _tokens.front().type == TokenType::End; +} + +bool TokenStream::is_delim(char ch) const +{ + return _delims.find(ch) != std::string::npos; +} + +void end(TokenStream &in, bool &valid) +{ + LOG_TOKEN_POS(in, pos); + auto tok = in.take(); + if (tok.type != TokenType::End) + { + FAIL_WITH_MSG(valid, pos, "Unexpected token at the end of stream"); + } +} + +bool bool_val(TokenStream &in, bool &valid) +{ + LOG_TOKEN_POS(in, pos); + auto tok = in.take(); + if (tok.type != TokenType::Int) + { + FAIL_WITH_MSG_DEFAULT(valid, false, pos, "Expect bool or int token"); + } + bool val{}; + std::stringstream(tok.value) >> val; + return val; +} + +int int_val(TokenStream &in, bool &valid) +{ + LOG_TOKEN_POS(in, pos); + auto tok = in.take(); + if (tok.type != TokenType::Int) + { + FAIL_WITH_MSG_DEFAULT(valid, -1, pos, "Expect int token"); + } + int val{}; + std::stringstream(tok.value) >> val; + return val; +} + +unsigned int uint_val(TokenStream &in, bool &valid) +{ + LOG_TOKEN_POS(in, pos); + int val = CHECK_DEFAULT(int_val(in, valid), valid, 0); + if (val < 0) + { + FAIL_WITH_MSG_DEFAULT(valid, 0, pos, "Expect unsigned int token"); + } + return static_cast<unsigned int>(val); +} + +float float_val(TokenStream &in, bool &valid) +{ + LOG_TOKEN_POS(in, pos); + auto tok = in.take(); + if (tok.type != TokenType::Float) + { + FAIL_WITH_MSG_DEFAULT(valid, 0.f, pos, "Expect float token"); + } + float val{}; + std::stringstream(tok.value) >> val; + return val; +} + +std::string text_val(TokenStream &in, bool &valid) +{ + LOG_TOKEN_POS(in, pos); + auto tok = in.take(); + if (tok.type != TokenType::Text || tok.value.empty()) + { + FAIL_WITH_MSG_DEFAULT(valid, "", pos, "Expect a non-empty text token"); + } + return tok.value; +} + +bool accept_text(TokenStream &in, const std::string &c_str, bool take = true) +{ + auto tok = in.peek(); + if (tok.type == TokenType::Text && tok.value == c_str) + { + if (take) + { + in.take(); + } + return true; + } + return false; +} + +void expect_text(TokenStream &in, const std::string &str, bool &valid) +{ + LOG_TOKEN_POS(in, pos); + if (!accept_text(in, str)) + { + FAIL_WITH_MSG(valid, pos, std::string("Expect text token: ") + str); + } +} + +bool accept_l_list(TokenStream &in) +{ + auto tok = in.peek(); + if (tok.type == TokenType::L_List) + { + in.take(); + return true; + } + return false; +} + +void expect_l_list(TokenStream &in, bool &valid) +{ + LOG_TOKEN_POS(in, pos); + if (!accept_l_list(in)) + { + FAIL_WITH_MSG(valid, pos, "Expect '['"); + } +} + +bool accept_r_list(TokenStream &in) +{ + auto tok = in.peek(); + if (tok.type == TokenType::R_List) + { + in.take(); + return true; + } + return false; +} + +void expect_r_list(TokenStream &in, bool &valid) +{ + LOG_TOKEN_POS(in, pos); + if (!accept_r_list(in)) + { + FAIL_WITH_MSG(valid, pos, "Expect ']'"); + } +} + +ConditionalOp conditional_op(TokenStream &in, bool &valid) +{ + LOG_TOKEN_POS(in, pos); + if (accept_text(in, "<=")) + { + return ConditionalOp::LE; + } + else if (accept_text(in, ">=")) + { + return ConditionalOp::GE; + } + else if (accept_text(in, "==")) + { + return ConditionalOp::EQ; + } + else if (accept_text(in, "<")) + { + return ConditionalOp::LT; + } + else if (accept_text(in, ">")) + { + return ConditionalOp::GT; + } + else + { + FAIL_WITH_MSG_DEFAULT(valid, ConditionalOp::EQ, pos, "Expect conditional op"); + } +} + +void gemm_version(TokenStream &in, bool &valid) +{ + CHECK(expect_text(in, "gemm-version", valid), valid); + CHECK(expect_l_list(in, valid), valid); + CHECK(uint_val(in, valid), valid); + CHECK(uint_val(in, valid), valid); + CHECK(uint_val(in, valid), valid); + CHECK(expect_r_list(in, valid), valid); +} + +void ip_type(TokenStream &in, bool &valid) +{ + CHECK(expect_text(in, "ip-type", valid), valid); + LOG_TOKEN_POS(in, pos); + if (accept_text(in, "gpu")) + { + ; + } + else if (accept_text(in, "cpu")) + { + ; + } + else + { + FAIL_WITH_MSG(valid, pos, "Expect ip type"); + } +} + +void header(TokenStream &in, bool &valid) +{ + CHECK(expect_text(in, "<header>", valid), valid); + CHECK(gemm_version(in, valid), valid); + CHECK(ip_type(in, valid), valid); + CHECK(expect_text(in, "</header>", valid), valid); +} + +DataType data_type(TokenStream &in, bool &valid) +{ + LOG_TOKEN_POS(in, pos); + if (accept_text(in, "f16")) + { + return DataType::F16; + } + else if (accept_text(in, "f32")) + { + return DataType::F32; + } + else if (accept_text(in, "qasymm8")) + { + return DataType::QASYMM8; + } + else + { + FAIL_WITH_MSG_DEFAULT(valid, DataType::QASYMM8, pos, "Expect data type"); + } +} + +ComparatorType comparator_type(TokenStream &in, bool &valid) +{ + LOG_TOKEN_POS(in, pos); + if (accept_text(in, "var")) + { + return ComparatorType::Var; + } + else if (accept_text(in, "num")) + { + return ComparatorType::Num; + } + else if (accept_text(in, "enum")) + { + return ComparatorType::Enum; + } + else + { + FAIL_WITH_MSG_DEFAULT(valid, ComparatorType::Num, pos, "Expect comparator type"); + } +} + +HeuristicType heuristic_type(TokenStream &in, bool &valid, bool take = true) +{ + LOG_TOKEN_POS(in, pos); + if (accept_text(in, "gemm-type", take)) + { + return HeuristicType::GEMM_Type; + } + else if (accept_text(in, "gemm-config-native", take)) + { + return HeuristicType::GEMM_Config_Native; + } + else if (accept_text(in, "gemm-config-reshaped-only-rhs", take)) + { + return HeuristicType::GEMM_Config_Reshaped_Only_RHS; + } + else if (accept_text(in, "gemm-config-reshaped", take)) + { + return HeuristicType::GEMM_Config_Reshaped; + } + else + { + FAIL_WITH_MSG_DEFAULT(valid, HeuristicType::GEMM_Config_Reshaped, pos, "Expect heuristic type"); + } +} + +void expect_heuristic_type(TokenStream &in, HeuristicType expected_ht, bool &valid) +{ + LOG_TOKEN_POS(in, pos); + auto ht = CHECK(heuristic_type(in, valid, false), valid); + if (ht != expected_ht) + { + FAIL_WITH_MSG(valid, pos, "Unexpected heuristic type"); + } + CHECK(heuristic_type(in, valid, true), valid); +} + +GEMMType gemm_type(TokenStream &in, bool &valid) +{ + LOG_TOKEN_POS(in, pos); + if (accept_text(in, "native")) + { + return GEMMType::NATIVE; + } + else if (accept_text(in, "reshaped-only-rhs")) + { + return GEMMType::RESHAPED_ONLY_RHS; + } + else if (accept_text(in, "reshaped")) + { + return GEMMType::RESHAPED; + } + else + { + FAIL_WITH_MSG_DEFAULT(valid, GEMMType::RESHAPED_ONLY_RHS, pos, "Expect gemm type"); + } +} + +GEMMConfigNative gemm_config_native(TokenStream &in, bool &valid) +{ + const auto invalid_val = GEMMConfigNative{}; + CHECK_DEFAULT(expect_l_list(in, valid), valid, invalid_val); + const auto m0 = CHECK_DEFAULT(uint_val(in, valid), valid, invalid_val); + const auto n0 = CHECK_DEFAULT(uint_val(in, valid), valid, invalid_val); + const auto k0 = CHECK_DEFAULT(uint_val(in, valid), valid, invalid_val); + CHECK_DEFAULT(expect_r_list(in, valid), valid, invalid_val); + return GEMMConfigNative{m0, n0, k0}; +} + +GEMMConfigReshapedOnlyRHS gemm_config_reshaped_only_rhs(TokenStream &in, bool &valid) +{ + const auto invalid_val = GEMMConfigReshapedOnlyRHS{}; + CHECK_DEFAULT(expect_l_list(in, valid), valid, invalid_val); + const auto m0 = CHECK_DEFAULT(uint_val(in, valid), valid, invalid_val); + const auto n0 = CHECK_DEFAULT(uint_val(in, valid), valid, invalid_val); + const auto k0 = CHECK_DEFAULT(uint_val(in, valid), valid, invalid_val); + const auto h0 = CHECK_DEFAULT(uint_val(in, valid), valid, invalid_val); + const auto ir = CHECK_DEFAULT(bool_val(in, valid), valid, invalid_val); + const auto tr = CHECK_DEFAULT(bool_val(in, valid), valid, invalid_val); + const auto ex = CHECK_DEFAULT(bool_val(in, valid), valid, invalid_val); + CHECK_DEFAULT(expect_r_list(in, valid), valid, invalid_val); + return GEMMConfigReshapedOnlyRHS{m0, n0, k0, h0, ir, tr, ex}; +} + +GEMMConfigReshaped gemm_config_reshaped(TokenStream &in, bool &valid) +{ + const auto invalid_val = GEMMConfigReshaped{}; + CHECK_DEFAULT(expect_l_list(in, valid), valid, invalid_val); + const auto m0 = CHECK_DEFAULT(uint_val(in, valid), valid, invalid_val); + const auto n0 = CHECK_DEFAULT(uint_val(in, valid), valid, invalid_val); + const auto k0 = CHECK_DEFAULT(uint_val(in, valid), valid, invalid_val); + const auto v0 = CHECK_DEFAULT(uint_val(in, valid), valid, invalid_val); + const auto h0 = CHECK_DEFAULT(uint_val(in, valid), valid, invalid_val); + const auto il = CHECK_DEFAULT(bool_val(in, valid), valid, invalid_val); + const auto ir = CHECK_DEFAULT(bool_val(in, valid), valid, invalid_val); + const auto tr = CHECK_DEFAULT(bool_val(in, valid), valid, invalid_val); + const auto ex = CHECK_DEFAULT(bool_val(in, valid), valid, invalid_val); + CHECK_DEFAULT(expect_r_list(in, valid), valid, invalid_val); + return GEMMConfigReshaped{m0, n0, k0, v0, h0, il, ir, tr, ex}; +} + +void gpu_priority(TokenStream &in, bool &valid) +{ + LOG_TOKEN_POS(in, pos); + if (accept_text(in, "best-performance")) + { + ; + } + else if (accept_text(in, "best-memory-usage")) + { + ; + } + else + { + FAIL_WITH_MSG(valid, pos, "Expect gpu priority"); + } +} + +void gpu_behavior(TokenStream &in, bool &valid) +{ + LOG_TOKEN_POS(in, pos); + if (accept_text(in, "static")) + { + ; + } + else if (accept_text(in, "dynamic")) + { + ; + } + else + { + FAIL_WITH_MSG(valid, pos, "Expect ip type"); + } +} + +void free_vars(TokenStream &in, bool &valid) +{ + CHECK(expect_l_list(in, valid), valid); + while (!accept_r_list(in)) + { + CHECK(text_val(in, valid), valid); + } +} + +void heuristics_table_entry(TokenStream &in, MLGOHeuristics &h, bool &valid) +{ + const auto id = CHECK(uint_val(in, valid), valid); + const auto ip = CHECK(text_val(in, valid), valid); + CHECK(uint_val(in, valid), valid); // Num cores + const auto dt = CHECK(data_type(in, valid), valid); + CHECK(gpu_priority(in, valid), valid); + CHECK(gpu_behavior(in, valid), valid); + const auto ht = CHECK(heuristic_type(in, valid), valid); + CHECK(free_vars(in, valid), valid); + HeuristicTree t(id, ht, ip, dt); + valid = CHECK(h.add_heuristic_tree(std::move(t)), valid); +} + +void heuristics_table(TokenStream &in, MLGOHeuristics &h, bool &valid) +{ + CHECK(expect_text(in, "<heuristics-table>", valid), valid); + while (!accept_text(in, "</heuristics-table>")) + { + CHECK(heuristics_table_entry(in, h, valid), valid); + } +} + +Condition condition(TokenStream &in, bool &valid) +{ + LOG_TOKEN_POS(in, pos); + // NOTE: Only simplified Conditions are accepted, which means the lhs comparator type is fixed to Var and that of + // the rhs is fixed to Num (float) + const auto invalid_val = Condition{}; + const auto l_t = CHECK_DEFAULT(comparator_type(in, valid), valid, invalid_val); + const auto l_v = CHECK_DEFAULT(text_val(in, valid), valid, invalid_val); + const auto c_o = CHECK_DEFAULT(conditional_op(in, valid), valid, invalid_val); + const auto r_t = CHECK_DEFAULT(comparator_type(in, valid), valid, invalid_val); + const auto r_v = CHECK_DEFAULT(float_val(in, valid), valid, invalid_val); + if (l_t != ComparatorType::Var || r_t != ComparatorType::Num) + { + FAIL_WITH_MSG_DEFAULT(valid, invalid_val, pos, + "Only accept LHS type to be Var (string) and RHS type to be Num (float)"); + } + return Condition{l_v, c_o, r_v}; +} + +void heuristic_tree(TokenStream &in, MLGOHeuristics &h, bool &valid) +{ + CHECK(expect_text(in, "<heuristic", valid), valid); + const auto tree_id = CHECK(uint_val(in, valid), valid); + CHECK(expect_text(in, ">", valid), valid); + HeuristicTree *t = nullptr; + std::tie(valid, t) = CHECK(h.get_heuristic_tree(tree_id), valid); + const HeuristicType t_heuristic_type = std::get<0>(t->index()); + while (!accept_text(in, "</heuristic>")) + { + LOG_TOKEN_POS(in, pos); + if (accept_text(in, "b")) + { + // Branch node + const auto id = CHECK(uint_val(in, valid), valid); + const auto cond = CHECK(condition(in, valid), valid); + const auto t_id = CHECK(uint_val(in, valid), valid); + const auto f_id = CHECK(uint_val(in, valid), valid); + valid = CHECK(t->add_branch(id, cond, t_id, f_id), valid); + } + else if (accept_text(in, "l")) + { + // Leaf node + const auto id = CHECK(uint_val(in, valid), valid); + // NOTE: Heuristic type within each tree appears to be redundant (same information can be obtained from the + // heuristic table). For now it remains as a step for validation. + LOG_TOKEN_POS(in, pos); + CHECK(expect_heuristic_type(in, t_heuristic_type, valid), valid); + switch (t_heuristic_type) + { + case HeuristicType::GEMM_Type: + { + const auto g_type = CHECK(gemm_type(in, valid), valid); + valid = CHECK(t->add_leaf(id, g_type), valid); + break; + } + case HeuristicType::GEMM_Config_Native: + { + const auto g_c = CHECK(gemm_config_native(in, valid), valid); + valid = CHECK(t->add_leaf(id, g_c), valid); + break; + } + case HeuristicType::GEMM_Config_Reshaped_Only_RHS: + { + const auto g_c = CHECK(gemm_config_reshaped_only_rhs(in, valid), valid); + valid = CHECK(t->add_leaf(id, g_c), valid); + break; + } + case HeuristicType::GEMM_Config_Reshaped: + { + const auto g_c = CHECK(gemm_config_reshaped(in, valid), valid); + valid = CHECK(t->add_leaf(id, g_c), valid); + break; + } + default: + { + FAIL_WITH_MSG(valid, pos, "Unexpected heuristic type"); + } + } + } + else + { + FAIL_WITH_MSG(valid, pos, "Expect tree node type"); + } + } + // Perform semantic checks in the middle of parsing so that it can fail fast should there be any invalidities + valid = CHECK(h.check_heuristic_tree(tree_id), valid); +} + +MLGOHeuristics mlgo(TokenStream &in, bool &valid) +{ + MLGOHeuristics h; + CHECK_DEFAULT(header(in, valid), valid, h); + CHECK_DEFAULT(heuristics_table(in, h, valid), valid, h); + while (accept_text(in, "<heuristic", false)) + { + CHECK_DEFAULT(heuristic_tree(in, h, valid), valid, h); + } + CHECK_DEFAULT(end(in, valid), valid, h); + valid = CHECK_DEFAULT(h.check_all(), valid, h); + return h; +} + +std::pair<bool, MLGOHeuristics> parse_mlgo(std::istream &in) +{ + auto tokens = TokenStream(in); + bool valid = true; + auto h = mlgo(tokens, valid); + return std::make_pair(std::move(valid), std::move(h)); +} +} // namespace parser +} // namespace mlgo +} // namespace arm_compute + +#undef CHECK +#undef CHECK_DEFAULT +#undef FAIL_WITH_MSG +#undef FAIL_WITH_MSG_DEFAULT diff --git a/src/runtime/CL/mlgo/MLGOParser.h b/src/runtime/CL/mlgo/MLGOParser.h new file mode 100644 index 0000000000..cffce8d6a1 --- /dev/null +++ b/src/runtime/CL/mlgo/MLGOParser.h @@ -0,0 +1,198 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef SRC_RUNTIME_CL_MLGO_MLGO_PARSER_H +#define SRC_RUNTIME_CL_MLGO_MLGO_PARSER_H + +#include "src/runtime/CL/mlgo/MLGOHeuristics.h" + +#include <deque> +#include <istream> +#include <string> +#include <utility> + +/** A DotMLGO file parser (LL(k) parser) + * + * The grammar of DotMLGO is defined as the following ENBF: + * + * delim = "," | "\n"; // Note that delimiters are omitted from the definition below + * + * mlgo = header, heuristics-table, {heuristic-tree}; + * + * header = "<header>", gemm-version, ip-type, "</header>"; + * gemm-version = "gemm-version", "[", int, int, int, "]"; + * ip-type = "ip-type", ("gpu" | "cpu"); + * + * heiristics-table = "<heuristics-table>", {heuristics-table-entry}, "</heuristics-table>"; + * heuristics-table-entry = entry-id, ip-name, num-cores, data-type, gpu-priority, gpu-behavior, heuristic-type, free-vars; + * entry-id = int; + * ip-name = char-sequence; + * num-cores = int; + * data-type = "f32" | "f16" | "qasymm8"; + * gpu-priority = "best-performance" | "best-memory-usage"; + * gpu-behavior = "static" | "dynamic"; + * heuristic-type = "gemm-type" | "gemm-config-native" | "gemm-config-reshaped-only-rhs" | "gemm-config-reshaped"; + * free-vars = "[", {char-sequence}, "]"; + * + * heuristic-tree = "<heuristic", entry-id, ">", {tree-node}, "</heuristic>"; + * tree-node = branch-node | leaf-node; + * branch-node = "b", entry-id, lhs-type, lhs-value, conditional-op, rhs-type, rhs-value, true-node, false-node; + * lhs-type = comparator-type; + * lhs-value = comparator-value; + * rhs-type = comparator-type; + * rhs-value = comparator-value; + * comparator-type = "var" | "num" | "enum"; + * comparator-value = char-sequence | float; + * conditional-op = "<" | "<=" | "==" | ">=" | ">"; + * true-node = entry-id; + * false-node = entry-id; + * leaf-node = "l", entry-id, heuristic-type, leaf-value; + * leaf-value = gemm-type | gemm-config-native | gemm-config-reshaped-only-rhs | gemm-config-reshaped + * gemm-type = "native" | "reshaped-only-rhs" | "reshaped"; + * gemm-config-native = "[", int, int, int, "]"; + * gemm-config-reshaped-only-rhs = "[", int, int, int, int, bool, bool, bool, "]"; + * gemm-config-reshaped = "[", int, int, int, int, int, bool, bool, bool, bool, "]"; + */ + +namespace arm_compute +{ +namespace mlgo +{ +namespace parser +{ +/** Type of Token */ +enum class TokenType +{ + L_List = '[', /**< List open */ + R_List = ']', /**< List close */ + Int, /**< Integral */ + Float, /**< Floating */ + Text, /**< Text/String */ + End, /**< End of stream */ +}; + +struct CharPosition +{ + bool operator==(const CharPosition &other) const + { + return ln == other.ln && col == other.col; + } + + size_t ln{0}; + size_t col{0}; +}; + +/** Token */ +struct Token +{ + Token(TokenType t, std::string v, CharPosition pos) : type{t}, value{v}, pos{pos} + { + } + + bool operator==(const Token &other) const + { + return type == other.type && value == other.value && pos == other.pos; + } + + TokenType type; /**< Token type */ + std::string value; /**< Token value */ + CharPosition pos; +}; + +/** A stream of token */ +class TokenStream +{ + // NOTE: _tokens is never empty. The end of token stream is signalled by the End Token +public: + static constexpr size_t max_look_ahead = 10; + +public: + /** Constructor + * + * @param[in] s Input stream + * @param[in] delims Delimiter characters packed in a string. Each char from the string can be used as a delim on its own + */ + TokenStream(std::istream &s, const std::string &delims = ",\n"); + + /** Check if there're more (non-End) Tokens + * @return true If there are more tokens + * @return false If reached end of stream (only End token) + */ + explicit operator bool() const; + + /** Get and pop off the current token + * + * @return Token + */ + Token take(); + + /** Peek the next ith token + * + * @param[in] i The next ith token. i < @ref max_look_ahead. + * + * @return Token + */ + Token peek(size_t i = 0); + + /** Get the position of the current token + * + * @return CharPosition + */ + CharPosition current_pos() const + { + return _tokens.front().pos; + } + +private: + void read(); + + Token recognize_tok(char ch); + + Token num_st(std::string value = ""); + + Token float_after_dp_st(std::string value = ""); + + Token text_st(std::string value = ""); + + bool reached_end() const; + + bool is_delim(char ch) const; + + std::string _delims; + std::istream &_istream; + std::deque<Token> _tokens; + CharPosition _lookahead_pos; +}; + +/** Parse and construct a @ref MLGOHeuristics from input stream + * + * @param[in] in Input stream + * + * @return MLGOHeuristics + */ +std::pair<bool, MLGOHeuristics> parse_mlgo(std::istream &in); + +} // namespace parser +} // namespace mlgo +} // namespace arm_compute +#endif //SRC_RUNTIME_CL_MLGO_MLGO_PARSER_H diff --git a/src/runtime/CL/mlgo/Utils.cpp b/src/runtime/CL/mlgo/Utils.cpp new file mode 100644 index 0000000000..c7e0100b3c --- /dev/null +++ b/src/runtime/CL/mlgo/Utils.cpp @@ -0,0 +1,185 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "src/runtime/CL/mlgo/Utils.h" + +#include <sstream> + +namespace arm_compute +{ +namespace mlgo +{ +namespace +{ +template <typename T> +inline std::string to_str(const T &val) +{ + std::stringstream ss; + ss << val; + return ss.str(); +} +} // namespace + +std::ostream &operator<<(std::ostream &os, const GEMMConfigNative &config) +{ + return os << "Native:{" + << "m0: " << config.m0 << ", " + << "n0: " << config.n0 << ", " + << "k0: " << config.k0 << ", " + << "}"; +} +std::ostream &operator<<(std::ostream &os, const GEMMConfigReshapedOnlyRHS &config) +{ + return os << "ReshapedOnlyRHS:{" + << "m0: " << config.m0 << ", " + << "n0: " << config.n0 << ", " + << "k0: " << config.k0 << ", " + << "h0: " << config.h0 << ", " + << "interleave_rhs: " << config.interleave_rhs << ", " + << "transpose_rhs: " << config.transpose_rhs << ", " + << "export_cl_image: " << config.export_cl_image << "}"; +} +std::ostream &operator<<(std::ostream &os, const GEMMConfigReshaped &config) +{ + return os << "Reshaped:{" + << "m0: " << config.m0 << ", " + << "n0: " << config.n0 << ", " + << "k0: " << config.k0 << ", " + << "v0: " << config.v0 << ", " + << "h0: " << config.h0 << ", " + << "interleave_lhs: " << config.interleave_lhs << ", " + << "interleave_rhs: " << config.interleave_rhs << ", " + << "transpose_rhs: " << config.transpose_rhs << ", " + << "export_cl_image: " << config.export_cl_image << "}"; +} +std::ostream &operator<<(std::ostream &os, HeuristicType ht) +{ + switch (ht) + { + case HeuristicType::GEMM_Type: + { + os << "GEMM_Type"; + break; + } + case HeuristicType::GEMM_Config_Reshaped_Only_RHS: + { + os << "GEMM_Config_Reshaped_Only_RHS"; + break; + } + case HeuristicType::GEMM_Config_Reshaped: + { + os << "GEMM_Config_Reshaped"; + break; + } + default: + { + os << "Unknown"; + break; + } + } + return os; +} +std::ostream &operator<<(std::ostream &os, DataType dt) +{ + switch (dt) + { + case DataType::F32: + { + os << "F32"; + break; + } + case DataType::F16: + { + os << "F16"; + break; + } + case DataType::QASYMM8: + { + os << "QASYMM8"; + break; + } + default: + { + os << "Unknown"; + break; + } + } + return os; +} +std::ostream &operator<<(std::ostream &os, const HeuristicTree::Index &index) +{ + HeuristicType ht; + std::string ip; + DataType dt; + std::tie(ht, ip, dt) = index; + os << "Index("; + os << "HeuristicType=" << ht << ","; + os << "IP=" << ip << ","; + os << "DataType=" << dt; + os << ")"; + return os; +} +std::ostream &operator<<(std::ostream &os, const Query &query) +{ + os << "Query("; + os << "IP=" << query.ip_target << ","; + os << "DataType=" << query.data_type << ","; + os << "M=" << query.m << ","; + os << "N=" << query.n << ","; + os << "K=" << query.k << ","; + os << "B=" << query.b << ")"; + return os; +} + +std::string to_string(const GEMMConfigNative &config) +{ + return to_str(config); +} + +std::string to_string(const GEMMConfigReshapedOnlyRHS &config) +{ + return to_str(config); +} + +std::string to_string(const GEMMConfigReshaped &config) +{ + return to_str(config); +} + +std::string to_string(const Query &query) +{ + return to_str(query); +} + +namespace parser +{ +std::ostream &operator<<(std::ostream &os, const CharPosition &pos) +{ + os << "(Ln: " << pos.ln << ", Col: " << pos.col << ")"; + return os; +} +} // namespace parser + +} // namespace mlgo + +} // namespace arm_compute diff --git a/src/runtime/CL/mlgo/Utils.h b/src/runtime/CL/mlgo/Utils.h new file mode 100644 index 0000000000..73b537f476 --- /dev/null +++ b/src/runtime/CL/mlgo/Utils.h @@ -0,0 +1,57 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef SRC_RUNTIME_CL_MLGO_UTILS_H +#define SRC_RUNTIME_CL_MLGO_UTILS_H + +#include "src/runtime/CL/mlgo/Common.h" +#include "src/runtime/CL/mlgo/HeuristicTree.h" +#include "src/runtime/CL/mlgo/MLGOHeuristics.h" +#include "src/runtime/CL/mlgo/MLGOParser.h" + +#include <ostream> +#include <string> + +namespace arm_compute +{ +namespace mlgo +{ +std::ostream &operator<<(std::ostream &os, const GEMMConfigNative &config); +std::ostream &operator<<(std::ostream &os, const GEMMConfigReshapedOnlyRHS &config); +std::ostream &operator<<(std::ostream &os, const GEMMConfigReshaped &config); +std::ostream &operator<<(std::ostream &os, HeuristicType ht); +std::ostream &operator<<(std::ostream &os, DataType dt); +std::ostream &operator<<(std::ostream &os, const HeuristicTree::Index &index); +std::ostream &operator<<(std::ostream &os, const Query &query); +std::string to_string(const GEMMConfigNative &config); +std::string to_string(const GEMMConfigReshapedOnlyRHS &config); +std::string to_string(const GEMMConfigReshaped &config); +std::string to_string(const Query &query); +namespace parser +{ +std::ostream &operator<<(std::ostream &os, const CharPosition &pos); +} // namespace parser +} // namespace mlgo +} // namespace arm_compute + +#endif //SRC_RUNTIME_CL_MLGO_UTILS_H diff --git a/src/runtime/CL/tuners/BifrostTuner.cpp b/src/runtime/CL/tuners/BifrostTuner.cpp deleted file mode 100644 index 5b23baaed3..0000000000 --- a/src/runtime/CL/tuners/BifrostTuner.cpp +++ /dev/null @@ -1,319 +0,0 @@ -/* - * Copyright (c) 2018-2019 ARM Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "arm_compute/runtime/CL/tuners/BifrostTuner.h" - -#include "arm_compute/core/CL/CLHelpers.h" -#include "arm_compute/core/CL/CLKernels.h" -#include "arm_compute/core/utils/misc/Cast.h" - -namespace arm_compute -{ -namespace tuners -{ -namespace -{ -/** Tunes a @ref CLDirectConvolutionLayerKernel for a bifrost target - * - * @param[in] k Kernels to tune - */ -void tune_direct_convolution_kernel(CLDirectConvolutionLayerKernel &k) -{ - cl::NDRange lws_hint = k.lws_hint(); - - const GPUTarget gpu_target = k.get_target(); - const DataType dt = k._input->info()->data_type(); - const TensorShape weights_shape = k._weights->info()->tensor_shape(); - const TensorShape inputs_shape = k._input->info()->tensor_shape(); - const size_t kernel_size = weights_shape.x(); - const unsigned int stride_x = k._conv_stride_x; - const unsigned int stride_y = k._conv_stride_y; - - if(gpu_target_is_in(gpu_target, GPUTarget::G71, GPUTarget::G72) && (kernel_size <= 5) && (stride_x == 1) && (stride_y == 1) && (dt == DataType::F32)) - { - // Through extensive experimentation with over 30 representative tensor - // shapes, we found a small number of local work size configurations - // that result in nearly optimal execution times. Selecting the right - // lws for a given shape, however, required a complex decision tree, - // until we constructed a simple feature as described below. - // - // We started from the number of multiply-accumulate operations for a - // convolution layer, which is equal to the product of the input - // dimensions 0..2 and the weights dimensions 0..2. Unfortunately, - // this resulted in ties between distinct shapes that required distinct - // lws configurations. Replacing the width of the input with the kernel - // size, however, resulted in nearly optimal predictions. We use underscores - // in variable names to indicate when they are intentionally misleading. - const size_t product_of_weights_dimensions = weights_shape[0] * weights_shape[1] * weights_shape[2]; - const size_t product_of_input_dimensions_ = inputs_shape[0] * inputs_shape[1] * inputs_shape[2]; - const float mega_ops_ = 1e-6 * product_of_weights_dimensions * product_of_input_dimensions_; - - switch(kernel_size) - { - case 1: - { - if(mega_ops_ < 1.f) - { - lws_hint = cl::NDRange(1, 1, 8); - } - else if(mega_ops_ < 7.f) - { - lws_hint = cl::NDRange(1, 1, 4); - } - else - { - lws_hint = cl::NDRange(1, 1, 2); - } - break; - } - case 3: - { - if(mega_ops_ < 1.f) - { - lws_hint = cl::NDRange(1, 1, 8); - } - else if(mega_ops_ < 13.f) - { - lws_hint = cl::NDRange(2, 1, 4); - } - else if(mega_ops_ < 50.f) - { - lws_hint = cl::NDRange(3, 1, 4); - } - else - { - lws_hint = cl::NDRange(2, 1, 6); - } - break; - } - case 5: - { - if(mega_ops_ < 2.f || mega_ops_ > 80.f) - { - lws_hint = cl::NDRange(2, 1, 4); - } - else - { - lws_hint = cl::NDRange(2, 1, 8); - } - break; - } - default: - break; - } - k.set_lws_hint(lws_hint); - } -} - -void tune_col2im_kernel(CLCol2ImKernel &k) -{ - cl::NDRange lws_hint = k.lws_hint(); - const GPUTarget gpu_target = k.get_target(); - - // Configure the local work size for Bifrost with a value obtained - // via exhaustive autotuning over 30 representative tensor shapes. - if(gpu_target_is_in(gpu_target, - GPUTarget::G71, GPUTarget::G72, GPUTarget::G76, - GPUTarget::G51, GPUTarget::G51BIG, GPUTarget::G51LIT, - GPUTarget::G52, GPUTarget::G52LIT)) - { - if((k._convolved_dims.width == 7) || (k._convolved_dims.width == 14)) - { - lws_hint = cl::NDRange(1, 7, 1); - } - else - { - lws_hint = cl::NDRange(1, 8, 1); - } - } - - k.set_lws_hint(lws_hint); -} - -void tune_im2col_kernel(CLIm2ColKernel &k) -{ - cl::NDRange lws_hint = k.lws_hint(); - const GPUTarget gpu_target = k.get_target(); - - // Local work size optimized for the 11x11 AlexNet convolution on Bifrost. - if(gpu_target_is_in(gpu_target, - GPUTarget::G71, GPUTarget::G72, GPUTarget::G76, - GPUTarget::G51, GPUTarget::G51BIG, GPUTarget::G51LIT, - GPUTarget::G52, GPUTarget::G52LIT) - && k._kernel_dims.width == 11) - { - const bool is_square_kernel = (k._kernel_dims.width == k._kernel_dims.height); - if(!is_square_kernel && k._kernel_dims.width > 1 && !k._conv_info.has_padding()) - { - lws_hint = cl::NDRange(1, 1, 1); - } - } - k.set_lws_hint(lws_hint); -} - -void tune_gemv_kernel(CLGEMMMatrixVectorMultiplyKernel &k) -{ - cl::NDRange lws_hint = k.lws_hint(); - const GPUTarget gpu_target = k.get_target(); - - // Configure the local work size for Bifrost with a value obtained - // via exhaustive autotuning for the MobileNets tensor shapes. - if(gpu_target_is_in(gpu_target, - GPUTarget::G71, GPUTarget::G72, GPUTarget::G76, - GPUTarget::G51, GPUTarget::G51BIG, GPUTarget::G51LIT, - GPUTarget::G52, GPUTarget::G52LIT)) - { - lws_hint = cl::NDRange(1, 1, 1); - } - - k.set_lws_hint(lws_hint); -} - -void tune_gemm_kernel(CLGEMMMatrixMultiplyKernel &k) -{ - cl::NDRange lws_hint = k.lws_hint(); - const GPUTarget gpu_target = k.get_target(); - - // Configure LWS hint - switch(gpu_target) - { - case GPUTarget::G71: - case GPUTarget::G72: - case GPUTarget::G51: - case GPUTarget::G51BIG: - case GPUTarget::G51LIT: - case GPUTarget::G52: - case GPUTarget::G52LIT: - case GPUTarget::G76: - if(k._input1->info()->dimension(1) == 24) - { - // LWS optimized for the 11x11 AlexNet convolution on Bifrost. - lws_hint = cl::NDRange(2, 2); - } - else if(k._output->info()->dimension(1) == 196) - { - lws_hint = cl::NDRange(1, 7); - } - else - { - lws_hint = cl::NDRange(8, 8); - } - break; - default: - lws_hint = cl::NullRange; - } - - k.set_lws_hint(lws_hint); -} - -void tune_pooling_kernel(CLPoolingLayerKernel &k) -{ - cl::NDRange lws_hint = k.lws_hint(); - const GPUTarget gpu_target = k.get_target(); - - // Configure the local work size (hint) from the first two dimensions of the global work size. - // On Bifrost, this works for up to 35x35xC filters, for which the pooling_layer_3_optimized - // kernel is launched with gws=(9, 33, C). In any case, the hint will be ignored if it is - // invalid (e.g. exceeds the maximum workgroup size that the kernel can be launched with). - if(k._input->info()->data_layout() == DataLayout::NCHW) - { - if(gpu_target_is_in(gpu_target, - GPUTarget::G71, GPUTarget::G72, GPUTarget::G76, - GPUTarget::G51, GPUTarget::G51BIG, GPUTarget::G51LIT, - GPUTarget::G52, GPUTarget::G52LIT)) - { - cl::NDRange gws = ICLKernel::gws_from_window(k.window()); - lws_hint = cl::NDRange(gws[0], gws[1], 1); - } - } - - k.set_lws_hint(lws_hint); -} - -void tune_scale_kernel(CLScaleKernel &k) -{ - cl::NDRange lws_hint = k.lws_hint(); - const GPUTarget gpu_target = k.get_target(); - const DataType dt = k.input()->info()->data_type(); - const InterpolationPolicy interpolation = k._interpolationPolicy; - - // Configure the local work size for Bifrost, interpolation (bilinear) and datatype F32. - // The value are obtained via exhaustive autotuning. - if(gpu_target_is_in(gpu_target, GPUTarget::G71, GPUTarget::G72) && (dt == DataType::F32) && (interpolation == InterpolationPolicy::BILINEAR)) - { - auto dim_0 = k.output()->info()->dimension(0); - if(dim_0 == 480) - { - lws_hint = cl::NDRange(2, 1); - } - else if(dim_0 == 3120) - { - lws_hint = cl::NDRange(2, 8); - } - else if(dim_0 == 4160) - { - lws_hint = cl::NDRange(4, 8); - } - k.set_lws_hint(lws_hint); - } -} -} // namespace - -void BifrostTuner::tune_kernel_static(ICLKernel &kernel) -{ - if(dynamic_cast<CLDirectConvolutionLayerKernel *>(&kernel) != nullptr) - { - tune_direct_convolution_kernel(*utils::cast::polymorphic_downcast<CLDirectConvolutionLayerKernel *>(&kernel)); - } - else if(dynamic_cast<CLCol2ImKernel *>(&kernel) != nullptr) - { - tune_col2im_kernel(*utils::cast::polymorphic_downcast<CLCol2ImKernel *>(&kernel)); - } - else if(dynamic_cast<CLIm2ColKernel *>(&kernel) != nullptr) - { - tune_im2col_kernel(*utils::cast::polymorphic_downcast<CLIm2ColKernel *>(&kernel)); - } - else if(dynamic_cast<CLGEMMMatrixVectorMultiplyKernel *>(&kernel) != nullptr) - { - tune_gemv_kernel(*utils::cast::polymorphic_downcast<CLGEMMMatrixVectorMultiplyKernel *>(&kernel)); - } - else if(dynamic_cast<CLGEMMMatrixMultiplyKernel *>(&kernel) != nullptr) - { - tune_gemm_kernel(*utils::cast::polymorphic_downcast<CLGEMMMatrixMultiplyKernel *>(&kernel)); - } - else if(dynamic_cast<CLPoolingLayerKernel *>(&kernel) != nullptr) - { - tune_pooling_kernel(*utils::cast::polymorphic_downcast<CLPoolingLayerKernel *>(&kernel)); - } - else if(dynamic_cast<CLScaleKernel *>(&kernel) != nullptr) - { - tune_scale_kernel(*utils::cast::polymorphic_downcast<CLScaleKernel *>(&kernel)); - } -} - -void BifrostTuner::tune_kernel_dynamic(ICLKernel &kernel) -{ - ARM_COMPUTE_UNUSED(kernel); -} -} // namespace tuners -} // namespace arm_compute
\ No newline at end of file diff --git a/src/runtime/CL/tuners/CLLWSList.cpp b/src/runtime/CL/tuners/CLLWSList.cpp deleted file mode 100644 index 30fd558ef3..0000000000 --- a/src/runtime/CL/tuners/CLLWSList.cpp +++ /dev/null @@ -1,114 +0,0 @@ -/* - * Copyright (c) 2019 ARM Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "arm_compute/runtime/CL/tuners/CLLWSList.h" - -namespace arm_compute -{ -namespace cl_tuner -{ -size_t CLLWSList::size() -{ - return search_space_shape.total_size(); -} - -cl::NDRange CLLWSListExhaustive::operator[](size_t index) -{ - ARM_COMPUTE_ERROR_ON(index >= size()); - auto coords = index2coords(search_space_shape, index); - return cl::NDRange{ coords[0] + 1U, coords[1] + 1U, coords[2] + 1U }; -} - -CLLWSListExhaustive::CLLWSListExhaustive(const cl::NDRange &gws) -{ - ARM_COMPUTE_UNUSED(gws); - search_space_shape = TensorShape(max_lws_supported_x, - max_lws_supported_y, - max_lws_supported_z); -} - -cl::NDRange CLLWSListNormal::operator[](size_t index) -{ - ARM_COMPUTE_ERROR_ON(index >= size()); - auto coords = index2coords(search_space_shape, index); - return cl::NDRange{ _lws_x[coords[0]], _lws_y[coords[1]], _lws_z[coords[2]] }; -} - -CLLWSListNormal::CLLWSListNormal(const cl::NDRange &gws) -{ - auto lws_x_max = std::min(static_cast<unsigned int>(gws[0]), max_lws_supported_x); - auto lws_y_max = std::min(static_cast<unsigned int>(gws[1]), max_lws_supported_y); - auto lws_z_max = std::min(static_cast<unsigned int>(gws[2]), max_lws_supported_z); - - // Initialize the LWS values to test - initialize_lws_values(_lws_x, gws[0], lws_x_max, gws[2] > 16); // Explore lws that are not factors of gws only when gws[2] > 16 - initialize_lws_values(_lws_y, gws[1], lws_y_max, gws[2] > 16); // Explore lws that are not factors of gws only when gws[2] > 16 - initialize_lws_values(_lws_z, gws[2], lws_z_max, false); - - search_space_shape = TensorShape(_lws_x.size(), _lws_y.size(), _lws_z.size()); -} - -void CLLWSListNormal::initialize_lws_values(std::vector<unsigned int> &lws, unsigned int gws, unsigned int lws_max, bool mod_let_one) -{ - lws.push_back(1); - - for(unsigned int i = 2; i <= lws_max; ++i) - { - // Power of two condition - const bool is_power_of_two = (i & (i - 1)) == 0; - - // Condition for the module accordingly with the mod_let_one flag - const bool mod_cond = mod_let_one ? (gws % i) <= 1 : (gws % i) == 0; - - if(mod_cond || is_power_of_two) - { - lws.push_back(i); - } - } -} - -CLLWSListRapid::CLLWSListRapid(const cl::NDRange &gws) -{ - auto lws_x_max = std::min(static_cast<unsigned int>(gws[0]), 8u); // Limit exploration to 1 - 8 - auto lws_y_max = std::min(static_cast<unsigned int>(gws[1]), 4u); // Limit exploration to 1 - 4 - auto lws_z_max = std::min(static_cast<unsigned int>(gws[2]), 4u); // Limit exploration to 1 - 4 - - // Initialize the LWS values to test - initialize_lws_values(_lws_x, lws_x_max); - initialize_lws_values(_lws_y, lws_y_max); - initialize_lws_values(_lws_z, lws_z_max); - - search_space_shape = TensorShape(_lws_x.size(), _lws_y.size(), _lws_z.size()); -} - -void CLLWSListRapid::initialize_lws_values(std::vector<unsigned int> &lws, unsigned int lws_max) -{ - lws.push_back(1); - - for(unsigned int i = 2; i <= lws_max; i *= 4) - { - lws.push_back(i); - } -} -} // namespace cl_tuner -} // namespace arm_compute diff --git a/src/runtime/CL/tuners/CLTuningParametersList.cpp b/src/runtime/CL/tuners/CLTuningParametersList.cpp new file mode 100644 index 0000000000..5e3907f1ea --- /dev/null +++ b/src/runtime/CL/tuners/CLTuningParametersList.cpp @@ -0,0 +1,287 @@ +/* + * Copyright (c) 2019-2021, 2023 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/runtime/CL/tuners/CLTuningParametersList.h" + +namespace arm_compute +{ +namespace cl_tuner +{ +constexpr unsigned int max_lws_supported_x{64u}; +constexpr unsigned int max_lws_supported_y{32u}; +constexpr unsigned int max_lws_supported_z{32u}; + +/** Non instantiable base class for Tuning parameters combinations that use Index2Coord mapping */ +class CLTuningParametersList : public ICLTuningParametersList +{ +protected: + /* Shape of 4-D search space */ + TensorShape search_space_shape{0, 0, 0, 0}; + std::vector<unsigned int> _lws_x{0}; + std::vector<unsigned int> _lws_y{0}; + std::vector<unsigned int> _lws_z{0}; + std::vector<int> _wbsm{0}; /* Modify the batches size of workgroups distributed to compute units. + The value is in the range [-31,+31]. + When 0, the runtime-selected wbs used is unmodified. */ + + /** Constructor */ + CLTuningParametersList() = default; + /** Copy Constructor */ + CLTuningParametersList(const CLTuningParametersList &) = default; + /** Move Constructor */ + CLTuningParametersList(CLTuningParametersList &&) noexcept(true) = default; + /** Assignment */ + CLTuningParametersList &operator=(const CLTuningParametersList &) = default; + /** Move Assignment */ + CLTuningParametersList &operator=(CLTuningParametersList &&) noexcept(true) = default; + /** Destructor */ + virtual ~CLTuningParametersList() = default; + + // Inherited methods overridden: + virtual size_t size() override; +}; + +/** Exhaustive list of all possible Tuning parameters (lws) values */ +class CLTuningParametersListExhaustive : public CLTuningParametersList +{ +public: + /** Prevent default constructor calls */ + CLTuningParametersListExhaustive() = delete; + /** Constructor */ + CLTuningParametersListExhaustive(const cl::NDRange &gws, CLTuningInfo tuning_info); + /** Copy Constructor */ + CLTuningParametersListExhaustive(const CLTuningParametersListExhaustive &) = default; + /** Move Constructor */ + CLTuningParametersListExhaustive(CLTuningParametersListExhaustive &&) noexcept(true) = default; + /** Assignment */ + CLTuningParametersListExhaustive &operator=(const CLTuningParametersListExhaustive &) = default; + /** Move Assignment */ + CLTuningParametersListExhaustive &operator=(CLTuningParametersListExhaustive &&) noexcept(true) = default; + /** Destructor */ + ~CLTuningParametersListExhaustive() = default; + + // Inherited methods overridden: + CLTuningParams operator[](size_t) override; +}; + +/** A subset of LWS values that are either factors of gws when gws[2] < 16 or power of 2 */ +class CLTuningParametersListNormal : public CLTuningParametersList +{ +public: + /** Constructor */ + CLTuningParametersListNormal(const cl::NDRange &gws, CLTuningInfo tuning_info); + /** Copy Constructor */ + CLTuningParametersListNormal(const CLTuningParametersListNormal &) = default; + /** Move Constructor */ + CLTuningParametersListNormal(CLTuningParametersListNormal &&) noexcept(true) = default; + /** Assignment */ + CLTuningParametersListNormal &operator=(const CLTuningParametersListNormal &) = default; + /** Move Assignment */ + CLTuningParametersListNormal &operator=(CLTuningParametersListNormal &&) noexcept(true) = default; + /** Destructor */ + ~CLTuningParametersListNormal() = default; + + // Inherited methods overridden: + CLTuningParams operator[](size_t) override; + + /** Prevent default constructor calls */ + CLTuningParametersListNormal() = default; + +private: + /** Utility function used to initialize the LWS values to test. + * Only the LWS values which are power of 2 or satisfy the modulo conditions with GWS are taken into account by the CLTuner + * + * @param[in, out] lws Vector of LWS to test + * @param[in] gws Size of the specific GWS + * @param[in] lws_max Max LWS value allowed to be tested + * @param[in] mod_let_one True if the results of the modulo operation between gws and the lws can be less than one. + */ + void + initialize_lws_values(std::vector<unsigned int> &lws, unsigned int gws, unsigned int lws_max, bool mod_let_one); +}; + +/** A minimal subset of LWS values that only have 1,2 and 4/8 */ +class CLTuningParametersListRapid : public CLTuningParametersListNormal +{ +public: + /** Prevent default constructor calls */ + CLTuningParametersListRapid() = delete; + /** Constructor */ + CLTuningParametersListRapid(const cl::NDRange &gws, CLTuningInfo tuning_info); + /** Copy Constructor */ + CLTuningParametersListRapid(const CLTuningParametersListRapid &) = default; + /** Move Constructor */ + CLTuningParametersListRapid(CLTuningParametersListRapid &&) noexcept(true) = default; + /** Assignment */ + CLTuningParametersListRapid &operator=(const CLTuningParametersListRapid &) = default; + /** Move Assignment */ + CLTuningParametersListRapid &operator=(CLTuningParametersListRapid &&) noexcept(true) = default; + /** Destructor */ + virtual ~CLTuningParametersListRapid() = default; + +private: + /** Utility function used to initialize the LWS values to test. + * Only the LWS values that have 1,2 and 4/8 for each dimension are taken into account by the CLTuner + * + * @param[in, out] lws Vector of LWS to test + * @param[in] lws_max Max LWS value allowed to be tested + */ + void initialize_lws_values(std::vector<unsigned int> &lws, unsigned int lws_max); +}; + +size_t CLTuningParametersList::size() +{ + return search_space_shape.total_size(); +} + +CLTuningParams CLTuningParametersListExhaustive::operator[](size_t index) +{ + ARM_COMPUTE_ERROR_ON(index >= size()); + auto coords = index2coords(search_space_shape, index); + return CLTuningParams(coords[0] + 1U, coords[1] + 1U, coords[2] + 1U, static_cast<int>(coords[3])); +} + +CLTuningParametersListExhaustive::CLTuningParametersListExhaustive(const cl::NDRange &gws, CLTuningInfo tuning_info) +{ + const auto lws_x_max = std::min(static_cast<unsigned int>(gws[0]), max_lws_supported_x); + const auto lws_y_max = std::min(static_cast<unsigned int>(gws[1]), max_lws_supported_y); + const auto lws_z_max = std::min(static_cast<unsigned int>(gws[2]), max_lws_supported_z); + + search_space_shape[0] = lws_x_max; + search_space_shape[1] = lws_y_max; + search_space_shape[2] = lws_z_max; + search_space_shape[3] = 1; + if (tuning_info.tune_wbsm) + { + _wbsm = {-3, -2, -1, 0, 1, 2, 3}; + search_space_shape[3] = _wbsm.size(); + } +} + +CLTuningParams CLTuningParametersListNormal::operator[](size_t index) +{ + ARM_COMPUTE_ERROR_ON(index >= size()); + auto coords = index2coords(search_space_shape, index); + return CLTuningParams(_lws_x[coords[0]], _lws_y[coords[1]], _lws_z[coords[2]], _wbsm[coords[3]]); +} + +CLTuningParametersListNormal::CLTuningParametersListNormal(const cl::NDRange &gws, CLTuningInfo tuning_info) +{ + const auto lws_x_max = std::min(static_cast<unsigned int>(gws[0]), max_lws_supported_x); + const auto lws_y_max = std::min(static_cast<unsigned int>(gws[1]), max_lws_supported_y); + const auto lws_z_max = std::min(static_cast<unsigned int>(gws[2]), max_lws_supported_z); + + // Initialize the tuning parameters values to test + _lws_x = {}; + _lws_y = {}; + _lws_z = {}; + initialize_lws_values(_lws_x, gws[0], lws_x_max, + gws[2] > 16); // Explore lws that are not factors of gws only when gws[2] > 16 + initialize_lws_values(_lws_y, gws[1], lws_y_max, + gws[2] > 16); // Explore lws that are not factors of gws only when gws[2] > 16 + initialize_lws_values(_lws_z, gws[2], lws_z_max, false); + + search_space_shape[0] = _lws_x.size(); + search_space_shape[1] = _lws_y.size(); + search_space_shape[2] = _lws_z.size(); + search_space_shape[3] = 1; + if (tuning_info.tune_wbsm) + { + _wbsm = {-2, -1, 0, 1, 2}; + search_space_shape[3] = _wbsm.size(); + } +} + +void CLTuningParametersListNormal::initialize_lws_values(std::vector<unsigned int> &lws, + unsigned int gws, + unsigned int lws_max, + bool mod_let_one) +{ + lws.push_back(1); + + for (unsigned int i = 2; i <= lws_max; ++i) + { + // Power of two condition + const bool is_power_of_two = (i & (i - 1)) == 0; + + // Condition for the module accordingly with the mod_let_one flag + const bool mod_cond = mod_let_one ? (gws % i) <= 1 : (gws % i) == 0; + + if (mod_cond || is_power_of_two) + { + lws.push_back(i); + } + } +} + +CLTuningParametersListRapid::CLTuningParametersListRapid(const cl::NDRange &gws, CLTuningInfo tuning_info) +{ + const auto lws_x_max = std::min(static_cast<unsigned int>(gws[0]), 8u); // Limit exploration to 1 - 8 + const auto lws_y_max = std::min(static_cast<unsigned int>(gws[1]), 4u); // Limit exploration to 1 - 4 + const auto lws_z_max = std::min(static_cast<unsigned int>(gws[2]), 4u); // Limit exploration to 1 - 4 + + // Initialize the LWS values to test + _lws_x = {}; + _lws_y = {}; + _lws_z = {}; + initialize_lws_values(_lws_x, lws_x_max); + initialize_lws_values(_lws_y, lws_y_max); + initialize_lws_values(_lws_z, lws_z_max); + + search_space_shape[0] = _lws_x.size(); + search_space_shape[1] = _lws_y.size(); + search_space_shape[2] = _lws_z.size(); + search_space_shape[3] = 1; + if (tuning_info.tune_wbsm) + { + _wbsm = {-1, 0, 1}; + search_space_shape[3] = _wbsm.size(); + } +} + +void CLTuningParametersListRapid::initialize_lws_values(std::vector<unsigned int> &lws, unsigned int lws_max) +{ + lws.push_back(1); + + for (unsigned int i = 2; i <= lws_max; i *= 4) + { + lws.push_back(i); + } +} + +std::unique_ptr<ICLTuningParametersList> get_tuning_parameters_list(CLTuningInfo tuning_info, const cl::NDRange &gws) +{ + switch (tuning_info.tuner_mode) + { + case CLTunerMode::EXHAUSTIVE: + return std::make_unique<CLTuningParametersListExhaustive>(gws, tuning_info); + case CLTunerMode::NORMAL: + return std::make_unique<CLTuningParametersListNormal>(gws, tuning_info); + case CLTunerMode::RAPID: + return std::make_unique<CLTuningParametersListRapid>(gws, tuning_info); + default: + return nullptr; + } +} +} // namespace cl_tuner +} // namespace arm_compute diff --git a/src/runtime/CPP/CPPScheduler.cpp b/src/runtime/CPP/CPPScheduler.cpp index 0a03497cb9..9fbdc3a4dd 100644 --- a/src/runtime/CPP/CPPScheduler.cpp +++ b/src/runtime/CPP/CPPScheduler.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2016-2020 ARM Limited. + * Copyright (c) 2016-2023 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -26,17 +26,21 @@ #include "arm_compute/core/CPP/ICPPKernel.h" #include "arm_compute/core/Error.h" #include "arm_compute/core/Helpers.h" +#include "arm_compute/core/Log.h" #include "arm_compute/core/Utils.h" -#include "arm_compute/runtime/CPUUtils.h" +#include "arm_compute/core/utils/misc/Utility.h" + #include "support/Mutex.h" #include <atomic> #include <condition_variable> #include <iostream> #include <list> +#include <memory> #include <mutex> #include <system_error> #include <thread> +#include <vector> namespace arm_compute { @@ -50,8 +54,7 @@ public: * @param[in] start First value that will be returned by the feeder * @param[in] end End condition (The last value returned by get_next() will be end - 1) */ - explicit ThreadFeeder(unsigned int start = 0, unsigned int end = 0) - : _atomic_counter(start), _end(end) + explicit ThreadFeeder(unsigned int start = 0, unsigned int end = 0) : _atomic_counter(start), _end(end) { } /** Return the next element in the range if there is one. @@ -71,61 +74,6 @@ private: const unsigned int _end; }; -/** Given two dimensions and a maxium number of threads to utilise, calcualte the best - * combination of threads that fit in (mutliplied together) max_threads. - * - * This algorithm assumes that work in either of the dimensions is equally difficult - * to compute - * - * @returns [m_nthreads, n_nthreads] A pair of the threads that should be used in each dimension - */ -std::pair<unsigned, unsigned> split_2d(unsigned max_threads, std::size_t m, std::size_t n) -{ - /* - * We want the same ratio of threads in M & N to the ratio of m and n problem size - * - * Therefore: mt/nt == m/n where mt*nt == max_threads - * - * max_threads/nt = mt & (max_threads/nt) * (m/n) = nt - * nt^2 = max_threads * (m/n) - * nt = sqrt( max_threads * (m/n) ) - */ - //ratio of m to n in problem dimensions - double ratio = m / static_cast<double>(n); - - // nt = sqrt(max_threads * (m / n) ) - const unsigned adjusted = std::round( - std::sqrt(max_threads * ratio)); - - //find the nearest factor of max_threads - for(unsigned i = 0; i!= adjusted; ++i) - { - //try down - const unsigned adj_down = adjusted - i; - if(max_threads % adj_down == 0) - { - return { adj_down, max_threads / adj_down }; - } - - //try up - const unsigned adj_up = adjusted + i; - if(max_threads % adj_up == 0) - { - return { adj_up, max_threads / adj_up }; - } - } - - //we didn't find anything so lets bail out with maxes biased to the largest dimension - if(m > n) - { - return{ std::min<unsigned>(m, max_threads), 1 }; - } - else - { - return{ 1, std::min<unsigned>(n, max_threads) }; - } -} - /** Execute workloads[info.thread_id] first, then call the feeder to get the index of the next workload to run. * * Will run workloads until the feeder reaches the end of its range. @@ -141,51 +89,77 @@ void process_workloads(std::vector<IScheduler::Workload> &workloads, ThreadFeede { ARM_COMPUTE_ERROR_ON(workload_index >= workloads.size()); workloads[workload_index](info); - } - while(feeder.get_next(workload_index)); + } while (feeder.get_next(workload_index)); } -} //namespace - -struct CPPScheduler::Impl final +/** Set thread affinity. Pin current thread to a particular core + * + * @param[in] core_id ID of the core to which the current thread is pinned + */ +void set_thread_affinity(int core_id) { - explicit Impl(unsigned int thread_hint) - : _num_threads(thread_hint), _threads(_num_threads - 1) - { - } - void set_num_threads(unsigned int num_threads, unsigned int thead_hint) + if (core_id < 0) { - _num_threads = num_threads == 0 ? thead_hint : num_threads; - _threads.resize(_num_threads - 1); - } - unsigned int num_threads() const - { - return _num_threads; + return; } - void run_workloads(std::vector<IScheduler::Workload> &workloads); - - class Thread; +#if !defined(_WIN64) && !defined(__APPLE__) && !defined(__OpenBSD__) + cpu_set_t set; + CPU_ZERO(&set); + CPU_SET(core_id, &set); + ARM_COMPUTE_EXIT_ON_MSG(sched_setaffinity(0, sizeof(set), &set), "Error setting thread affinity"); +#endif /* !defined(__APPLE__) && !defined(__OpenBSD__) */ +} - unsigned int _num_threads; - std::list<Thread> _threads; - arm_compute::Mutex _run_workloads_mutex{}; -}; +/** There are currently 2 scheduling modes supported by CPPScheduler + * + * Linear: + * The default mode where all the scheduling is carried out by the main thread linearly (in a loop). + * E.G. If there are 8 threads in total, there will be 1 main thread + 7 threads in the thread pool, and it is main + * thread's responsibility to start all the other threads in the thread pool. + * + * Fanout: + * In fanout mode, the scheduling (starting other threads) task is distributed across many threads instead of just + * the main thread. + * + * The scheduler has a fixed parameter: wake_fanout, and the scheduling sequence goes like this: + * 1. Main thread wakes the first wake_fanout - 1 number of FanoutThreads from the thread pool + * From thread: 0 + * To thread (non-inclusive): Wake_fanout - 1 + * 2. Each FanoutThread then wakes wake_fanout number of FanoutThreads from the thread pool: + * From thread: (i + 1) * wake_fanout - 1 + * To thread (non-inclusive): (i + 2) * wake_fanout - 1 + * where i is the current thread's thread id + * The end is clamped at the size of the thread pool / the number of threads in use - 1 + * + * E.G. for a total number of 8 threads (1 main thread, 7 FanoutThreads in thread pool) with a fanout of 3 + * 1. Main thread wakes FanoutThread 0, 1 + * 2. FanoutThread 0 wakes FanoutThread 2, 3, 4 + * 3. FanoutThread 1 wakes FanoutThread 5, 6 + */ -class CPPScheduler::Impl::Thread final +class Thread final { public: - /** Start a new thread. */ - Thread(); + /** Start a new thread + * + * Thread will be pinned to a given core id if value is non-negative + * + * @param[in] core_pin Core id to pin the thread on. If negative no thread pinning will take place + */ + explicit Thread(int core_pin = -1); - Thread(const Thread &) = delete; + Thread(const Thread &) = delete; Thread &operator=(const Thread &) = delete; Thread(Thread &&) = delete; - Thread &operator=(Thread &&) = delete; + Thread &operator=(Thread &&) = delete; /** Destructor. Make the thread join. */ ~Thread(); + /** Set workloads */ + void set_workload(std::vector<IScheduler::Workload> *workloads, ThreadFeeder &feeder, const ThreadInfo &info); + /** Request the worker thread to start executing workloads. * * The thread will start by executing workloads[info.thread_id] and will then call the feeder to @@ -194,47 +168,72 @@ public: * @note This function will return as soon as the workloads have been sent to the worker thread. * wait() needs to be called to ensure the execution is complete. */ - void start(std::vector<IScheduler::Workload> *workloads, ThreadFeeder &feeder, const ThreadInfo &info); + void start(); /** Wait for the current kernel execution to complete. */ - void wait(); + std::exception_ptr wait(); /** Function ran by the worker thread. */ void worker_thread(); + /** Set the scheduling strategy to be linear */ + void set_linear_mode() + { + _thread_pool = nullptr; + _wake_beg = 0; + _wake_end = 0; + } + + /** Set the scheduling strategy to be fanout */ + void set_fanout_mode(std::list<Thread> *thread_pool, unsigned int wake_beg, unsigned int wake_end) + { + _thread_pool = thread_pool; + _wake_beg = wake_beg; + _wake_end = wake_end; + } + private: std::thread _thread{}; ThreadInfo _info{}; - std::vector<IScheduler::Workload> *_workloads{ nullptr }; - ThreadFeeder *_feeder{ nullptr }; + std::vector<IScheduler::Workload> *_workloads{nullptr}; + ThreadFeeder *_feeder{nullptr}; std::mutex _m{}; std::condition_variable _cv{}; - bool _wait_for_work{ false }; - bool _job_complete{ true }; - std::exception_ptr _current_exception{ nullptr }; + bool _wait_for_work{false}; + bool _job_complete{true}; + std::exception_ptr _current_exception{nullptr}; + int _core_pin{-1}; + std::list<Thread> *_thread_pool{nullptr}; + unsigned int _wake_beg{0}; + unsigned int _wake_end{0}; }; -CPPScheduler::Impl::Thread::Thread() +Thread::Thread(int core_pin) : _core_pin(core_pin) { _thread = std::thread(&Thread::worker_thread, this); } -CPPScheduler::Impl::Thread::~Thread() +Thread::~Thread() { // Make sure worker thread has ended - if(_thread.joinable()) + if (_thread.joinable()) { ThreadFeeder feeder; - start(nullptr, feeder, ThreadInfo()); + set_workload(nullptr, feeder, ThreadInfo()); + start(); _thread.join(); } } -void CPPScheduler::Impl::Thread::start(std::vector<IScheduler::Workload> *workloads, ThreadFeeder &feeder, const ThreadInfo &info) +void Thread::set_workload(std::vector<IScheduler::Workload> *workloads, ThreadFeeder &feeder, const ThreadInfo &info) { _workloads = workloads; _feeder = &feeder; _info = info; +} + +void Thread::start() +{ { std::lock_guard<std::mutex> lock(_m); _wait_for_work = true; @@ -243,22 +242,20 @@ void CPPScheduler::Impl::Thread::start(std::vector<IScheduler::Workload> *worklo _cv.notify_one(); } -void CPPScheduler::Impl::Thread::wait() +std::exception_ptr Thread::wait() { { std::unique_lock<std::mutex> lock(_m); _cv.wait(lock, [&] { return _job_complete; }); } - - if(_current_exception) - { - std::rethrow_exception(_current_exception); - } + return _current_exception; } -void CPPScheduler::Impl::Thread::worker_thread() +void Thread::worker_thread() { - while(true) + set_thread_affinity(_core_pin); + + while (true) { std::unique_lock<std::mutex> lock(_m); _cv.wait(lock, [&] { return _wait_for_work; }); @@ -266,12 +263,24 @@ void CPPScheduler::Impl::Thread::worker_thread() _current_exception = nullptr; - // Time to exit - if(_workloads == nullptr) + // Exit if the worker thread has not been fed with workloads + if (_workloads == nullptr || _feeder == nullptr) { return; } + // Wake up more peer threads from thread pool if this job has been delegated to the current thread + if (_thread_pool != nullptr) + { + auto thread_it = _thread_pool->begin(); + std::advance(thread_it, std::min(static_cast<unsigned int>(_thread_pool->size()), _wake_beg)); + auto wake_end = std::min(_wake_end, static_cast<unsigned int>(_info.num_threads - 1)); + for (unsigned int t = _wake_beg; t < wake_end; ++t, ++thread_it) + { + thread_it->start(); + } + } + #ifndef ARM_COMPUTE_EXCEPTIONS_DISABLED try { @@ -280,19 +289,142 @@ void CPPScheduler::Impl::Thread::worker_thread() #ifndef ARM_COMPUTE_EXCEPTIONS_DISABLED } - catch(...) + catch (...) { _current_exception = std::current_exception(); } #endif /* ARM_COMPUTE_EXCEPTIONS_DISABLED */ + _workloads = nullptr; _job_complete = true; lock.unlock(); _cv.notify_one(); } } +} //namespace + +struct CPPScheduler::Impl final +{ + constexpr static unsigned int m_default_wake_fanout = 4; + enum class Mode + { + Linear, + Fanout + }; + enum class ModeToggle + { + None, + Linear, + Fanout + }; + explicit Impl(unsigned int thread_hint) + : _num_threads(thread_hint), _threads(_num_threads - 1), _mode(Mode::Linear), _wake_fanout(0U) + { + const auto mode_env_v = utility::tolower(utility::getenv("ARM_COMPUTE_CPP_SCHEDULER_MODE")); + if (mode_env_v == "linear") + { + _forced_mode = ModeToggle::Linear; + } + else if (mode_env_v == "fanout") + { + _forced_mode = ModeToggle::Fanout; + } + else + { + _forced_mode = ModeToggle::None; + } + } + void set_num_threads(unsigned int num_threads, unsigned int thread_hint) + { + _num_threads = num_threads == 0 ? thread_hint : num_threads; + _threads.resize(_num_threads - 1); + auto_switch_mode(_num_threads); + } + void set_num_threads_with_affinity(unsigned int num_threads, unsigned int thread_hint, BindFunc func) + { + _num_threads = num_threads == 0 ? thread_hint : num_threads; + + // Set affinity on main thread + set_thread_affinity(func(0, thread_hint)); + + // Set affinity on worked threads + _threads.clear(); + for (auto i = 1U; i < _num_threads; ++i) + { + _threads.emplace_back(func(i, thread_hint)); + } + auto_switch_mode(_num_threads); + } + void auto_switch_mode(unsigned int num_threads_to_use) + { + // If the environment variable is set to any of the modes, it overwrites the mode selected over num_threads_to_use + if (_forced_mode == ModeToggle::Fanout || (_forced_mode == ModeToggle::None && num_threads_to_use > 8)) + { + set_fanout_mode(m_default_wake_fanout, num_threads_to_use); + ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE( + "Set CPPScheduler to Fanout mode, with wake up fanout : %d and %d threads to use\n", + this->wake_fanout(), num_threads_to_use); + } + else // Equivalent to (_forced_mode == ModeToggle::Linear || (_forced_mode == ModeToggle::None && num_threads_to_use <= 8)) + { + set_linear_mode(); + ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("Set CPPScheduler to Linear mode, with %d threads to use\n", + num_threads_to_use); + } + } + void set_linear_mode() + { + for (auto &thread : _threads) + { + thread.set_linear_mode(); + } + _mode = Mode::Linear; + _wake_fanout = 0U; + } + void set_fanout_mode(unsigned int wake_fanout, unsigned int num_threads_to_use) + { + ARM_COMPUTE_ERROR_ON(num_threads_to_use > _threads.size() + 1); + const auto actual_wake_fanout = std::max(2U, std::min(wake_fanout, num_threads_to_use - 1)); + auto thread_it = _threads.begin(); + for (auto i = 1U; i < num_threads_to_use; ++i, ++thread_it) + { + const auto wake_begin = i * actual_wake_fanout - 1; + const auto wake_end = std::min((i + 1) * actual_wake_fanout - 1, num_threads_to_use - 1); + thread_it->set_fanout_mode(&_threads, wake_begin, wake_end); + } + // Reset the remaining threads's wake up schedule + while (thread_it != _threads.end()) + { + thread_it->set_fanout_mode(&_threads, 0U, 0U); + ++thread_it; + } + _mode = Mode::Fanout; + _wake_fanout = actual_wake_fanout; + } + unsigned int num_threads() const + { + return _num_threads; + } + unsigned int wake_fanout() const + { + return _wake_fanout; + } + Mode mode() const + { + return _mode; + } + + void run_workloads(std::vector<IScheduler::Workload> &workloads); + + unsigned int _num_threads; + std::list<Thread> _threads; + arm_compute::Mutex _run_workloads_mutex{}; + Mode _mode{Mode::Linear}; + ModeToggle _forced_mode{ModeToggle::None}; + unsigned int _wake_fanout{0}; +}; /* - * This singleton has been deprecated and will be removed in the next release + * This singleton has been deprecated and will be removed in future releases */ CPPScheduler &CPPScheduler::get() { @@ -300,8 +432,7 @@ CPPScheduler &CPPScheduler::get() return scheduler; } -CPPScheduler::CPPScheduler() - : _impl(support::cpp14::make_unique<Impl>(num_threads_hint())) +CPPScheduler::CPPScheduler() : _impl(std::make_unique<Impl>(num_threads_hint())) { } @@ -314,6 +445,13 @@ void CPPScheduler::set_num_threads(unsigned int num_threads) _impl->set_num_threads(num_threads, num_threads_hint()); } +void CPPScheduler::set_num_threads_with_affinity(unsigned int num_threads, BindFunc func) +{ + // No changes in the number of threads while current workloads are running + arm_compute::lock_guard<std::mutex> lock(_impl->_run_workloads_mutex); + _impl->set_num_threads_with_affinity(num_threads, num_threads_hint(), func); +} + unsigned int CPPScheduler::num_threads() const { return _impl->num_threads(); @@ -327,137 +465,93 @@ void CPPScheduler::run_workloads(std::vector<IScheduler::Workload> &workloads) // This is not great because different threads workloads won't run in parallel but at least they // won't interfere each other and deadlock. arm_compute::lock_guard<std::mutex> lock(_impl->_run_workloads_mutex); - const unsigned int num_threads = std::min(_impl->num_threads(), static_cast<unsigned int>(workloads.size())); - if(num_threads < 1) + const unsigned int num_threads_to_use = std::min(_impl->num_threads(), static_cast<unsigned int>(workloads.size())); + if (num_threads_to_use < 1) { return; } - ThreadFeeder feeder(num_threads, workloads.size()); + // Re-adjust the mode if the actual number of threads to use is different from the number of threads created + _impl->auto_switch_mode(num_threads_to_use); + int num_threads_to_start = 0; + switch (_impl->mode()) + { + case CPPScheduler::Impl::Mode::Fanout: + { + num_threads_to_start = static_cast<int>(_impl->wake_fanout()) - 1; + break; + } + case CPPScheduler::Impl::Mode::Linear: + default: + { + num_threads_to_start = static_cast<int>(num_threads_to_use) - 1; + break; + } + } + ThreadFeeder feeder(num_threads_to_use, workloads.size()); ThreadInfo info; - info.cpu_info = &_cpu_info; - info.num_threads = num_threads; + info.cpu_info = &cpu_info(); + info.num_threads = num_threads_to_use; unsigned int t = 0; auto thread_it = _impl->_threads.begin(); - for(; t < num_threads - 1; ++t, ++thread_it) + // Set num_threads_to_use - 1 workloads to the threads as the remaining 1 is left to the main thread + for (; t < num_threads_to_use - 1; ++t, ++thread_it) { info.thread_id = t; - thread_it->start(&workloads, feeder, info); + thread_it->set_workload(&workloads, feeder, info); } - - info.thread_id = t; - process_workloads(workloads, feeder, info); + thread_it = _impl->_threads.begin(); + for (int i = 0; i < num_threads_to_start; ++i, ++thread_it) + { + thread_it->start(); + } + info.thread_id = t; // Set main thread's thread_id + std::exception_ptr last_exception = nullptr; #ifndef ARM_COMPUTE_EXCEPTIONS_DISABLED try { -#endif /* ARM_COMPUTE_EXCEPTIONS_DISABLED */ - for(auto &thread : _impl->_threads) - { - thread.wait(); - } +#endif /* ARM_COMPUTE_EXCEPTIONS_DISABLED */ + process_workloads(workloads, feeder, info); // Main thread processes workloads #ifndef ARM_COMPUTE_EXCEPTIONS_DISABLED } - catch(const std::system_error &e) + catch (...) { - std::cerr << "Caught system_error with code " << e.code() << " meaning " << e.what() << '\n'; + last_exception = std::current_exception(); } -#endif /* ARM_COMPUTE_EXCEPTIONS_DISABLED */ -} -#endif /* DOXYGEN_SKIP_THIS */ -void CPPScheduler::schedule(ICPPKernel *kernel, const Hints &hints) -{ - ARM_COMPUTE_ERROR_ON_MSG(!kernel, "The child class didn't set the kernel"); - - const Window &max_window = kernel->window(); - - if(hints.split_dimension() == IScheduler::split_dimensions_all) + try { - /* - * if the split dim is size_t max then this signals we should parallelise over - * all dimensions - */ - const std::size_t m = max_window.num_iterations(Window::DimX); - const std::size_t n = max_window.num_iterations(Window::DimY); - - //in c++17 this can be swapped for auto [ m_threads, n_threads ] = split_2d(... - unsigned m_threads, n_threads; - std::tie(m_threads, n_threads) = split_2d(_impl->_num_threads, m, n); - - std::vector<IScheduler::Workload> workloads; - for(unsigned int ni = 0; ni != n_threads; ++ni) +#endif /* ARM_COMPUTE_EXCEPTIONS_DISABLED */ + thread_it = _impl->_threads.begin(); + for (unsigned int i = 0; i < num_threads_to_use - 1; ++i, ++thread_it) { - for(unsigned int mi = 0; mi != m_threads; ++mi) + std::exception_ptr current_exception = thread_it->wait(); + if (current_exception) { - workloads.push_back( - [ ni, mi, m_threads, n_threads, &max_window, &kernel ] - (const ThreadInfo & info) - { - //narrow the window to our mi-ni workload - Window win = max_window.split_window(Window::DimX, mi, m_threads) - .split_window(Window::DimY, ni, n_threads); - - win.validate(); - - Window thread_locator; - thread_locator.set(Window::DimX, Window::Dimension(mi, m_threads)); - thread_locator.set(Window::DimY, Window::Dimension(ni, n_threads)); - - thread_locator.validate(); - - kernel->run_nd(win, info, thread_locator); - } - ); + last_exception = current_exception; } } - run_workloads(workloads); + if (last_exception) + { + std::rethrow_exception(last_exception); + } +#ifndef ARM_COMPUTE_EXCEPTIONS_DISABLED } - else + catch (const std::system_error &e) { - const unsigned int num_iterations = max_window.num_iterations(hints.split_dimension()); - const unsigned int num_threads = std::min(num_iterations, _impl->_num_threads); + std::cerr << "Caught system_error with code " << e.code() << " meaning " << e.what() << '\n'; + } +#endif /* ARM_COMPUTE_EXCEPTIONS_DISABLED */ +} +#endif /* DOXYGEN_SKIP_THIS */ - if(num_iterations == 0) - { - return; - } +void CPPScheduler::schedule_op(ICPPKernel *kernel, const Hints &hints, const Window &window, ITensorPack &tensors) +{ + schedule_common(kernel, hints, window, tensors); +} - if(!kernel->is_parallelisable() || num_threads == 1) - { - ThreadInfo info; - info.cpu_info = &_cpu_info; - kernel->run(max_window, info); - } - else - { - unsigned int num_windows = 0; - switch(hints.strategy()) - { - case StrategyHint::STATIC: - num_windows = num_threads; - break; - case StrategyHint::DYNAMIC: - { - const unsigned int granule_threshold = (hints.threshold() <= 0) ? num_threads : static_cast<unsigned int>(hints.threshold()); - // Make sure we don't use some windows which are too small as this might create some contention on the ThreadFeeder - num_windows = num_iterations > granule_threshold ? granule_threshold : num_iterations; - break; - } - default: - ARM_COMPUTE_ERROR("Unknown strategy"); - } - std::vector<IScheduler::Workload> workloads(num_windows); - for(unsigned int t = 0; t < num_windows; t++) - { - //Capture 't' by copy, all the other variables by reference: - workloads[t] = [t, &hints, &max_window, &num_windows, &kernel](const ThreadInfo & info) - { - Window win = max_window.split_window(hints.split_dimension(), t, num_windows); - win.validate(); - kernel->run(win, info); - }; - } - run_workloads(workloads); - } - } +void CPPScheduler::schedule(ICPPKernel *kernel, const Hints &hints) +{ + ITensorPack tensors; + schedule_common(kernel, hints, kernel->window(), tensors); } } // namespace arm_compute diff --git a/src/runtime/CPP/ICPPSimpleFunction.cpp b/src/runtime/CPP/ICPPSimpleFunction.cpp index 42a2d2228c..f4fef11acc 100644 --- a/src/runtime/CPP/ICPPSimpleFunction.cpp +++ b/src/runtime/CPP/ICPPSimpleFunction.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017 ARM Limited. + * Copyright (c) 2017 Arm Limited. * * SPDX-License-Identifier: MIT * diff --git a/src/runtime/CPP/SingleThreadScheduler.cpp b/src/runtime/CPP/SingleThreadScheduler.cpp index 660a79652c..c46a2731d8 100644 --- a/src/runtime/CPP/SingleThreadScheduler.cpp +++ b/src/runtime/CPP/SingleThreadScheduler.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2020 ARM Limited. + * Copyright (c) 2017-2021 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -37,23 +37,38 @@ void SingleThreadScheduler::set_num_threads(unsigned int num_threads) void SingleThreadScheduler::schedule(ICPPKernel *kernel, const Hints &hints) { - const Window &max_window = kernel->window(); - const unsigned int num_iterations = max_window.num_iterations(hints.split_dimension()); - if(num_iterations < 1) + const Window &max_window = kernel->window(); + + if (hints.split_dimension() != IScheduler::split_dimensions_all) { - return; + const unsigned int num_iterations = max_window.num_iterations(hints.split_dimension()); + if (num_iterations < 1) + { + return; + } } ThreadInfo info; - info.cpu_info = &_cpu_info; + info.cpu_info = &cpu_info(); kernel->run(kernel->window(), info); } +void SingleThreadScheduler::schedule_op(ICPPKernel *kernel, + const Hints &hints, + const Window &window, + ITensorPack &tensors) +{ + ARM_COMPUTE_UNUSED(hints); + ThreadInfo info; + info.cpu_info = &cpu_info(); + kernel->run_op(tensors, window, info); +} + void SingleThreadScheduler::run_workloads(std::vector<Workload> &workloads) { ThreadInfo info; - info.cpu_info = &_cpu_info; - for(auto &wl : workloads) + info.cpu_info = &cpu_info(); + for (auto &wl : workloads) { wl(info); } diff --git a/src/runtime/CPP/functions/CPPBoxWithNonMaximaSuppressionLimit.cpp b/src/runtime/CPP/functions/CPPBoxWithNonMaximaSuppressionLimit.cpp index 232f71dbea..94a1673d59 100644 --- a/src/runtime/CPP/functions/CPPBoxWithNonMaximaSuppressionLimit.cpp +++ b/src/runtime/CPP/functions/CPPBoxWithNonMaximaSuppressionLimit.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018-2020 ARM Limited. + * Copyright (c) 2018-2021 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -26,6 +26,8 @@ #include "arm_compute/core/CPP/kernels/CPPBoxWithNonMaximaSuppressionLimitKernel.h" #include "arm_compute/runtime/Scheduler.h" +#include "src/common/utils/Log.h" + namespace arm_compute { namespace @@ -40,28 +42,37 @@ void dequantize_tensor(const ITensor *input, ITensor *output) Iterator input_it(input, window); Iterator output_it(output, window); - switch(data_type) + switch (data_type) { case DataType::QASYMM8: - execute_window_loop(window, [&](const Coordinates &) - { - *reinterpret_cast<float *>(output_it.ptr()) = dequantize(*reinterpret_cast<const uint8_t *>(input_it.ptr()), qinfo.scale, qinfo.offset); - }, - input_it, output_it); + execute_window_loop( + window, + [&](const Coordinates &) + { + *reinterpret_cast<float *>(output_it.ptr()) = + dequantize(*reinterpret_cast<const uint8_t *>(input_it.ptr()), qinfo.scale, qinfo.offset); + }, + input_it, output_it); break; case DataType::QASYMM8_SIGNED: - execute_window_loop(window, [&](const Coordinates &) - { - *reinterpret_cast<float *>(output_it.ptr()) = dequantize_qasymm8_signed(*reinterpret_cast<const int8_t *>(input_it.ptr()), qinfo); - }, - input_it, output_it); + execute_window_loop( + window, + [&](const Coordinates &) + { + *reinterpret_cast<float *>(output_it.ptr()) = + dequantize_qasymm8_signed(*reinterpret_cast<const int8_t *>(input_it.ptr()), qinfo); + }, + input_it, output_it); break; case DataType::QASYMM16: - execute_window_loop(window, [&](const Coordinates &) - { - *reinterpret_cast<float *>(output_it.ptr()) = dequantize(*reinterpret_cast<const uint16_t *>(input_it.ptr()), qinfo.scale, qinfo.offset); - }, - input_it, output_it); + execute_window_loop( + window, + [&](const Coordinates &) + { + *reinterpret_cast<float *>(output_it.ptr()) = + dequantize(*reinterpret_cast<const uint16_t *>(input_it.ptr()), qinfo.scale, qinfo.offset); + }, + input_it, output_it); break; default: ARM_COMPUTE_ERROR("Unsupported data type"); @@ -78,28 +89,37 @@ void quantize_tensor(const ITensor *input, ITensor *output) Iterator input_it(input, window); Iterator output_it(output, window); - switch(data_type) + switch (data_type) { case DataType::QASYMM8: - execute_window_loop(window, [&](const Coordinates &) - { - *reinterpret_cast<uint8_t *>(output_it.ptr()) = quantize_qasymm8(*reinterpret_cast<const float *>(input_it.ptr()), qinfo); - }, - input_it, output_it); + execute_window_loop( + window, + [&](const Coordinates &) + { + *reinterpret_cast<uint8_t *>(output_it.ptr()) = + quantize_qasymm8(*reinterpret_cast<const float *>(input_it.ptr()), qinfo); + }, + input_it, output_it); break; case DataType::QASYMM8_SIGNED: - execute_window_loop(window, [&](const Coordinates &) - { - *reinterpret_cast<int8_t *>(output_it.ptr()) = quantize_qasymm8_signed(*reinterpret_cast<const float *>(input_it.ptr()), qinfo); - }, - input_it, output_it); + execute_window_loop( + window, + [&](const Coordinates &) + { + *reinterpret_cast<int8_t *>(output_it.ptr()) = + quantize_qasymm8_signed(*reinterpret_cast<const float *>(input_it.ptr()), qinfo); + }, + input_it, output_it); break; case DataType::QASYMM16: - execute_window_loop(window, [&](const Coordinates &) - { - *reinterpret_cast<uint16_t *>(output_it.ptr()) = quantize_qasymm16(*reinterpret_cast<const float *>(input_it.ptr()), qinfo); - }, - input_it, output_it); + execute_window_loop( + window, + [&](const Coordinates &) + { + *reinterpret_cast<uint16_t *>(output_it.ptr()) = + quantize_qasymm16(*reinterpret_cast<const float *>(input_it.ptr()), qinfo); + }, + input_it, output_it); break; default: ARM_COMPUTE_ERROR("Unsupported data type"); @@ -130,12 +150,23 @@ CPPBoxWithNonMaximaSuppressionLimit::CPPBoxWithNonMaximaSuppressionLimit(std::sh { } -void CPPBoxWithNonMaximaSuppressionLimit::configure(const ITensor *scores_in, const ITensor *boxes_in, const ITensor *batch_splits_in, ITensor *scores_out, ITensor *boxes_out, ITensor *classes, - ITensor *batch_splits_out, ITensor *keeps, ITensor *keeps_size, const BoxNMSLimitInfo info) +void CPPBoxWithNonMaximaSuppressionLimit::configure(const ITensor *scores_in, + const ITensor *boxes_in, + const ITensor *batch_splits_in, + ITensor *scores_out, + ITensor *boxes_out, + ITensor *classes, + ITensor *batch_splits_out, + ITensor *keeps, + ITensor *keeps_size, + const BoxNMSLimitInfo info) { ARM_COMPUTE_ERROR_ON_NULLPTR(scores_in, boxes_in, scores_out, boxes_out, classes); + ARM_COMPUTE_LOG_PARAMS(scores_in, boxes_in, batch_splits_in, scores_out, boxes_out, classes, batch_splits_out, + keeps, keeps_size, info); - _is_qasymm8 = scores_in->info()->data_type() == DataType::QASYMM8 || scores_in->info()->data_type() == DataType::QASYMM8_SIGNED; + _is_qasymm8 = scores_in->info()->data_type() == DataType::QASYMM8 || + scores_in->info()->data_type() == DataType::QASYMM8_SIGNED; _scores_in = scores_in; _boxes_in = boxes_in; @@ -146,7 +177,7 @@ void CPPBoxWithNonMaximaSuppressionLimit::configure(const ITensor *scores_in, co _batch_splits_out = batch_splits_out; _keeps = keeps; - if(_is_qasymm8) + if (_is_qasymm8) { // Manage intermediate buffers _memory_group.manage(&_scores_in_f32); @@ -156,7 +187,7 @@ void CPPBoxWithNonMaximaSuppressionLimit::configure(const ITensor *scores_in, co _memory_group.manage(&_classes_f32); _scores_in_f32.allocator()->init(scores_in->info()->clone()->set_data_type(DataType::F32)); _boxes_in_f32.allocator()->init(boxes_in->info()->clone()->set_data_type(DataType::F32)); - if(batch_splits_in != nullptr) + if (batch_splits_in != nullptr) { _memory_group.manage(&_batch_splits_in_f32); _batch_splits_in_f32.allocator()->init(batch_splits_in->info()->clone()->set_data_type(DataType::F32)); @@ -164,58 +195,70 @@ void CPPBoxWithNonMaximaSuppressionLimit::configure(const ITensor *scores_in, co _scores_out_f32.allocator()->init(scores_out->info()->clone()->set_data_type(DataType::F32)); _boxes_out_f32.allocator()->init(boxes_out->info()->clone()->set_data_type(DataType::F32)); _classes_f32.allocator()->init(classes->info()->clone()->set_data_type(DataType::F32)); - if(batch_splits_out != nullptr) + if (batch_splits_out != nullptr) { _memory_group.manage(&_batch_splits_out_f32); _batch_splits_out_f32.allocator()->init(batch_splits_out->info()->clone()->set_data_type(DataType::F32)); } - if(keeps != nullptr) + if (keeps != nullptr) { _memory_group.manage(&_keeps_f32); _keeps_f32.allocator()->init(keeps->info()->clone()->set_data_type(DataType::F32)); } - _box_with_nms_limit_kernel.configure(&_scores_in_f32, &_boxes_in_f32, (batch_splits_in != nullptr) ? &_batch_splits_in_f32 : nullptr, + _box_with_nms_limit_kernel.configure(&_scores_in_f32, &_boxes_in_f32, + (batch_splits_in != nullptr) ? &_batch_splits_in_f32 : nullptr, &_scores_out_f32, &_boxes_out_f32, &_classes_f32, - (batch_splits_out != nullptr) ? &_batch_splits_out_f32 : nullptr, (keeps != nullptr) ? &_keeps_f32 : nullptr, - keeps_size, info); + (batch_splits_out != nullptr) ? &_batch_splits_out_f32 : nullptr, + (keeps != nullptr) ? &_keeps_f32 : nullptr, keeps_size, info); } else { - _box_with_nms_limit_kernel.configure(scores_in, boxes_in, batch_splits_in, scores_out, boxes_out, classes, batch_splits_out, keeps, keeps_size, info); + _box_with_nms_limit_kernel.configure(scores_in, boxes_in, batch_splits_in, scores_out, boxes_out, classes, + batch_splits_out, keeps, keeps_size, info); } - if(_is_qasymm8) + if (_is_qasymm8) { _scores_in_f32.allocator()->allocate(); _boxes_in_f32.allocator()->allocate(); - if(_batch_splits_in != nullptr) + if (_batch_splits_in != nullptr) { _batch_splits_in_f32.allocator()->allocate(); } _scores_out_f32.allocator()->allocate(); _boxes_out_f32.allocator()->allocate(); _classes_f32.allocator()->allocate(); - if(batch_splits_out != nullptr) + if (batch_splits_out != nullptr) { _batch_splits_out_f32.allocator()->allocate(); } - if(keeps != nullptr) + if (keeps != nullptr) { _keeps_f32.allocator()->allocate(); } } } -Status validate(const ITensorInfo *scores_in, const ITensorInfo *boxes_in, const ITensorInfo *batch_splits_in, const ITensorInfo *scores_out, const ITensorInfo *boxes_out, const ITensorInfo *classes, - const ITensorInfo *batch_splits_out, const ITensorInfo *keeps, const ITensorInfo *keeps_size, const BoxNMSLimitInfo info) +Status validate(const ITensorInfo *scores_in, + const ITensorInfo *boxes_in, + const ITensorInfo *batch_splits_in, + const ITensorInfo *scores_out, + const ITensorInfo *boxes_out, + const ITensorInfo *classes, + const ITensorInfo *batch_splits_out, + const ITensorInfo *keeps, + const ITensorInfo *keeps_size, + const BoxNMSLimitInfo info) { ARM_COMPUTE_UNUSED(batch_splits_in, batch_splits_out, keeps, keeps_size, info); ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(scores_in, boxes_in, scores_out, boxes_out, classes); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(scores_in, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F16, DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(scores_in, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, + DataType::F16, DataType::F32); - const bool is_qasymm8 = scores_in->data_type() == DataType::QASYMM8 || scores_in->data_type() == DataType::QASYMM8_SIGNED; - if(is_qasymm8) + const bool is_qasymm8 = + scores_in->data_type() == DataType::QASYMM8 || scores_in->data_type() == DataType::QASYMM8_SIGNED; + if (is_qasymm8) { ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(boxes_in, 1, DataType::QASYMM16); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(boxes_in, boxes_out); @@ -233,11 +276,11 @@ void CPPBoxWithNonMaximaSuppressionLimit::run() // Acquire all the temporaries MemoryGroupResourceScope scope_mg(_memory_group); - if(_is_qasymm8) + if (_is_qasymm8) { dequantize_tensor(_scores_in, &_scores_in_f32); dequantize_tensor(_boxes_in, &_boxes_in_f32); - if(_batch_splits_in != nullptr) + if (_batch_splits_in != nullptr) { dequantize_tensor(_batch_splits_in, &_batch_splits_in_f32); } @@ -245,16 +288,16 @@ void CPPBoxWithNonMaximaSuppressionLimit::run() Scheduler::get().schedule(&_box_with_nms_limit_kernel, Window::DimY); - if(_is_qasymm8) + if (_is_qasymm8) { quantize_tensor(&_scores_out_f32, _scores_out); quantize_tensor(&_boxes_out_f32, _boxes_out); quantize_tensor(&_classes_f32, _classes); - if(_batch_splits_out != nullptr) + if (_batch_splits_out != nullptr) { quantize_tensor(&_batch_splits_out_f32, _batch_splits_out); } - if(_keeps != nullptr) + if (_keeps != nullptr) { quantize_tensor(&_keeps_f32, _keeps); } diff --git a/src/runtime/CPP/functions/CPPDetectionOutputLayer.cpp b/src/runtime/CPP/functions/CPPDetectionOutputLayer.cpp index 4ec0ab6c1a..e6291f973e 100644 --- a/src/runtime/CPP/functions/CPPDetectionOutputLayer.cpp +++ b/src/runtime/CPP/functions/CPPDetectionOutputLayer.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018-2020 ARM Limited. + * Copyright (c) 2018-2021 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -27,31 +27,44 @@ #include "arm_compute/core/Helpers.h" #include "arm_compute/core/Validate.h" +#include "src/common/utils/Log.h" +#include "src/core/helpers/AutoConfiguration.h" + #include <list> namespace arm_compute { namespace { -Status validate_arguments(const ITensorInfo *input_loc, const ITensorInfo *input_conf, const ITensorInfo *input_priorbox, const ITensorInfo *output, DetectionOutputLayerInfo info) +Status validate_arguments(const ITensorInfo *input_loc, + const ITensorInfo *input_conf, + const ITensorInfo *input_priorbox, + const ITensorInfo *output, + DetectionOutputLayerInfo info) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input_loc, input_conf, input_priorbox, output); ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input_loc, 1, DataType::F32); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input_loc, input_conf, input_priorbox); ARM_COMPUTE_RETURN_ERROR_ON_MSG(input_loc->num_dimensions() > 2, "The location input tensor should be [C1, N]."); ARM_COMPUTE_RETURN_ERROR_ON_MSG(input_conf->num_dimensions() > 2, "The location input tensor should be [C2, N]."); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(input_priorbox->num_dimensions() > 3, "The priorbox input tensor should be [C3, 2, N]."); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(input_priorbox->num_dimensions() > 3, + "The priorbox input tensor should be [C3, 2, N]."); ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.eta() <= 0.f && info.eta() > 1.f, "Eta should be between 0 and 1"); const int num_priors = input_priorbox->tensor_shape()[0] / 4; - ARM_COMPUTE_RETURN_ERROR_ON_MSG(static_cast<size_t>((num_priors * info.num_loc_classes() * 4)) != input_loc->tensor_shape()[0], "Number of priors must match number of location predictions."); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(static_cast<size_t>((num_priors * info.num_classes())) != input_conf->tensor_shape()[0], "Number of priors must match number of confidence predictions."); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(static_cast<size_t>((num_priors * info.num_loc_classes() * 4)) != + input_loc->tensor_shape()[0], + "Number of priors must match number of location predictions."); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(static_cast<size_t>((num_priors * info.num_classes())) != + input_conf->tensor_shape()[0], + "Number of priors must match number of confidence predictions."); // Validate configured output - if(output->total_size() != 0) + if (output->total_size() != 0) { - const unsigned int max_size = info.keep_top_k() * (input_loc->num_dimensions() > 1 ? input_loc->dimension(1) : 1); + const unsigned int max_size = + info.keep_top_k() * (input_loc->num_dimensions() > 1 ? input_loc->dimension(1) : 1); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), TensorShape(7U, max_size)); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input_loc, output); } @@ -62,8 +75,7 @@ Status validate_arguments(const ITensorInfo *input_loc, const ITensorInfo *input /** Function used to sort pair<float, T> in descend order based on the score (first) value. */ template <typename T> -bool SortScorePairDescend(const std::pair<float, T> &pair1, - const std::pair<float, T> &pair2) +bool SortScorePairDescend(const std::pair<float, T> &pair1, const std::pair<float, T> &pair2) { return pair1.first > pair2.first; } @@ -79,16 +91,19 @@ bool SortScorePairDescend(const std::pair<float, T> &pair1, * @param[out] all_location_predictions All the location predictions. * */ -void retrieve_all_loc_predictions(const ITensor *input_loc, const int num, - const int num_priors, const int num_loc_classes, - const bool share_location, std::vector<LabelBBox> &all_location_predictions) +void retrieve_all_loc_predictions(const ITensor *input_loc, + const int num, + const int num_priors, + const int num_loc_classes, + const bool share_location, + std::vector<LabelBBox> &all_location_predictions) { - for(int i = 0; i < num; ++i) + for (int i = 0; i < num; ++i) { - for(int c = 0; c < num_loc_classes; ++c) + for (int c = 0; c < num_loc_classes; ++c) { int label = share_location ? -1 : c; - if(all_location_predictions[i].find(label) == all_location_predictions[i].end()) + if (all_location_predictions[i].find(label) == all_location_predictions[i].end()) { all_location_predictions[i][label].resize(num_priors); } @@ -99,19 +114,23 @@ void retrieve_all_loc_predictions(const ITensor *input_loc, const int num, } } } - for(int i = 0; i < num; ++i) + for (int i = 0; i < num; ++i) { - for(int p = 0; p < num_priors; ++p) + for (int p = 0; p < num_priors; ++p) { - for(int c = 0; c < num_loc_classes; ++c) + for (int c = 0; c < num_loc_classes; ++c) { const int label = share_location ? -1 : c; const int base_ptr = i * num_priors * num_loc_classes * 4 + p * num_loc_classes * 4 + c * 4; //xmin, ymin, xmax, ymax - all_location_predictions[i][label][p][0] = *reinterpret_cast<float *>(input_loc->ptr_to_element(Coordinates(base_ptr))); - all_location_predictions[i][label][p][1] = *reinterpret_cast<float *>(input_loc->ptr_to_element(Coordinates(base_ptr + 1))); - all_location_predictions[i][label][p][2] = *reinterpret_cast<float *>(input_loc->ptr_to_element(Coordinates(base_ptr + 2))); - all_location_predictions[i][label][p][3] = *reinterpret_cast<float *>(input_loc->ptr_to_element(Coordinates(base_ptr + 3))); + all_location_predictions[i][label][p][0] = + *reinterpret_cast<float *>(input_loc->ptr_to_element(Coordinates(base_ptr))); + all_location_predictions[i][label][p][1] = + *reinterpret_cast<float *>(input_loc->ptr_to_element(Coordinates(base_ptr + 1))); + all_location_predictions[i][label][p][2] = + *reinterpret_cast<float *>(input_loc->ptr_to_element(Coordinates(base_ptr + 2))); + all_location_predictions[i][label][p][3] = + *reinterpret_cast<float *>(input_loc->ptr_to_element(Coordinates(base_ptr + 3))); } } } @@ -127,26 +146,28 @@ void retrieve_all_loc_predictions(const ITensor *input_loc, const int num, * @param[out] all_location_predictions All the location predictions. * */ -void retrieve_all_conf_scores(const ITensor *input_conf, const int num, - const int num_priors, const int num_classes, +void retrieve_all_conf_scores(const ITensor *input_conf, + const int num, + const int num_priors, + const int num_classes, std::vector<std::map<int, std::vector<float>>> &all_confidence_scores) { std::vector<float> tmp_buffer; tmp_buffer.resize(num * num_priors * num_classes); - for(int i = 0; i < num; ++i) + for (int i = 0; i < num; ++i) { - for(int c = 0; c < num_classes; ++c) + for (int c = 0; c < num_classes; ++c) { - for(int p = 0; p < num_priors; ++p) + for (int p = 0; p < num_priors; ++p) { - tmp_buffer[i * num_classes * num_priors + c * num_priors + p] = - *reinterpret_cast<float *>(input_conf->ptr_to_element(Coordinates(i * num_classes * num_priors + p * num_classes + c))); + tmp_buffer[i * num_classes * num_priors + c * num_priors + p] = *reinterpret_cast<float *>( + input_conf->ptr_to_element(Coordinates(i * num_classes * num_priors + p * num_classes + c))); } } } - for(int i = 0; i < num; ++i) + for (int i = 0; i < num; ++i) { - for(int c = 0; c < num_classes; ++c) + for (int c = 0; c < num_classes; ++c) { all_confidence_scores[i][c].resize(num_priors); all_confidence_scores[i][c].assign(&tmp_buffer[i * num_classes * num_priors + c * num_priors], @@ -165,28 +186,23 @@ void retrieve_all_conf_scores(const ITensor *input_conf, const int num, * @param[out] all_location_predictions All the location predictions. * */ -void retrieve_all_priorbox(const ITensor *input_priorbox, - const int num_priors, - std::vector<BBox> &all_prior_bboxes, +void retrieve_all_priorbox(const ITensor *input_priorbox, + const int num_priors, + std::vector<BBox> &all_prior_bboxes, std::vector<std::array<float, 4>> &all_prior_variances) { - for(int i = 0; i < num_priors; ++i) + for (int i = 0; i < num_priors; ++i) { - all_prior_bboxes[i] = - { - { - *reinterpret_cast<float *>(input_priorbox->ptr_to_element(Coordinates(i * 4))), - *reinterpret_cast<float *>(input_priorbox->ptr_to_element(Coordinates(i * 4 + 1))), - *reinterpret_cast<float *>(input_priorbox->ptr_to_element(Coordinates(i * 4 + 2))), - *reinterpret_cast<float *>(input_priorbox->ptr_to_element(Coordinates(i * 4 + 3))) - } - }; + all_prior_bboxes[i] = {{*reinterpret_cast<float *>(input_priorbox->ptr_to_element(Coordinates(i * 4))), + *reinterpret_cast<float *>(input_priorbox->ptr_to_element(Coordinates(i * 4 + 1))), + *reinterpret_cast<float *>(input_priorbox->ptr_to_element(Coordinates(i * 4 + 2))), + *reinterpret_cast<float *>(input_priorbox->ptr_to_element(Coordinates(i * 4 + 3)))}}; } - std::array<float, 4> var({ { 0, 0, 0, 0 } }); - for(int i = 0; i < num_priors; ++i) + std::array<float, 4> var({{0, 0, 0, 0}}); + for (int i = 0; i < num_priors; ++i) { - for(int j = 0; j < 4; ++j) + for (int j = 0; j < 4; ++j) { var[j] = *reinterpret_cast<float *>(input_priorbox->ptr_to_element(Coordinates((num_priors + i) * 4 + j))); } @@ -205,13 +221,17 @@ void retrieve_all_priorbox(const ITensor *input_priorbox, * @param[out] decode_bbox The decoded bboxes. * */ -void DecodeBBox(const BBox &prior_bbox, const std::array<float, 4> &prior_variance, - const DetectionOutputLayerCodeType code_type, const bool variance_encoded_in_target, - const bool clip_bbox, const BBox &bbox, BBox &decode_bbox) +void DecodeBBox(const BBox &prior_bbox, + const std::array<float, 4> &prior_variance, + const DetectionOutputLayerCodeType code_type, + const bool variance_encoded_in_target, + const bool clip_bbox, + const BBox &bbox, + BBox &decode_bbox) { // if the variance is encoded in target, we simply need to add the offset predictions // otherwise we need to scale the offset accordingly. - switch(code_type) + switch (code_type) { case DetectionOutputLayerCodeType::CORNER: { @@ -234,10 +254,14 @@ void DecodeBBox(const BBox &prior_bbox, const std::array<float, 4> &prior_varian const float prior_center_x = (prior_bbox[0] + prior_bbox[2]) / 2.; const float prior_center_y = (prior_bbox[1] + prior_bbox[3]) / 2.; - const float decode_bbox_center_x = (variance_encoded_in_target ? bbox[0] : prior_variance[0] * bbox[0]) * prior_width + prior_center_x; - const float decode_bbox_center_y = (variance_encoded_in_target ? bbox[1] : prior_variance[1] * bbox[1]) * prior_height + prior_center_y; - const float decode_bbox_width = (variance_encoded_in_target ? std::exp(bbox[2]) : std::exp(prior_variance[2] * bbox[2])) * prior_width; - const float decode_bbox_height = (variance_encoded_in_target ? std::exp(bbox[3]) : std::exp(prior_variance[3] * bbox[3])) * prior_height; + const float decode_bbox_center_x = + (variance_encoded_in_target ? bbox[0] : prior_variance[0] * bbox[0]) * prior_width + prior_center_x; + const float decode_bbox_center_y = + (variance_encoded_in_target ? bbox[1] : prior_variance[1] * bbox[1]) * prior_height + prior_center_y; + const float decode_bbox_width = + (variance_encoded_in_target ? std::exp(bbox[2]) : std::exp(prior_variance[2] * bbox[2])) * prior_width; + const float decode_bbox_height = + (variance_encoded_in_target ? std::exp(bbox[3]) : std::exp(prior_variance[3] * bbox[3])) * prior_height; decode_bbox[0] = (decode_bbox_center_x - decode_bbox_width / 2.f); decode_bbox[1] = (decode_bbox_center_y - decode_bbox_height / 2.f); @@ -255,10 +279,14 @@ void DecodeBBox(const BBox &prior_bbox, const std::array<float, 4> &prior_varian ARM_COMPUTE_ERROR_ON(prior_width <= 0.f); ARM_COMPUTE_ERROR_ON(prior_height <= 0.f); - decode_bbox[0] = prior_bbox[0] + (variance_encoded_in_target ? bbox[0] : prior_variance[0] * bbox[0]) * prior_width; - decode_bbox[1] = prior_bbox[1] + (variance_encoded_in_target ? bbox[1] : prior_variance[1] * bbox[1]) * prior_height; - decode_bbox[2] = prior_bbox[2] + (variance_encoded_in_target ? bbox[2] : prior_variance[2] * bbox[2]) * prior_width; - decode_bbox[3] = prior_bbox[3] + (variance_encoded_in_target ? bbox[3] : prior_variance[3] * bbox[3]) * prior_height; + decode_bbox[0] = + prior_bbox[0] + (variance_encoded_in_target ? bbox[0] : prior_variance[0] * bbox[0]) * prior_width; + decode_bbox[1] = + prior_bbox[1] + (variance_encoded_in_target ? bbox[1] : prior_variance[1] * bbox[1]) * prior_height; + decode_bbox[2] = + prior_bbox[2] + (variance_encoded_in_target ? bbox[2] : prior_variance[2] * bbox[2]) * prior_width; + decode_bbox[3] = + prior_bbox[3] + (variance_encoded_in_target ? bbox[3] : prior_variance[3] * bbox[3]) * prior_height; break; } @@ -266,9 +294,9 @@ void DecodeBBox(const BBox &prior_bbox, const std::array<float, 4> &prior_varian ARM_COMPUTE_ERROR("Unsupported Detection Output Code Type."); } - if(clip_bbox) + if (clip_bbox) { - for(auto &d_bbox : decode_bbox) + for (auto &d_bbox : decode_bbox) { d_bbox = utility::clamp(d_bbox, 0.f, 1.f); } @@ -286,10 +314,13 @@ void DecodeBBox(const BBox &prior_bbox, const std::array<float, 4> &prior_varian * @param[out] indices The kept indices of bboxes after nms. * */ -void ApplyNMSFast(const std::vector<BBox> &bboxes, - const std::vector<float> &scores, const float score_threshold, - const float nms_threshold, const float eta, const int top_k, - std::vector<int> &indices) +void ApplyNMSFast(const std::vector<BBox> &bboxes, + const std::vector<float> &scores, + const float score_threshold, + const float nms_threshold, + const float eta, + const int top_k, + std::vector<int> &indices) { ARM_COMPUTE_ERROR_ON_MSG(bboxes.size() != scores.size(), "bboxes and scores have different size."); @@ -297,9 +328,9 @@ void ApplyNMSFast(const std::vector<BBox> &bboxes, std::list<std::pair<float, int>> score_index_vec; // Generate index score pairs. - for(size_t i = 0; i < scores.size(); ++i) + for (size_t i = 0; i < scores.size(); ++i) { - if(scores[i] > score_threshold) + if (scores[i] > score_threshold) { score_index_vec.emplace_back(std::make_pair(scores[i], i)); } @@ -310,7 +341,7 @@ void ApplyNMSFast(const std::vector<BBox> &bboxes, // Keep top_k scores if needed. const int score_index_vec_size = score_index_vec.size(); - if(top_k > -1 && top_k < score_index_vec_size) + if (top_k > -1 && top_k < score_index_vec_size) { score_index_vec.resize(top_k); } @@ -319,46 +350,45 @@ void ApplyNMSFast(const std::vector<BBox> &bboxes, float adaptive_threshold = nms_threshold; indices.clear(); - while(!score_index_vec.empty()) + while (!score_index_vec.empty()) { const int idx = score_index_vec.front().second; bool keep = true; - for(int kept_idx : indices) + for (int kept_idx : indices) { - if(keep) + if (keep) { // Compute the jaccard (intersection over union IoU) overlap between two bboxes. - BBox intersect_bbox = std::array<float, 4>({ 0, 0, 0, 0 }); - if(bboxes[kept_idx][0] > bboxes[idx][2] || bboxes[kept_idx][2] < bboxes[idx][0] || bboxes[kept_idx][1] > bboxes[idx][3] || bboxes[kept_idx][3] < bboxes[idx][1]) + BBox intersect_bbox = std::array<float, 4>({0, 0, 0, 0}); + if (bboxes[kept_idx][0] > bboxes[idx][2] || bboxes[kept_idx][2] < bboxes[idx][0] || + bboxes[kept_idx][1] > bboxes[idx][3] || bboxes[kept_idx][3] < bboxes[idx][1]) { - intersect_bbox = std::array<float, 4>({ { 0, 0, 0, 0 } }); + intersect_bbox = std::array<float, 4>({{0, 0, 0, 0}}); } else { - intersect_bbox = std::array<float, 4>({ { - std::max(bboxes[idx][0], bboxes[kept_idx][0]), - std::max(bboxes[idx][1], bboxes[kept_idx][1]), - std::min(bboxes[idx][2], bboxes[kept_idx][2]), - std::min(bboxes[idx][3], bboxes[kept_idx][3]) - } - }); + intersect_bbox = std::array<float, 4>( + {{std::max(bboxes[idx][0], bboxes[kept_idx][0]), std::max(bboxes[idx][1], bboxes[kept_idx][1]), + std::min(bboxes[idx][2], bboxes[kept_idx][2]), + std::min(bboxes[idx][3], bboxes[kept_idx][3])}}); } float intersect_width = intersect_bbox[2] - intersect_bbox[0]; float intersect_height = intersect_bbox[3] - intersect_bbox[1]; float overlap = 0.f; - if(intersect_width > 0 && intersect_height > 0) + if (intersect_width > 0 && intersect_height > 0) { float intersect_size = intersect_width * intersect_height; - float bbox1_size = (bboxes[idx][2] < bboxes[idx][0] - || bboxes[idx][3] < bboxes[idx][1]) ? - 0.f : - (bboxes[idx][2] - bboxes[idx][0]) * (bboxes[idx][3] - bboxes[idx][1]); //BBoxSize(bboxes[idx]); - float bbox2_size = (bboxes[kept_idx][2] < bboxes[kept_idx][0] - || bboxes[kept_idx][3] < bboxes[kept_idx][1]) ? - 0.f : - (bboxes[kept_idx][2] - bboxes[kept_idx][0]) * (bboxes[kept_idx][3] - bboxes[kept_idx][1]); // BBoxSize(bboxes[kept_idx]); + float bbox1_size = (bboxes[idx][2] < bboxes[idx][0] || bboxes[idx][3] < bboxes[idx][1]) + ? 0.f + : (bboxes[idx][2] - bboxes[idx][0]) * + (bboxes[idx][3] - bboxes[idx][1]); //BBoxSize(bboxes[idx]); + float bbox2_size = + (bboxes[kept_idx][2] < bboxes[kept_idx][0] || bboxes[kept_idx][3] < bboxes[kept_idx][1]) + ? 0.f + : (bboxes[kept_idx][2] - bboxes[kept_idx][0]) * + (bboxes[kept_idx][3] - bboxes[kept_idx][1]); // BBoxSize(bboxes[kept_idx]); overlap = intersect_size / (bbox1_size + bbox2_size - intersect_size); } keep = (overlap <= adaptive_threshold); @@ -368,12 +398,12 @@ void ApplyNMSFast(const std::vector<BBox> &bboxes, break; } } - if(keep) + if (keep) { indices.push_back(idx); } score_index_vec.erase(score_index_vec.begin()); - if(keep && eta < 1.f && adaptive_threshold > 0.5f) + if (keep && eta < 1.f && adaptive_threshold > 0.5f) { adaptive_threshold *= eta; } @@ -382,23 +412,42 @@ void ApplyNMSFast(const std::vector<BBox> &bboxes, } // namespace CPPDetectionOutputLayer::CPPDetectionOutputLayer() - : _input_loc(nullptr), _input_conf(nullptr), _input_priorbox(nullptr), _output(nullptr), _info(), _num_priors(), _num(), _all_location_predictions(), _all_confidence_scores(), _all_prior_bboxes(), - _all_prior_variances(), _all_decode_bboxes(), _all_indices() + : _input_loc(nullptr), + _input_conf(nullptr), + _input_priorbox(nullptr), + _output(nullptr), + _info(), + _num_priors(), + _num(), + _all_location_predictions(), + _all_confidence_scores(), + _all_prior_bboxes(), + _all_prior_variances(), + _all_decode_bboxes(), + _all_indices() { } -void CPPDetectionOutputLayer::configure(const ITensor *input_loc, const ITensor *input_conf, const ITensor *input_priorbox, ITensor *output, DetectionOutputLayerInfo info) +void CPPDetectionOutputLayer::configure(const ITensor *input_loc, + const ITensor *input_conf, + const ITensor *input_priorbox, + ITensor *output, + DetectionOutputLayerInfo info) { ARM_COMPUTE_ERROR_ON_NULLPTR(input_loc, input_conf, input_priorbox, output); + ARM_COMPUTE_LOG_PARAMS(input_loc, input_conf, input_priorbox, output, info); + // Output auto initialization if not yet initialized // Since the number of bboxes to kept is unknown before nms, the shape is set to the maximum // The maximum is keep_top_k * input_loc_size[1] // Each row is a 7 dimension std::vector, which stores [image_id, label, confidence, xmin, ymin, xmax, ymax] - const unsigned int max_size = info.keep_top_k() * (input_loc->info()->num_dimensions() > 1 ? input_loc->info()->dimension(1) : 1); + const unsigned int max_size = + info.keep_top_k() * (input_loc->info()->num_dimensions() > 1 ? input_loc->info()->dimension(1) : 1); auto_init_if_empty(*output->info(), input_loc->info()->clone()->set_tensor_shape(TensorShape(7U, max_size))); // Perform validation step - ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input_loc->info(), input_conf->info(), input_priorbox->info(), output->info(), info)); + ARM_COMPUTE_ERROR_THROW_ON( + validate_arguments(input_loc->info(), input_conf->info(), input_priorbox->info(), output->info(), info)); _input_loc = input_loc; _input_conf = input_conf; @@ -414,12 +463,12 @@ void CPPDetectionOutputLayer::configure(const ITensor *input_loc, const ITensor _all_prior_variances.resize(_num_priors); _all_decode_bboxes.resize(_num); - for(int i = 0; i < _num; ++i) + for (int i = 0; i < _num; ++i) { - for(int c = 0; c < _info.num_loc_classes(); ++c) + for (int c = 0; c < _info.num_loc_classes(); ++c) { const int label = _info.share_location() ? -1 : c; - if(label == _info.background_label_id()) + if (label == _info.background_label_id()) { // Ignore background class. continue; @@ -434,7 +483,11 @@ void CPPDetectionOutputLayer::configure(const ITensor *input_loc, const ITensor output->info()->set_valid_region(ValidRegion(coord, output->info()->tensor_shape())); } -Status CPPDetectionOutputLayer::validate(const ITensorInfo *input_loc, const ITensorInfo *input_conf, const ITensorInfo *input_priorbox, const ITensorInfo *output, DetectionOutputLayerInfo info) +Status CPPDetectionOutputLayer::validate(const ITensorInfo *input_loc, + const ITensorInfo *input_conf, + const ITensorInfo *input_priorbox, + const ITensorInfo *output, + DetectionOutputLayerInfo info) { ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input_loc, input_conf, input_priorbox, output, info)); return Status{}; @@ -443,7 +496,8 @@ Status CPPDetectionOutputLayer::validate(const ITensorInfo *input_loc, const ITe void CPPDetectionOutputLayer::run() { // Retrieve all location predictions. - retrieve_all_loc_predictions(_input_loc, _num, _num_priors, _info.num_loc_classes(), _info.share_location(), _all_location_predictions); + retrieve_all_loc_predictions(_input_loc, _num, _num_priors, _info.num_loc_classes(), _info.share_location(), + _all_location_predictions); // Retrieve all confidences. retrieve_all_conf_scores(_input_conf, _num, _num_priors, _info.num_classes(), _all_confidence_scores); @@ -453,75 +507,79 @@ void CPPDetectionOutputLayer::run() // Decode all loc predictions to bboxes const bool clip_bbox = false; - for(int i = 0; i < _num; ++i) + for (int i = 0; i < _num; ++i) { - for(int c = 0; c < _info.num_loc_classes(); ++c) + for (int c = 0; c < _info.num_loc_classes(); ++c) { const int label = _info.share_location() ? -1 : c; - if(label == _info.background_label_id()) + if (label == _info.background_label_id()) { // Ignore background class. continue; } - ARM_COMPUTE_ERROR_ON_MSG_VAR(_all_location_predictions[i].find(label) == _all_location_predictions[i].end(), "Could not find location predictions for label %d.", label); + ARM_COMPUTE_ERROR_ON_MSG_VAR(_all_location_predictions[i].find(label) == _all_location_predictions[i].end(), + "Could not find location predictions for label %d.", label); const std::vector<BBox> &label_loc_preds = _all_location_predictions[i].find(label)->second; const int num_bboxes = _all_prior_bboxes.size(); ARM_COMPUTE_ERROR_ON(_all_prior_variances[i].size() != 4); - for(int j = 0; j < num_bboxes; ++j) + for (int j = 0; j < num_bboxes; ++j) { - DecodeBBox(_all_prior_bboxes[j], _all_prior_variances[j], _info.code_type(), _info.variance_encoded_in_target(), clip_bbox, label_loc_preds[j], _all_decode_bboxes[i][label][j]); + DecodeBBox(_all_prior_bboxes[j], _all_prior_variances[j], _info.code_type(), + _info.variance_encoded_in_target(), clip_bbox, label_loc_preds[j], + _all_decode_bboxes[i][label][j]); } } } int num_kept = 0; - for(int i = 0; i < _num; ++i) + for (int i = 0; i < _num; ++i) { - const LabelBBox &decode_bboxes = _all_decode_bboxes[i]; - const std::map<int, std::vector<float>> &conf_scores = _all_confidence_scores[i]; + const LabelBBox &decode_bboxes = _all_decode_bboxes[i]; + const std::map<int, std::vector<float>> &conf_scores = _all_confidence_scores[i]; std::map<int, std::vector<int>> indices; - int num_det = 0; - for(int c = 0; c < _info.num_classes(); ++c) + int num_det = 0; + for (int c = 0; c < _info.num_classes(); ++c) { - if(c == _info.background_label_id()) + if (c == _info.background_label_id()) { // Ignore background class continue; } const int label = _info.share_location() ? -1 : c; - if(conf_scores.find(c) == conf_scores.end() || decode_bboxes.find(label) == decode_bboxes.end()) + if (conf_scores.find(c) == conf_scores.end() || decode_bboxes.find(label) == decode_bboxes.end()) { ARM_COMPUTE_ERROR_VAR("Could not find predictions for label %d.", label); } const std::vector<float> &scores = conf_scores.find(c)->second; - const std::vector<BBox> &bboxes = decode_bboxes.find(label)->second; + const std::vector<BBox> &bboxes = decode_bboxes.find(label)->second; - ApplyNMSFast(bboxes, scores, _info.confidence_threshold(), _info.nms_threshold(), _info.eta(), _info.top_k(), indices[c]); + ApplyNMSFast(bboxes, scores, _info.confidence_threshold(), _info.nms_threshold(), _info.eta(), + _info.top_k(), indices[c]); num_det += indices[c].size(); } int num_to_add = 0; - if(_info.keep_top_k() > -1 && num_det > _info.keep_top_k()) + if (_info.keep_top_k() > -1 && num_det > _info.keep_top_k()) { std::vector<std::pair<float, std::pair<int, int>>> score_index_pairs; - for(auto const &it : indices) + for (auto const &it : indices) { const int label = it.first; const std::vector<int> &label_indices = it.second; - if(conf_scores.find(label) == conf_scores.end()) + if (conf_scores.find(label) == conf_scores.end()) { ARM_COMPUTE_ERROR_VAR("Could not find predictions for label %d.", label); } const std::vector<float> &scores = conf_scores.find(label)->second; - for(auto idx : label_indices) + for (auto idx : label_indices) { ARM_COMPUTE_ERROR_ON(idx > static_cast<int>(scores.size())); score_index_pairs.emplace_back(std::make_pair(scores[idx], std::make_pair(label, idx))); @@ -535,7 +593,7 @@ void CPPDetectionOutputLayer::run() // Store the new indices. std::map<int, std::vector<int>> new_indices; - for(auto score_index_pair : score_index_pairs) + for (auto score_index_pair : score_index_pairs) { int label = score_index_pair.second.first; int idx = score_index_pair.second.second; @@ -556,25 +614,25 @@ void CPPDetectionOutputLayer::run() _output->info()->set_valid_region(ValidRegion(Coordinates(0, 0), TensorShape(7, num_kept))); int count = 0; - for(int i = 0; i < _num; ++i) + for (int i = 0; i < _num; ++i) { - const std::map<int, std::vector<float>> &conf_scores = _all_confidence_scores[i]; - const LabelBBox &decode_bboxes = _all_decode_bboxes[i]; - for(auto &it : _all_indices[i]) + const std::map<int, std::vector<float>> &conf_scores = _all_confidence_scores[i]; + const LabelBBox &decode_bboxes = _all_decode_bboxes[i]; + for (auto &it : _all_indices[i]) { const int label = it.first; const std::vector<float> &scores = conf_scores.find(label)->second; const int loc_label = _info.share_location() ? -1 : label; - if(conf_scores.find(label) == conf_scores.end() || decode_bboxes.find(loc_label) == decode_bboxes.end()) + if (conf_scores.find(label) == conf_scores.end() || decode_bboxes.find(loc_label) == decode_bboxes.end()) { // Either if there are no confidence predictions // or there are no location predictions for current label. ARM_COMPUTE_ERROR_VAR("Could not find predictions for the label %d.", label); } const std::vector<BBox> &bboxes = decode_bboxes.find(loc_label)->second; - const std::vector<int> &indices = it.second; + const std::vector<int> &indices = it.second; - for(auto idx : indices) + for (auto idx : indices) { *(reinterpret_cast<float *>(_output->ptr_to_element(Coordinates(count * 7)))) = i; *(reinterpret_cast<float *>(_output->ptr_to_element(Coordinates(count * 7 + 1)))) = label; diff --git a/src/runtime/CPP/functions/CPPDetectionPostProcessLayer.cpp b/src/runtime/CPP/functions/CPPDetectionPostProcessLayer.cpp index b3fc9c776d..2861d6cacb 100644 --- a/src/runtime/CPP/functions/CPPDetectionPostProcessLayer.cpp +++ b/src/runtime/CPP/functions/CPPDetectionPostProcessLayer.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2020 ARM Limited. + * Copyright (c) 2019-2021 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -27,6 +27,9 @@ #include "arm_compute/core/Helpers.h" #include "arm_compute/core/Validate.h" +#include "src/common/utils/Log.h" +#include "src/core/helpers/AutoConfiguration.h" + #include <cstddef> #include <ios> #include <list> @@ -35,53 +38,76 @@ namespace arm_compute { namespace { -Status validate_arguments(const ITensorInfo *input_box_encoding, const ITensorInfo *input_class_score, const ITensorInfo *input_anchors, - ITensorInfo *output_boxes, ITensorInfo *output_classes, ITensorInfo *output_scores, ITensorInfo *num_detection, - DetectionPostProcessLayerInfo info, const unsigned int kBatchSize, const unsigned int kNumCoordBox) +Status validate_arguments(const ITensorInfo *input_box_encoding, + const ITensorInfo *input_class_score, + const ITensorInfo *input_anchors, + ITensorInfo *output_boxes, + ITensorInfo *output_classes, + ITensorInfo *output_scores, + ITensorInfo *num_detection, + DetectionPostProcessLayerInfo info, + const unsigned int kBatchSize, + const unsigned int kNumCoordBox) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input_box_encoding, input_class_score, input_anchors); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input_box_encoding, 1, DataType::F32, DataType::QASYMM8, DataType::QASYMM8_SIGNED); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input_box_encoding, 1, DataType::F32, DataType::QASYMM8, + DataType::QASYMM8_SIGNED); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input_box_encoding, input_anchors); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(input_box_encoding->num_dimensions() > 3, "The location input tensor shape should be [4, N, kBatchSize]."); - if(input_box_encoding->num_dimensions() > 2) + ARM_COMPUTE_RETURN_ERROR_ON_MSG(input_box_encoding->num_dimensions() > 3, + "The location input tensor shape should be [4, N, kBatchSize]."); + if (input_box_encoding->num_dimensions() > 2) { - ARM_COMPUTE_RETURN_ERROR_ON_MSG_VAR(input_box_encoding->dimension(2) != kBatchSize, "The third dimension of the input box_encoding tensor should be equal to %d.", kBatchSize); + ARM_COMPUTE_RETURN_ERROR_ON_MSG_VAR( + input_box_encoding->dimension(2) != kBatchSize, + "The third dimension of the input box_encoding tensor should be equal to %d.", kBatchSize); } - ARM_COMPUTE_RETURN_ERROR_ON_MSG_VAR(input_box_encoding->dimension(0) != kNumCoordBox, "The first dimension of the input box_encoding tensor should be equal to %d.", kNumCoordBox); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(input_class_score->dimension(0) != (info.num_classes() + 1), - "The first dimension of the input class_prediction should be equal to the number of classes plus one."); - - ARM_COMPUTE_RETURN_ERROR_ON_MSG(input_anchors->num_dimensions() > 3, "The anchors input tensor shape should be [4, N, kBatchSize]."); - if(input_anchors->num_dimensions() > 2) + ARM_COMPUTE_RETURN_ERROR_ON_MSG_VAR(input_box_encoding->dimension(0) != kNumCoordBox, + "The first dimension of the input box_encoding tensor should be equal to %d.", + kNumCoordBox); + ARM_COMPUTE_RETURN_ERROR_ON_MSG( + input_class_score->dimension(0) != (info.num_classes() + 1), + "The first dimension of the input class_prediction should be equal to the number of classes plus one."); + + ARM_COMPUTE_RETURN_ERROR_ON_MSG(input_anchors->num_dimensions() > 3, + "The anchors input tensor shape should be [4, N, kBatchSize]."); + if (input_anchors->num_dimensions() > 2) { - ARM_COMPUTE_RETURN_ERROR_ON_MSG_VAR(input_anchors->dimension(0) != kNumCoordBox, "The first dimension of the input anchors tensor should be equal to %d.", kNumCoordBox); + ARM_COMPUTE_RETURN_ERROR_ON_MSG_VAR(input_anchors->dimension(0) != kNumCoordBox, + "The first dimension of the input anchors tensor should be equal to %d.", + kNumCoordBox); } - ARM_COMPUTE_RETURN_ERROR_ON_MSG((input_box_encoding->dimension(1) != input_class_score->dimension(1)) - || (input_box_encoding->dimension(1) != input_anchors->dimension(1)), + ARM_COMPUTE_RETURN_ERROR_ON_MSG((input_box_encoding->dimension(1) != input_class_score->dimension(1)) || + (input_box_encoding->dimension(1) != input_anchors->dimension(1)), "The second dimension of the inputs should be the same."); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(num_detection->num_dimensions() > 1, "The num_detection output tensor shape should be [M]."); - ARM_COMPUTE_RETURN_ERROR_ON_MSG((info.iou_threshold() <= 0.0f) || (info.iou_threshold() > 1.0f), "The intersection over union should be positive and less than 1."); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.max_classes_per_detection() <= 0, "The number of max classes per detection should be positive."); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(num_detection->num_dimensions() > 1, + "The num_detection output tensor shape should be [M]."); + ARM_COMPUTE_RETURN_ERROR_ON_MSG((info.iou_threshold() <= 0.0f) || (info.iou_threshold() > 1.0f), + "The intersection over union should be positive and less than 1."); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.max_classes_per_detection() <= 0, + "The number of max classes per detection should be positive."); const unsigned int num_detected_boxes = info.max_detections() * info.max_classes_per_detection(); // Validate configured outputs - if(output_boxes->total_size() != 0) + if (output_boxes->total_size() != 0) { - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output_boxes->tensor_shape(), TensorShape(4U, num_detected_boxes, 1U)); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output_boxes->tensor_shape(), + TensorShape(4U, num_detected_boxes, 1U)); ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output_boxes, 1, DataType::F32); } - if(output_classes->total_size() != 0) + if (output_classes->total_size() != 0) { - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output_classes->tensor_shape(), TensorShape(num_detected_boxes, 1U)); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output_classes->tensor_shape(), + TensorShape(num_detected_boxes, 1U)); ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output_classes, 1, DataType::F32); } - if(output_scores->total_size() != 0) + if (output_scores->total_size() != 0) { - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output_scores->tensor_shape(), TensorShape(num_detected_boxes, 1U)); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output_scores->tensor_shape(), + TensorShape(num_detected_boxes, 1U)); ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output_scores, 1, DataType::F32); } - if(num_detection->total_size() != 0) + if (num_detection->total_size() != 0) { ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(num_detection->tensor_shape(), TensorShape(1U)); ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(num_detection, 1, DataType::F32); @@ -90,15 +116,18 @@ Status validate_arguments(const ITensorInfo *input_box_encoding, const ITensorIn return Status{}; } -inline void DecodeBoxCorner(BBox &box_centersize, BBox &anchor, Iterator &decoded_it, DetectionPostProcessLayerInfo info) +inline void +DecodeBoxCorner(BBox &box_centersize, BBox &anchor, Iterator &decoded_it, DetectionPostProcessLayerInfo info) { const float half_factor = 0.5f; // BBox is equavalent to CenterSizeEncoding [y,x,h,w] const float y_center = box_centersize[0] / info.scale_value_y() * anchor[2] + anchor[0]; const float x_center = box_centersize[1] / info.scale_value_x() * anchor[3] + anchor[1]; - const float half_h = half_factor * static_cast<float>(std::exp(box_centersize[2] / info.scale_value_h())) * anchor[2]; - const float half_w = half_factor * static_cast<float>(std::exp(box_centersize[3] / info.scale_value_w())) * anchor[3]; + const float half_h = + half_factor * static_cast<float>(std::exp(box_centersize[2] / info.scale_value_h())) * anchor[2]; + const float half_w = + half_factor * static_cast<float>(std::exp(box_centersize[3] / info.scale_value_w())) * anchor[3]; // Box Corner encoding boxes are saved as [xmin, ymin, xmax, ymax] auto decoded_ptr = reinterpret_cast<float *>(decoded_it.ptr()); @@ -115,12 +144,15 @@ inline void DecodeBoxCorner(BBox &box_centersize, BBox &anchor, Iterator &decode * @param[in] info The detection informations * @param[out] decoded_boxes The decoded bboxes. */ -void DecodeCenterSizeBoxes(const ITensor *input_box_encoding, const ITensor *input_anchors, DetectionPostProcessLayerInfo info, Tensor *decoded_boxes) +void DecodeCenterSizeBoxes(const ITensor *input_box_encoding, + const ITensor *input_anchors, + DetectionPostProcessLayerInfo info, + Tensor *decoded_boxes) { const QuantizationInfo &qi_box = input_box_encoding->info()->quantization_info(); const QuantizationInfo &qi_anchors = input_anchors->info()->quantization_info(); - BBox box_centersize{ {} }; - BBox anchor{ {} }; + BBox box_centersize{{}}; + BBox anchor{{}}; Window win; win.use_tensor_dimensions(input_box_encoding->info()->tensor_shape()); @@ -130,103 +162,155 @@ void DecodeCenterSizeBoxes(const ITensor *input_box_encoding, const ITensor *inp Iterator anchor_it(input_anchors, win); Iterator decoded_it(decoded_boxes, win); - if(input_box_encoding->info()->data_type() == DataType::QASYMM8) + if (input_box_encoding->info()->data_type() == DataType::QASYMM8) { - execute_window_loop(win, [&](const Coordinates &) - { - const auto box_ptr = reinterpret_cast<const qasymm8_t *>(box_it.ptr()); - const auto anchor_ptr = reinterpret_cast<const qasymm8_t *>(anchor_it.ptr()); - box_centersize = BBox({ dequantize_qasymm8(*box_ptr, qi_box), dequantize_qasymm8(*(box_ptr + 1), qi_box), - dequantize_qasymm8(*(2 + box_ptr), qi_box), dequantize_qasymm8(*(3 + box_ptr), qi_box) - }); - anchor = BBox({ dequantize_qasymm8(*anchor_ptr, qi_anchors), dequantize_qasymm8(*(anchor_ptr + 1), qi_anchors), - dequantize_qasymm8(*(2 + anchor_ptr), qi_anchors), dequantize_qasymm8(*(3 + anchor_ptr), qi_anchors) - }); - DecodeBoxCorner(box_centersize, anchor, decoded_it, info); - }, - box_it, anchor_it, decoded_it); + execute_window_loop( + win, + [&](const Coordinates &) + { + const auto box_ptr = reinterpret_cast<const qasymm8_t *>(box_it.ptr()); + const auto anchor_ptr = reinterpret_cast<const qasymm8_t *>(anchor_it.ptr()); + box_centersize = + BBox({dequantize_qasymm8(*box_ptr, qi_box), dequantize_qasymm8(*(box_ptr + 1), qi_box), + dequantize_qasymm8(*(2 + box_ptr), qi_box), dequantize_qasymm8(*(3 + box_ptr), qi_box)}); + anchor = BBox({dequantize_qasymm8(*anchor_ptr, qi_anchors), + dequantize_qasymm8(*(anchor_ptr + 1), qi_anchors), + dequantize_qasymm8(*(2 + anchor_ptr), qi_anchors), + dequantize_qasymm8(*(3 + anchor_ptr), qi_anchors)}); + DecodeBoxCorner(box_centersize, anchor, decoded_it, info); + }, + box_it, anchor_it, decoded_it); } - else if(input_box_encoding->info()->data_type() == DataType::QASYMM8_SIGNED) + else if (input_box_encoding->info()->data_type() == DataType::QASYMM8_SIGNED) { - execute_window_loop(win, [&](const Coordinates &) - { - const auto box_ptr = reinterpret_cast<const qasymm8_signed_t *>(box_it.ptr()); - const auto anchor_ptr = reinterpret_cast<const qasymm8_signed_t *>(anchor_it.ptr()); - box_centersize = BBox({ dequantize_qasymm8_signed(*box_ptr, qi_box), dequantize_qasymm8_signed(*(box_ptr + 1), qi_box), - dequantize_qasymm8_signed(*(2 + box_ptr), qi_box), dequantize_qasymm8_signed(*(3 + box_ptr), qi_box) - }); - anchor = BBox({ dequantize_qasymm8_signed(*anchor_ptr, qi_anchors), dequantize_qasymm8_signed(*(anchor_ptr + 1), qi_anchors), - dequantize_qasymm8_signed(*(2 + anchor_ptr), qi_anchors), dequantize_qasymm8_signed(*(3 + anchor_ptr), qi_anchors) - }); - DecodeBoxCorner(box_centersize, anchor, decoded_it, info); - }, - box_it, anchor_it, decoded_it); + execute_window_loop( + win, + [&](const Coordinates &) + { + const auto box_ptr = reinterpret_cast<const qasymm8_signed_t *>(box_it.ptr()); + const auto anchor_ptr = reinterpret_cast<const qasymm8_signed_t *>(anchor_it.ptr()); + box_centersize = BBox({dequantize_qasymm8_signed(*box_ptr, qi_box), + dequantize_qasymm8_signed(*(box_ptr + 1), qi_box), + dequantize_qasymm8_signed(*(2 + box_ptr), qi_box), + dequantize_qasymm8_signed(*(3 + box_ptr), qi_box)}); + anchor = BBox({dequantize_qasymm8_signed(*anchor_ptr, qi_anchors), + dequantize_qasymm8_signed(*(anchor_ptr + 1), qi_anchors), + dequantize_qasymm8_signed(*(2 + anchor_ptr), qi_anchors), + dequantize_qasymm8_signed(*(3 + anchor_ptr), qi_anchors)}); + DecodeBoxCorner(box_centersize, anchor, decoded_it, info); + }, + box_it, anchor_it, decoded_it); } else { - execute_window_loop(win, [&](const Coordinates &) - { - const auto box_ptr = reinterpret_cast<const float *>(box_it.ptr()); - const auto anchor_ptr = reinterpret_cast<const float *>(anchor_it.ptr()); - box_centersize = BBox({ *box_ptr, *(box_ptr + 1), *(2 + box_ptr), *(3 + box_ptr) }); - anchor = BBox({ *anchor_ptr, *(anchor_ptr + 1), *(2 + anchor_ptr), *(3 + anchor_ptr) }); - DecodeBoxCorner(box_centersize, anchor, decoded_it, info); - }, - box_it, anchor_it, decoded_it); + execute_window_loop( + win, + [&](const Coordinates &) + { + const auto box_ptr = reinterpret_cast<const float *>(box_it.ptr()); + const auto anchor_ptr = reinterpret_cast<const float *>(anchor_it.ptr()); + box_centersize = BBox({*box_ptr, *(box_ptr + 1), *(2 + box_ptr), *(3 + box_ptr)}); + anchor = BBox({*anchor_ptr, *(anchor_ptr + 1), *(2 + anchor_ptr), *(3 + anchor_ptr)}); + DecodeBoxCorner(box_centersize, anchor, decoded_it, info); + }, + box_it, anchor_it, decoded_it); } } -void SaveOutputs(const Tensor *decoded_boxes, const std::vector<int> &result_idx_boxes_after_nms, const std::vector<float> &result_scores_after_nms, const std::vector<int> &result_classes_after_nms, - std::vector<unsigned int> &sorted_indices, const unsigned int num_output, const unsigned int max_detections, ITensor *output_boxes, ITensor *output_classes, ITensor *output_scores, - ITensor *num_detection) +void SaveOutputs(const Tensor *decoded_boxes, + const std::vector<int> &result_idx_boxes_after_nms, + const std::vector<float> &result_scores_after_nms, + const std::vector<int> &result_classes_after_nms, + std::vector<unsigned int> &sorted_indices, + const unsigned int num_output, + const unsigned int max_detections, + ITensor *output_boxes, + ITensor *output_classes, + ITensor *output_scores, + ITensor *num_detection) { // xmin,ymin,xmax,ymax -> ymin,xmin,ymax,xmax unsigned int i = 0; - for(; i < num_output; ++i) + for (; i < num_output; ++i) { const unsigned int box_in_idx = result_idx_boxes_after_nms[sorted_indices[i]]; - *(reinterpret_cast<float *>(output_boxes->ptr_to_element(Coordinates(0, i)))) = *(reinterpret_cast<float *>(decoded_boxes->ptr_to_element(Coordinates(1, box_in_idx)))); - *(reinterpret_cast<float *>(output_boxes->ptr_to_element(Coordinates(1, i)))) = *(reinterpret_cast<float *>(decoded_boxes->ptr_to_element(Coordinates(0, box_in_idx)))); - *(reinterpret_cast<float *>(output_boxes->ptr_to_element(Coordinates(2, i)))) = *(reinterpret_cast<float *>(decoded_boxes->ptr_to_element(Coordinates(3, box_in_idx)))); - *(reinterpret_cast<float *>(output_boxes->ptr_to_element(Coordinates(3, i)))) = *(reinterpret_cast<float *>(decoded_boxes->ptr_to_element(Coordinates(2, box_in_idx)))); - *(reinterpret_cast<float *>(output_classes->ptr_to_element(Coordinates(i)))) = static_cast<float>(result_classes_after_nms[sorted_indices[i]]); - *(reinterpret_cast<float *>(output_scores->ptr_to_element(Coordinates(i)))) = result_scores_after_nms[sorted_indices[i]]; + *(reinterpret_cast<float *>(output_boxes->ptr_to_element(Coordinates(0, i)))) = + *(reinterpret_cast<float *>(decoded_boxes->ptr_to_element(Coordinates(1, box_in_idx)))); + *(reinterpret_cast<float *>(output_boxes->ptr_to_element(Coordinates(1, i)))) = + *(reinterpret_cast<float *>(decoded_boxes->ptr_to_element(Coordinates(0, box_in_idx)))); + *(reinterpret_cast<float *>(output_boxes->ptr_to_element(Coordinates(2, i)))) = + *(reinterpret_cast<float *>(decoded_boxes->ptr_to_element(Coordinates(3, box_in_idx)))); + *(reinterpret_cast<float *>(output_boxes->ptr_to_element(Coordinates(3, i)))) = + *(reinterpret_cast<float *>(decoded_boxes->ptr_to_element(Coordinates(2, box_in_idx)))); + *(reinterpret_cast<float *>(output_classes->ptr_to_element(Coordinates(i)))) = + static_cast<float>(result_classes_after_nms[sorted_indices[i]]); + *(reinterpret_cast<float *>(output_scores->ptr_to_element(Coordinates(i)))) = + result_scores_after_nms[sorted_indices[i]]; } - for(; i < max_detections; ++i) + for (; i < max_detections; ++i) { *(reinterpret_cast<float *>(output_boxes->ptr_to_element(Coordinates(1, i)))) = 0.0f; *(reinterpret_cast<float *>(output_boxes->ptr_to_element(Coordinates(0, i)))) = 0.0f; *(reinterpret_cast<float *>(output_boxes->ptr_to_element(Coordinates(3, i)))) = 0.0f; *(reinterpret_cast<float *>(output_boxes->ptr_to_element(Coordinates(2, i)))) = 0.0f; - *(reinterpret_cast<float *>(output_classes->ptr_to_element(Coordinates(i)))) = 0.0f; - *(reinterpret_cast<float *>(output_scores->ptr_to_element(Coordinates(i)))) = 0.0f; + *(reinterpret_cast<float *>(output_classes->ptr_to_element(Coordinates(i)))) = 0.0f; + *(reinterpret_cast<float *>(output_scores->ptr_to_element(Coordinates(i)))) = 0.0f; } *(reinterpret_cast<float *>(num_detection->ptr_to_element(Coordinates(0)))) = num_output; } } // namespace CPPDetectionPostProcessLayer::CPPDetectionPostProcessLayer(std::shared_ptr<IMemoryManager> memory_manager) - : _memory_group(std::move(memory_manager)), _nms(), _input_box_encoding(nullptr), _input_scores(nullptr), _input_anchors(nullptr), _output_boxes(nullptr), _output_classes(nullptr), - _output_scores(nullptr), _num_detection(nullptr), _info(), _num_boxes(), _num_classes_with_background(), _num_max_detected_boxes(), _dequantize_scores(false), _decoded_boxes(), _decoded_scores(), - _selected_indices(), _class_scores(), _input_scores_to_use(nullptr) + : _memory_group(std::move(memory_manager)), + _nms(), + _input_box_encoding(nullptr), + _input_scores(nullptr), + _input_anchors(nullptr), + _output_boxes(nullptr), + _output_classes(nullptr), + _output_scores(nullptr), + _num_detection(nullptr), + _info(), + _num_boxes(), + _num_classes_with_background(), + _num_max_detected_boxes(), + _dequantize_scores(false), + _decoded_boxes(), + _decoded_scores(), + _selected_indices(), + _class_scores(), + _input_scores_to_use(nullptr) { } -void CPPDetectionPostProcessLayer::configure(const ITensor *input_box_encoding, const ITensor *input_scores, const ITensor *input_anchors, - ITensor *output_boxes, ITensor *output_classes, ITensor *output_scores, ITensor *num_detection, DetectionPostProcessLayerInfo info) +void CPPDetectionPostProcessLayer::configure(const ITensor *input_box_encoding, + const ITensor *input_scores, + const ITensor *input_anchors, + ITensor *output_boxes, + ITensor *output_classes, + ITensor *output_scores, + ITensor *num_detection, + DetectionPostProcessLayerInfo info) { - ARM_COMPUTE_ERROR_ON_NULLPTR(input_box_encoding, input_scores, input_anchors, output_boxes, output_classes, output_scores); + ARM_COMPUTE_ERROR_ON_NULLPTR(input_box_encoding, input_scores, input_anchors, output_boxes, output_classes, + output_scores); + ARM_COMPUTE_LOG_PARAMS(input_box_encoding, input_scores, input_anchors, output_boxes, output_classes, output_scores, + num_detection, info); + _num_max_detected_boxes = info.max_detections() * info.max_classes_per_detection(); - auto_init_if_empty(*output_boxes->info(), TensorInfo(TensorShape(_kNumCoordBox, _num_max_detected_boxes, _kBatchSize), 1, DataType::F32)); - auto_init_if_empty(*output_classes->info(), TensorInfo(TensorShape(_num_max_detected_boxes, _kBatchSize), 1, DataType::F32)); - auto_init_if_empty(*output_scores->info(), TensorInfo(TensorShape(_num_max_detected_boxes, _kBatchSize), 1, DataType::F32)); + auto_init_if_empty(*output_boxes->info(), + TensorInfo(TensorShape(_kNumCoordBox, _num_max_detected_boxes, _kBatchSize), 1, DataType::F32)); + auto_init_if_empty(*output_classes->info(), + TensorInfo(TensorShape(_num_max_detected_boxes, _kBatchSize), 1, DataType::F32)); + auto_init_if_empty(*output_scores->info(), + TensorInfo(TensorShape(_num_max_detected_boxes, _kBatchSize), 1, DataType::F32)); auto_init_if_empty(*num_detection->info(), TensorInfo(TensorShape(1U), 1, DataType::F32)); // Perform validation step - ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input_box_encoding->info(), input_scores->info(), input_anchors->info(), output_boxes->info(), output_classes->info(), output_scores->info(), - num_detection->info(), - info, _kBatchSize, _kNumCoordBox)); + ARM_COMPUTE_ERROR_THROW_ON(validate_arguments( + input_box_encoding->info(), input_scores->info(), input_anchors->info(), output_boxes->info(), + output_classes->info(), output_scores->info(), num_detection->info(), info, _kBatchSize, _kNumCoordBox)); _input_box_encoding = input_box_encoding; _input_scores = input_scores; @@ -238,13 +322,24 @@ void CPPDetectionPostProcessLayer::configure(const ITensor *input_box_encoding, _info = info; _num_boxes = input_box_encoding->info()->dimension(1); _num_classes_with_background = _input_scores->info()->dimension(0); - _dequantize_scores = (info.dequantize_scores() && is_data_type_quantized(input_box_encoding->info()->data_type())); - - auto_init_if_empty(*_decoded_boxes.info(), TensorInfo(TensorShape(_kNumCoordBox, _input_box_encoding->info()->dimension(1), _kBatchSize), 1, DataType::F32)); - auto_init_if_empty(*_decoded_scores.info(), TensorInfo(TensorShape(_input_scores->info()->dimension(0), _input_scores->info()->dimension(1), _kBatchSize), 1, DataType::F32)); - auto_init_if_empty(*_selected_indices.info(), TensorInfo(TensorShape(info.use_regular_nms() ? info.detection_per_class() : info.max_detections()), 1, DataType::S32)); + _dequantize_scores = (info.dequantize_scores() && is_data_type_quantized(input_box_encoding->info()->data_type())); + + auto_init_if_empty(*_decoded_boxes.info(), + TensorInfo(TensorShape(_kNumCoordBox, _input_box_encoding->info()->dimension(1), _kBatchSize), 1, + DataType::F32)); + auto_init_if_empty( + *_decoded_scores.info(), + TensorInfo(TensorShape(_input_scores->info()->dimension(0), _input_scores->info()->dimension(1), _kBatchSize), + 1, DataType::F32)); + auto_init_if_empty( + *_selected_indices.info(), + TensorInfo(TensorShape(info.use_regular_nms() ? info.detection_per_class() : info.max_detections()), 1, + DataType::S32)); const unsigned int num_classes_per_box = std::min(info.max_classes_per_detection(), info.num_classes()); - auto_init_if_empty(*_class_scores.info(), TensorInfo(info.use_regular_nms() ? TensorShape(_num_boxes) : TensorShape(_num_boxes * num_classes_per_box), 1, DataType::F32)); + auto_init_if_empty( + *_class_scores.info(), + TensorInfo(info.use_regular_nms() ? TensorShape(_num_boxes) : TensorShape(_num_boxes * num_classes_per_box), 1, + DataType::F32)); _input_scores_to_use = _dequantize_scores ? &_decoded_scores : _input_scores; @@ -253,7 +348,9 @@ void CPPDetectionPostProcessLayer::configure(const ITensor *input_box_encoding, _memory_group.manage(&_decoded_scores); _memory_group.manage(&_selected_indices); _memory_group.manage(&_class_scores); - _nms.configure(&_decoded_boxes, &_class_scores, &_selected_indices, info.use_regular_nms() ? info.detection_per_class() : info.max_detections(), info.nms_score_threshold(), info.iou_threshold()); + _nms.configure(&_decoded_boxes, &_class_scores, &_selected_indices, + info.use_regular_nms() ? info.detection_per_class() : info.max_detections(), + info.nms_score_threshold(), info.iou_threshold()); // Allocate and reserve intermediate tensors and vectors _decoded_boxes.allocator()->allocate(); @@ -262,18 +359,28 @@ void CPPDetectionPostProcessLayer::configure(const ITensor *input_box_encoding, _class_scores.allocator()->allocate(); } -Status CPPDetectionPostProcessLayer::validate(const ITensorInfo *input_box_encoding, const ITensorInfo *input_class_score, const ITensorInfo *input_anchors, - ITensorInfo *output_boxes, ITensorInfo *output_classes, ITensorInfo *output_scores, ITensorInfo *num_detection, DetectionPostProcessLayerInfo info) +Status CPPDetectionPostProcessLayer::validate(const ITensorInfo *input_box_encoding, + const ITensorInfo *input_class_score, + const ITensorInfo *input_anchors, + ITensorInfo *output_boxes, + ITensorInfo *output_classes, + ITensorInfo *output_scores, + ITensorInfo *num_detection, + DetectionPostProcessLayerInfo info) { - constexpr unsigned int kBatchSize = 1; - constexpr unsigned int kNumCoordBox = 4; - const TensorInfo _decoded_boxes_info = TensorInfo(TensorShape(kNumCoordBox, input_box_encoding->dimension(1)), 1, DataType::F32); - const TensorInfo _decoded_scores_info = TensorInfo(TensorShape(input_box_encoding->dimension(1)), 1, DataType::F32); - const TensorInfo _selected_indices_info = TensorInfo(TensorShape(info.max_detections()), 1, DataType::S32); - - ARM_COMPUTE_RETURN_ON_ERROR(CPPNonMaximumSuppression::validate(&_decoded_boxes_info, &_decoded_scores_info, &_selected_indices_info, info.max_detections(), info.nms_score_threshold(), - info.iou_threshold())); - ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input_box_encoding, input_class_score, input_anchors, output_boxes, output_classes, output_scores, num_detection, info, kBatchSize, kNumCoordBox)); + constexpr unsigned int kBatchSize = 1; + constexpr unsigned int kNumCoordBox = 4; + const TensorInfo _decoded_boxes_info = + TensorInfo(TensorShape(kNumCoordBox, input_box_encoding->dimension(1)), 1, DataType::F32); + const TensorInfo _decoded_scores_info = TensorInfo(TensorShape(input_box_encoding->dimension(1)), 1, DataType::F32); + const TensorInfo _selected_indices_info = TensorInfo(TensorShape(info.max_detections()), 1, DataType::S32); + + ARM_COMPUTE_RETURN_ON_ERROR(CPPNonMaximumSuppression::validate(&_decoded_boxes_info, &_decoded_scores_info, + &_selected_indices_info, info.max_detections(), + info.nms_score_threshold(), info.iou_threshold())); + ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input_box_encoding, input_class_score, input_anchors, output_boxes, + output_classes, output_scores, num_detection, info, kBatchSize, + kNumCoordBox)); return Status{}; } @@ -286,62 +393,69 @@ void CPPDetectionPostProcessLayer::run() DecodeCenterSizeBoxes(_input_box_encoding, _input_anchors, _info, &_decoded_boxes); // Decode scores if necessary - if(_dequantize_scores) + if (_dequantize_scores) { - if(_input_box_encoding->info()->data_type() == DataType::QASYMM8) + if (_input_box_encoding->info()->data_type() == DataType::QASYMM8) { - for(unsigned int idx_c = 0; idx_c < _num_classes_with_background; ++idx_c) + for (unsigned int idx_c = 0; idx_c < _num_classes_with_background; ++idx_c) { - for(unsigned int idx_b = 0; idx_b < _num_boxes; ++idx_b) + for (unsigned int idx_b = 0; idx_b < _num_boxes; ++idx_b) { *(reinterpret_cast<float *>(_decoded_scores.ptr_to_element(Coordinates(idx_c, idx_b)))) = - dequantize_qasymm8(*(reinterpret_cast<qasymm8_t *>(_input_scores->ptr_to_element(Coordinates(idx_c, idx_b)))), _input_scores->info()->quantization_info()); + dequantize_qasymm8( + *(reinterpret_cast<qasymm8_t *>(_input_scores->ptr_to_element(Coordinates(idx_c, idx_b)))), + _input_scores->info()->quantization_info()); } } } - else if(_input_box_encoding->info()->data_type() == DataType::QASYMM8_SIGNED) + else if (_input_box_encoding->info()->data_type() == DataType::QASYMM8_SIGNED) { - for(unsigned int idx_c = 0; idx_c < _num_classes_with_background; ++idx_c) + for (unsigned int idx_c = 0; idx_c < _num_classes_with_background; ++idx_c) { - for(unsigned int idx_b = 0; idx_b < _num_boxes; ++idx_b) + for (unsigned int idx_b = 0; idx_b < _num_boxes; ++idx_b) { *(reinterpret_cast<float *>(_decoded_scores.ptr_to_element(Coordinates(idx_c, idx_b)))) = - dequantize_qasymm8_signed(*(reinterpret_cast<qasymm8_signed_t *>(_input_scores->ptr_to_element(Coordinates(idx_c, idx_b)))), _input_scores->info()->quantization_info()); + dequantize_qasymm8_signed(*(reinterpret_cast<qasymm8_signed_t *>( + _input_scores->ptr_to_element(Coordinates(idx_c, idx_b)))), + _input_scores->info()->quantization_info()); } } } } // Regular NMS - if(_info.use_regular_nms()) + if (_info.use_regular_nms()) { std::vector<int> result_idx_boxes_after_nms; std::vector<int> result_classes_after_nms; std::vector<float> result_scores_after_nms; std::vector<unsigned int> sorted_indices; - for(unsigned int c = 0; c < num_classes; ++c) + for (unsigned int c = 0; c < num_classes; ++c) { // For each boxes get scores of the boxes for the class c - for(unsigned int i = 0; i < _num_boxes; ++i) + for (unsigned int i = 0; i < _num_boxes; ++i) { *(reinterpret_cast<float *>(_class_scores.ptr_to_element(Coordinates(i)))) = - *(reinterpret_cast<float *>(_input_scores_to_use->ptr_to_element(Coordinates(c + 1, i)))); // i * _num_classes_with_background + c + 1 + *(reinterpret_cast<float *>(_input_scores_to_use->ptr_to_element( + Coordinates(c + 1, i)))); // i * _num_classes_with_background + c + 1 } // Run Non-maxima Suppression _nms.run(); - for(unsigned int i = 0; i < _info.detection_per_class(); ++i) + for (unsigned int i = 0; i < _info.detection_per_class(); ++i) { - const auto selected_index = *(reinterpret_cast<int *>(_selected_indices.ptr_to_element(Coordinates(i)))); - if(selected_index == -1) + const auto selected_index = + *(reinterpret_cast<int *>(_selected_indices.ptr_to_element(Coordinates(i)))); + if (selected_index == -1) { // Nms will return -1 for all the last M-elements not valid break; } result_idx_boxes_after_nms.emplace_back(selected_index); - result_scores_after_nms.emplace_back((reinterpret_cast<float *>(_class_scores.buffer()))[selected_index]); + result_scores_after_nms.emplace_back( + (reinterpret_cast<float *>(_class_scores.buffer()))[selected_index]); result_classes_after_nms.emplace_back(c); } } @@ -353,49 +467,46 @@ void CPPDetectionPostProcessLayer::run() // Sort selected indices based on result scores sorted_indices.resize(num_selected); std::iota(sorted_indices.begin(), sorted_indices.end(), 0); - std::partial_sort(sorted_indices.data(), - sorted_indices.data() + num_output, + std::partial_sort(sorted_indices.data(), sorted_indices.data() + num_output, sorted_indices.data() + num_selected, [&](unsigned int first, unsigned int second) - { - - return result_scores_after_nms[first] > result_scores_after_nms[second]; - }); + { return result_scores_after_nms[first] > result_scores_after_nms[second]; }); - SaveOutputs(&_decoded_boxes, result_idx_boxes_after_nms, result_scores_after_nms, result_classes_after_nms, sorted_indices, - num_output, max_detections, _output_boxes, _output_classes, _output_scores, _num_detection); + SaveOutputs(&_decoded_boxes, result_idx_boxes_after_nms, result_scores_after_nms, result_classes_after_nms, + sorted_indices, num_output, max_detections, _output_boxes, _output_classes, _output_scores, + _num_detection); } // Fast NMS else { - const unsigned int num_classes_per_box = std::min<unsigned int>(_info.max_classes_per_detection(), _info.num_classes()); + const unsigned int num_classes_per_box = + std::min<unsigned int>(_info.max_classes_per_detection(), _info.num_classes()); std::vector<float> max_scores; std::vector<int> box_indices; std::vector<int> max_score_classes; - for(unsigned int b = 0; b < _num_boxes; ++b) + for (unsigned int b = 0; b < _num_boxes; ++b) { std::vector<float> box_scores; - for(unsigned int c = 0; c < num_classes; ++c) + for (unsigned int c = 0; c < num_classes; ++c) { - box_scores.emplace_back(*(reinterpret_cast<float *>(_input_scores_to_use->ptr_to_element(Coordinates(c + 1, b))))); + box_scores.emplace_back( + *(reinterpret_cast<float *>(_input_scores_to_use->ptr_to_element(Coordinates(c + 1, b))))); } std::vector<unsigned int> max_score_indices; max_score_indices.resize(_info.num_classes()); std::iota(max_score_indices.data(), max_score_indices.data() + _info.num_classes(), 0); - std::partial_sort(max_score_indices.data(), - max_score_indices.data() + num_classes_per_box, + std::partial_sort(max_score_indices.data(), max_score_indices.data() + num_classes_per_box, max_score_indices.data() + num_classes, [&](unsigned int first, unsigned int second) - { - return box_scores[first] > box_scores[second]; - }); + { return box_scores[first] > box_scores[second]; }); - for(unsigned int i = 0; i < num_classes_per_box; ++i) + for (unsigned int i = 0; i < num_classes_per_box; ++i) { - const float score_to_add = box_scores[max_score_indices[i]]; - *(reinterpret_cast<float *>(_class_scores.ptr_to_element(Coordinates(b * num_classes_per_box + i)))) = score_to_add; + const float score_to_add = box_scores[max_score_indices[i]]; + *(reinterpret_cast<float *>(_class_scores.ptr_to_element(Coordinates(b * num_classes_per_box + i)))) = + score_to_add; max_scores.emplace_back(score_to_add); box_indices.emplace_back(b); max_score_classes.emplace_back(max_score_indices[i]); @@ -405,10 +516,10 @@ void CPPDetectionPostProcessLayer::run() // Run Non-maxima Suppression _nms.run(); std::vector<unsigned int> selected_indices; - for(unsigned int i = 0; i < max_detections; ++i) + for (unsigned int i = 0; i < max_detections; ++i) { // NMS returns M valid indices, the not valid tail is filled with -1 - if(*(reinterpret_cast<int *>(_selected_indices.ptr_to_element(Coordinates(i)))) == -1) + if (*(reinterpret_cast<int *>(_selected_indices.ptr_to_element(Coordinates(i)))) == -1) { // Nms will return -1 for all the last M-elements not valid break; @@ -418,8 +529,8 @@ void CPPDetectionPostProcessLayer::run() // We select the max detection numbers of the highest score of all classes const auto num_output = std::min<unsigned int>(_info.max_detections(), selected_indices.size()); - SaveOutputs(&_decoded_boxes, box_indices, max_scores, max_score_classes, selected_indices, - num_output, max_detections, _output_boxes, _output_classes, _output_scores, _num_detection); + SaveOutputs(&_decoded_boxes, box_indices, max_scores, max_score_classes, selected_indices, num_output, + max_detections, _output_boxes, _output_classes, _output_scores, _num_detection); } } } // namespace arm_compute diff --git a/src/runtime/CPP/functions/CPPNonMaximumSuppression.cpp b/src/runtime/CPP/functions/CPPNonMaximumSuppression.cpp index 8856191996..3217742c6b 100644 --- a/src/runtime/CPP/functions/CPPNonMaximumSuppression.cpp +++ b/src/runtime/CPP/functions/CPPNonMaximumSuppression.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2020 ARM Limited. + * Copyright (c) 2019-2021 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -24,23 +24,33 @@ #include "arm_compute/runtime/CPP/functions/CPPNonMaximumSuppression.h" #include "arm_compute/core/CPP/kernels/CPPNonMaximumSuppressionKernel.h" -#include "support/MemorySupport.h" + +#include "src/common/utils/Log.h" namespace arm_compute { -void CPPNonMaximumSuppression::configure( - const ITensor *bboxes, const ITensor *scores, ITensor *indices, unsigned int max_output_size, - const float score_threshold, const float nms_threshold) +void CPPNonMaximumSuppression::configure(const ITensor *bboxes, + const ITensor *scores, + ITensor *indices, + unsigned int max_output_size, + const float score_threshold, + const float nms_threshold) { - auto k = arm_compute::support::cpp14::make_unique<CPPNonMaximumSuppressionKernel>(); + ARM_COMPUTE_LOG_PARAMS(bboxes, scores, indices, max_output_size, score_threshold, nms_threshold); + + auto k = std::make_unique<CPPNonMaximumSuppressionKernel>(); k->configure(bboxes, scores, indices, max_output_size, score_threshold, nms_threshold); _kernel = std::move(k); } -Status CPPNonMaximumSuppression::validate( - const ITensorInfo *bboxes, const ITensorInfo *scores, const ITensorInfo *indices, unsigned int max_output_size, - const float score_threshold, const float nms_threshold) +Status CPPNonMaximumSuppression::validate(const ITensorInfo *bboxes, + const ITensorInfo *scores, + const ITensorInfo *indices, + unsigned int max_output_size, + const float score_threshold, + const float nms_threshold) { - return CPPNonMaximumSuppressionKernel::validate(bboxes, scores, indices, max_output_size, score_threshold, nms_threshold); + return CPPNonMaximumSuppressionKernel::validate(bboxes, scores, indices, max_output_size, score_threshold, + nms_threshold); } } // namespace arm_compute diff --git a/src/runtime/CPP/functions/CPPPermute.cpp b/src/runtime/CPP/functions/CPPPermute.cpp index 1cdfe92db2..83941f1dc1 100644 --- a/src/runtime/CPP/functions/CPPPermute.cpp +++ b/src/runtime/CPP/functions/CPPPermute.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2020 ARM Limited. + * Copyright (c) 2017-2021 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -24,13 +24,16 @@ #include "arm_compute/runtime/CPP/functions/CPPPermute.h" #include "arm_compute/core/CPP/kernels/CPPPermuteKernel.h" -#include "support/MemorySupport.h" + +#include "src/common/utils/Log.h" using namespace arm_compute; void CPPPermute::configure(const ITensor *input, ITensor *output, const PermutationVector &perm) { - auto k = arm_compute::support::cpp14::make_unique<CPPPermuteKernel>(); + ARM_COMPUTE_LOG_PARAMS(input, output, perm); + + auto k = std::make_unique<CPPPermuteKernel>(); k->configure(input, output, perm); _kernel = std::move(k); } diff --git a/src/runtime/CPP/functions/CPPTopKV.cpp b/src/runtime/CPP/functions/CPPTopKV.cpp index eb0d560bdf..3d64def804 100644 --- a/src/runtime/CPP/functions/CPPTopKV.cpp +++ b/src/runtime/CPP/functions/CPPTopKV.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2020 ARM Limited. + * Copyright (c) 2019-2021 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -24,18 +24,24 @@ #include "arm_compute/runtime/CPP/functions/CPPTopKV.h" #include "arm_compute/core/CPP/kernels/CPPTopKVKernel.h" -#include "support/MemorySupport.h" + +#include "src/common/utils/Log.h" namespace arm_compute { void CPPTopKV::configure(const ITensor *predictions, const ITensor *targets, ITensor *output, const unsigned int k) { - auto kernel = arm_compute::support::cpp14::make_unique<CPPTopKVKernel>(); + ARM_COMPUTE_LOG_PARAMS(predictions, targets, output, k); + + auto kernel = std::make_unique<CPPTopKVKernel>(); kernel->configure(predictions, targets, output, k); _kernel = std::move(kernel); } -Status CPPTopKV::validate(const ITensorInfo *predictions, const ITensorInfo *targets, ITensorInfo *output, const unsigned int k) +Status CPPTopKV::validate(const ITensorInfo *predictions, + const ITensorInfo *targets, + ITensorInfo *output, + const unsigned int k) { return CPPTopKVKernel::validate(predictions, targets, output, k); } diff --git a/src/runtime/CPP/functions/CPPUpsample.cpp b/src/runtime/CPP/functions/CPPUpsample.cpp index a154b5ee66..8f72473aeb 100644 --- a/src/runtime/CPP/functions/CPPUpsample.cpp +++ b/src/runtime/CPP/functions/CPPUpsample.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2020 ARM Limited. + * Copyright (c) 2017-2021 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -24,13 +24,16 @@ #include "arm_compute/runtime/CPP/functions/CPPUpsample.h" #include "arm_compute/core/CPP/kernels/CPPUpsampleKernel.h" -#include "support/MemorySupport.h" + +#include "src/common/utils/Log.h" using namespace arm_compute; void CPPUpsample::configure(const ITensor *input, ITensor *output, const PadStrideInfo &info) { - auto k = arm_compute::support::cpp14::make_unique<CPPUpsampleKernel>(); + ARM_COMPUTE_LOG_PARAMS(input, output, info); + + auto k = std::make_unique<CPPUpsampleKernel>(); k->configure(input, output, info); _kernel = std::move(k); } diff --git a/src/runtime/CPUUtils.cpp b/src/runtime/CPUUtils.cpp deleted file mode 100644 index d8f01a9066..0000000000 --- a/src/runtime/CPUUtils.cpp +++ /dev/null @@ -1,456 +0,0 @@ -/* - * Copyright (c) 2018-2020 ARM Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "arm_compute/runtime/CPUUtils.h" - -#include "arm_compute/core/CPP/CPPTypes.h" -#include "arm_compute/core/Error.h" -#include "support/StringSupport.h" - -#include <algorithm> -#include <array> -#include <cstdlib> -#include <cstring> -#include <fstream> -#include <map> - -#ifndef BARE_METAL -/* C++ std::regex takes up a lot of space in the standalone builds */ -#include <regex.h> -#include <thread> -#endif /* BARE_METAL */ - -#if !defined(BARE_METAL) && (defined(__arm__) || defined(__aarch64__)) -#include <sys/auxv.h> - -/* Get HWCAP bits from asm/hwcap.h */ -#include <asm/hwcap.h> -#endif /* !BARE_METAL */ - -/* Make sure the bits we care about are defined, just in case asm/hwcap.h is - * out of date (or for bare metal mode) */ -#ifndef HWCAP_ASIMDHP -#define HWCAP_ASIMDHP (1 << 10) // NOLINT -#endif /* HWCAP_ASIMDHP */ - -#ifndef HWCAP_CPUID -#define HWCAP_CPUID (1 << 11) // NOLINT -#endif /* HWCAP_CPUID */ - -#ifndef HWCAP_ASIMDDP -#define HWCAP_ASIMDDP (1 << 20) // NOLINT -#endif /* HWCAP_ASIMDDP */ - -namespace -{ -using namespace arm_compute; - -#if !defined(BARE_METAL) && (defined(__arm__) || defined(__aarch64__)) - -bool model_supports_dot(CPUModel model) -{ - switch(model) - { - case CPUModel::GENERIC_FP16_DOT: - case CPUModel::A55r1: - return true; - default: - return false; - } -} - -bool model_supports_fp16(CPUModel model) -{ - switch(model) - { - case CPUModel::GENERIC_FP16: - case CPUModel::GENERIC_FP16_DOT: - case CPUModel::A55r1: - return true; - default: - return false; - } -} - -/* Convert an MIDR register value to a CPUModel enum value. */ -CPUModel midr_to_model(const unsigned int midr) -{ - CPUModel model = CPUModel::GENERIC; - - // Unpack variant and CPU ID - const int implementer = (midr >> 24) & 0xFF; - const int variant = (midr >> 20) & 0xF; - const int cpunum = (midr >> 4) & 0xFFF; - - if(implementer == 0x41) // Arm CPUs - { - // Only CPUs we have code paths for are detected. All other CPUs can be safely classed as "GENERIC" - switch(cpunum) - { - case 0xd03: // A53 - case 0xd04: // A35 - model = CPUModel::A53; - break; - case 0xd05: // A55 - if(variant != 0) - { - model = CPUModel::A55r1; - } - else - { - model = CPUModel::A55r0; - } - break; - case 0xd0a: // A75 - if(variant != 0) - { - model = CPUModel::GENERIC_FP16_DOT; - } - else - { - model = CPUModel::GENERIC_FP16; - } - break; - case 0xd0b: // A76 - case 0xd06: - case 0xd0c: - case 0xd0d: - model = CPUModel::GENERIC_FP16_DOT; - break; - default: - model = CPUModel::GENERIC; - break; - } - } - else if(implementer == 0x48) - { - // Only CPUs we have code paths for are detected. All other CPUs can be safely classed as "GENERIC" - switch(cpunum) - { - case 0xd40: // A76 - model = CPUModel::GENERIC_FP16_DOT; - break; - default: - model = CPUModel::GENERIC; - break; - } - } - - return model; -} - -void populate_models_cpuid(std::vector<CPUModel> &cpusv) -{ - // If the CPUID capability is present, MIDR information is provided in /sys. Use that to populate the CPU model table. - uint32_t i = 0; - for(auto &c : cpusv) - { - std::stringstream str; - str << "/sys/devices/system/cpu/cpu" << i++ << "/regs/identification/midr_el1"; - std::ifstream file; - file.open(str.str(), std::ios::in); - if(file.is_open()) - { - std::string line; - if(bool(getline(file, line))) - { - const uint32_t midr = support::cpp11::stoul(line, nullptr, support::cpp11::NumericBase::BASE_16); - c = midr_to_model(midr & 0xffffffff); - } - } - } -} - -void populate_models_cpuinfo(std::vector<CPUModel> &cpusv) -{ - regex_t proc_regex; - regex_t imp_regex; - regex_t var_regex; - regex_t part_regex; - regex_t rev_regex; - - memset(&proc_regex, 0, sizeof(regex_t)); - memset(&imp_regex, 0, sizeof(regex_t)); - memset(&var_regex, 0, sizeof(regex_t)); - memset(&part_regex, 0, sizeof(regex_t)); - memset(&rev_regex, 0, sizeof(regex_t)); - - int ret_status = 0; - // If "long-form" cpuinfo is present, parse that to populate models. - ret_status |= regcomp(&proc_regex, R"(^processor.*([[:digit:]]+)$)", REG_EXTENDED); - ret_status |= regcomp(&imp_regex, R"(^CPU implementer.*0x(..)$)", REG_EXTENDED); - ret_status |= regcomp(&var_regex, R"(^CPU variant.*0x(.)$)", REG_EXTENDED); - ret_status |= regcomp(&part_regex, R"(^CPU part.*0x(...)$)", REG_EXTENDED); - ret_status |= regcomp(&rev_regex, R"(^CPU revision.*([[:digit:]]+)$)", REG_EXTENDED); - ARM_COMPUTE_UNUSED(ret_status); - ARM_COMPUTE_ERROR_ON_MSG(ret_status != 0, "Regex compilation failed."); - - std::ifstream file; - file.open("/proc/cpuinfo", std::ios::in); - - if(file.is_open()) - { - std::string line; - int midr = 0; - int curcpu = -1; - - while(bool(getline(file, line))) - { - std::array<regmatch_t, 2> match; - ret_status = regexec(&proc_regex, line.c_str(), 2, match.data(), 0); - if(ret_status == 0) - { - std::string id = line.substr(match[1].rm_so, (match[1].rm_eo - match[1].rm_so)); - int newcpu = support::cpp11::stoi(id, nullptr); - - if(curcpu >= 0 && midr == 0) - { - // Matched a new CPU ID without any description of the previous one - looks like old format. - return; - } - - if(curcpu >= 0) - { - cpusv[curcpu] = midr_to_model(midr); - } - - midr = 0; - curcpu = newcpu; - - continue; - } - - ret_status = regexec(&imp_regex, line.c_str(), 2, match.data(), 0); - if(ret_status == 0) - { - std::string subexp = line.substr(match[1].rm_so, (match[1].rm_eo - match[1].rm_so)); - int impv = support::cpp11::stoi(subexp, nullptr, support::cpp11::NumericBase::BASE_16); - midr |= (impv << 24); - - continue; - } - - ret_status = regexec(&var_regex, line.c_str(), 2, match.data(), 0); - if(ret_status == 0) - { - std::string subexp = line.substr(match[1].rm_so, (match[1].rm_eo - match[1].rm_so)); - int varv = support::cpp11::stoi(subexp, nullptr, support::cpp11::NumericBase::BASE_16); - midr |= (varv << 20); - - continue; - } - - ret_status = regexec(&part_regex, line.c_str(), 2, match.data(), 0); - if(ret_status == 0) - { - std::string subexp = line.substr(match[1].rm_so, (match[1].rm_eo - match[1].rm_so)); - int partv = support::cpp11::stoi(subexp, nullptr, support::cpp11::NumericBase::BASE_16); - midr |= (partv << 4); - - continue; - } - - ret_status = regexec(&rev_regex, line.c_str(), 2, match.data(), 0); - if(ret_status == 0) - { - std::string subexp = line.substr(match[1].rm_so, (match[1].rm_eo - match[1].rm_so)); - int regv = support::cpp11::stoi(subexp, nullptr); - midr |= (regv); - midr |= (0xf << 16); - - continue; - } - } - - if(curcpu >= 0) - { - cpusv[curcpu] = midr_to_model(midr); - } - } - - // Free allocated memory - regfree(&proc_regex); - regfree(&imp_regex); - regfree(&var_regex); - regfree(&part_regex); - regfree(&rev_regex); -} - -int get_max_cpus() -{ - int max_cpus = 1; - std::ifstream CPUspresent; - CPUspresent.open("/sys/devices/system/cpu/present", std::ios::in); - bool success = false; - - if(CPUspresent.is_open()) - { - std::string line; - - if(bool(getline(CPUspresent, line))) - { - /* The content of this file is a list of ranges or single values, e.g. - * 0-5, or 1-3,5,7 or similar. As we are interested in the - * max valid ID, we just need to find the last valid - * delimiter ('-' or ',') and parse the integer immediately after that. - */ - auto startfrom = line.begin(); - - for(auto i = line.begin(); i < line.end(); ++i) - { - if(*i == '-' || *i == ',') - { - startfrom = i + 1; - } - } - - line.erase(line.begin(), startfrom); - - max_cpus = support::cpp11::stoi(line, nullptr) + 1; - success = true; - } - } - - // Return std::thread::hardware_concurrency() as a fallback. - if(!success) - { - max_cpus = std::thread::hardware_concurrency(); - } - return max_cpus; -} -#endif /* !defined(BARE_METAL) && (defined(__arm__) || defined(__aarch64__)) */ - -} // namespace - -namespace arm_compute -{ -void get_cpu_configuration(CPUInfo &cpuinfo) -{ -#if !defined(BARE_METAL) && (defined(__arm__) || defined(__aarch64__)) - bool cpuid = false; - bool hwcaps_fp16_support = false; - bool hwcaps_dot_support = false; - - const uint32_t hwcaps = getauxval(AT_HWCAP); - - if((hwcaps & HWCAP_CPUID) != 0) - { - cpuid = true; - } - - if((hwcaps & HWCAP_ASIMDHP) != 0) - { - hwcaps_fp16_support = true; - } - -#if defined(__aarch64__) - if((hwcaps & HWCAP_ASIMDDP) != 0) - { - hwcaps_dot_support = true; - } -#endif /* defined(__aarch64__) */ - - const unsigned int max_cpus = get_max_cpus(); - cpuinfo.set_cpu_num(max_cpus); - std::vector<CPUModel> percpu(max_cpus, CPUModel::GENERIC); - if(cpuid) - { - populate_models_cpuid(percpu); - } - else - { - populate_models_cpuinfo(percpu); - } - int j(0); - // Update dot product and FP16 support if one of the CPUs support these features - // We assume that the system does not have mixed architectures - bool one_supports_dot = false; - bool one_supports_fp16 = false; - for(const auto &v : percpu) - { - one_supports_dot = one_supports_dot || model_supports_dot(v); - one_supports_fp16 = one_supports_fp16 || model_supports_fp16(v); - cpuinfo.set_cpu_model(j++, v); - } - cpuinfo.set_dotprod(one_supports_dot || hwcaps_dot_support); - cpuinfo.set_fp16(one_supports_fp16 || hwcaps_fp16_support); -#else /* !defined(BARE_METAL) && (defined(__arm__) || defined(__aarch64__)) */ - ARM_COMPUTE_UNUSED(cpuinfo); -#endif /* !defined(BARE_METAL) && (defined(__arm__) || defined(__aarch64__)) */ -} - -unsigned int get_threads_hint() -{ - unsigned int num_threads_hint = 1; - -#if !defined(BARE_METAL) - std::map<std::string, unsigned int> cpu_part_occurrence_map; - - // CPU part regex - regex_t cpu_part_rgx; - memset(&cpu_part_rgx, 0, sizeof(regex_t)); - int ret_status = regcomp(&cpu_part_rgx, R"(.*CPU part.+/?\:[[:space:]]+([[:alnum:]]+).*)", REG_EXTENDED); - ARM_COMPUTE_UNUSED(ret_status); - ARM_COMPUTE_ERROR_ON_MSG(ret_status != 0, "Regex compilation failed."); - - // Read cpuinfo and get occurrence of each core - std::ifstream cpuinfo; - cpuinfo.open("/proc/cpuinfo", std::ios::in); - if(cpuinfo.is_open()) - { - std::string line; - while(bool(getline(cpuinfo, line))) - { - std::array<regmatch_t, 2> match; - ret_status = regexec(&cpu_part_rgx, line.c_str(), 2, match.data(), 0); - if(ret_status == 0) - { - std::string cpu_part = line.substr(match[1].rm_so, (match[1].rm_eo - match[1].rm_so)); - if(cpu_part_occurrence_map.find(cpu_part) != cpu_part_occurrence_map.end()) - { - cpu_part_occurrence_map[cpu_part]++; - } - else - { - cpu_part_occurrence_map[cpu_part] = 1; - } - } - } - } - regfree(&cpu_part_rgx); - - // Get min number of threads - auto min_common_cores = std::min_element(cpu_part_occurrence_map.begin(), cpu_part_occurrence_map.end(), - [](const std::pair<std::string, unsigned int> &p1, const std::pair<std::string, unsigned int> &p2) - { - return p1.second < p2.second; - }); - - // Set thread hint - num_threads_hint = cpu_part_occurrence_map.empty() ? std::thread::hardware_concurrency() : min_common_cores->second; -#endif /* !defined(BARE_METAL) */ - - return num_threads_hint; -} - -} // namespace arm_compute diff --git a/src/runtime/Distribution1D.cpp b/src/runtime/Distribution1D.cpp deleted file mode 100644 index 7080220320..0000000000 --- a/src/runtime/Distribution1D.cpp +++ /dev/null @@ -1,40 +0,0 @@ -/* - * Copyright (c) 2016-2020 ARM Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "arm_compute/runtime/Distribution1D.h" - -#include "arm_compute/core/Error.h" - -#include <cstdint> - -using namespace arm_compute; - -Distribution1D::Distribution1D(size_t num_bins, int32_t offset, uint32_t range) - : IDistribution1D(num_bins, offset, range), _data(num_bins) -{ -} - -uint32_t *Distribution1D::buffer() const -{ - return _data.data(); -} diff --git a/src/runtime/GLES_COMPUTE/GCBufferAllocator.cpp b/src/runtime/GLES_COMPUTE/GCBufferAllocator.cpp deleted file mode 100644 index 70a1f4f8ff..0000000000 --- a/src/runtime/GLES_COMPUTE/GCBufferAllocator.cpp +++ /dev/null @@ -1,59 +0,0 @@ -/* - * Copyright (c) 2018 ARM Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "arm_compute/runtime/GLES_COMPUTE/GCBufferAllocator.h" - -#include "arm_compute/core/Error.h" -#include "arm_compute/core/GLES_COMPUTE/OpenGLES.h" -#include "arm_compute/runtime/GLES_COMPUTE/GCMemoryRegion.h" - -#include <cstddef> - -namespace arm_compute -{ -void *GCBufferAllocator::allocate(size_t size, size_t alignment) -{ - ARM_COMPUTE_UNUSED(alignment); - - auto *gl_ssbo_name = new GLuint; - ARM_COMPUTE_GL_CHECK(glBindBuffer(GL_SHADER_STORAGE_BUFFER, *gl_ssbo_name)); - ARM_COMPUTE_GL_CHECK(glBufferData(GL_SHADER_STORAGE_BUFFER, static_cast<GLsizeiptr>(size), nullptr, GL_STATIC_DRAW)); - ARM_COMPUTE_GL_CHECK(glBindBuffer(GL_SHADER_STORAGE_BUFFER, 0)); - - return reinterpret_cast<void *>(gl_ssbo_name); -} - -void GCBufferAllocator::free(void *ptr) -{ - ARM_COMPUTE_ERROR_ON(ptr == nullptr); - auto *gl_ssbo_name = reinterpret_cast<GLuint *>(ptr); - ARM_COMPUTE_GL_CHECK(glDeleteBuffers(1, gl_ssbo_name)); - delete gl_ssbo_name; -} - -std::unique_ptr<IMemoryRegion> GCBufferAllocator::make_region(size_t size, size_t alignment) -{ - ARM_COMPUTE_UNUSED(alignment); - return arm_compute::support::cpp14::make_unique<GCBufferMemoryRegion>(size); -} -} // namespace arm_compute diff --git a/src/runtime/GLES_COMPUTE/GCHelpers.cpp b/src/runtime/GLES_COMPUTE/GCHelpers.cpp deleted file mode 100644 index df2f4f5e6e..0000000000 --- a/src/runtime/GLES_COMPUTE/GCHelpers.cpp +++ /dev/null @@ -1,85 +0,0 @@ -/* - * Copyright (c) 2019 ARM Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ - -#include "arm_compute/runtime/GLES_COMPUTE/GCHelpers.h" - -#include "arm_compute/core/Error.h" - -namespace arm_compute -{ -std::tuple<EGLDisplay, EGLContext, EGLBoolean> create_opengl_display_and_context() -{ - EGLBoolean res; - EGLDisplay display = eglGetDisplay(EGL_DEFAULT_DISPLAY); - - ARM_COMPUTE_ERROR_ON_MSG_VAR(display == EGL_NO_DISPLAY, "Failed to get display: 0x%x.", eglGetError()); - - res = eglInitialize(display, nullptr, nullptr); - - ARM_COMPUTE_ERROR_ON_MSG_VAR(res == EGL_FALSE, "Failed to initialize egl: 0x%x.", eglGetError()); - ARM_COMPUTE_UNUSED(res); - - const char *egl_extension_st = eglQueryString(display, EGL_EXTENSIONS); - ARM_COMPUTE_ERROR_ON_MSG((strstr(egl_extension_st, "EGL_KHR_create_context") == nullptr), "Failed to query EGL_KHR_create_context"); - ARM_COMPUTE_ERROR_ON_MSG((strstr(egl_extension_st, "EGL_KHR_surfaceless_context") == nullptr), "Failed to query EGL_KHR_surfaceless_context"); - ARM_COMPUTE_UNUSED(egl_extension_st); - - const std::array<EGLint, 3> config_attribs = - { - EGL_RENDERABLE_TYPE, EGL_OPENGL_ES3_BIT_KHR, - EGL_NONE - }; - EGLConfig cfg; - EGLint count; - - res = eglChooseConfig(display, config_attribs.data(), &cfg, 1, &count); - - ARM_COMPUTE_ERROR_ON_MSG_VAR(res == EGL_FALSE, "Failed to choose config: 0x%x.", eglGetError()); - ARM_COMPUTE_UNUSED(res); - - res = eglBindAPI(EGL_OPENGL_ES_API); - - ARM_COMPUTE_ERROR_ON_MSG_VAR(res == EGL_FALSE, "Failed to bind api: 0x%x.", eglGetError()); - - const std::array<EGLint, 3> attribs = - { - EGL_CONTEXT_CLIENT_VERSION, 3, - EGL_NONE - }; - EGLContext context = eglCreateContext(display, - cfg, - EGL_NO_CONTEXT, - attribs.data()); - - ARM_COMPUTE_ERROR_ON_MSG_VAR(context == EGL_NO_CONTEXT, "Failed to create context: 0x%x.", eglGetError()); - ARM_COMPUTE_UNUSED(res); - - res = eglMakeCurrent(display, EGL_NO_SURFACE, EGL_NO_SURFACE, context); - - ARM_COMPUTE_ERROR_ON_MSG_VAR(res == EGL_FALSE, "Failed to make current: 0x%x.", eglGetError()); - ARM_COMPUTE_UNUSED(res); - - return std::make_tuple(display, context, res); -} -} // namespace arm_compute diff --git a/src/runtime/GLES_COMPUTE/GCMemory.cpp b/src/runtime/GLES_COMPUTE/GCMemory.cpp deleted file mode 100644 index f1457c4d6e..0000000000 --- a/src/runtime/GLES_COMPUTE/GCMemory.cpp +++ /dev/null @@ -1,81 +0,0 @@ -/* - * Copyright (c) 2018-2019 ARM Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "arm_compute/runtime/GLES_COMPUTE/GCMemory.h" - -#include "arm_compute/core/utils/misc/Cast.h" -#include "arm_compute/runtime/GLES_COMPUTE/GCMemoryRegion.h" - -namespace arm_compute -{ -GCMemory::GCMemory() - : _region(nullptr), _region_owned(nullptr) -{ -} - -GCMemory::GCMemory(const std::shared_ptr<IGCMemoryRegion> &memory) - : _region(nullptr), _region_owned(memory) -{ - _region_owned = memory; - _region = _region_owned.get(); -} - -GCMemory::GCMemory(IGCMemoryRegion *memory) - : _region(memory), _region_owned(nullptr) -{ - _region = memory; -} - -IGCMemoryRegion *GCMemory::gc_region() -{ - return _region; -} - -IGCMemoryRegion *GCMemory::gc_region() const -{ - return _region; -} - -IMemoryRegion *GCMemory::region() -{ - return _region; -} - -IMemoryRegion *GCMemory::region() const -{ - return _region; -} - -void GCMemory::set_region(IMemoryRegion *region) -{ - auto gc_region = utils::cast::polymorphic_downcast<IGCMemoryRegion *>(region); - _region_owned = nullptr; - _region = gc_region; -} - -void GCMemory::set_owned_region(std::unique_ptr<IMemoryRegion> region) -{ - _region_owned = utils::cast::polymorphic_downcast_unique_ptr<IGCMemoryRegion>(std::move(region)); - _region = _region_owned.get(); -} -} // namespace arm_compute diff --git a/src/runtime/GLES_COMPUTE/GCMemoryRegion.cpp b/src/runtime/GLES_COMPUTE/GCMemoryRegion.cpp deleted file mode 100644 index 2ffd9f2ffc..0000000000 --- a/src/runtime/GLES_COMPUTE/GCMemoryRegion.cpp +++ /dev/null @@ -1,96 +0,0 @@ -/* - * Copyright (c) 2018-2020 ARM Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "arm_compute/runtime/GLES_COMPUTE/GCMemoryRegion.h" - -#include "arm_compute/core/Error.h" - -namespace arm_compute -{ -IGCMemoryRegion::IGCMemoryRegion(size_t size) - : IMemoryRegion(size), _mapping(nullptr), _ssbo_name(0) -{ -} - -const GLuint &IGCMemoryRegion::gc_ssbo_name() const -{ - return _ssbo_name; -} - -void *IGCMemoryRegion::buffer() -{ - return _mapping; -} - -const void *IGCMemoryRegion::buffer() const -{ - return _mapping; -} - -GCBufferMemoryRegion::GCBufferMemoryRegion(size_t size) - : IGCMemoryRegion(size) -{ - ARM_COMPUTE_GL_CHECK(glGenBuffers(1, &_ssbo_name)); - ARM_COMPUTE_GL_CHECK(glBindBuffer(GL_SHADER_STORAGE_BUFFER, _ssbo_name)); - ARM_COMPUTE_GL_CHECK(glBufferData(GL_SHADER_STORAGE_BUFFER, static_cast<GLsizeiptr>(size), nullptr, GL_STATIC_DRAW)); - ARM_COMPUTE_GL_CHECK(glBindBuffer(GL_SHADER_STORAGE_BUFFER, 0)); -} - -GCBufferMemoryRegion::~GCBufferMemoryRegion() -{ - ARM_COMPUTE_GL_CHECK(glDeleteBuffers(1, &_ssbo_name)); -} - -void *GCBufferMemoryRegion::ptr() -{ - return nullptr; -} - -void *GCBufferMemoryRegion::map(bool blocking) -{ - ARM_COMPUTE_ERROR_ON(_mapping != nullptr); - ARM_COMPUTE_UNUSED(blocking); - - ARM_COMPUTE_GL_CHECK(glBindBuffer(GL_SHADER_STORAGE_BUFFER, _ssbo_name)); - void *p = ARM_COMPUTE_GL_CHECK(glMapBufferRange(GL_SHADER_STORAGE_BUFFER, 0, static_cast<GLsizeiptr>(size()), GL_MAP_READ_BIT | GL_MAP_WRITE_BIT)); - _mapping = reinterpret_cast<uint8_t *>(p); - - return _mapping; -} - -void GCBufferMemoryRegion::unmap() -{ - ARM_COMPUTE_ERROR_ON(_mapping == nullptr); - - ARM_COMPUTE_GL_CHECK(glBindBuffer(GL_SHADER_STORAGE_BUFFER, _ssbo_name)); - ARM_COMPUTE_GL_CHECK(glUnmapBuffer(GL_SHADER_STORAGE_BUFFER)); - ARM_COMPUTE_GL_CHECK(glBindBuffer(GL_SHADER_STORAGE_BUFFER, 0)); - _mapping = nullptr; -} - -std::unique_ptr<IMemoryRegion> GCBufferMemoryRegion::extract_subregion(size_t offset, size_t size) -{ - ARM_COMPUTE_UNUSED(offset, size); - return nullptr; -} -} // namespace arm_compute
\ No newline at end of file diff --git a/src/runtime/GLES_COMPUTE/GCScheduler.cpp b/src/runtime/GLES_COMPUTE/GCScheduler.cpp deleted file mode 100644 index a45d7931be..0000000000 --- a/src/runtime/GLES_COMPUTE/GCScheduler.cpp +++ /dev/null @@ -1,142 +0,0 @@ -/* - * Copyright (c) 2017-2019 ARM Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ - -#include "arm_compute/runtime/GLES_COMPUTE/GCScheduler.h" - -#include "arm_compute/core/GLES_COMPUTE/GCHelpers.h" -#include "arm_compute/core/GLES_COMPUTE/GCKernelLibrary.h" - -using namespace arm_compute; - -std::once_flag GCScheduler::_initialize_symbols; - -GCScheduler::GCScheduler() - : _display(EGL_NO_DISPLAY), _context(EGL_NO_CONTEXT), _target(GPUTarget::MIDGARD) -{ -} - -GCScheduler::~GCScheduler() -{ - eglDestroyContext(_display, _context); - eglTerminate(_display); - - _context = EGL_NO_CONTEXT; - _display = EGL_NO_DISPLAY; -} - -void GCScheduler::default_init() -{ - setup_context(); - - init(_display, _context); -} - -void GCScheduler::default_init_with_context(EGLDisplay display, EGLContext ctx) -{ - _context = ctx; - _display = display; - - _target = get_target_from_device(); -} - -void GCScheduler::init(EGLDisplay dpy, EGLContext ctx) -{ - _target = get_target_from_device(); - - GCKernelLibrary::get().init("./cs_shaders/", dpy, ctx); -} - -GCScheduler &GCScheduler::get() -{ - std::call_once(_initialize_symbols, opengles31_is_available); - static GCScheduler scheduler; - return scheduler; -} - -void GCScheduler::dispatch(IGCKernel &kernel, bool flush) -{ - kernel.run(kernel.window()); - if(flush) - { - ARM_COMPUTE_GL_CHECK(glFlush()); - } -} - -void GCScheduler::memory_barrier() -{ - ARM_COMPUTE_GL_CHECK(glMemoryBarrier(GL_SHADER_STORAGE_BARRIER_BIT)); -} - -void GCScheduler::setup_context() -{ - EGLBoolean res; - _display = eglGetDisplay(EGL_DEFAULT_DISPLAY); - - ARM_COMPUTE_ERROR_ON_MSG_VAR(_display == EGL_NO_DISPLAY, "Failed to get display: 0x%x.", eglGetError()); - - res = eglInitialize(_display, nullptr, nullptr); - - ARM_COMPUTE_ERROR_ON_MSG_VAR(res == EGL_FALSE, "Failed to initialize egl: 0x%x.", eglGetError()); - ARM_COMPUTE_UNUSED(res); - - const char *egl_extension_st = eglQueryString(_display, EGL_EXTENSIONS); - ARM_COMPUTE_ERROR_ON_MSG((strstr(egl_extension_st, "EGL_KHR_create_context") == nullptr), "Failed to query EGL_KHR_create_context"); - ARM_COMPUTE_ERROR_ON_MSG((strstr(egl_extension_st, "EGL_KHR_surfaceless_context") == nullptr), "Failed to query EGL_KHR_surfaceless_context"); - ARM_COMPUTE_UNUSED(egl_extension_st); - - const std::array<EGLint, 3> config_attribs = - { - EGL_RENDERABLE_TYPE, EGL_OPENGL_ES3_BIT_KHR, - EGL_NONE - }; - EGLConfig cfg; - EGLint count; - - res = eglChooseConfig(_display, config_attribs.data(), &cfg, 1, &count); - - ARM_COMPUTE_ERROR_ON_MSG_VAR(res == EGL_FALSE, "Failed to choose config: 0x%x.", eglGetError()); - ARM_COMPUTE_UNUSED(res); - - res = eglBindAPI(EGL_OPENGL_ES_API); - - ARM_COMPUTE_ERROR_ON_MSG_VAR(res == EGL_FALSE, "Failed to bind api: 0x%x.", eglGetError()); - - const std::array<EGLint, 3> attribs = - { - EGL_CONTEXT_CLIENT_VERSION, 3, - EGL_NONE - }; - _context = eglCreateContext(_display, - cfg, - EGL_NO_CONTEXT, - attribs.data()); - - ARM_COMPUTE_ERROR_ON_MSG_VAR(_context == EGL_NO_CONTEXT, "Failed to create context: 0x%x.", eglGetError()); - ARM_COMPUTE_UNUSED(res); - - res = eglMakeCurrent(_display, EGL_NO_SURFACE, EGL_NO_SURFACE, _context); - - ARM_COMPUTE_ERROR_ON_MSG_VAR(res == EGL_FALSE, "Failed to make current: 0x%x.", eglGetError()); - ARM_COMPUTE_UNUSED(res); -} diff --git a/src/runtime/GLES_COMPUTE/GCTensorAllocator.cpp b/src/runtime/GLES_COMPUTE/GCTensorAllocator.cpp deleted file mode 100644 index 61523bcc31..0000000000 --- a/src/runtime/GLES_COMPUTE/GCTensorAllocator.cpp +++ /dev/null @@ -1,105 +0,0 @@ -/* - * Copyright (c) 2017-2020 ARM Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ - -#include "arm_compute/runtime/GLES_COMPUTE/GCTensorAllocator.h" - -#include "arm_compute/core/Error.h" -#include "arm_compute/core/TensorInfo.h" -#include "arm_compute/runtime/GLES_COMPUTE/GCMemoryRegion.h" -#include "arm_compute/runtime/GLES_COMPUTE/GCScheduler.h" -#include "support/MemorySupport.h" - -using namespace arm_compute; - -GCTensorAllocator::GCTensorAllocator(IMemoryManageable *owner) - : _owner(owner), _associated_memory_group(nullptr), _memory(), _mapping(nullptr) -{ -} - -uint8_t *GCTensorAllocator::data() -{ - return _mapping; -} - -void GCTensorAllocator::allocate() -{ - if(_associated_memory_group == nullptr) - { - _memory.set_owned_region(support::cpp14::make_unique<GCBufferMemoryRegion>(info().total_size())); - } - else - { - _associated_memory_group->finalize_memory(_owner, _memory, info().total_size(), alignment()); - } - info().set_is_resizable(false); -} - -void GCTensorAllocator::free() -{ - _mapping = nullptr; - _memory.set_region(nullptr); - info().set_is_resizable(true); -} - -void GCTensorAllocator::set_associated_memory_group(IMemoryGroup *associated_memory_group) -{ - ARM_COMPUTE_ERROR_ON(associated_memory_group == nullptr); - ARM_COMPUTE_ERROR_ON(_associated_memory_group != nullptr && _associated_memory_group != associated_memory_group); - ARM_COMPUTE_ERROR_ON(_memory.region() != nullptr && _memory.gc_region()->gc_ssbo_name() != 0); - - _associated_memory_group = associated_memory_group; -} - -uint8_t *GCTensorAllocator::lock() -{ - return map(true); -} - -void GCTensorAllocator::unlock() -{ - unmap(); -} - -GLuint GCTensorAllocator::get_gl_ssbo_name() const -{ - return (_memory.region() == nullptr) ? static_cast<GLuint>(0) : _memory.gc_region()->gc_ssbo_name(); -} - -uint8_t *GCTensorAllocator::map(bool blocking) -{ - ARM_COMPUTE_ERROR_ON(_mapping != nullptr); - ARM_COMPUTE_ERROR_ON(_memory.region() == nullptr); - - _mapping = reinterpret_cast<uint8_t *>(_memory.gc_region()->map(blocking)); - return _mapping; -} - -void GCTensorAllocator::unmap() -{ - ARM_COMPUTE_ERROR_ON(_mapping == nullptr); - ARM_COMPUTE_ERROR_ON(_memory.region() == nullptr); - - _memory.gc_region()->unmap(); - _mapping = nullptr; -} diff --git a/src/runtime/GLES_COMPUTE/IGCSimpleFunction.cpp b/src/runtime/GLES_COMPUTE/IGCSimpleFunction.cpp deleted file mode 100644 index bb9239eabe..0000000000 --- a/src/runtime/GLES_COMPUTE/IGCSimpleFunction.cpp +++ /dev/null @@ -1,47 +0,0 @@ -/* - * Copyright (c) 2017-2019 ARM Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "arm_compute/runtime/GLES_COMPUTE/IGCSimpleFunction.h" - -#include "arm_compute/core/Error.h" -#include "arm_compute/runtime/GLES_COMPUTE/GCScheduler.h" - -using namespace arm_compute; - -IGCSimpleFunction::IGCSimpleFunction(GCRuntimeContext *ctx) //NOLINT - : _kernel(), - _border_handler(), - _ctx(ctx) -{ -} - -void IGCSimpleFunction::run() -{ - ARM_COMPUTE_ERROR_ON_MSG(!_kernel, "The child class didn't set the GLES kernel or function isn't configured"); - GCScheduler *scheduler = (_ctx != nullptr) ? _ctx->gpu_scheduler() : &GCScheduler::get().get(); - ARM_COMPUTE_ERROR_ON(scheduler == nullptr); - - scheduler->dispatch(_border_handler, false); - scheduler->memory_barrier(); - scheduler->dispatch(*_kernel); -} diff --git a/src/runtime/GLES_COMPUTE/functions/GCAbsoluteDifference.cpp b/src/runtime/GLES_COMPUTE/functions/GCAbsoluteDifference.cpp deleted file mode 100644 index 5098dd786d..0000000000 --- a/src/runtime/GLES_COMPUTE/functions/GCAbsoluteDifference.cpp +++ /dev/null @@ -1,40 +0,0 @@ -/* - * Copyright (c) 2017-2020 ARM Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ - -#include "arm_compute/runtime/GLES_COMPUTE/functions/GCAbsoluteDifference.h" - -#include "arm_compute/core/GLES_COMPUTE/kernels/GCAbsoluteDifferenceKernel.h" -#include "arm_compute/core/Helpers.h" -#include "support/MemorySupport.h" - -#include <utility> - -using namespace arm_compute; - -void GCAbsoluteDifference::configure(const IGCTensor *input1, const IGCTensor *input2, IGCTensor *output) -{ - auto k = arm_compute::support::cpp14::make_unique<GCAbsoluteDifferenceKernel>(); - k->configure(input1, input2, output); - _kernel = std::move(k); -} diff --git a/src/runtime/GLES_COMPUTE/functions/GCArithmeticAddition.cpp b/src/runtime/GLES_COMPUTE/functions/GCArithmeticAddition.cpp deleted file mode 100755 index b0d8a3cf9f..0000000000 --- a/src/runtime/GLES_COMPUTE/functions/GCArithmeticAddition.cpp +++ /dev/null @@ -1,45 +0,0 @@ -/* - * Copyright (c) 2016-2020 ARM Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "arm_compute/runtime/GLES_COMPUTE/functions/GCArithmeticAddition.h" - -#include "arm_compute/core/GLES_COMPUTE/kernels/GCArithmeticAdditionKernel.h" -#include "support/MemorySupport.h" - -#include <utility> - -using namespace arm_compute; - -void GCArithmeticAddition::configure(const IGCTensor *input1, const IGCTensor *input2, IGCTensor *output, ConvertPolicy policy, const ActivationLayerInfo &act_info) -{ - ARM_COMPUTE_UNUSED(act_info); - auto k = arm_compute::support::cpp14::make_unique<GCArithmeticAdditionKernel>(); - k->configure(input1, input2, output, policy); - _kernel = std::move(k); -} - -Status GCArithmeticAddition::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, ConvertPolicy policy, const ActivationLayerInfo &act_info) -{ - ARM_COMPUTE_RETURN_ERROR_ON(act_info.enabled()); - return GCArithmeticAdditionKernel::validate(input1, input2, output, policy); -} diff --git a/src/runtime/GLES_COMPUTE/functions/GCBatchNormalizationLayer.cpp b/src/runtime/GLES_COMPUTE/functions/GCBatchNormalizationLayer.cpp deleted file mode 100755 index cc5e8f49f2..0000000000 --- a/src/runtime/GLES_COMPUTE/functions/GCBatchNormalizationLayer.cpp +++ /dev/null @@ -1,49 +0,0 @@ -/* - * Copyright (c) 2017-2018 ARM Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ - -#include "arm_compute/runtime/GLES_COMPUTE/functions/GCBatchNormalizationLayer.h" - -#include "arm_compute/core/Error.h" -#include "arm_compute/core/TensorInfo.h" -#include "arm_compute/core/Types.h" -#include "arm_compute/core/Validate.h" -#include "arm_compute/runtime/GLES_COMPUTE/GCScheduler.h" - -using namespace arm_compute; - -GCBatchNormalizationLayer::GCBatchNormalizationLayer() - : _norm_kernel() -{ -} - -void GCBatchNormalizationLayer::configure(const IGCTensor *input, IGCTensor *output, const IGCTensor *mean, const IGCTensor *var, const IGCTensor *beta, const IGCTensor *gamma, float epsilon, - ActivationLayerInfo act_info) -{ - _norm_kernel.configure(input, output, mean, var, beta, gamma, epsilon, act_info); -} - -void GCBatchNormalizationLayer::run() -{ - GCScheduler::get().dispatch(_norm_kernel, true); -} diff --git a/src/runtime/GLES_COMPUTE/functions/GCConcatenateLayer.cpp b/src/runtime/GLES_COMPUTE/functions/GCConcatenateLayer.cpp deleted file mode 100644 index 81e98f1ba2..0000000000 --- a/src/runtime/GLES_COMPUTE/functions/GCConcatenateLayer.cpp +++ /dev/null @@ -1,81 +0,0 @@ -/* - * Copyright (c) 2019-2020 ARM Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "arm_compute/runtime/GLES_COMPUTE/functions/GCConcatenateLayer.h" - -#include "arm_compute/core/Error.h" -#include "arm_compute/core/GLES_COMPUTE/IGCTensor.h" -#include "arm_compute/core/PixelValue.h" -#include "arm_compute/core/Types.h" -#include "arm_compute/core/utils/misc/ShapeCalculator.h" -#include "arm_compute/runtime/GLES_COMPUTE/GCScheduler.h" -#include "support/MemorySupport.h" - -namespace arm_compute -{ -GCConcatenateLayer::GCConcatenateLayer() - : _concat_kernels(), - _num_inputs(0), - _axis(Window::DimZ) -{ -} - -void GCConcatenateLayer::configure(std::vector<IGCTensor *> inputs_vector, IGCTensor *output, size_t axis) -{ - ARM_COMPUTE_ERROR_ON(inputs_vector.size() < 2); - - _num_inputs = inputs_vector.size(); - _axis = axis; - - TensorShape output_shape = arm_compute::misc::shape_calculator::calculate_concatenate_shape(inputs_vector, axis); - - // Output auto inizialitation if not yet initialized - auto_init_if_empty(*output->info(), output_shape, 1, inputs_vector[0]->info()->data_type()); - - unsigned int offset = 0; - switch(axis) - { - case Window::DimZ: - { - for(unsigned int i = 0; i < _num_inputs; ++i) - { - auto kernel = support::cpp14::make_unique<GCDepthConcatenateLayerKernel>(); - kernel->configure(inputs_vector.at(i), offset, output); - offset += inputs_vector.at(i)->info()->dimension(axis); - _concat_kernels.emplace_back(std::move(kernel)); - } - break; - } - default: - ARM_COMPUTE_ERROR("Axis not supported"); - } -} - -void GCConcatenateLayer::run() -{ - for(auto &kernel : _concat_kernels) - { - GCScheduler::get().dispatch(*kernel, true); - } -} -} // namespace arm_compute diff --git a/src/runtime/GLES_COMPUTE/functions/GCConvolutionLayer.cpp b/src/runtime/GLES_COMPUTE/functions/GCConvolutionLayer.cpp deleted file mode 100644 index 61c0740937..0000000000 --- a/src/runtime/GLES_COMPUTE/functions/GCConvolutionLayer.cpp +++ /dev/null @@ -1,241 +0,0 @@ -/* - * Copyright (c) 2017-2019 ARM Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ - -#include "arm_compute/runtime/GLES_COMPUTE/functions/GCConvolutionLayer.h" - -#include "arm_compute/core/PixelValue.h" -#include "arm_compute/core/Size2D.h" -#include "arm_compute/core/Utils.h" -#include "arm_compute/core/Validate.h" -#include "arm_compute/runtime/GLES_COMPUTE/GCScheduler.h" - -#include <cmath> -#include <memory> -#include <tuple> - -using namespace arm_compute; - -GCConvolutionLayerReshapeWeights::GCConvolutionLayerReshapeWeights() - : _weights_reshape_kernel() -{ -} - -void GCConvolutionLayerReshapeWeights::configure(const IGCTensor *weights, const IGCTensor *biases, IGCTensor *output) -{ - ARM_COMPUTE_ERROR_ON_NULLPTR(weights, output); - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(weights, 1, DataType::F16, DataType::F32); - ARM_COMPUTE_ERROR_ON(weights->info()->num_dimensions() > 4); - - if(biases != nullptr) - { - ARM_COMPUTE_ERROR_ON(is_data_type_quantized_asymmetric(weights->info()->data_type())); - ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(weights, biases); - ARM_COMPUTE_ERROR_ON(biases->info()->dimension(0) != weights->info()->dimension(3)); - ARM_COMPUTE_ERROR_ON(biases->info()->num_dimensions() > 1); - } - - const bool append_biases = (biases != nullptr) && !is_data_type_quantized_asymmetric(weights->info()->data_type()); - const IGCTensor *biases_to_use = (append_biases) ? biases : nullptr; - - _weights_reshape_kernel.configure(weights, biases_to_use, output); -} - -void GCConvolutionLayerReshapeWeights::run() -{ - GCScheduler::get().dispatch(_weights_reshape_kernel); -} - -GCConvolutionLayer::GCConvolutionLayer(std::shared_ptr<IMemoryManager> memory_manager) - : _memory_group(std::move(memory_manager)), _reshape_weights(), _input_im2col_kernel(), _mm_gemm(), _output_col2im_kernel(), _fill_border(), _activationlayer_function(), _original_weights(nullptr), - _input_im2col_reshaped(), _input_interleaved_reshaped(), _weights_reshaped(), _weights_transposed(), _gemm_output(), _tmp_output(), _is_activationlayer_enabled(false), _is_prepared(false) -{ -} - -void GCConvolutionLayer::configure_mm(const IGCTensor *input, const IGCTensor *weights, IGCTensor *output) -{ - ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights); - ARM_COMPUTE_ERROR_THROW_ON(validate_mm(input->info(), weights->info(), output->info())); - - _mm_gemm.configure(input, weights, nullptr, output, 1.f, 0.0f, GEMMInfo(false, false, true /* Reshape weights only for the first run */)); -} - -Status GCConvolutionLayer::validate_mm(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *output) -{ - // Perform validation step on Matrix multiply function - GCGEMM::validate(input, weights, nullptr, output, 1.0f, 0.0f, GEMMInfo(false, false, true /* Reshape weights only for the first run */)); - return Status{}; -} - -void GCConvolutionLayer::configure(const IGCTensor *input, const IGCTensor *weights, const IGCTensor *biases, IGCTensor *output, const PadStrideInfo &conv_info, const WeightsInfo &weights_info, - const Size2D &dilation, const ActivationLayerInfo &act_info, unsigned int num_groups) -{ - ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights); - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32); - ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights); - ARM_COMPUTE_ERROR_ON_MSG(weights_info.are_reshaped(), "Weights already reshaped are not supported!"); - ARM_COMPUTE_ERROR_ON(weights->info()->dimension(2) != input->info()->dimension(2)); - ARM_COMPUTE_ERROR_ON(weights->info()->num_dimensions() > 4); - ARM_COMPUTE_ERROR_ON(num_groups > 1); - ARM_COMPUTE_UNUSED(num_groups); - - _is_prepared = false; - _original_weights = weights; - - if(biases != nullptr) - { - ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, biases); - ARM_COMPUTE_ERROR_ON(biases->info()->dimension(0) != weights->info()->dimension(3)); - ARM_COMPUTE_ERROR_ON(biases->info()->num_dimensions() > 1); - } - - const DataType dt = input->info()->data_type(); - - // Set the GPU target for im2col and col2im - _input_im2col_kernel.set_target(GCScheduler::get().get_target()); - _output_col2im_kernel.set_target(GCScheduler::get().get_target()); - - const bool append_bias = (biases != nullptr); - const unsigned bias_element = (append_bias) ? 1 : 0; - const IGCTensor *biases_to_use = (append_bias) ? biases : nullptr; - - // Get parameters from conv_info - unsigned int stride_x = 0; - unsigned int stride_y = 0; - std::tie(stride_x, stride_y) = conv_info.stride(); - - // Get convolved dimensions - unsigned int conv_w = 0; - unsigned int conv_h = 0; - - const unsigned int kernel_width = weights->info()->dimension(0); - const unsigned int kernel_height = weights->info()->dimension(1); - std::tie(conv_w, conv_h) = scaled_dimensions(input->info()->dimension(0), input->info()->dimension(1), kernel_width, kernel_height, - conv_info, dilation); - - unsigned int mat_weights_cols = weights->info()->dimension(3); - unsigned int mat_weights_rows = weights->info()->dimension(0) * weights->info()->dimension(1) * weights->info()->dimension(2) + bias_element; - - // _weights_reshaped will be auto configured in the kernel. - // Just append biases and do not transpose 1xW as it will be reshaped in GCGEMM - _reshape_weights.configure(weights, biases_to_use, &_weights_reshaped); - - weights = &_weights_reshaped; - - // Create tensor to store im2col reshaped inputs - const unsigned int mat_input_cols = mat_weights_rows; - const unsigned int mat_input_rows = conv_w * conv_h; - TensorShape shape_im2col = input->info()->tensor_shape(); - shape_im2col.set(0, mat_input_cols); - shape_im2col.set(1, mat_input_rows); - shape_im2col.set(2, 1); - - // FIXME: input->clone() doesn't work with subtensors for grouped convolutions. - TensorInfo im2col_reshaped_info(shape_im2col, 1, dt); - _input_im2col_reshaped.allocator()->init(im2col_reshaped_info); - _memory_group.manage(&_input_im2col_reshaped); - - // Create GEMM output tensor - TensorShape shape_gemm = _input_im2col_reshaped.info()->tensor_shape(); - shape_gemm.set(0, mat_weights_cols); - shape_gemm.set(1, mat_input_rows); - const DataType gemm_data_type = dt; - - // FIXME: input->clone() doesn't work with subtensors for grouped convolutions. - TensorInfo info_gemm(shape_gemm, 1, gemm_data_type); - _gemm_output.allocator()->init(info_gemm); - _memory_group.manage(&_gemm_output); - - if(dt == DataType::F16) - { - BorderSize border_size = BorderSize(conv_info.pad_top(), conv_info.pad_right(), conv_info.pad_bottom(), conv_info.pad_left()); - input->info()->extend_padding(border_size); - _fill_border.configure(input, border_size, BorderMode::CONSTANT, PixelValue()); // for PAD of im2col fp16: consider it as border - } - // Configure im2col - _input_im2col_kernel.configure(input, &_input_im2col_reshaped, Size2D(kernel_width, kernel_height), conv_info, append_bias, dilation); - - // Configure GEMM - configure_mm(&_input_im2col_reshaped, weights, &_gemm_output); - - _input_im2col_reshaped.allocator()->allocate(); - - // Configure Col2Im - _output_col2im_kernel.configure(&_gemm_output, output, std::make_pair(conv_w, conv_h)); - _gemm_output.allocator()->allocate(); - - ARM_COMPUTE_ERROR_ON_MSG((output->info()->dimension(0) != conv_w) || (output->info()->dimension(1) != conv_h), "Output shape does not match the expected one"); - - //Configure Activation Layer - _is_activationlayer_enabled = act_info.enabled(); - - if(_is_activationlayer_enabled) - { - _activationlayer_function.configure(output, nullptr, act_info); - } - - ARM_COMPUTE_UNUSED(weights_info); -} - -void GCConvolutionLayer::run() -{ - prepare(); - - MemoryGroupResourceScope scope_mg(_memory_group); - - // Run im2col - GCScheduler::get().dispatch(_fill_border); - GCScheduler::get().memory_barrier(); - GCScheduler::get().dispatch(_input_im2col_kernel); - - // Run gemm on reshaped matrices - _mm_gemm.run(); - GCScheduler::get().memory_barrier(); - - // Reshape output matrix - GCScheduler::get().dispatch(_output_col2im_kernel, false); - GCScheduler::get().memory_barrier(); - - // Run Activation Layer - if(_is_activationlayer_enabled) - { - _activationlayer_function.run(); - } -} - -void GCConvolutionLayer::prepare() -{ - if(!_is_prepared) - { - ARM_COMPUTE_ERROR_ON(!_original_weights->is_used()); - - // Run weights reshaping and mark as unused - _weights_reshaped.allocator()->allocate(); - _reshape_weights.run(); - - // Mark original weights tensor as unused - _original_weights->mark_as_unused(); - - _is_prepared = true; - } -} diff --git a/src/runtime/GLES_COMPUTE/functions/GCDepthwiseConvolutionLayer.cpp b/src/runtime/GLES_COMPUTE/functions/GCDepthwiseConvolutionLayer.cpp deleted file mode 100644 index 9a9f30d4be..0000000000 --- a/src/runtime/GLES_COMPUTE/functions/GCDepthwiseConvolutionLayer.cpp +++ /dev/null @@ -1,74 +0,0 @@ -/* - * Copyright (c) 2017-2020 ARM Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "arm_compute/runtime/GLES_COMPUTE/functions/GCDepthwiseConvolutionLayer.h" - -#include "arm_compute/core/GLES_COMPUTE/IGCTensor.h" -#include "arm_compute/core/PixelValue.h" -#include "arm_compute/runtime/GLES_COMPUTE/GCScheduler.h" -#include "support/MemorySupport.h" - -using namespace arm_compute; - -GCDepthwiseConvolutionLayer3x3::GCDepthwiseConvolutionLayer3x3() - : _kernel(nullptr), _border_handler(), _shift_handler(), _activationlayer_function(), _is_activationlayer_enabled(false) -{ -} - -void GCDepthwiseConvolutionLayer3x3::configure(IGCTensor *input, const IGCTensor *weights, const IGCTensor *biases, IGCTensor *output, const PadStrideInfo &conv_info, - unsigned int depth_multiplier, const ActivationLayerInfo &act_info, const Size2D &dilation) -{ - ARM_COMPUTE_ERROR_ON(dilation.x() != 1 || dilation.y() != 1); - ARM_COMPUTE_UNUSED(dilation); - auto k = arm_compute::support::cpp14::make_unique<GCDepthwiseConvolutionLayer3x3Kernel>(); - k->configure(input, weights, biases, output, conv_info, depth_multiplier); - _kernel = std::move(k); - - // Configure border handler - _border_handler.configure(input, _kernel->border_size(), BorderMode::CONSTANT, PixelValue()); - - _shift_handler.configure(input); - - //Configure Activation Layer - _is_activationlayer_enabled = act_info.enabled(); - - if(_is_activationlayer_enabled) - { - _activationlayer_function.configure(output, nullptr, act_info); - } -} - -void GCDepthwiseConvolutionLayer3x3::run() -{ - GCScheduler::get().dispatch(_shift_handler, false); - GCScheduler::get().memory_barrier(); - GCScheduler::get().dispatch(_border_handler, false); - GCScheduler::get().memory_barrier(); - GCScheduler::get().dispatch(*_kernel); - - // Run Activation Layer - if(_is_activationlayer_enabled) - { - _activationlayer_function.run(); - } -} diff --git a/src/runtime/GLES_COMPUTE/functions/GCDirectConvolutionLayer.cpp b/src/runtime/GLES_COMPUTE/functions/GCDirectConvolutionLayer.cpp deleted file mode 100644 index 3bc3398cb5..0000000000 --- a/src/runtime/GLES_COMPUTE/functions/GCDirectConvolutionLayer.cpp +++ /dev/null @@ -1,85 +0,0 @@ -/* - * Copyright (c) 2017-2020 ARM Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "arm_compute/runtime/GLES_COMPUTE/functions/GCDirectConvolutionLayer.h" - -#include "arm_compute/core/GLES_COMPUTE/IGCTensor.h" -#include "arm_compute/core/GLES_COMPUTE/kernels/GCDirectConvolutionLayerKernel.h" -#include "arm_compute/core/Helpers.h" -#include "arm_compute/core/PixelValue.h" -#include "arm_compute/core/TensorInfo.h" -#include "arm_compute/core/Utils.h" -#include "arm_compute/runtime/GLES_COMPUTE/GCScheduler.h" -#include "support/MemorySupport.h" - -using namespace arm_compute; - -GCDirectConvolutionLayer::GCDirectConvolutionLayer() - : _kernel(nullptr), _border_handler(), _shift_handler() -{ -} - -void GCDirectConvolutionLayer::configure(IGCTensor *input, const IGCTensor *weights, const IGCTensor *biases, IGCTensor *output, const PadStrideInfo &conv_info, - const ActivationLayerInfo &act_info) -{ - int kernel_size = weights->info()->dimension(0); - - if(kernel_size == 1) - { - auto k = arm_compute::support::cpp14::make_unique<GCDirectConvolutionLayer1x1Kernel>(); - k->configure(input, weights, biases, output, conv_info, act_info); - _kernel = std::move(k); - } - else if(kernel_size == 3) - { - auto k = arm_compute::support::cpp14::make_unique<GCDirectConvolutionLayer3x3Kernel>(); - k->configure(input, weights, biases, output, conv_info, act_info); - _kernel = std::move(k); - } - else if(kernel_size == 5) - { - auto k = arm_compute::support::cpp14::make_unique<GCDirectConvolutionLayer5x5Kernel>(); - k->configure(input, weights, biases, output, conv_info, act_info); - _kernel = std::move(k); - } - else - { - ARM_COMPUTE_ERROR("kernel size unsupported!"); - return; - } - - _border_handler.configure(input, _kernel->border_size(), BorderMode::CONSTANT, PixelValue()); - - _shift_handler.configure(input); -} - -void GCDirectConvolutionLayer::run() -{ - GCScheduler::get().dispatch(_shift_handler, false); - GCScheduler::get().memory_barrier(); - GCScheduler::get().dispatch(_border_handler, false); - GCScheduler::get().memory_barrier(); - GCScheduler::get().dispatch(*_kernel); - GCScheduler::get().memory_barrier(); - GCScheduler::get().dispatch(_shift_handler); -} diff --git a/src/runtime/GLES_COMPUTE/functions/GCDropoutLayer.cpp b/src/runtime/GLES_COMPUTE/functions/GCDropoutLayer.cpp deleted file mode 100644 index 6407464e48..0000000000 --- a/src/runtime/GLES_COMPUTE/functions/GCDropoutLayer.cpp +++ /dev/null @@ -1,50 +0,0 @@ -/* - * Copyright (c) 2017 ARM Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ - -#include "arm_compute/runtime/GLES_COMPUTE/functions/GCDropoutLayer.h" - -#include "arm_compute/core/Validate.h" -#include "arm_compute/runtime/GLES_COMPUTE/GCScheduler.h" -#include "arm_compute/runtime/GLES_COMPUTE/GCTensor.h" - -using namespace arm_compute; - -GCDropoutLayer::GCDropoutLayer() - : _dropout_kernel() -{ -} - -void GCDropoutLayer::configure(const IGCTensor *input, IGCTensor *mask, IGCTensor *output, float ratio, bool forward) -{ - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32); - ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, mask, output); - - // Configure kernel - _dropout_kernel.configure(input, mask, output, ratio, forward); -} - -void GCDropoutLayer::run() -{ - GCScheduler::get().dispatch(_dropout_kernel); -} diff --git a/src/runtime/GLES_COMPUTE/functions/GCFillBorder.cpp b/src/runtime/GLES_COMPUTE/functions/GCFillBorder.cpp deleted file mode 100644 index d1d9874449..0000000000 --- a/src/runtime/GLES_COMPUTE/functions/GCFillBorder.cpp +++ /dev/null @@ -1,40 +0,0 @@ -/* - * Copyright (c) 2017-2020 ARM Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ - -#include "arm_compute/runtime/GLES_COMPUTE/functions/GCFillBorder.h" - -#include "arm_compute/core/GLES_COMPUTE/kernels/GCFillBorderKernel.h" -#include "arm_compute/core/Helpers.h" -#include "support/MemorySupport.h" - -#include <utility> - -using namespace arm_compute; - -void GCFillBorder::configure(IGCTensor *tensor, unsigned int border_width, BorderMode border_mode, const PixelValue &constant_border_value) -{ - auto k = arm_compute::support::cpp14::make_unique<GCFillBorderKernel>(); - k->configure(tensor, BorderSize(border_width), border_mode, constant_border_value); - _kernel = std::move(k); -} diff --git a/src/runtime/GLES_COMPUTE/functions/GCFullyConnectedLayer.cpp b/src/runtime/GLES_COMPUTE/functions/GCFullyConnectedLayer.cpp deleted file mode 100644 index d391eddf84..0000000000 --- a/src/runtime/GLES_COMPUTE/functions/GCFullyConnectedLayer.cpp +++ /dev/null @@ -1,194 +0,0 @@ -/* - * Copyright (c) 2017-2020 ARM Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "arm_compute/runtime/GLES_COMPUTE/functions/GCFullyConnectedLayer.h" - -#include "arm_compute/core/Validate.h" -#include "arm_compute/runtime/GLES_COMPUTE/GCScheduler.h" -#include "support/MemorySupport.h" - -#include <algorithm> - -using namespace arm_compute; - -void GCFullyConnectedLayerReshapeWeights::configure(const IGCTensor *input, IGCTensor *output) -{ - auto k = arm_compute::support::cpp14::make_unique<GCTransposeKernel>(); - k->configure(input, output); - _kernel = std::move(k); -} - -GCFullyConnectedLayer::GCFullyConnectedLayer(std::shared_ptr<IMemoryManager> memory_manager, IWeightsManager *weights_manager) - : _memory_group(std::move(memory_manager)), _weights_manager(std::move(weights_manager)), _im2col_kernel(), _reshape_weights_kernel(), _mm_kernel(), _accumulate_biases_kernel(), _im2col_output(), - _reshape_weights_output(), _original_weights(nullptr), _are_weights_reshaped(true), _is_fc_after_conv(true), _accumulate_biases(false) -{ -} - -void GCFullyConnectedLayer::configure_conv_fc(const IGCTensor *input, const IGCTensor *weights, IGCTensor *output) -{ - ARM_COMPUTE_ERROR_ON((weights->info()->dimension(1) != (input->info()->dimension(0) * input->info()->dimension(1) * input->info()->dimension(2)))); - - const DataType dt = input->info()->data_type(); - - // If the fully connected layer is called after a convolution layer, the input tensor must be linearized - - // Initialize output tensor for im2col - TensorShape shape_im2col; - shape_im2col.set(0, input->info()->dimension(0) * input->info()->dimension(1) * input->info()->dimension(2)); - shape_im2col.set(1, input->info()->dimension(3)); - shape_im2col.set(2, input->info()->dimension(4)); - shape_im2col.set(3, input->info()->dimension(5)); - _im2col_output.allocator()->init(TensorInfo(shape_im2col, 1, dt)); - - // Configure im2col kernel - _memory_group.manage(&_im2col_output); - _im2col_kernel.configure(input, &_im2col_output, Size2D(1, 1), PadStrideInfo(1, 1, 0, 0), false); - - // Configure matrix multiply kernel - _mm_kernel.configure(&_im2col_output, weights, output, 1.0f, false); - - // Allocate the output tensor for im2col once all the configure methods have been called - _im2col_output.allocator()->allocate(); -} - -void GCFullyConnectedLayer::configure_fc_fc(const IGCTensor *input, const IGCTensor *weights, IGCTensor *output) -{ - ARM_COMPUTE_ERROR_ON(input->info()->dimension(0) != weights->info()->dimension(1)); - - // Configure matrix multiply kernel - _mm_kernel.configure(input, weights, output, 1.0f, false); -} - -void GCFullyConnectedLayer::configure(const IGCTensor *input, const IGCTensor *weights, const IGCTensor *biases, IGCTensor *output, - FullyConnectedLayerInfo fc_info) -{ - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32, DataType::F16); - ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights, output); - ARM_COMPUTE_ERROR_ON(weights->info()->num_dimensions() > 2); - - _original_weights = weights; - _are_weights_reshaped = fc_info.transpose_weights ? fc_info.are_weights_reshaped : true; - _is_fc_after_conv = true; - _accumulate_biases = false; - - if(biases != nullptr) - { - ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, biases); - - _accumulate_biases = true; - - // Configure accumulate biases kernel - _accumulate_biases_kernel.configure(output, biases); - } - - // With the Fully Connected layer we can have 4 different cases: - // 1) Convolution layer -> Fully Connected layer without batches - // 2) Fully Connected layer -> Fully Connected layer without batches - // 3) Convolution layer -> Fully Connected layer with batches - // 4) Fully Connected layer -> Fully Connected layer with batches - - const IGCTensor *weights_to_use = weights; - - if(!_are_weights_reshaped) - { - weights_to_use = &_reshape_weights_output; - - // Reshape the weights - _reshape_weights_kernel.configure(weights, &_reshape_weights_output); - } - - // Check if we have a fully connected layer with batches - const bool is_batched_fc_layer = output->info()->dimension(1) > 1; - - if(is_batched_fc_layer) - { - _is_fc_after_conv = (TensorShape::num_max_dimensions >= 4) && (std::equal(input->info()->tensor_shape().cbegin() + 3, - input->info()->tensor_shape().cend(), - output->info()->tensor_shape().cbegin() + 1)); - } - else - { - _is_fc_after_conv = input->info()->num_dimensions() > 1; - } - - if(_is_fc_after_conv) - { - // Fully Connected layer after a Convolution Layer without batches - configure_conv_fc(input, weights_to_use, output); - } - else - { - // Fully Connected layer after a Fully Connected Layer without batches - configure_fc_fc(input, weights_to_use, output); - } - - ARM_COMPUTE_ERROR_ON(fc_info.retain_internal_weights && _reshape_weights_output.gc_buffer() == 0); - _are_weights_reshaped = _are_weights_reshaped || fc_info.retain_internal_weights; -} - -void GCFullyConnectedLayer::run() -{ - prepare(); - - MemoryGroupResourceScope scope_mg(_memory_group); - - // Linearize input if it comes from a convolutional layer - if(_is_fc_after_conv) - { - GCScheduler::get().dispatch(_im2col_kernel, false); - } - - if(!_are_weights_reshaped || _is_fc_after_conv) - { - GCScheduler::get().memory_barrier(); - } - - // Run matrix multiply - GCScheduler::get().dispatch(_mm_kernel, !_accumulate_biases); - - // Accumulate biases if provided - if(_accumulate_biases) - { - GCScheduler::get().memory_barrier(); - - GCScheduler::get().dispatch(_accumulate_biases_kernel); - } -} - -void GCFullyConnectedLayer::prepare() -{ - // Reshape of the weights (happens only once) - if(!_are_weights_reshaped) - { - ARM_COMPUTE_ERROR_ON(!_original_weights->is_used()); - - // Run reshape weights kernel and mark weights as unused - _reshape_weights_output.allocator()->allocate(); - _reshape_weights_kernel.run(); - - // Mark original weights tensor as unused - _original_weights->mark_as_unused(); - - _are_weights_reshaped = true; - } -} diff --git a/src/runtime/GLES_COMPUTE/functions/GCGEMM.cpp b/src/runtime/GLES_COMPUTE/functions/GCGEMM.cpp deleted file mode 100644 index ddfe590ee1..0000000000 --- a/src/runtime/GLES_COMPUTE/functions/GCGEMM.cpp +++ /dev/null @@ -1,211 +0,0 @@ -/* - * Copyright (c) 2017-2019 ARM Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "arm_compute/runtime/GLES_COMPUTE/functions/GCGEMM.h" - -#include "arm_compute/core/Error.h" -#include "arm_compute/core/GLES_COMPUTE/GCHelpers.h" -#include "arm_compute/core/GLES_COMPUTE/IGCTensor.h" -#include "arm_compute/core/GLES_COMPUTE/kernels/GCGEMMInterleave4x4Kernel.h" -#include "arm_compute/core/GLES_COMPUTE/kernels/GCGEMMMatrixAdditionKernel.h" -#include "arm_compute/core/GLES_COMPUTE/kernels/GCGEMMMatrixMultiplyKernel.h" -#include "arm_compute/core/GLES_COMPUTE/kernels/GCGEMMTranspose1xWKernel.h" -#include "arm_compute/core/Helpers.h" -#include "arm_compute/core/TensorInfo.h" -#include "arm_compute/core/Types.h" -#include "arm_compute/core/Validate.h" -#include "arm_compute/runtime/GLES_COMPUTE/GCScheduler.h" -#include "arm_compute/runtime/ITensorAllocator.h" - -using namespace arm_compute; - -namespace -{ -Status validate_arguments(const ITensorInfo *a, const ITensorInfo *b, const IGCTensor *c, const ITensorInfo *output, const float alpha, const float beta, const GEMMInfo &gemm_info = GEMMInfo()) -{ - ARM_COMPUTE_ERROR_ON_NULLPTR(a, b, output); - - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(a, 1, DataType::F16, DataType::F32); - ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(a, b, output); - ARM_COMPUTE_ERROR_ON_MSG(gemm_info.is_a_reshaped(), "Matrix A already reshaped is not supported"); - ARM_COMPUTE_ERROR_ON_MSG(gemm_info.is_b_reshaped(), "Matrix B already reshaped is not supported"); - - if(c != nullptr) - { - ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(a, c->info()); - ARM_COMPUTE_ERROR_ON_MSG(a->dimension(1) != c->info()->dimension(1), "The C matrix must have the same number of rows as the matrix A"); - ARM_COMPUTE_ERROR_ON_MSG(b->dimension(0) != c->info()->dimension(0), "The C matrix must have the same number of columns as the matrix B"); - } - - if(output->total_size() != 0) - { - ARM_COMPUTE_RETURN_ERROR_ON_MSG(b->dimension(0) != output->dimension(0), "The output matrix must have the same number of columns as the matrix B"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(a->dimension(1) != output->dimension(1), "The output matrix must have the same number of rows as the matrix A"); - } - - ARM_COMPUTE_RETURN_ERROR_ON_MSG(a->dimension(0) != b->dimension(1), "The product AB is defined only if the number of columns in A is equal to the number of rows in B"); - - ARM_COMPUTE_UNUSED(alpha); - ARM_COMPUTE_UNUSED(beta); - ARM_COMPUTE_UNUSED(gemm_info); - return Status{}; -} -} // namespace - -GCGEMM::GCGEMM(std::shared_ptr<IMemoryManager> memory_manager) - : _memory_group(std::move(memory_manager)), _interleave_kernel(), _transpose_kernel(), _mm_kernel(), _ma_kernel(), _tmp_a(), _tmp_b(), _original_b(nullptr), _is_interleaved_transposed(false), - _run_addition(false), _reshape_b_only_on_first_run(false), _is_prepared(false) -{ -} - -void GCGEMM::configure(const IGCTensor *a, const IGCTensor *b, const IGCTensor *c, IGCTensor *output, float alpha, float beta, const GEMMInfo &gemm_info) -{ - ARM_COMPUTE_ERROR_ON_NULLPTR(a, b, output); - - // Perform validation step - ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(a->info(), b->info(), c, output->info(), alpha, beta, gemm_info)); - - // Check if we need to reshape the matrix B only on the first run - _reshape_b_only_on_first_run = gemm_info.reshape_b_only_on_first_run(); - _is_prepared = false; - _original_b = b; - - const IGCTensor *matrix_a = a; - const IGCTensor *matrix_b = b; - - // Get the GPU target - const GPUTarget gpu_target = GCScheduler::get().get_target(); - - // Set the target for the kernels - _interleave_kernel.set_target(gpu_target); - _mm_kernel.set_target(gpu_target); - - // Arguments used by GEMMReshapeInfo - // If we pass the matrix A and matrix B reshaped to GCGEMMMatrixMultiplyKernel, we need to pass m, n, k, mult_transpose1xW_width and mult_interleave4x4_height to GCGEMMReshapeInfo - // in order to know how the matrices have been reshaped - const int m = a->info()->dimension(1); - const int n = b->info()->dimension(0); - const int k = a->info()->dimension(0); - int mult_transpose1xW_width = 1; - int mult_interleave4x4_height = 1; - - // If the input tensor has less than 16 rows, we run a special version of GEMM without reshaping the input tensors - _is_interleaved_transposed = a->info()->dimension(1) > 16; - - if(_is_interleaved_transposed) - { - matrix_a = &_tmp_a; - matrix_b = &_tmp_b; - - // Manage intermediate buffers - _memory_group.manage(&_tmp_a); - if(!_reshape_b_only_on_first_run) - { - _memory_group.manage(&_tmp_b); - } - // _tmp_a and _tmp_b will be auto configured in _interleave_kernel and in _transpose_kernel - - // Configure interleave kernel - _interleave_kernel.configure(a, &_tmp_a); - - // Configure transpose kernel - _transpose_kernel.configure(b, &_tmp_b); - } - - _mm_kernel.configure(matrix_a, matrix_b, output, alpha, _is_interleaved_transposed, GEMMReshapeInfo(m, n, k, mult_transpose1xW_width, mult_interleave4x4_height)); - - if(_is_interleaved_transposed) - { - // Allocate intermediate tensors - _tmp_a.allocator()->allocate(); - if(!_reshape_b_only_on_first_run) - { - _tmp_b.allocator()->allocate(); - } - } - - // Configure matrix addition kernel - if(beta != 0 && c != nullptr) - { - _ma_kernel.configure(c, output, beta); - _run_addition = true; - } -} - -Status GCGEMM::validate(const ITensorInfo *a, const ITensorInfo *b, const IGCTensor *c, const ITensorInfo *output, const float alpha, const float beta, const GEMMInfo &gemm_info) -{ - ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(a, b, c, output, alpha, beta, gemm_info)); - return Status{}; -} - -void GCGEMM::run() -{ - prepare(); - - MemoryGroupResourceScope scope_mg(_memory_group); - - if(_is_interleaved_transposed) - { - // Run interleave kernel - GCScheduler::get().dispatch(_interleave_kernel, false); - - if(!_reshape_b_only_on_first_run) - { - // Run transpose kernel - GCScheduler::get().dispatch(_transpose_kernel, false); - } - - GCScheduler::get().memory_barrier(); - } - - // Run matrix multiply kernel - GCScheduler::get().dispatch(_mm_kernel, !_run_addition); - - // Run matrix addition kernel - if(_run_addition) - { - GCScheduler::get().memory_barrier(); - GCScheduler::get().dispatch(_ma_kernel); - } -} - -void GCGEMM::prepare() -{ - if(!_is_prepared) - { - if(_is_interleaved_transposed && _reshape_b_only_on_first_run) - { - ARM_COMPUTE_ERROR_ON(!_original_b->is_used()); - - // Run transpose kernel - _tmp_b.allocator()->allocate(); - GCScheduler::get().dispatch(_transpose_kernel, false); - GCScheduler::get().memory_barrier(); - - // Mark original weights tensor as unused - _original_b->mark_as_unused(); - } - - _is_prepared = true; - } -} diff --git a/src/runtime/GLES_COMPUTE/functions/GCGEMMInterleave4x4.cpp b/src/runtime/GLES_COMPUTE/functions/GCGEMMInterleave4x4.cpp deleted file mode 100644 index cc37bf4b4d..0000000000 --- a/src/runtime/GLES_COMPUTE/functions/GCGEMMInterleave4x4.cpp +++ /dev/null @@ -1,36 +0,0 @@ -/* - * Copyright (c) 2017-2020 ARM Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "arm_compute/runtime/GLES_COMPUTE/functions/GCGEMMInterleave4x4.h" - -#include "arm_compute/core/GLES_COMPUTE/kernels/GCGEMMInterleave4x4Kernel.h" -#include "support/MemorySupport.h" - -using namespace arm_compute; - -void GCGEMMInterleave4x4::configure(const IGCTensor *input, IGCTensor *output) -{ - auto k = arm_compute::support::cpp14::make_unique<GCGEMMInterleave4x4Kernel>(); - k->configure(input, output); - _kernel = std::move(k); -} diff --git a/src/runtime/GLES_COMPUTE/functions/GCGEMMTranspose1xW.cpp b/src/runtime/GLES_COMPUTE/functions/GCGEMMTranspose1xW.cpp deleted file mode 100644 index af933fa578..0000000000 --- a/src/runtime/GLES_COMPUTE/functions/GCGEMMTranspose1xW.cpp +++ /dev/null @@ -1,38 +0,0 @@ -/* - * Copyright (c) 2017-2020 ARM Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "arm_compute/runtime/GLES_COMPUTE/functions/GCGEMMTranspose1xW.h" - -#include "arm_compute/core/GLES_COMPUTE/IGCTensor.h" -#include "arm_compute/core/GLES_COMPUTE/kernels/GCGEMMTranspose1xWKernel.h" -#include "arm_compute/core/Types.h" -#include "support/MemorySupport.h" - -using namespace arm_compute; - -void GCGEMMTranspose1xW::configure(const IGCTensor *input, IGCTensor *output) -{ - auto k = arm_compute::support::cpp14::make_unique<GCGEMMTranspose1xWKernel>(); - k->configure(input, output); - _kernel = std::move(k); -} diff --git a/src/runtime/GLES_COMPUTE/functions/GCNormalizationLayer.cpp b/src/runtime/GLES_COMPUTE/functions/GCNormalizationLayer.cpp deleted file mode 100644 index 8f602792a8..0000000000 --- a/src/runtime/GLES_COMPUTE/functions/GCNormalizationLayer.cpp +++ /dev/null @@ -1,66 +0,0 @@ -/* - * Copyright (c) 2017-2019 ARM Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ - -#include "arm_compute/runtime/GLES_COMPUTE/functions/GCNormalizationLayer.h" - -#include "arm_compute/core/Error.h" -#include "arm_compute/core/PixelValue.h" -#include "arm_compute/core/TensorInfo.h" -#include "arm_compute/core/Types.h" -#include "arm_compute/core/Validate.h" -#include "arm_compute/runtime/GLES_COMPUTE/GCScheduler.h" - -using namespace arm_compute; - -GCNormalizationLayer::GCNormalizationLayer(std::shared_ptr<IMemoryManager> memory_manager) - : _memory_group(std::move(memory_manager)), _squared_input(), _norm_kernel(), _multiply_kernel(), _border_handler() -{ -} - -void GCNormalizationLayer::configure(const IGCTensor *input, IGCTensor *output, const NormalizationLayerInfo &norm_info) -{ - ARM_COMPUTE_ERROR_ON(input == nullptr); - - _squared_input.allocator()->init(TensorInfo(input->info()->tensor_shape(), 1, input->info()->data_type())); - _memory_group.manage(&_squared_input); - - _norm_kernel.configure(input, &_squared_input, output, norm_info); - _multiply_kernel.configure(input, input, &_squared_input, 1.0f); - // Fill the border by 3 elements since we need vload4 in the IN_MAP normalization kernel - _border_handler.configure(&_squared_input, _norm_kernel.border_size(), BorderMode::CONSTANT, PixelValue()); - - // Allocate intermediate buffers - _squared_input.allocator()->allocate(); -} - -void GCNormalizationLayer::run() -{ - MemoryGroupResourceScope scope_mg(_memory_group); - - GCScheduler::get().dispatch(_multiply_kernel, false); - GCScheduler::get().memory_barrier(); - GCScheduler::get().dispatch(_border_handler, false); - GCScheduler::get().memory_barrier(); - GCScheduler::get().dispatch(_norm_kernel, true); -} diff --git a/src/runtime/GLES_COMPUTE/functions/GCNormalizePlanarYUVLayer.cpp b/src/runtime/GLES_COMPUTE/functions/GCNormalizePlanarYUVLayer.cpp deleted file mode 100755 index 19fdc3d7c0..0000000000 --- a/src/runtime/GLES_COMPUTE/functions/GCNormalizePlanarYUVLayer.cpp +++ /dev/null @@ -1,54 +0,0 @@ -/* - * Copyright (c) 2017-2018 ARM Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ - -#include "arm_compute/runtime/GLES_COMPUTE/functions/GCNormalizePlanarYUVLayer.h" - -#include "arm_compute/core/Error.h" -#include "arm_compute/core/TensorInfo.h" -#include "arm_compute/core/Types.h" -#include "arm_compute/core/Validate.h" -#include "arm_compute/runtime/GLES_COMPUTE/GCScheduler.h" - -using namespace arm_compute; - -GCNormalizePlanarYUVLayer::GCNormalizePlanarYUVLayer() - : _norm_kernel() -{ -} - -void GCNormalizePlanarYUVLayer::configure(const IGCTensor *input, IGCTensor *output, const IGCTensor *mean, const IGCTensor *std) -{ - _norm_kernel.configure(input, output, mean, std); -} - -Status GCNormalizePlanarYUVLayer::validate(const ITensorInfo *input, const ITensorInfo *output, - const ITensorInfo *mean, const ITensorInfo *std) -{ - return GCNormalizePlanarYUVLayerKernel::validate(input, output, mean, std); -} - -void GCNormalizePlanarYUVLayer::run() -{ - GCScheduler::get().dispatch(_norm_kernel, true); -} diff --git a/src/runtime/GLES_COMPUTE/functions/GCPixelWiseMultiplication.cpp b/src/runtime/GLES_COMPUTE/functions/GCPixelWiseMultiplication.cpp deleted file mode 100755 index 1075f0b5be..0000000000 --- a/src/runtime/GLES_COMPUTE/functions/GCPixelWiseMultiplication.cpp +++ /dev/null @@ -1,39 +0,0 @@ -/* - * Copyright (c) 2017-2020 ARM Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "arm_compute/runtime/GLES_COMPUTE/functions/GCPixelWiseMultiplication.h" - -#include "arm_compute/core/GLES_COMPUTE/kernels/GCPixelWiseMultiplicationKernel.h" -#include "support/MemorySupport.h" - -#include <utility> - -using namespace arm_compute; - -void GCPixelWiseMultiplication::configure(const IGCTensor *input1, const IGCTensor *input2, IGCTensor *output, float scale, const ActivationLayerInfo &act_info) -{ - ARM_COMPUTE_UNUSED(act_info); - auto k = arm_compute::support::cpp14::make_unique<GCPixelWiseMultiplicationKernel>(); - k->configure(input1, input2, output, scale); - _kernel = std::move(k); -} diff --git a/src/runtime/GLES_COMPUTE/functions/GCPoolingLayer.cpp b/src/runtime/GLES_COMPUTE/functions/GCPoolingLayer.cpp deleted file mode 100644 index accf60e204..0000000000 --- a/src/runtime/GLES_COMPUTE/functions/GCPoolingLayer.cpp +++ /dev/null @@ -1,66 +0,0 @@ -/* - * Copyright (c) 2017-2020 ARM Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "arm_compute/runtime/GLES_COMPUTE/functions/GCPoolingLayer.h" - -#include "arm_compute/core/GLES_COMPUTE/IGCTensor.h" -#include "arm_compute/core/GLES_COMPUTE/kernels/GCPoolingLayerKernel.h" -#include "arm_compute/runtime/GLES_COMPUTE/GCScheduler.h" - -#include "support/MemorySupport.h" - -namespace arm_compute -{ -GCPoolingLayer::GCPoolingLayer() - : _kernel(nullptr), _border_handler(), _shift_handler() -{ -} - -void GCPoolingLayer::configure(IGCTensor *input, IGCTensor *output, const PoolingLayerInfo &pool_info, IGCTensor *indices) -{ - // Configure pooling kernel - auto k = arm_compute::support::cpp14::make_unique<GCPoolingLayerKernel>(); - k->configure(input, output, pool_info, indices); - _kernel = std::move(k); - - // Configure border depending on operation required - BorderMode border_mode = (PoolingType::MAX == pool_info.pool_type) ? BorderMode::REPLICATE : BorderMode::CONSTANT; - _border_handler.configure(input, _kernel->border_size(), border_mode, PixelValue(0.0f)); - - _shift_handler.configure(input); -} - -Status GCPoolingLayer::validate(const ITensorInfo *input, const ITensorInfo *output, const PoolingLayerInfo &pool_info, const ITensorInfo *indices) -{ - return GCPoolingLayerKernel::validate(input, output, pool_info, indices); -} - -void GCPoolingLayer::run() -{ - GCScheduler::get().dispatch(_shift_handler, false); - GCScheduler::get().memory_barrier(); - GCScheduler::get().dispatch(_border_handler, false); - GCScheduler::get().memory_barrier(); - GCScheduler::get().dispatch(*_kernel); -} -} // namespace arm_compute diff --git a/src/runtime/GLES_COMPUTE/functions/GCScale.cpp b/src/runtime/GLES_COMPUTE/functions/GCScale.cpp deleted file mode 100644 index f245c3ecd0..0000000000 --- a/src/runtime/GLES_COMPUTE/functions/GCScale.cpp +++ /dev/null @@ -1,42 +0,0 @@ -/* - * Copyright (c) 2016-2020 ARM Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "arm_compute/runtime/GLES_COMPUTE/functions/GCScale.h" - -#include "arm_compute/core/Error.h" -#include "arm_compute/core/GLES_COMPUTE/IGCTensor.h" -#include "arm_compute/core/GLES_COMPUTE/kernels/GCScaleKernel.h" -#include "arm_compute/core/Validate.h" -#include "support/MemorySupport.h" - -using namespace arm_compute; - -void GCScale::configure(IGCTensor *input, IGCTensor *output, InterpolationPolicy policy, BorderMode border_mode, PixelValue constant_border_value, SamplingPolicy sampling_policy, bool use_padding, - bool align_corners) -{ - ARM_COMPUTE_UNUSED(use_padding, align_corners); - auto k = arm_compute::support::cpp14::make_unique<GCScaleKernel>(); - k->configure(input, output, policy, border_mode == BorderMode::UNDEFINED, sampling_policy); - _kernel = std::move(k); - _border_handler.configure(input, _kernel->border_size(), border_mode, constant_border_value); -} diff --git a/src/runtime/GLES_COMPUTE/functions/GCSoftmaxLayer.cpp b/src/runtime/GLES_COMPUTE/functions/GCSoftmaxLayer.cpp deleted file mode 100644 index 0645ae7f8f..0000000000 --- a/src/runtime/GLES_COMPUTE/functions/GCSoftmaxLayer.cpp +++ /dev/null @@ -1,79 +0,0 @@ -/* - * Copyright (c) 2017-2019 ARM Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "arm_compute/runtime/GLES_COMPUTE/functions/GCSoftmaxLayer.h" - -#include "arm_compute/core/GLES_COMPUTE/kernels/GCSoftmaxLayerKernel.h" -#include "arm_compute/core/Helpers.h" -#include "arm_compute/runtime/GLES_COMPUTE/GCScheduler.h" - -using namespace arm_compute; - -GCSoftmaxLayer::GCSoftmaxLayer(std::shared_ptr<IMemoryManager> memory_manager) - : _memory_group(std::move(memory_manager)), _max_kernel(), _shift_exp_sum_kernel(), _norm_kernel(), _max(), _sum(), _tmp() -{ -} - -void GCSoftmaxLayer::configure(const IGCTensor *input, IGCTensor *output, float beta, size_t axis) -{ - ARM_COMPUTE_UNUSED(beta, axis); - - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32); - ARM_COMPUTE_ERROR_ON(beta != 1.0f); - ARM_COMPUTE_ERROR_ON_MSG(axis != 1, "Axis must be 1 for GLES"); - - // Create intermediate tensors shapes - _tmp.allocator()->init(TensorInfo(input->info()->tensor_shape(), input->info()->num_channels(), input->info()->data_type())); - - TensorShape shape = input->info()->tensor_shape(); - shape.set(0, 1); - TensorInfo tensor_info_max_sum(shape, input->info()->num_channels(), input->info()->data_type()); - _max.allocator()->init(tensor_info_max_sum); - _sum.allocator()->init(tensor_info_max_sum); - - // Manage intermediate buffers - _memory_group.manage(&_tmp); - _memory_group.manage(&_max); - _memory_group.manage(&_sum); - - // Configure Kernels - _max_kernel.configure(input, &_max); - _shift_exp_sum_kernel.configure(input, &_max, &_tmp, &_sum); - _norm_kernel.configure(&_tmp, &_sum, output); - - // Allocate intermediate buffers - _tmp.allocator()->allocate(); - _max.allocator()->allocate(); - _sum.allocator()->allocate(); -} - -void GCSoftmaxLayer::run() -{ - MemoryGroupResourceScope scope_mg(_memory_group); - - GCScheduler::get().dispatch(_max_kernel, false); - GCScheduler::get().memory_barrier(); - GCScheduler::get().dispatch(_shift_exp_sum_kernel, false); - GCScheduler::get().memory_barrier(); - GCScheduler::get().dispatch(_norm_kernel); -} diff --git a/src/runtime/GLES_COMPUTE/functions/GCTensorShift.cpp b/src/runtime/GLES_COMPUTE/functions/GCTensorShift.cpp deleted file mode 100644 index 7ef07e923f..0000000000 --- a/src/runtime/GLES_COMPUTE/functions/GCTensorShift.cpp +++ /dev/null @@ -1,40 +0,0 @@ -/* - * Copyright (c) 2017-2020 ARM Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "arm_compute/runtime/GLES_COMPUTE/functions/GCTensorShift.h" - -#include "arm_compute/core/GLES_COMPUTE/IGCTensor.h" -#include "arm_compute/core/GLES_COMPUTE/kernels/GCTensorShiftKernel.h" -#include "arm_compute/core/Helpers.h" -#include "arm_compute/core/PixelValue.h" -#include "arm_compute/core/Utils.h" -#include "support/MemorySupport.h" - -using namespace arm_compute; - -void GCTensorShift::configure(IGCTensor *input) -{ - auto k = arm_compute::support::cpp14::make_unique<GCTensorShiftKernel>(); - k->configure(input); - _kernel = std::move(k); -} diff --git a/src/runtime/GLES_COMPUTE/functions/GCTranspose.cpp b/src/runtime/GLES_COMPUTE/functions/GCTranspose.cpp deleted file mode 100644 index 530f52abd6..0000000000 --- a/src/runtime/GLES_COMPUTE/functions/GCTranspose.cpp +++ /dev/null @@ -1,38 +0,0 @@ -/* - * Copyright (c) 2017-2020 ARM Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "arm_compute/runtime/GLES_COMPUTE/functions/GCTranspose.h" - -#include "arm_compute/core/GLES_COMPUTE/kernels/GCTransposeKernel.h" -#include "support/MemorySupport.h" - -#include <utility> - -using namespace arm_compute; - -void GCTranspose::configure(const IGCTensor *input, IGCTensor *output) -{ - auto k = arm_compute::support::cpp14::make_unique<GCTransposeKernel>(); - k->configure(input, output); - _kernel = std::move(k); -} diff --git a/src/runtime/HOG.cpp b/src/runtime/HOG.cpp deleted file mode 100644 index d312967699..0000000000 --- a/src/runtime/HOG.cpp +++ /dev/null @@ -1,49 +0,0 @@ -/* - * Copyright (c) 2016-2020 ARM Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "arm_compute/runtime/HOG.h" - -#include "arm_compute/core/Error.h" - -using namespace arm_compute; - -HOG::HOG() - : IHOG(), _info(), _descriptor() -{ -} - -void HOG::init(const HOGInfo &input) -{ - _info = input; - _descriptor.resize(_info.descriptor_size()); -} - -float *HOG::descriptor() const -{ - return _descriptor.data(); -} - -const HOGInfo *HOG::info() const -{ - return &_info; -} diff --git a/src/runtime/ILutAllocator.cpp b/src/runtime/ILutAllocator.cpp deleted file mode 100644 index fb961638f1..0000000000 --- a/src/runtime/ILutAllocator.cpp +++ /dev/null @@ -1,58 +0,0 @@ -/* - * Copyright (c) 2016, 2017 ARM Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "arm_compute/runtime/ILutAllocator.h" - -#include "arm_compute/core/Utils.h" - -using namespace arm_compute; - -ILutAllocator::ILutAllocator() - : _num_elements(0), _data_type(DataType::U8) -{ -} - -void ILutAllocator::init(size_t num_elements, DataType data_type) -{ - // Init internal metadata - _num_elements = num_elements; - _data_type = data_type; - - // Allocate the image's memory - allocate(); -} - -size_t ILutAllocator::num_elements() const -{ - return _num_elements; -} - -DataType ILutAllocator::type() const -{ - return _data_type; -} - -size_t ILutAllocator::size() const -{ - return data_size_from_type(_data_type) * num_elements(); -} diff --git a/src/runtime/IScheduler.cpp b/src/runtime/IScheduler.cpp index b2edad0ca5..ecf84abd2c 100644 --- a/src/runtime/IScheduler.cpp +++ b/src/runtime/IScheduler.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2016-2018 ARM Limited. + * Copyright (c) 2016-2023 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -23,32 +23,198 @@ */ #include "arm_compute/runtime/IScheduler.h" +#include "arm_compute/core/CPP/ICPPKernel.h" #include "arm_compute/core/Error.h" -#include "arm_compute/runtime/CPUUtils.h" +#include "arm_compute/core/Log.h" +#include "arm_compute/core/Window.h" + +#include "src/common/cpuinfo/CpuInfo.h" +#include "src/runtime/SchedulerUtils.h" namespace arm_compute { IScheduler::IScheduler() - : _cpu_info() { - get_cpu_configuration(_cpu_info); // Work out the best possible number of execution threads - _num_threads_hint = get_threads_hint(); + _num_threads_hint = cpuinfo::num_threads_hint(); } CPUInfo &IScheduler::cpu_info() { - return _cpu_info; + return CPUInfo::get(); +} + +void IScheduler::set_num_threads_with_affinity(unsigned int num_threads, BindFunc func) +{ + ARM_COMPUTE_UNUSED(num_threads, func); + ARM_COMPUTE_ERROR("Feature for affinity setting is not implemented"); } unsigned int IScheduler::num_threads_hint() const { return _num_threads_hint; } + +void IScheduler::schedule_common(ICPPKernel *kernel, const Hints &hints, const Window &window, ITensorPack &tensors) +{ + ARM_COMPUTE_ERROR_ON_MSG(!kernel, "The child class didn't set the kernel"); +#ifndef BARE_METAL + const Window &max_window = window; + if (hints.split_dimension() == IScheduler::split_dimensions_all) + { + /* + * if the split dim is size_t max then this signals we should parallelise over + * all dimensions + */ + const std::size_t m = max_window.num_iterations(Window::DimX); + const std::size_t n = max_window.num_iterations(Window::DimY); + + //in c++17 this can be swapped for auto [ m_threads, n_threads ] = split_2d(... + unsigned m_threads, n_threads; + std::tie(m_threads, n_threads) = scheduler_utils::split_2d(this->num_threads(), m, n); + + std::vector<IScheduler::Workload> workloads; + for (unsigned int ni = 0; ni != n_threads; ++ni) + { + for (unsigned int mi = 0; mi != m_threads; ++mi) + { + workloads.push_back( + [ni, mi, m_threads, n_threads, &max_window, &kernel](const ThreadInfo &info) + { + //narrow the window to our mi-ni workload + Window win = max_window.split_window(Window::DimX, mi, m_threads) + .split_window(Window::DimY, ni, n_threads); + + win.validate(); + + Window thread_locator; + thread_locator.set(Window::DimX, Window::Dimension(mi, m_threads)); + thread_locator.set(Window::DimY, Window::Dimension(ni, n_threads)); + + thread_locator.validate(); + + kernel->run_nd(win, info, thread_locator); + }); + } + } + run_workloads(workloads); + } + else + { + const unsigned int num_iterations = max_window.num_iterations(hints.split_dimension()); + const unsigned int num_threads = std::min(num_iterations, this->num_threads()); + + if (num_iterations == 0) + { + return; + } + + if (!kernel->is_parallelisable() || num_threads == 1) + { + ThreadInfo info; + info.cpu_info = &cpu_info(); + if (tensors.empty()) + { + kernel->run(max_window, info); + } + else + { + kernel->run_op(tensors, max_window, info); + } + } + else + { + unsigned int num_windows = 0; + switch (hints.strategy()) + { + case StrategyHint::STATIC: + num_windows = num_threads; + break; + case StrategyHint::DYNAMIC: + { + const unsigned int granule_threshold = + (hints.threshold() <= 0) ? num_threads : static_cast<unsigned int>(hints.threshold()); + // Make sure we don't use some windows which are too small as this might create some contention on the ThreadFeeder + num_windows = num_iterations > granule_threshold ? granule_threshold : num_iterations; + break; + } + default: + ARM_COMPUTE_ERROR("Unknown strategy"); + } + // Make sure the smallest window is larger than minimum workload size + num_windows = adjust_num_of_windows(max_window, hints.split_dimension(), num_windows, *kernel, cpu_info()); + + std::vector<IScheduler::Workload> workloads(num_windows); + for (unsigned int t = 0; t < num_windows; ++t) + { + //Capture 't' by copy, all the other variables by reference: + workloads[t] = [t, &hints, &max_window, &num_windows, &kernel, &tensors](const ThreadInfo &info) + { + Window win = max_window.split_window(hints.split_dimension(), t, num_windows); + win.validate(); + + if (tensors.empty()) + { + kernel->run(win, info); + } + else + { + kernel->run_op(tensors, win, info); + } + }; + } + run_workloads(workloads); + } + } +#else /* !BARE_METAL */ + ARM_COMPUTE_UNUSED(kernel, hints, window, tensors); +#endif /* !BARE_METAL */ +} + void IScheduler::run_tagged_workloads(std::vector<Workload> &workloads, const char *tag) { ARM_COMPUTE_UNUSED(tag); run_workloads(workloads); } +std::size_t IScheduler::adjust_num_of_windows(const Window &window, + std::size_t split_dimension, + std::size_t init_num_windows, + const ICPPKernel &kernel, + const CPUInfo &cpu_info) +{ + // Mitigation of the narrow split issue, which occurs when the split dimension is too small to split (hence "narrow"). + if (window.num_iterations(split_dimension) < init_num_windows) + { + auto recommended_split_dim = Window::DimX; + for (std::size_t dims = Window::DimY; dims <= Window::DimW; ++dims) + { + if (window.num_iterations(recommended_split_dim) < window.num_iterations(dims)) + { + recommended_split_dim = dims; + } + } + ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE( + "%zu dimension is not a suitable dimension to split the workload. Recommended: %zu recommended_split_dim", + split_dimension, recommended_split_dim); + } + + for (auto t = init_num_windows; t > 0; --t) // Trying the highest number of windows ,init_num_windows, first + { + // Try splitting the workload into t, subject to each subworkload size <= mws. + if ((window.num_iterations(split_dimension) / kernel.get_mws(cpu_info, t)) >= t) + { + if (t != init_num_windows) + { + ARM_COMPUTE_LOG_INFO_MSG_CORE( + "The scheduler is using a different thread count than the one assigned by the user."); + } + return t; + } + } + ARM_COMPUTE_LOG_INFO_MSG_CORE( + "The scheduler is using single thread instead of the thread count assigned by the user."); + return 1; // If the workload is so small that it can't be split, we should run a single thread +} + } // namespace arm_compute diff --git a/src/runtime/ISimpleLifetimeManager.cpp b/src/runtime/ISimpleLifetimeManager.cpp index d0c9919e26..8e5b62ae7d 100644 --- a/src/runtime/ISimpleLifetimeManager.cpp +++ b/src/runtime/ISimpleLifetimeManager.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2020 ARM Limited. + * Copyright (c) 2017-2020 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -43,7 +43,7 @@ ISimpleLifetimeManager::ISimpleLifetimeManager() void ISimpleLifetimeManager::register_group(IMemoryGroup *group) { - if(_active_group == nullptr) + if (_active_group == nullptr) { ARM_COMPUTE_ERROR_ON(group == nullptr); _active_group = group; @@ -52,12 +52,12 @@ void ISimpleLifetimeManager::register_group(IMemoryGroup *group) bool ISimpleLifetimeManager::release_group(IMemoryGroup *group) { - if(group == nullptr) + if (group == nullptr) { return false; } const bool status = bool(_finalized_groups.erase(group)); - if(status) + if (status) { group->mappings().clear(); } @@ -67,12 +67,13 @@ bool ISimpleLifetimeManager::release_group(IMemoryGroup *group) void ISimpleLifetimeManager::start_lifetime(void *obj) { ARM_COMPUTE_ERROR_ON(obj == nullptr); - ARM_COMPUTE_ERROR_ON_MSG(_active_elements.find(obj) != std::end(_active_elements), "Memory object is already registered!"); + ARM_COMPUTE_ERROR_ON_MSG(_active_elements.find(obj) != std::end(_active_elements), + "Memory object is already registered!"); // Check if there is a free blob - if(_free_blobs.empty()) + if (_free_blobs.empty()) { - _occupied_blobs.emplace_front(Blob{ obj, 0, 0, { obj } }); + _occupied_blobs.emplace_front(Blob{obj, 0, 0, {obj}}); } else { @@ -100,10 +101,8 @@ void ISimpleLifetimeManager::end_lifetime(void *obj, IMemory &obj_memory, size_t el.status = true; // Find object in the occupied lists - auto occupied_blob_it = std::find_if(std::begin(_occupied_blobs), std::end(_occupied_blobs), [&obj](const Blob & b) - { - return obj == b.id; - }); + auto occupied_blob_it = std::find_if(std::begin(_occupied_blobs), std::end(_occupied_blobs), + [&obj](const Blob &b) { return obj == b.id; }); ARM_COMPUTE_ERROR_ON(occupied_blob_it == std::end(_occupied_blobs)); // Update occupied blob and return as free @@ -114,7 +113,7 @@ void ISimpleLifetimeManager::end_lifetime(void *obj, IMemory &obj_memory, size_t _free_blobs.splice(std::begin(_free_blobs), _occupied_blobs, occupied_blob_it); // Check if all objects are finalized and reset active group - if(are_all_finalized()) + if (are_all_finalized()) { ARM_COMPUTE_ERROR_ON(!_occupied_blobs.empty()); @@ -133,9 +132,7 @@ void ISimpleLifetimeManager::end_lifetime(void *obj, IMemory &obj_memory, size_t bool ISimpleLifetimeManager::are_all_finalized() const { - return !std::any_of(std::begin(_active_elements), std::end(_active_elements), [](const std::pair<void *, Element> &e) - { - return !e.second.status; - }); + return !std::any_of(std::begin(_active_elements), std::end(_active_elements), + [](const std::pair<void *, Element> &e) { return !e.second.status; }); } } // namespace arm_compute diff --git a/src/runtime/ITensorAllocator.cpp b/src/runtime/ITensorAllocator.cpp index 087f324922..fe3d2804cb 100644 --- a/src/runtime/ITensorAllocator.cpp +++ b/src/runtime/ITensorAllocator.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2016-2018 ARM Limited. + * Copyright (c) 2016-2021 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -30,25 +30,27 @@ using namespace arm_compute; -ITensorAllocator::ITensorAllocator() - : _info(), _alignment(0) +void ITensorAllocator::init(const TensorInfo &input, size_t alignment) { + _info_owned = input; + _info_external = nullptr; + _alignment = alignment; } -void ITensorAllocator::init(const TensorInfo &input, size_t alignment) +void ITensorAllocator::soft_init(TensorInfo &input, size_t alignment) { - _info = input; - _alignment = alignment; + _info_external = &input; + _alignment = alignment; } TensorInfo &ITensorAllocator::info() { - return _info; + return (_info_external != nullptr) ? *_info_external : _info_owned; } const TensorInfo &ITensorAllocator::info() const { - return _info; + return (_info_external != nullptr) ? *_info_external : _info_owned; } size_t ITensorAllocator::alignment() const diff --git a/src/runtime/IWeightsManager.cpp b/src/runtime/IWeightsManager.cpp index b367b5f70b..96287dcc49 100644 --- a/src/runtime/IWeightsManager.cpp +++ b/src/runtime/IWeightsManager.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019 ARM Limited. + * Copyright (c) 2019, 2021 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -25,23 +25,27 @@ namespace arm_compute { -IWeightsManager::IWeightsManager() - : _managed_weights(), _managed_weights_parents() +IWeightsManager::IWeightsManager() : _managed_weights(), _managed_counter(), _managed_weights_parents() { } void IWeightsManager::manage(const ITensor *weights, ITransformWeights *parent) { - if(!are_weights_managed(weights)) + if (!are_weights_managed(weights)) { _managed_weights[weights]; + _managed_counter[weights]; + } + else + { + _managed_counter[weights].counter++; } // In case the weights are an output of a previous reshape function // store the parent's link - if(parent != nullptr) + if (parent != nullptr) { - if(_managed_weights_parents.find(weights) == _managed_weights_parents.end()) + if (_managed_weights_parents.find(weights) == _managed_weights_parents.end()) { _managed_weights_parents[weights] = parent; } @@ -54,13 +58,13 @@ ITensor *IWeightsManager::run(const ITensor *weights, ITransformWeights *weights // Find if I have the same weights with weights transform. If I do, don't run the reshape auto item = _managed_weights.find(weights); - bool perform_run{ true }; - ITensor *weights_tensor{ nullptr }; + bool perform_run{true}; + ITensor *weights_tensor{nullptr}; // Check if I already have the requested transform and I have run the reshape function - for(auto it : item->second) + for (auto it : item->second) { - if(it->is_reshape_run() && (it->uid() == weights_transform->uid())) + if (it->is_reshape_run() && (it->uid() == weights_transform->uid())) { weights_tensor = it->get_weights(); perform_run = false; @@ -68,7 +72,7 @@ ITensor *IWeightsManager::run(const ITensor *weights, ITransformWeights *weights } } - if(perform_run) + if (perform_run) { weights_transform->run(); weights_tensor = weights_transform->get_weights(); @@ -76,10 +80,10 @@ ITensor *IWeightsManager::run(const ITensor *weights, ITransformWeights *weights // Check if we can release memory from parent auto parent_item = _managed_weights_parents.find(weights); - if(parent_item != _managed_weights_parents.end()) + if (parent_item != _managed_weights_parents.end()) { int32_t refcount = parent_item->second->decrease_refcount(); - if(refcount == 0) + if (refcount == 0) { parent_item->second->release(); } @@ -87,20 +91,20 @@ ITensor *IWeightsManager::run(const ITensor *weights, ITransformWeights *weights // Check top level weights. If all the transformations are done // mark the weights as unused - if(_managed_weights_parents.find(weights) == _managed_weights_parents.end()) + if (_managed_weights_parents.find(weights) == _managed_weights_parents.end()) { auto item = _managed_weights.find(weights); bool mark_as_unused = true; - for(auto it : item->second) + for (auto it : item->second) { - if(!it->is_reshape_run()) + if (!it->is_reshape_run()) { mark_as_unused = false; break; } } - if(mark_as_unused) + if (mark_as_unused) { weights->mark_as_unused(); } @@ -118,15 +122,15 @@ ITensor *IWeightsManager::acquire(const ITensor *weights, ITransformWeights *wei { ARM_COMPUTE_ERROR_ON_MSG(!are_weights_managed(weights), "Cannot acquire weights. Weights are not managed"); - ITensor *transformed_weights{ nullptr }; + ITensor *transformed_weights{nullptr}; auto item = _managed_weights.find(weights); // Check if I already have the requested transform. If I do, // increase the refcount of the transformed weights object and // reuse the tensor - for(auto it : item->second) + for (auto it : item->second) { - if(it->uid() == weights_transform->uid()) + if (it->uid() == weights_transform->uid()) { transformed_weights = it->get_weights(); it->increase_refcount(); @@ -134,7 +138,7 @@ ITensor *IWeightsManager::acquire(const ITensor *weights, ITransformWeights *wei } } - if(transformed_weights == nullptr) + if (transformed_weights == nullptr) { transformed_weights = weights_transform->get_weights(); weights_transform->increase_refcount(); @@ -146,4 +150,28 @@ ITensor *IWeightsManager::acquire(const ITensor *weights, ITransformWeights *wei return transformed_weights; } + +void IWeightsManager::release(const ITensor *weights) +{ + if (weights == nullptr || !are_weights_managed(weights)) + { + return; + } + + _managed_counter[weights].counter--; + if (_managed_counter[weights].counter == 0 && _managed_counter[weights].is_unused) + { + weights->mark_as_unused(); + } +} + +void IWeightsManager::pre_mark_as_unused(const ITensor *weights) +{ + if (weights == nullptr || !are_weights_managed(weights)) + { + return; + } + + _managed_counter[weights].is_unused = true; +} } // namespace arm_compute diff --git a/src/runtime/Lut.cpp b/src/runtime/Lut.cpp deleted file mode 100644 index 1b3daf1f60..0000000000 --- a/src/runtime/Lut.cpp +++ /dev/null @@ -1,75 +0,0 @@ -/* - * Copyright (c) 2016, 2017 ARM Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "arm_compute/runtime/Lut.h" - -#include <cstring> - -using namespace arm_compute; - -Lut::Lut() - : _allocator() -{ -} - -Lut::Lut(size_t num_elements, DataType data_type) - : _allocator() -{ - _allocator.init(num_elements, data_type); -} - -size_t Lut::num_elements() const -{ - return _allocator.num_elements(); -} - -uint32_t Lut::index_offset() const -{ - return (DataType::S16 == _allocator.type()) ? num_elements() / 2 : 0; -} - -size_t Lut::size_in_bytes() const -{ - return _allocator.size(); -} - -DataType Lut::type() const -{ - return _allocator.type(); -} - -uint8_t *Lut::buffer() const -{ - return _allocator.data(); -} - -void Lut::clear() -{ - ARM_COMPUTE_ERROR_ON(this->buffer() == nullptr); - std::memset(this->buffer(), 0, this->size_in_bytes()); -} - -ILutAllocator *Lut::allocator() -{ - return &_allocator; -} diff --git a/src/runtime/LutAllocator.cpp b/src/runtime/LutAllocator.cpp deleted file mode 100644 index 4b77a4533f..0000000000 --- a/src/runtime/LutAllocator.cpp +++ /dev/null @@ -1,50 +0,0 @@ -/* - * Copyright (c) 2016-2020 ARM Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "arm_compute/runtime/LutAllocator.h" - -using namespace arm_compute; - -LutAllocator::LutAllocator() - : _buffer() -{ -} - -uint8_t *LutAllocator::data() const -{ - return _buffer.data(); -} - -void LutAllocator::allocate() -{ - _buffer.resize(size()); -} - -uint8_t *LutAllocator::lock() -{ - return _buffer.data(); -} - -void LutAllocator::unlock() -{ -} diff --git a/src/runtime/MEMUtils.cpp b/src/runtime/MEMUtils.cpp deleted file mode 100644 index 054169ac46..0000000000 --- a/src/runtime/MEMUtils.cpp +++ /dev/null @@ -1,103 +0,0 @@ -/* - * Copyright (c) 2018-2020 ARM Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "arm_compute/core/CPP/CPPTypes.h" -#include "arm_compute/core/Error.h" -#include "support/StringSupport.h" - -#ifndef BARE_METAL -#include <fstream> -#include <iterator> -#include <sstream> -#endif // ifndef BARE_METAL - -namespace -{ -void parse_mem_info(size_t &total, size_t &free, size_t &buffer) -{ - free = 0; - total = 0; - buffer = 0; -#ifndef BARE_METAL - size_t memcache = 0; - size_t memfree = 0; - std::ifstream meminfo_f; - meminfo_f.open("/proc/meminfo", std::ios::in); - - if(meminfo_f.is_open()) - { - std::string line; - while(bool(getline(meminfo_f, line))) - { - std::istringstream iss(line); - std::vector<std::string> tokens((std::istream_iterator<std::string>(iss)), - std::istream_iterator<std::string>()); - if(tokens[0] == "MemTotal:") - { - total = arm_compute::support::cpp11::stoul(tokens[1], nullptr); - } - else if(tokens[0] == "MemFree:") - { - memfree = arm_compute::support::cpp11::stoul(tokens[1], nullptr); - } - else if(tokens[0] == "Buffers:") - { - buffer = arm_compute::support::cpp11::stoul(tokens[1], nullptr); - } - else if(tokens[0] == "Cached:") - { - memcache = arm_compute::support::cpp11::stoul(tokens[1], nullptr); - } - } - free = memfree + (buffer + memcache); - } -#endif // ifndef BARE_METAL -} - -} // namespace - -namespace arm_compute -{ -void MEMInfo::set_policy(MemoryPolicy policy) -{ - _policy = policy; -} - -MemoryPolicy MEMInfo::get_policy() -{ - return _policy; -} -MemoryPolicy MEMInfo::_policy = { MemoryPolicy::NORMAL }; - -MEMInfo::MEMInfo() - : _total(0), _free(0), _buffer(0) -{ - parse_mem_info(_total, _free, _buffer); -} - -size_t MEMInfo::get_total_in_kb() const -{ - return _total; -} - -} // namespace arm_compute diff --git a/src/runtime/Memory.cpp b/src/runtime/Memory.cpp index c6b956d929..90fd025eb7 100644 --- a/src/runtime/Memory.cpp +++ b/src/runtime/Memory.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2019 ARM Limited. + * Copyright (c) 2017-2019 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -27,20 +27,17 @@ namespace arm_compute { -Memory::Memory() - : _region(nullptr), _region_owned(nullptr) +Memory::Memory() : _region(nullptr), _region_owned(nullptr) { } -Memory::Memory(const std::shared_ptr<IMemoryRegion> &memory) - : _region(nullptr), _region_owned(memory) +Memory::Memory(const std::shared_ptr<IMemoryRegion> &memory) : _region(nullptr), _region_owned(memory) { _region_owned = memory; _region = _region_owned.get(); } -Memory::Memory(IMemoryRegion *memory) - : _region(memory), _region_owned(nullptr) +Memory::Memory(IMemoryRegion *memory) : _region(memory), _region_owned(nullptr) { _region = memory; } diff --git a/src/runtime/MemoryManagerOnDemand.cpp b/src/runtime/MemoryManagerOnDemand.cpp index d9803a8caf..5fa9ea47e9 100644 --- a/src/runtime/MemoryManagerOnDemand.cpp +++ b/src/runtime/MemoryManagerOnDemand.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2016-2018 ARM Limited. + * Copyright (c) 2016-2018 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -31,7 +31,8 @@ namespace arm_compute { -MemoryManagerOnDemand::MemoryManagerOnDemand(std::shared_ptr<ILifetimeManager> lifetime_manager, std::shared_ptr<IPoolManager> pool_manager) +MemoryManagerOnDemand::MemoryManagerOnDemand(std::shared_ptr<ILifetimeManager> lifetime_manager, + std::shared_ptr<IPoolManager> pool_manager) : _lifetime_mgr(std::move(lifetime_manager)), _pool_mgr(std::move(pool_manager)) { ARM_COMPUTE_ERROR_ON_MSG(!_lifetime_mgr, "Lifetime manager not specified correctly!"); @@ -57,7 +58,7 @@ void MemoryManagerOnDemand::populate(arm_compute::IAllocator &allocator, size_t // Create pools auto pool_template = _lifetime_mgr->create_pool(&allocator); - for(int i = num_pools; i > 1; --i) + for (int i = num_pools; i > 1; --i) { auto pool = pool_template->duplicate(); _pool_mgr->register_pool(std::move(pool)); diff --git a/src/runtime/MultiHOG.cpp b/src/runtime/MultiHOG.cpp deleted file mode 100644 index 1584e3ab8e..0000000000 --- a/src/runtime/MultiHOG.cpp +++ /dev/null @@ -1,52 +0,0 @@ -/* - * Copyright (c) 2016-2020 ARM Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "arm_compute/runtime/MultiHOG.h" - -#include "arm_compute/core/Error.h" -#include "arm_compute/core/IMultiHOG.h" - -using namespace arm_compute; - -MultiHOG::MultiHOG(size_t num_models) - : _num_models(num_models), _model() -{ - _model.resize(_num_models); -} - -size_t MultiHOG::num_models() const -{ - return _num_models; -} - -IHOG *MultiHOG::model(size_t index) -{ - ARM_COMPUTE_ERROR_ON(index >= _num_models); - return (&_model[index]); -} - -const IHOG *MultiHOG::model(size_t index) const -{ - ARM_COMPUTE_ERROR_ON(index >= _num_models); - return (&_model[index]); -} diff --git a/src/runtime/MultiImage.cpp b/src/runtime/MultiImage.cpp deleted file mode 100644 index eec58d3ef4..0000000000 --- a/src/runtime/MultiImage.cpp +++ /dev/null @@ -1,224 +0,0 @@ -/* - * Copyright (c) 2016-2019 ARM Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "arm_compute/runtime/MultiImage.h" - -#include "arm_compute/core/Error.h" -#include "arm_compute/core/TensorInfo.h" -#include "arm_compute/core/Utils.h" -#include "arm_compute/runtime/TensorAllocator.h" - -using namespace arm_compute; - -MultiImage::MultiImage() - : _info(), _plane() -{ -} - -const MultiImageInfo *MultiImage::info() const -{ - return &_info; -} - -void MultiImage::init(unsigned int width, unsigned int height, Format format) -{ - internal_init(width, height, format, false); -} - -void MultiImage::init_auto_padding(unsigned int width, unsigned int height, Format format) -{ - internal_init(width, height, format, true); -} - -void MultiImage::internal_init(unsigned int width, unsigned int height, Format format, bool auto_padding) -{ - TensorShape shape = adjust_odd_shape(TensorShape{ width, height }, format); - TensorInfo info(shape, Format::U8); - - if(auto_padding) - { - info.auto_padding(); - } - - switch(format) - { - case Format::U8: - case Format::S16: - case Format::U16: - case Format::S32: - case Format::F16: - case Format::F32: - case Format::U32: - case Format::RGB888: - case Format::RGBA8888: - case Format::YUYV422: - case Format::UYVY422: - { - TensorInfo info_full(shape, format); - - if(auto_padding) - { - info_full.auto_padding(); - } - - std::get<0>(_plane).allocator()->init(info_full); - break; - } - case Format::NV12: - case Format::NV21: - { - const TensorShape shape_uv88 = calculate_subsampled_shape(shape, Format::UV88); - TensorInfo info_uv88(shape_uv88, Format::UV88); - - if(auto_padding) - { - info_uv88.auto_padding(); - } - - std::get<0>(_plane).allocator()->init(info); - std::get<1>(_plane).allocator()->init(info_uv88); - break; - } - case Format::IYUV: - { - const TensorShape shape_sub2 = calculate_subsampled_shape(shape, Format::IYUV); - TensorInfo info_sub2(shape_sub2, Format::U8); - - if(auto_padding) - { - info_sub2.auto_padding(); - } - - std::get<0>(_plane).allocator()->init(info); - std::get<1>(_plane).allocator()->init(info_sub2); - std::get<2>(_plane).allocator()->init(info_sub2); - break; - } - case Format::YUV444: - std::get<0>(_plane).allocator()->init(info); - std::get<1>(_plane).allocator()->init(info); - std::get<2>(_plane).allocator()->init(info); - break; - default: - ARM_COMPUTE_ERROR("Not supported"); - break; - } - - _info.init(shape.x(), shape.y(), format); -} - -void MultiImage::allocate() -{ - switch(_info.format()) - { - case Format::U8: - case Format::S16: - case Format::U16: - case Format::S32: - case Format::F16: - case Format::F32: - case Format::U32: - case Format::RGB888: - case Format::RGBA8888: - case Format::YUYV422: - case Format::UYVY422: - std::get<0>(_plane).allocator()->allocate(); - break; - case Format::NV12: - case Format::NV21: - std::get<0>(_plane).allocator()->allocate(); - std::get<1>(_plane).allocator()->allocate(); - break; - case Format::IYUV: - case Format::YUV444: - std::get<0>(_plane).allocator()->allocate(); - std::get<1>(_plane).allocator()->allocate(); - std::get<2>(_plane).allocator()->allocate(); - break; - default: - ARM_COMPUTE_ERROR("Not supported"); - break; - } -} - -void MultiImage::create_subimage(MultiImage *image, const Coordinates &coords, unsigned int width, unsigned int height) -{ - arm_compute::Format format = image->info()->format(); - TensorInfo info(width, height, Format::U8); - - switch(format) - { - case Format::U8: - case Format::S16: - case Format::U16: - case Format::S32: - case Format::F32: - case Format::F16: - case Format::U32: - case Format::RGB888: - case Format::RGBA8888: - case Format::YUYV422: - case Format::UYVY422: - { - TensorInfo info_full(width, height, format); - std::get<0>(_plane).allocator()->init(*dynamic_cast<Image *>(image->plane(0))->allocator(), coords, info_full); - break; - } - case Format::NV12: - case Format::NV21: - { - TensorInfo info_uv88(width / 2, height / 2, Format::UV88); - std::get<0>(_plane).allocator()->init(*dynamic_cast<Image *>(image->plane(0))->allocator(), coords, info); - std::get<1>(_plane).allocator()->init(*dynamic_cast<Image *>(image->plane(1))->allocator(), coords, info_uv88); - break; - } - case Format::IYUV: - { - TensorInfo info_sub2(width / 2, height / 2, Format::U8); - std::get<0>(_plane).allocator()->init(*dynamic_cast<Image *>(image->plane(0))->allocator(), coords, info); - std::get<1>(_plane).allocator()->init(*dynamic_cast<Image *>(image->plane(1))->allocator(), coords, info_sub2); - std::get<2>(_plane).allocator()->init(*dynamic_cast<Image *>(image->plane(2))->allocator(), coords, info_sub2); - break; - } - case Format::YUV444: - std::get<0>(_plane).allocator()->init(*dynamic_cast<Image *>(image->plane(0))->allocator(), coords, info); - std::get<1>(_plane).allocator()->init(*dynamic_cast<Image *>(image->plane(0))->allocator(), coords, info); - std::get<2>(_plane).allocator()->init(*dynamic_cast<Image *>(image->plane(0))->allocator(), coords, info); - break; - default: - ARM_COMPUTE_ERROR("Not supported"); - break; - } - - _info.init(width, height, format); -} - -Image *MultiImage::plane(unsigned int index) -{ - return &_plane[index]; -} - -const Image *MultiImage::plane(unsigned int index) const -{ - return &_plane[index]; -} diff --git a/src/runtime/NEON/functions/NEUpsampleLayer.cpp b/src/runtime/NEON/INEOperator.cpp index 9be96af66a..fcfd3251ff 100644 --- a/src/runtime/NEON/functions/NEUpsampleLayer.cpp +++ b/src/runtime/NEON/INEOperator.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018 ARM Limited. + * Copyright (c) 2020-2021 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -21,32 +21,46 @@ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ -#include "arm_compute/runtime/NEON/functions/NEUpsampleLayer.h" +#include "arm_compute/runtime/NEON/INEOperator.h" -#include "arm_compute/core/NEON/kernels/NEUpsampleLayerKernel.h" +#include "arm_compute/core/Window.h" +#include "arm_compute/runtime/NEON/NEScheduler.h" + +#include "src/core/NEON/INEKernel.h" namespace arm_compute { -NEUpsampleLayer::NEUpsampleLayer() - : _kernel(), _data_layout() +namespace experimental +{ +INEOperator::~INEOperator() = default; + +INEOperator::INEOperator(IRuntimeContext *ctx) : _kernel(), _ctx(ctx), _workspace() { } -Status NEUpsampleLayer::validate(const ITensorInfo *input, const ITensorInfo *output, const Size2D &info, - const InterpolationPolicy &policy) +void INEOperator::run(ITensorPack &tensors) +{ + if (tensors.empty()) + { + ARM_COMPUTE_ERROR("No inputs provided"); + } + + run(tensors, _kernel->window()); +} + +void INEOperator::run(ITensorPack &tensors, const Window &window) { - return NEUpsampleLayerKernel::validate(input, output, info, policy); + NEScheduler::get().schedule_op(_kernel.get(), Window::DimY, window, tensors); } -void NEUpsampleLayer::configure(const ITensor *input, ITensor *output, const Size2D &info, const InterpolationPolicy &policy) +void INEOperator::prepare(ITensorPack &constants) { - _data_layout = input->info()->data_layout(); - _kernel.configure(input, output, info, policy); + ARM_COMPUTE_UNUSED(constants); } -void NEUpsampleLayer::run() +MemoryRequirements INEOperator::workspace() const { - const auto win = (_data_layout == DataLayout::NCHW) ? Window::DimZ : Window::DimX; - NEScheduler::get().schedule(&_kernel, win); + return _workspace; } +} // namespace experimental } // namespace arm_compute diff --git a/src/runtime/NEON/INESimpleFunction.cpp b/src/runtime/NEON/INESimpleFunction.cpp index 23d9872294..b6977221b9 100644 --- a/src/runtime/NEON/INESimpleFunction.cpp +++ b/src/runtime/NEON/INESimpleFunction.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2016, 2017 ARM Limited. + * Copyright (c) 2016-2020 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -23,18 +23,24 @@ */ #include "arm_compute/runtime/NEON/INESimpleFunction.h" +#include "arm_compute/core/CPP/ICPPKernel.h" +#include "arm_compute/core/Window.h" #include "arm_compute/runtime/NEON/NEScheduler.h" -using namespace arm_compute; +#include "src/core/NEON/kernels/NEFillBorderKernel.h" + +namespace arm_compute +{ +INESimpleFunction::~INESimpleFunction() = default; INESimpleFunction::INESimpleFunction() // NOLINT - : _kernel(), - _border_handler() + : _kernel(), _border_handler() { } void INESimpleFunction::run() { - NEScheduler::get().schedule(&_border_handler, Window::DimZ); + NEScheduler::get().schedule(_border_handler.get(), Window::DimZ); NEScheduler::get().schedule(_kernel.get(), Window::DimY); } +} //namespace arm_compute diff --git a/src/runtime/NEON/INESimpleFunctionNoBorder.cpp b/src/runtime/NEON/INESimpleFunctionNoBorder.cpp index 2cabee4c46..04bff9fa4b 100644 --- a/src/runtime/NEON/INESimpleFunctionNoBorder.cpp +++ b/src/runtime/NEON/INESimpleFunctionNoBorder.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018-2019 ARM Limited. + * Copyright (c) 2018-2020 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -23,19 +23,22 @@ */ #include "arm_compute/runtime/NEON/INESimpleFunctionNoBorder.h" +#include "arm_compute/core/Window.h" #include "arm_compute/runtime/NEON/NEScheduler.h" -#include "arm_compute/runtime/Utils.h" + +#include "src/core/NEON/INEKernel.h" +#include "src/runtime/Utils.h" namespace arm_compute { -INESimpleFunctionNoBorder::INESimpleFunctionNoBorder(IRuntimeContext *ctx) - : _kernel(), - _ctx(ctx) +INESimpleFunctionNoBorder::~INESimpleFunctionNoBorder() = default; + +INESimpleFunctionNoBorder::INESimpleFunctionNoBorder(IRuntimeContext *ctx) : _kernel(), _ctx(ctx) { } void INESimpleFunctionNoBorder::run() { - schedule_kernel_on_ctx(_ctx, _kernel.get(), Window::DimY); + utils::schedule_kernel_on_ctx(_ctx, _kernel.get(), Window::DimY); } } // namespace arm_compute diff --git a/src/runtime/NEON/functions/NEAbsoluteDifference.cpp b/src/runtime/NEON/functions/NEAbsoluteDifference.cpp deleted file mode 100644 index 06b38a9b9e..0000000000 --- a/src/runtime/NEON/functions/NEAbsoluteDifference.cpp +++ /dev/null @@ -1,38 +0,0 @@ -/* - * Copyright (c) 2017-2020 ARM Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "arm_compute/runtime/NEON/functions/NEAbsoluteDifference.h" - -#include "arm_compute/core/NEON/kernels/NEAbsoluteDifferenceKernel.h" -#include "support/MemorySupport.h" - -#include <utility> - -using namespace arm_compute; - -void NEAbsoluteDifference::configure(const ITensor *input1, const ITensor *input2, ITensor *output) -{ - auto k = arm_compute::support::cpp14::make_unique<NEAbsoluteDifferenceKernel>(); - k->configure(input1, input2, output); - _kernel = std::move(k); -} diff --git a/src/runtime/NEON/functions/NEAccumulate.cpp b/src/runtime/NEON/functions/NEAccumulate.cpp deleted file mode 100644 index 47ea83de93..0000000000 --- a/src/runtime/NEON/functions/NEAccumulate.cpp +++ /dev/null @@ -1,61 +0,0 @@ -/* - * Copyright (c) 2017-2020 ARM Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "arm_compute/runtime/NEON/functions/NEAccumulate.h" - -#include "arm_compute/core/NEON/kernels/NEAccumulateKernel.h" -#include "support/MemorySupport.h" - -#include <utility> - -using namespace arm_compute; - -void NEAccumulate::configure(const ITensor *input, ITensor *output) -{ - auto k = arm_compute::support::cpp14::make_unique<NEAccumulateKernel>(); - k->configure(input, output); - _kernel = std::move(k); -} - -void NEAccumulateWeighted::configure(const ITensor *input, float alpha, ITensor *output, bool use_fp16) -{ - if(use_fp16) - { - auto k = arm_compute::support::cpp14::make_unique<NEAccumulateWeightedFP16Kernel>(); - k->configure(input, alpha, output); - _kernel = std::move(k); - } - else - { - auto k = arm_compute::support::cpp14::make_unique<NEAccumulateWeightedKernel>(); - k->configure(input, alpha, output); - _kernel = std::move(k); - } -} - -void NEAccumulateSquared::configure(const ITensor *input, uint32_t shift, ITensor *output) -{ - auto k = arm_compute::support::cpp14::make_unique<NEAccumulateSquaredKernel>(); - k->configure(input, shift, output); - _kernel = std::move(k); -} diff --git a/src/runtime/NEON/functions/NEActivationLayer.cpp b/src/runtime/NEON/functions/NEActivationLayer.cpp index e4d1125c79..59199452ce 100644 --- a/src/runtime/NEON/functions/NEActivationLayer.cpp +++ b/src/runtime/NEON/functions/NEActivationLayer.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2020 ARM Limited. + * Copyright (c) 2017-2021 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -23,25 +23,50 @@ */ #include "arm_compute/runtime/NEON/functions/NEActivationLayer.h" -#include "arm_compute/core/NEON/kernels/NEActivationLayerKernel.h" -#include "arm_compute/runtime/IRuntimeContext.h" -#include "support/MemorySupport.h" +#include "arm_compute/core/Validate.h" + +#include "src/cpu/operators/CpuActivation.h" namespace arm_compute { -NEActivationLayer::NEActivationLayer(IRuntimeContext *ctx) // NOLINT - : INESimpleFunctionNoBorder(ctx) +struct NEActivationLayer::Impl +{ + const ITensor *src{nullptr}; + ITensor *dst{nullptr}; + IRuntimeContext *ctx{nullptr}; + std::unique_ptr<cpu::CpuActivation> op{nullptr}; +}; + +NEActivationLayer::NEActivationLayer(IRuntimeContext *ctx) : _impl(std::make_unique<Impl>()) { + _impl->ctx = ctx; } +NEActivationLayer::NEActivationLayer(NEActivationLayer &&) = default; +NEActivationLayer &NEActivationLayer::operator=(NEActivationLayer &&) = default; +NEActivationLayer::~NEActivationLayer() = default; + void NEActivationLayer::configure(ITensor *input, ITensor *output, ActivationLayerInfo activation_info) { - auto k = arm_compute::support::cpp14::make_unique<NEActivationLayerKernel>(); - k->configure(input, output, activation_info); - _kernel = std::move(k); + _impl->src = input; + _impl->dst = output == nullptr ? input : output; + + ARM_COMPUTE_ERROR_ON_NULLPTR(_impl->src, _impl->dst); + + _impl->op = std::make_unique<cpu::CpuActivation>(); + _impl->op->configure(_impl->src->info(), _impl->dst->info(), activation_info); +} + +Status +NEActivationLayer::validate(const ITensorInfo *input, const ITensorInfo *output, const ActivationLayerInfo &act_info) +{ + return cpu::CpuActivation::validate(input, output, act_info); } -Status NEActivationLayer::validate(const ITensorInfo *input, const ITensorInfo *output, const ActivationLayerInfo &act_info) +void NEActivationLayer::run() { - return NEActivationLayerKernel::validate(input, output, act_info); + ITensorPack pack; + pack.add_tensor(TensorType::ACL_SRC, _impl->src); + pack.add_tensor(TensorType::ACL_DST, _impl->dst); + _impl->op->run(pack); } } // namespace arm_compute diff --git a/src/runtime/NEON/functions/NEAddMulAdd.cpp b/src/runtime/NEON/functions/NEAddMulAdd.cpp new file mode 100644 index 0000000000..a72364791c --- /dev/null +++ b/src/runtime/NEON/functions/NEAddMulAdd.cpp @@ -0,0 +1,89 @@ +/* + * Copyright (c) 2023 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "arm_compute/runtime/NEON/functions/NEAddMulAdd.h" + +#include "arm_compute/runtime/Tensor.h" + +#include "src/common/utils/Log.h" +#include "src/core/helpers/MemoryHelpers.h" +#include "src/cpu/operators/CpuAddMulAdd.h" + +namespace arm_compute +{ +struct NEAddMulAdd::Impl +{ + std::unique_ptr<cpu::CpuAddMulAdd> op{nullptr}; + WorkspaceData<Tensor> workspace_tensors{}; + ITensorPack run_pack{}; + MemoryGroup memory_group{}; +}; + +NEAddMulAdd::NEAddMulAdd(std::shared_ptr<IMemoryManager> memory_manager) : _impl(std::make_unique<Impl>()) +{ + _impl->memory_group = MemoryGroup(std::move(memory_manager)); +} + +NEAddMulAdd::~NEAddMulAdd() = default; + +void NEAddMulAdd::configure(ITensor *input1, + ITensor *input2, + ITensor *bn_mul, + ITensor *bn_add, + ITensor *add_output, + ITensor *final_output, + const ConvertPolicy policy, + const ActivationLayerInfo &act_info) +{ + ARM_COMPUTE_LOG_PARAMS(input1, input2, bn_mul, bn_add, add_output, final_output, policy, act_info); + + _impl->op = std::make_unique<cpu::CpuAddMulAdd>(); + _impl->op->configure(input1->info(), input2->info(), bn_mul->info(), bn_add->info(), + add_output != nullptr ? add_output->info() : nullptr, final_output->info(), policy, act_info); + + _impl->run_pack = { + {TensorType::ACL_SRC_0, input1}, {TensorType::ACL_SRC_1, input2}, {TensorType::ACL_SRC_2, bn_mul}, + {TensorType::ACL_SRC_3, bn_add}, {TensorType::ACL_DST_0, add_output}, {TensorType::ACL_DST_1, final_output}, + }; + + _impl->workspace_tensors = manage_workspace<Tensor>(_impl->op->workspace(), _impl->memory_group, _impl->run_pack); +} + +Status NEAddMulAdd::validate(const ITensorInfo *input1, + const ITensorInfo *input2, + const ITensorInfo *bn_mul, + const ITensorInfo *bn_add, + const ITensorInfo *add_output, + const ITensorInfo *final_output, + ConvertPolicy policy, + const ActivationLayerInfo &act_info) +{ + return cpu::CpuAddMulAdd::validate(input1, input2, bn_mul, bn_add, add_output, final_output, policy, act_info); +} + +void NEAddMulAdd::run() +{ + _impl->op->run(_impl->run_pack); +} +} // namespace arm_compute diff --git a/src/runtime/NEON/functions/NEArgMinMaxLayer.cpp b/src/runtime/NEON/functions/NEArgMinMaxLayer.cpp index a23061e296..fbaf1a96e7 100644 --- a/src/runtime/NEON/functions/NEArgMinMaxLayer.cpp +++ b/src/runtime/NEON/functions/NEArgMinMaxLayer.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018-2019 ARM Limited. + * Copyright (c) 2018-2021, 2023 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -29,28 +29,68 @@ #include "arm_compute/core/TensorInfo.h" #include "arm_compute/core/Types.h" #include "arm_compute/core/Validate.h" +#include "arm_compute/runtime/NEON/functions/NECast.h" +#include "arm_compute/runtime/NEON/functions/NEReductionOperation.h" +#include "arm_compute/runtime/Tensor.h" + +#include "src/common/utils/Log.h" +#include "src/core/NEON/kernels/NEReductionOperationKernel.h" namespace arm_compute { -NEArgMinMaxLayer::NEArgMinMaxLayer(std::shared_ptr<IMemoryManager> memory_manager) - : _reduction_function(support::cpp14::make_unique<NEReductionOperation>()) +struct NEArgMinMaxLayer::Impl { - ARM_COMPUTE_UNUSED(memory_manager); + MemoryGroup memory_group{}; + std::shared_ptr<IMemoryManager> memory_manager{}; + std::unique_ptr<NEReductionOperation> reduction_function{}; + std::unique_ptr<NECast> cast_function{}; + std::unique_ptr<Tensor> tmp_reduction_result{}; +}; + +NEArgMinMaxLayer::~NEArgMinMaxLayer() = default; + +NEArgMinMaxLayer::NEArgMinMaxLayer(std::shared_ptr<IMemoryManager> memory_manager) : _impl(std::make_unique<Impl>()) +{ + _impl->memory_manager = std::move(memory_manager); } + void NEArgMinMaxLayer::configure(ITensor *input, int axis, ITensor *output, const ReductionOperation &op) { - _reduction_function->configure(input, output, axis, op, false); + ARM_COMPUTE_LOG_PARAMS(input, axis, output, op); + _impl->reduction_function = std::make_unique<NEReductionOperation>(); + if (output->info() && + (output->info()->data_type() == DataType::S64 || output->info()->data_type() == DataType::U64)) + { + _impl->memory_group = MemoryGroup(std::move(_impl->memory_manager)); + _impl->cast_function = std::make_unique<NECast>(); + _impl->tmp_reduction_result = std::make_unique<Tensor>(); + _impl->reduction_function->configure(input, _impl->tmp_reduction_result.get(), axis, op, false); + _impl->cast_function->configure(_impl->tmp_reduction_result.get(), output, ConvertPolicy::SATURATE); + _impl->memory_group.manage(_impl->tmp_reduction_result.get()); + _impl->tmp_reduction_result->allocator()->allocate(); + } + else + { + _impl->reduction_function->configure(input, output, axis, op, false); + } } -Status NEArgMinMaxLayer::validate(const ITensorInfo *input, int axis, const ITensorInfo *output, const ReductionOperation &op) +Status +NEArgMinMaxLayer::validate(const ITensorInfo *input, int axis, const ITensorInfo *output, const ReductionOperation &op) { - ARM_COMPUTE_RETURN_ERROR_ON_MSG(op != ReductionOperation::ARG_IDX_MAX && op != ReductionOperation::ARG_IDX_MIN, "Invalid operation"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(op != ReductionOperation::ARG_IDX_MAX && op != ReductionOperation::ARG_IDX_MIN, + "Invalid operation"); return NEReductionOperation::validate(input, output, axis, op, false); } void NEArgMinMaxLayer::run() { - _reduction_function->run(); + MemoryGroupResourceScope scope_mg(_impl->memory_group); + _impl->reduction_function->run(); + if (_impl->tmp_reduction_result != nullptr) + { + _impl->cast_function->run(); + } } -} // namespace arm_compute
\ No newline at end of file +} // namespace arm_compute diff --git a/src/runtime/NEON/functions/NEArithmeticAddition.cpp b/src/runtime/NEON/functions/NEArithmeticAddition.cpp index 06c71db1bd..aff16ae9d1 100644 --- a/src/runtime/NEON/functions/NEArithmeticAddition.cpp +++ b/src/runtime/NEON/functions/NEArithmeticAddition.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2020 ARM Limited. + * Copyright (c) 2017-2021 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -23,24 +23,57 @@ */ #include "arm_compute/runtime/NEON/functions/NEArithmeticAddition.h" -#include "arm_compute/core/ITensor.h" -#include "arm_compute/core/NEON/kernels/NEArithmeticAdditionKernel.h" -#include "support/MemorySupport.h" +#include "arm_compute/core/Validate.h" + +#include "src/cpu/operators/CpuAdd.h" #include <utility> namespace arm_compute { -void NEArithmeticAddition::configure(ITensor *input1, ITensor *input2, ITensor *output, ConvertPolicy policy, const ActivationLayerInfo &act_info) +struct NEArithmeticAddition::Impl +{ + const ITensor *src_0{nullptr}; + const ITensor *src_1{nullptr}; + ITensor *dst{nullptr}; + std::unique_ptr<cpu::CpuAdd> op{nullptr}; +}; + +NEArithmeticAddition::NEArithmeticAddition() : _impl(std::make_unique<Impl>()) +{ +} +NEArithmeticAddition::NEArithmeticAddition(NEArithmeticAddition &&) = default; +NEArithmeticAddition &NEArithmeticAddition::operator=(NEArithmeticAddition &&) = default; +NEArithmeticAddition::~NEArithmeticAddition() = default; + +Status NEArithmeticAddition::validate(const ITensorInfo *input1, + const ITensorInfo *input2, + const ITensorInfo *output, + ConvertPolicy policy, + const ActivationLayerInfo &act_info) +{ + return cpu::CpuAdd::validate(input1, input2, output, policy, act_info); +} + +void NEArithmeticAddition::configure(const ITensor *input1, + const ITensor *input2, + ITensor *output, + ConvertPolicy policy, + const ActivationLayerInfo &act_info) { - ARM_COMPUTE_UNUSED(act_info); - auto k = arm_compute::support::cpp14::make_unique<NEArithmeticAdditionKernel>(); - k->configure(input1, input2, output, policy); - _kernel = std::move(k); + _impl->src_0 = input1; + _impl->src_1 = input2; + _impl->dst = output; + _impl->op = std::make_unique<cpu::CpuAdd>(); + _impl->op->configure(_impl->src_0->info(), _impl->src_1->info(), _impl->dst->info(), policy, act_info); } -Status NEArithmeticAddition::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, ConvertPolicy policy, const ActivationLayerInfo &act_info) + +void NEArithmeticAddition::run() { - ARM_COMPUTE_RETURN_ERROR_ON(act_info.enabled()); - return NEArithmeticAdditionKernel::validate(input1, input2, output, policy); + ITensorPack pack; + pack.add_tensor(TensorType::ACL_SRC_0, _impl->src_0); + pack.add_tensor(TensorType::ACL_SRC_1, _impl->src_1); + pack.add_tensor(TensorType::ACL_DST, _impl->dst); + _impl->op->run(pack); } } // namespace arm_compute diff --git a/src/runtime/NEON/functions/NEArithmeticSubtraction.cpp b/src/runtime/NEON/functions/NEArithmeticSubtraction.cpp index 20f930a286..097525c1a8 100644 --- a/src/runtime/NEON/functions/NEArithmeticSubtraction.cpp +++ b/src/runtime/NEON/functions/NEArithmeticSubtraction.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2020 ARM Limited. + * Copyright (c) 2017-2021 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -24,24 +24,56 @@ #include "arm_compute/runtime/NEON/functions/NEArithmeticSubtraction.h" #include "arm_compute/core/ITensor.h" -#include "arm_compute/core/NEON/kernels/NEArithmeticSubtractionKernel.h" -#include "support/MemorySupport.h" + +#include "src/cpu/operators/CpuSub.h" #include <utility> namespace arm_compute { -void NEArithmeticSubtraction::configure(ITensor *input1, ITensor *input2, ITensor *output, ConvertPolicy policy, const ActivationLayerInfo &act_info) +struct NEArithmeticSubtraction::Impl +{ + const ITensor *src_0{nullptr}; + const ITensor *src_1{nullptr}; + ITensor *dst{nullptr}; + std::unique_ptr<cpu::CpuSub> op{nullptr}; +}; + +NEArithmeticSubtraction::NEArithmeticSubtraction() : _impl(std::make_unique<Impl>()) +{ +} +NEArithmeticSubtraction::NEArithmeticSubtraction(NEArithmeticSubtraction &&) = default; +NEArithmeticSubtraction &NEArithmeticSubtraction::operator=(NEArithmeticSubtraction &&) = default; +NEArithmeticSubtraction::~NEArithmeticSubtraction() = default; + +Status NEArithmeticSubtraction::validate(const ITensorInfo *input1, + const ITensorInfo *input2, + const ITensorInfo *output, + ConvertPolicy policy, + const ActivationLayerInfo &act_info) +{ + return cpu::CpuSub::validate(input1, input2, output, policy, act_info); +} + +void NEArithmeticSubtraction::configure(const ITensor *input1, + const ITensor *input2, + ITensor *output, + ConvertPolicy policy, + const ActivationLayerInfo &act_info) { - ARM_COMPUTE_UNUSED(act_info); - auto k = arm_compute::support::cpp14::make_unique<NEArithmeticSubtractionKernel>(); - k->configure(input1, input2, output, policy); - _kernel = std::move(k); + _impl->src_0 = input1; + _impl->src_1 = input2; + _impl->dst = output; + _impl->op = std::make_unique<cpu::CpuSub>(); + _impl->op->configure(input1->info(), input2->info(), output->info(), policy, act_info); } -Status NEArithmeticSubtraction::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, ConvertPolicy policy, const ActivationLayerInfo &act_info) +void NEArithmeticSubtraction::run() { - ARM_COMPUTE_RETURN_ERROR_ON(act_info.enabled()); - return NEArithmeticSubtractionKernel::validate(input1, input2, output, policy); + ITensorPack pack; + pack.add_tensor(TensorType::ACL_SRC_0, _impl->src_0); + pack.add_tensor(TensorType::ACL_SRC_1, _impl->src_1); + pack.add_tensor(TensorType::ACL_DST, _impl->dst); + _impl->op->run(pack); } } // namespace arm_compute diff --git a/src/runtime/NEON/functions/NEBatchNormalizationLayer.cpp b/src/runtime/NEON/functions/NEBatchNormalizationLayer.cpp index bb224db163..d491f0aafc 100644 --- a/src/runtime/NEON/functions/NEBatchNormalizationLayer.cpp +++ b/src/runtime/NEON/functions/NEBatchNormalizationLayer.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2018 ARM Limited. + * Copyright (c) 2017-2021 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -30,28 +30,48 @@ #include "arm_compute/core/Validate.h" #include "arm_compute/runtime/NEON/NEScheduler.h" -using namespace arm_compute; +#include "src/common/utils/Log.h" +#include "src/core/NEON/kernels/NEBatchNormalizationLayerKernel.h" -NEBatchNormalizationLayer::NEBatchNormalizationLayer() - : _norm_kernel() +namespace arm_compute +{ +NEBatchNormalizationLayer::~NEBatchNormalizationLayer() = default; + +NEBatchNormalizationLayer::NEBatchNormalizationLayer() : _norm_kernel() { } -void NEBatchNormalizationLayer::configure(ITensor *input, ITensor *output, const ITensor *mean, const ITensor *var, const ITensor *beta, const ITensor *gamma, float epsilon, +void NEBatchNormalizationLayer::configure(ITensor *input, + ITensor *output, + const ITensor *mean, + const ITensor *var, + const ITensor *beta, + const ITensor *gamma, + float epsilon, ActivationLayerInfo act_info) { + ARM_COMPUTE_LOG_PARAMS(input, output, mean, var, beta, gamma, epsilon, act_info); // Configure kernel - _norm_kernel.configure(input, output, mean, var, beta, gamma, epsilon, act_info); + _norm_kernel = std::make_unique<NEBatchNormalizationLayerKernel>(); + _norm_kernel->configure(input, output, mean, var, beta, gamma, epsilon, act_info); } -Status NEBatchNormalizationLayer::validate(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *mean, const ITensorInfo *var, const ITensorInfo *beta, const ITensorInfo *gamma, - float epsilon, ActivationLayerInfo act_info) +Status NEBatchNormalizationLayer::validate(const ITensorInfo *input, + const ITensorInfo *output, + const ITensorInfo *mean, + const ITensorInfo *var, + const ITensorInfo *beta, + const ITensorInfo *gamma, + float epsilon, + ActivationLayerInfo act_info) { - ARM_COMPUTE_RETURN_ON_ERROR(NEBatchNormalizationLayerKernel::validate(input, output, mean, var, beta, gamma, epsilon, act_info)); + ARM_COMPUTE_RETURN_ON_ERROR( + NEBatchNormalizationLayerKernel::validate(input, output, mean, var, beta, gamma, epsilon, act_info)); return Status{}; } void NEBatchNormalizationLayer::run() { - NEScheduler::get().schedule(&_norm_kernel, Window::DimY); + NEScheduler::get().schedule(_norm_kernel.get(), Window::DimY); } +} // namespace arm_compute diff --git a/src/runtime/NEON/functions/NEBatchToSpaceLayer.cpp b/src/runtime/NEON/functions/NEBatchToSpaceLayer.cpp index a4db1fdda3..5d711c5ddf 100644 --- a/src/runtime/NEON/functions/NEBatchToSpaceLayer.cpp +++ b/src/runtime/NEON/functions/NEBatchToSpaceLayer.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019 ARM Limited. + * Copyright (c) 2019-2021, 2023 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -29,29 +29,39 @@ #include "arm_compute/core/Types.h" #include "arm_compute/core/Validate.h" +#include "src/common/utils/Log.h" +#include "src/core/NEON/kernels/NEBatchToSpaceLayerKernel.h" + namespace arm_compute { void NEBatchToSpaceLayer::configure(const ITensor *input, const ITensor *block_shape, ITensor *output) { - auto k = arm_compute::support::cpp14::make_unique<NEBatchToSpaceLayerKernel>(); + ARM_COMPUTE_LOG_PARAMS(input, block_shape, output); + auto k = std::make_unique<NEBatchToSpaceLayerKernel>(); k->configure(input, block_shape, output); _kernel = std::move(k); } -void NEBatchToSpaceLayer::configure(const ITensor *input, int32_t block_shape_x, int32_t block_shape_y, ITensor *output) +void NEBatchToSpaceLayer::configure( + const ITensor *input, int32_t block_shape_x, int32_t block_shape_y, ITensor *output, const CropInfo &crop_info) { - auto k = arm_compute::support::cpp14::make_unique<NEBatchToSpaceLayerKernel>(); - k->configure(input, block_shape_x, block_shape_y, output); + auto k = std::make_unique<NEBatchToSpaceLayerKernel>(); + k->configure(input, block_shape_x, block_shape_y, output, crop_info); _kernel = std::move(k); } -Status NEBatchToSpaceLayer::validate(const ITensorInfo *input, const ITensorInfo *block_shape, const ITensorInfo *output) +Status +NEBatchToSpaceLayer::validate(const ITensorInfo *input, const ITensorInfo *block_shape, const ITensorInfo *output) { return NEBatchToSpaceLayerKernel::validate(input, block_shape, output); } -Status NEBatchToSpaceLayer::validate(const ITensorInfo *input, int32_t block_shape_x, int32_t block_shape_y, const ITensorInfo *output) +Status NEBatchToSpaceLayer::validate(const ITensorInfo *input, + int32_t block_shape_x, + int32_t block_shape_y, + const ITensorInfo *output, + const CropInfo &crop_info) { - return NEBatchToSpaceLayerKernel::validate(input, block_shape_x, block_shape_y, output); + return NEBatchToSpaceLayerKernel::validate(input, block_shape_x, block_shape_y, output, crop_info); } } // namespace arm_compute diff --git a/src/runtime/NEON/functions/NEBitwiseAnd.cpp b/src/runtime/NEON/functions/NEBitwiseAnd.cpp index 98f4745179..89ce2087be 100644 --- a/src/runtime/NEON/functions/NEBitwiseAnd.cpp +++ b/src/runtime/NEON/functions/NEBitwiseAnd.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2020 ARM Limited. + * Copyright (c) 2017-2021 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -23,8 +23,8 @@ */ #include "arm_compute/runtime/NEON/functions/NEBitwiseAnd.h" -#include "arm_compute/core/NEON/kernels/NEBitwiseAndKernel.h" -#include "support/MemorySupport.h" +#include "src/common/utils/Log.h" +#include "src/core/NEON/kernels/NEBitwiseAndKernel.h" #include <utility> @@ -32,7 +32,8 @@ using namespace arm_compute; void NEBitwiseAnd::configure(const ITensor *input1, const ITensor *input2, ITensor *output) { - auto k = arm_compute::support::cpp14::make_unique<NEBitwiseAndKernel>(); + ARM_COMPUTE_LOG_PARAMS(input1, input2, output); + auto k = std::make_unique<NEBitwiseAndKernel>(); k->configure(input1, input2, output); _kernel = std::move(k); } diff --git a/src/runtime/NEON/functions/NEBitwiseNot.cpp b/src/runtime/NEON/functions/NEBitwiseNot.cpp index 173b7d5f74..eda59cd3e9 100644 --- a/src/runtime/NEON/functions/NEBitwiseNot.cpp +++ b/src/runtime/NEON/functions/NEBitwiseNot.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2020 ARM Limited. + * Copyright (c) 2017-2021 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -23,8 +23,8 @@ */ #include "arm_compute/runtime/NEON/functions/NEBitwiseNot.h" -#include "arm_compute/core/NEON/kernels/NEBitwiseNotKernel.h" -#include "support/MemorySupport.h" +#include "src/common/utils/Log.h" +#include "src/core/NEON/kernels/NEBitwiseNotKernel.h" #include <utility> @@ -32,7 +32,8 @@ using namespace arm_compute; void NEBitwiseNot::configure(const ITensor *input, ITensor *output) { - auto k = arm_compute::support::cpp14::make_unique<NEBitwiseNotKernel>(); + ARM_COMPUTE_LOG_PARAMS(input, output); + auto k = std::make_unique<NEBitwiseNotKernel>(); k->configure(input, output); _kernel = std::move(k); } diff --git a/src/runtime/NEON/functions/NEBitwiseOr.cpp b/src/runtime/NEON/functions/NEBitwiseOr.cpp index 64f1d82350..3d6f30b0fe 100644 --- a/src/runtime/NEON/functions/NEBitwiseOr.cpp +++ b/src/runtime/NEON/functions/NEBitwiseOr.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2020 ARM Limited. + * Copyright (c) 2017-2021 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -23,8 +23,8 @@ */ #include "arm_compute/runtime/NEON/functions/NEBitwiseOr.h" -#include "arm_compute/core/NEON/kernels/NEBitwiseOrKernel.h" -#include "support/MemorySupport.h" +#include "src/common/utils/Log.h" +#include "src/core/NEON/kernels/NEBitwiseOrKernel.h" #include <utility> @@ -32,7 +32,8 @@ using namespace arm_compute; void NEBitwiseOr::configure(const ITensor *input1, const ITensor *input2, ITensor *output) { - auto k = arm_compute::support::cpp14::make_unique<NEBitwiseOrKernel>(); + ARM_COMPUTE_LOG_PARAMS(input1, input2, output); + auto k = std::make_unique<NEBitwiseOrKernel>(); k->configure(input1, input2, output); _kernel = std::move(k); } diff --git a/src/runtime/NEON/functions/NEBitwiseXor.cpp b/src/runtime/NEON/functions/NEBitwiseXor.cpp index 28c1036112..f0cf3d3e5c 100644 --- a/src/runtime/NEON/functions/NEBitwiseXor.cpp +++ b/src/runtime/NEON/functions/NEBitwiseXor.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2020 ARM Limited. + * Copyright (c) 2017-2021 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -23,8 +23,8 @@ */ #include "arm_compute/runtime/NEON/functions/NEBitwiseXor.h" -#include "arm_compute/core/NEON/kernels/NEBitwiseXorKernel.h" -#include "support/MemorySupport.h" +#include "src/common/utils/Log.h" +#include "src/core/NEON/kernels/NEBitwiseXorKernel.h" #include <utility> @@ -32,7 +32,8 @@ using namespace arm_compute; void NEBitwiseXor::configure(const ITensor *input1, const ITensor *input2, ITensor *output) { - auto k = arm_compute::support::cpp14::make_unique<NEBitwiseXorKernel>(); + ARM_COMPUTE_LOG_PARAMS(input1, input2, output); + auto k = std::make_unique<NEBitwiseXorKernel>(); k->configure(input1, input2, output); _kernel = std::move(k); } diff --git a/src/runtime/NEON/functions/NEBoundingBoxTransform.cpp b/src/runtime/NEON/functions/NEBoundingBoxTransform.cpp index 6f767c7d31..adf891e417 100644 --- a/src/runtime/NEON/functions/NEBoundingBoxTransform.cpp +++ b/src/runtime/NEON/functions/NEBoundingBoxTransform.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2020 ARM Limited. + * Copyright (c) 2019-2021 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -23,19 +23,27 @@ */ #include "arm_compute/runtime/NEON/functions/NEBoundingBoxTransform.h" -#include "support/MemorySupport.h" +#include "src/common/utils/Log.h" +#include "src/core/NEON/kernels/NEBoundingBoxTransformKernel.h" namespace arm_compute { -void NEBoundingBoxTransform::configure(const ITensor *boxes, ITensor *pred_boxes, const ITensor *deltas, const BoundingBoxTransformInfo &info) +void NEBoundingBoxTransform::configure(const ITensor *boxes, + ITensor *pred_boxes, + const ITensor *deltas, + const BoundingBoxTransformInfo &info) { + ARM_COMPUTE_LOG_PARAMS(boxes, pred_boxes, deltas, info); // Configure Bounding Box kernel - auto k = arm_compute::support::cpp14::make_unique<NEBoundingBoxTransformKernel>(); + auto k = std::make_unique<NEBoundingBoxTransformKernel>(); k->configure(boxes, pred_boxes, deltas, info); _kernel = std::move(k); } -Status NEBoundingBoxTransform::validate(const ITensorInfo *boxes, const ITensorInfo *pred_boxes, const ITensorInfo *deltas, const BoundingBoxTransformInfo &info) +Status NEBoundingBoxTransform::validate(const ITensorInfo *boxes, + const ITensorInfo *pred_boxes, + const ITensorInfo *deltas, + const BoundingBoxTransformInfo &info) { return NEBoundingBoxTransformKernel::validate(boxes, pred_boxes, deltas, info); } diff --git a/src/runtime/NEON/functions/NEBox3x3.cpp b/src/runtime/NEON/functions/NEBox3x3.cpp deleted file mode 100644 index 096b226ab5..0000000000 --- a/src/runtime/NEON/functions/NEBox3x3.cpp +++ /dev/null @@ -1,49 +0,0 @@ -/* - * Copyright (c) 2017-2020 ARM Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "arm_compute/runtime/NEON/functions/NEBox3x3.h" - -#include "arm_compute/core/NEON/kernels/NEBox3x3Kernel.h" -#include "arm_compute/core/PixelValue.h" -#include "support/MemorySupport.h" - -#include <utility> - -using namespace arm_compute; - -void NEBox3x3::configure(ITensor *input, ITensor *output, BorderMode border_mode, uint8_t constant_border_value, bool use_fp16) -{ - if(use_fp16) - { - auto k = arm_compute::support::cpp14::make_unique<NEBox3x3FP16Kernel>(); - k->configure(input, output, border_mode == BorderMode::UNDEFINED); - _kernel = std::move(k); - } - else - { - auto k = arm_compute::support::cpp14::make_unique<NEBox3x3Kernel>(); - k->configure(input, output, border_mode == BorderMode::UNDEFINED); - _kernel = std::move(k); - } - _border_handler.configure(input, _kernel->border_size(), border_mode, PixelValue(constant_border_value)); -} diff --git a/src/runtime/NEON/functions/NECannyEdge.cpp b/src/runtime/NEON/functions/NECannyEdge.cpp deleted file mode 100644 index a57ea606a9..0000000000 --- a/src/runtime/NEON/functions/NECannyEdge.cpp +++ /dev/null @@ -1,188 +0,0 @@ -/* - * Copyright (c) 2017-2020 ARM Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "arm_compute/runtime/NEON/functions/NECannyEdge.h" - -#include "arm_compute/core/Error.h" -#include "arm_compute/core/ITensor.h" -#include "arm_compute/core/NEON/kernels/NECannyEdgeKernel.h" -#include "arm_compute/core/NEON/kernels/NEFillBorderKernel.h" -#include "arm_compute/core/TensorInfo.h" -#include "arm_compute/core/Validate.h" -#include "arm_compute/runtime/NEON/NEScheduler.h" -#include "arm_compute/runtime/NEON/functions/NESobel3x3.h" -#include "arm_compute/runtime/NEON/functions/NESobel5x5.h" -#include "arm_compute/runtime/NEON/functions/NESobel7x7.h" -#include "arm_compute/runtime/TensorAllocator.h" -#include "support/MemorySupport.h" - -#include <cstring> -#include <inttypes.h> -#include <utility> - -using namespace arm_compute; - -NECannyEdge::NECannyEdge(std::shared_ptr<IMemoryManager> memory_manager) // NOLINT - : _memory_group(std::move(memory_manager)), - _sobel(), - _gradient(), - _non_max_suppr(), - _edge_trace(), - _border_mag_gradient(), - _border_edge_trace(), - _gx(), - _gy(), - _magnitude(), - _phase(), - _nonmax(), - _output(nullptr) -{ -} - -void NECannyEdge::configure(ITensor *input, ITensor *output, int32_t upper_thr, int32_t lower_thr, int32_t gradient_size, int32_t norm_type, BorderMode border_mode, uint8_t constant_border_value) -{ - ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8); - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8); - ARM_COMPUTE_ERROR_ON((1 != norm_type) && (2 != norm_type)); - ARM_COMPUTE_ERROR_ON((gradient_size != 3) && (gradient_size != 5) && (gradient_size != 7)); - ARM_COMPUTE_ERROR_ON((lower_thr < 0) || (lower_thr >= upper_thr)); - - _output = output; - - const TensorShape &shape = input->info()->tensor_shape(); - TensorInfo gradient_info; - TensorInfo magnitude_info; - - // Initialize images - if(gradient_size < 7) - { - gradient_info.init(shape, Format::S16); - magnitude_info.init(shape, Format::U16); - } - else - { - gradient_info.init(shape, Format::S32); - magnitude_info.init(shape, Format::U32); - } - - _gx.allocator()->init(gradient_info); - _gy.allocator()->init(gradient_info); - _magnitude.allocator()->init(magnitude_info); - - TensorInfo info(shape, Format::U8); - _phase.allocator()->init(info); - _nonmax.allocator()->init(info); - - // Manage intermediate buffers - _memory_group.manage(&_gx); - _memory_group.manage(&_gy); - - // Configure/Init sobelNxN - if(gradient_size == 3) - { - auto k = arm_compute::support::cpp14::make_unique<NESobel3x3>(); - k->configure(input, &_gx, &_gy, border_mode, constant_border_value); - _sobel = std::move(k); - } - else if(gradient_size == 5) - { - auto k = arm_compute::support::cpp14::make_unique<NESobel5x5>(); - k->configure(input, &_gx, &_gy, border_mode, constant_border_value); - _sobel = std::move(k); - } - else if(gradient_size == 7) - { - auto k = arm_compute::support::cpp14::make_unique<NESobel7x7>(); - k->configure(input, &_gx, &_gy, border_mode, constant_border_value); - _sobel = std::move(k); - } - else - { - ARM_COMPUTE_ERROR_VAR("Gradient size %+" PRId32 " not supported\n", gradient_size); - } - - // Manage intermediate buffers - _memory_group.manage(&_magnitude); - _memory_group.manage(&_phase); - - // Configure gradient - auto k = arm_compute::support::cpp14::make_unique<NEGradientKernel>(); - k->configure(&_gx, &_gy, &_magnitude, &_phase, norm_type); - _gradient = std::move(k); - - // Allocate intermediate tensors - _gx.allocator()->allocate(); - _gy.allocator()->allocate(); - - // Manage intermediate buffers - _memory_group.manage(&_nonmax); - - // Configure non-maxima suppression - _non_max_suppr.configure(&_magnitude, &_phase, &_nonmax, upper_thr, lower_thr, border_mode == BorderMode::UNDEFINED); - - // Fill border around magnitude image as non-maxima suppression will access - // it. If border mode is undefined filling the border is a nop. - _border_mag_gradient.configure(&_magnitude, _non_max_suppr.border_size(), border_mode, constant_border_value); - - // Allocate intermediate tensors - _phase.allocator()->allocate(); - _magnitude.allocator()->allocate(); - - // Configure edge tracing - _edge_trace.configure(&_nonmax, output); - - // Fill border with "No edge" to stop recursion in edge trace - _border_edge_trace.configure(&_nonmax, _edge_trace.border_size(), BorderMode::CONSTANT, static_cast<float>(0.f)); - - // Allocate intermediate tensors - _nonmax.allocator()->allocate(); -} - -void NECannyEdge::run() -{ - ARM_COMPUTE_ERROR_ON_MSG(_sobel == nullptr, "Unconfigured function"); - - MemoryGroupResourceScope scope_mg(_memory_group); - - // Run sobelNxN - _sobel->run(); - - // Run gradient - NEScheduler::get().schedule(_gradient.get(), Window::DimY); - - // Fill border before non-maxima suppression. Nop for border mode undefined. - NEScheduler::get().schedule(&_border_mag_gradient, Window::DimZ); - - // Run non-maxima suppression - NEScheduler::get().schedule(&_non_max_suppr, Window::DimY); - - ARM_COMPUTE_ERROR_ON(_output->buffer() == nullptr); - std::fill_n(_output->buffer(), _output->info()->total_size(), 0); - - // Fill border before edge trace - NEScheduler::get().schedule(&_border_edge_trace, Window::DimZ); - - // Run edge tracing - NEScheduler::get().schedule(&_edge_trace, Window::DimY); -} diff --git a/src/runtime/NEON/functions/NECast.cpp b/src/runtime/NEON/functions/NECast.cpp index 464a608c8c..1fd172a730 100644 --- a/src/runtime/NEON/functions/NECast.cpp +++ b/src/runtime/NEON/functions/NECast.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2020 ARM Limited. + * Copyright (c) 2019-2023 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -23,24 +23,46 @@ */ #include "arm_compute/runtime/NEON/functions/NECast.h" -#include "arm_compute/core/ITensor.h" -#include "arm_compute/core/NEON/kernels/NEDepthConvertLayerKernel.h" -#include "arm_compute/core/TensorInfo.h" -#include "support/MemorySupport.h" +#include "arm_compute/core/Validate.h" -#include <utility> +#include "src/common/utils/Log.h" +#include "src/cpu/operators/CpuCast.h" namespace arm_compute { +struct NECast::Impl +{ + const ITensor *src{nullptr}; + ITensor *dst{nullptr}; + std::unique_ptr<cpu::CpuCast> op{nullptr}; +}; + +NECast::NECast() : _impl(std::make_unique<Impl>()) +{ +} +NECast::NECast(NECast &&) = default; +NECast &NECast::operator=(NECast &&) = default; +NECast::~NECast() = default; + void NECast::configure(ITensor *input, ITensor *output, ConvertPolicy policy) { - auto k = arm_compute::support::cpp14::make_unique<NEDepthConvertLayerKernel>(); - k->configure(input, output, policy, 0); - _kernel = std::move(k); + _impl->src = input; + _impl->dst = output; + + ARM_COMPUTE_ERROR_ON_NULLPTR(_impl->src, _impl->dst); + ARM_COMPUTE_LOG_PARAMS(input, output, policy); + _impl->op = std::make_unique<cpu::CpuCast>(); + _impl->op->configure(_impl->src->info(), _impl->dst->info(), policy); +} + +Status NECast::validate(const ITensorInfo *input, const ITensorInfo *output, ConvertPolicy policy) +{ + return cpu::CpuCast::validate(input, output, policy); } -Status NECast::validate(ITensorInfo *input, ITensorInfo *output, ConvertPolicy policy) +void NECast::run() { - return NEDepthConvertLayerKernel::validate(input, output, policy, 0); + ITensorPack pack = {{ACL_SRC, _impl->src}, {ACL_DST, _impl->dst}}; + _impl->op->run(pack); } } // namespace arm_compute diff --git a/src/runtime/NEON/functions/NEChannelCombine.cpp b/src/runtime/NEON/functions/NEChannelCombine.cpp deleted file mode 100644 index 37e92c28fd..0000000000 --- a/src/runtime/NEON/functions/NEChannelCombine.cpp +++ /dev/null @@ -1,45 +0,0 @@ -/* - * Copyright (c) 2017-2020 ARM Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "arm_compute/runtime/NEON/functions/NEChannelCombine.h" - -#include "arm_compute/core/NEON/kernels/NEChannelCombineKernel.h" -#include "support/MemorySupport.h" - -#include <utility> - -using namespace arm_compute; - -void NEChannelCombine::configure(const ITensor *plane0, const ITensor *plane1, const ITensor *plane2, const ITensor *plane3, ITensor *output) -{ - auto k = arm_compute::support::cpp14::make_unique<NEChannelCombineKernel>(); - k->configure(plane0, plane1, plane2, plane3, output); - _kernel = std::move(k); -} - -void NEChannelCombine::configure(const IImage *plane0, const IImage *plane1, const IImage *plane2, IMultiImage *output) -{ - auto k = arm_compute::support::cpp14::make_unique<NEChannelCombineKernel>(); - k->configure(plane0, plane1, plane2, output); - _kernel = std::move(k); -} diff --git a/src/runtime/NEON/functions/NEChannelExtract.cpp b/src/runtime/NEON/functions/NEChannelExtract.cpp deleted file mode 100644 index 37a9892d07..0000000000 --- a/src/runtime/NEON/functions/NEChannelExtract.cpp +++ /dev/null @@ -1,45 +0,0 @@ -/* - * Copyright (c) 2017-2020 ARM Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "arm_compute/runtime/NEON/functions/NEChannelExtract.h" - -#include "arm_compute/core/NEON/kernels/NEChannelExtractKernel.h" -#include "support/MemorySupport.h" - -#include <utility> - -using namespace arm_compute; - -void NEChannelExtract::configure(const ITensor *input, Channel channel, ITensor *output) -{ - auto k = arm_compute::support::cpp14::make_unique<NEChannelExtractKernel>(); - k->configure(input, channel, output); - _kernel = std::move(k); -} - -void NEChannelExtract::configure(const IMultiImage *input, Channel channel, IImage *output) -{ - auto k = arm_compute::support::cpp14::make_unique<NEChannelExtractKernel>(); - k->configure(input, channel, output); - _kernel = std::move(k); -} diff --git a/src/runtime/NEON/functions/NEChannelShuffleLayer.cpp b/src/runtime/NEON/functions/NEChannelShuffleLayer.cpp index 46d77831c4..86bee4dd43 100644 --- a/src/runtime/NEON/functions/NEChannelShuffleLayer.cpp +++ b/src/runtime/NEON/functions/NEChannelShuffleLayer.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018-2020 ARM Limited. + * Copyright (c) 2018-2021 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -23,15 +23,17 @@ */ #include "arm_compute/runtime/NEON/functions/NEChannelShuffleLayer.h" -#include "arm_compute/core/NEON/kernels/NEChannelShuffleLayerKernel.h" #include "arm_compute/core/Types.h" -#include "support/MemorySupport.h" + +#include "src/common/utils/Log.h" +#include "src/core/NEON/kernels/NEChannelShuffleLayerKernel.h" namespace arm_compute { void NEChannelShuffleLayer::configure(const ITensor *input, ITensor *output, unsigned int num_groups) { - auto k = arm_compute::support::cpp14::make_unique<NEChannelShuffleLayerKernel>(); + ARM_COMPUTE_LOG_PARAMS(input, output, num_groups); + auto k = std::make_unique<NEChannelShuffleLayerKernel>(); k->configure(input, output, num_groups); _kernel = std::move(k); } diff --git a/src/runtime/NEON/functions/NEColorConvert.cpp b/src/runtime/NEON/functions/NEColorConvert.cpp deleted file mode 100644 index fff7633833..0000000000 --- a/src/runtime/NEON/functions/NEColorConvert.cpp +++ /dev/null @@ -1,59 +0,0 @@ -/* - * Copyright (c) 2016-2020 ARM Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "arm_compute/runtime/NEON/functions/NEColorConvert.h" - -#include "arm_compute/core/NEON/kernels/NEColorConvertKernel.h" -#include "support/MemorySupport.h" - -#include <utility> - -using namespace arm_compute; - -void NEColorConvert::configure(const ITensor *input, ITensor *output) -{ - auto k = arm_compute::support::cpp14::make_unique<NEColorConvertKernel>(); - k->configure(input, output); - _kernel = std::move(k); -} - -void NEColorConvert::configure(const IMultiImage *input, IImage *output) -{ - auto k = arm_compute::support::cpp14::make_unique<NEColorConvertKernel>(); - k->configure(input, output); - _kernel = std::move(k); -} - -void NEColorConvert::configure(const IImage *input, IMultiImage *output) -{ - auto k = arm_compute::support::cpp14::make_unique<NEColorConvertKernel>(); - k->configure(input, output); - _kernel = std::move(k); -} - -void NEColorConvert::configure(const IMultiImage *input, IMultiImage *output) -{ - auto k = arm_compute::support::cpp14::make_unique<NEColorConvertKernel>(); - k->configure(input, output); - _kernel = std::move(k); -} diff --git a/src/runtime/NEON/functions/NEConcatenateLayer.cpp b/src/runtime/NEON/functions/NEConcatenateLayer.cpp index 61d41b44f8..59a0892f1f 100644 --- a/src/runtime/NEON/functions/NEConcatenateLayer.cpp +++ b/src/runtime/NEON/functions/NEConcatenateLayer.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018-2020 ARM Limited. + * Copyright (c) 2018-2021 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -23,162 +23,69 @@ */ #include "arm_compute/runtime/NEON/functions/NEConcatenateLayer.h" -#include "arm_compute/core/NEON/kernels/NEBatchConcatenateLayerKernel.h" -#include "arm_compute/core/NEON/kernels/NEDepthConcatenateLayerKernel.h" -#include "arm_compute/core/NEON/kernels/NEHeightConcatenateLayerKernel.h" -#include "arm_compute/core/NEON/kernels/NEWidthConcatenateLayerKernel.h" - -#include "arm_compute/core/utils/misc/ShapeCalculator.h" -#include "arm_compute/runtime/NEON/NEScheduler.h" - #include "arm_compute/core/Error.h" #include "arm_compute/core/ITensor.h" #include "arm_compute/core/TensorInfo.h" #include "arm_compute/core/Types.h" -#include "support/MemorySupport.h" +#include "arm_compute/core/utils/misc/ShapeCalculator.h" +#include "arm_compute/runtime/NEON/NEScheduler.h" + +#include "src/core/helpers/AutoConfiguration.h" +#include "src/cpu/operators/CpuConcatenate.h" namespace arm_compute { -NEConcatenateLayer::NEConcatenateLayer() - : _concat_kernels(), - _num_inputs(0), - _axis(Window::DimX) +struct NEConcatenateLayer::Impl { -} - -void NEConcatenateLayer::configure(std::vector<ITensor *> inputs_vector, ITensor *output, size_t axis) + std::vector<const ITensor *> srcs{}; + ITensor *dst{nullptr}; + unsigned int num_inputs{0}; + unsigned int axis{0}; + std::unique_ptr<cpu::CpuConcatenate> op{nullptr}; +}; + +NEConcatenateLayer::NEConcatenateLayer() : _impl(std::make_unique<Impl>()) { - configure_internal(std::move(inputs_vector), output, axis); } +NEConcatenateLayer::NEConcatenateLayer(NEConcatenateLayer &&) = default; +NEConcatenateLayer &NEConcatenateLayer::operator=(NEConcatenateLayer &&) = default; +NEConcatenateLayer::~NEConcatenateLayer() = default; void NEConcatenateLayer::configure(std::vector<const ITensor *> inputs_vector, ITensor *output, size_t axis) { - configure_internal(std::move(inputs_vector), output, axis); -} - -Status NEConcatenateLayer::validate(const std::vector<ITensorInfo *> &inputs_vector, const ITensorInfo *output, size_t axis) -{ - return validate_internal(inputs_vector, output, axis); -} - -Status NEConcatenateLayer::validate(const std::vector<const ITensorInfo *> &inputs_vector, const ITensorInfo *output, size_t axis) -{ - return validate_internal(inputs_vector, output, axis); -} - -template <typename TensorType, typename> -void NEConcatenateLayer::configure_internal(std::vector<TensorType *> &&inputs_vector, ITensor *output, size_t axis) -{ ARM_COMPUTE_ERROR_ON(output == nullptr); - _axis = axis; - _num_inputs = inputs_vector.size(); - std::vector<ITensorInfo *> inputs_vector_info; - inputs_vector_info.reserve(_num_inputs); - for(unsigned int i = 0; i < _num_inputs; ++i) + _impl->srcs = inputs_vector; + _impl->dst = output; + _impl->axis = axis; + _impl->num_inputs = inputs_vector.size(); + _impl->op = std::make_unique<cpu::CpuConcatenate>(); + + std::vector<const ITensorInfo *> inputs_vector_info; + for (unsigned int i = 0; i < inputs_vector.size(); ++i) { ARM_COMPUTE_ERROR_ON_NULLPTR(inputs_vector.at(i)); inputs_vector_info.emplace_back(inputs_vector.at(i)->info()); } - TensorShape output_shape = arm_compute::misc::shape_calculator::calculate_concatenate_shape(inputs_vector, _axis); - - // Output auto inizialitation if not yet initialized - auto_init_if_empty(*output->info(), output_shape, 1, inputs_vector[0]->info()->data_type()); - ARM_COMPUTE_ERROR_THROW_ON(NEConcatenateLayer::validate(inputs_vector_info, output->info(), axis)); - - unsigned int offset = 0; - - for(unsigned int i = 0; i < _num_inputs; ++i) - { - switch(_axis) - { - case Window::DimX: - { - auto kernel = support::cpp14::make_unique<NEWidthConcatenateLayerKernel>(); - kernel->configure(inputs_vector.at(i), offset, output); - _concat_kernels.emplace_back(std::move(kernel)); - break; - } - case Window::DimY: - { - auto kernel = support::cpp14::make_unique<NEHeightConcatenateLayerKernel>(); - kernel->configure(inputs_vector.at(i), offset, output); - _concat_kernels.emplace_back(std::move(kernel)); - break; - } - case Window::DimZ: - { - auto kernel = support::cpp14::make_unique<NEDepthConcatenateLayerKernel>(); - kernel->configure(inputs_vector.at(i), offset, output); - _concat_kernels.emplace_back(std::move(kernel)); - break; - } - case 3: - { - auto kernel = support::cpp14::make_unique<NEBatchConcatenateLayerKernel>(); - kernel->configure(inputs_vector.at(i), offset, output); - _concat_kernels.emplace_back(std::move(kernel)); - break; - } - default: - ARM_COMPUTE_ERROR("Axis not supported"); - } - offset += inputs_vector.at(i)->info()->dimension(_axis); - } + _impl->op->configure(inputs_vector_info, _impl->dst->info(), axis); } -template <typename TensorInfoType, typename> -Status NEConcatenateLayer::validate_internal(const std::vector<TensorInfoType *> &inputs_vector, const ITensorInfo *output, size_t axis) +Status NEConcatenateLayer::validate(const std::vector<const ITensorInfo *> &inputs_vector, + const ITensorInfo *output, + size_t axis) { - ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(output); - ARM_COMPUTE_RETURN_ERROR_ON(inputs_vector.size() < 2); - - unsigned int offset = 0; - for(const auto &input : inputs_vector) - { - ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input); - switch(axis) - { - case Window::DimX: - { - ARM_COMPUTE_RETURN_ON_ERROR(NEWidthConcatenateLayerKernel::validate(input, offset, output)); - break; - } - case Window::DimY: - { - ARM_COMPUTE_RETURN_ON_ERROR(NEHeightConcatenateLayerKernel::validate(input, offset, output)); - break; - } - case Window::DimZ: - { - ARM_COMPUTE_RETURN_ON_ERROR(NEDepthConcatenateLayerKernel::validate(input, offset, output)); - break; - } - case 3: - { - ARM_COMPUTE_RETURN_ON_ERROR(NEBatchConcatenateLayerKernel::validate(input, offset, output)); - break; - } - default: - ARM_COMPUTE_ERROR("Axis not supported"); - } - offset += input->dimension(axis); - } - - if(output->total_size() != 0) - { - TensorShape output_shape = arm_compute::misc::shape_calculator::calculate_concatenate_shape(inputs_vector, axis); - ARM_COMPUTE_RETURN_ERROR_ON(output_shape.total_size() != output->tensor_shape().total_size()); - } - - return Status{}; + return cpu::CpuConcatenate::validate(inputs_vector, output, axis); } void NEConcatenateLayer::run() { - for(auto &kernel : _concat_kernels) + ITensorPack pack; + for (unsigned i = 0; i < _impl->num_inputs; ++i) { - NEScheduler::get().schedule(kernel.get(), Window::DimY); + pack.add_tensor(TensorType::ACL_SRC_VEC + i, _impl->srcs.at(i)); } + pack.add_tensor(TensorType::ACL_DST, _impl->dst); + + _impl->op->run(pack); } } // namespace arm_compute diff --git a/src/runtime/NEON/functions/NEConv3D.cpp b/src/runtime/NEON/functions/NEConv3D.cpp new file mode 100644 index 0000000000..8f41151d6c --- /dev/null +++ b/src/runtime/NEON/functions/NEConv3D.cpp @@ -0,0 +1,87 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/runtime/NEON/functions/NEConv3D.h" + +#include "arm_compute/core/PixelValue.h" +#include "arm_compute/core/Utils.h" +#include "arm_compute/core/Validate.h" + +#include "src/common/utils/Log.h" +#include "src/cpu/operators/CpuDirectConv3d.h" + +namespace arm_compute +{ +using namespace arm_compute::experimental; + +struct NEConv3D::Impl +{ + std::unique_ptr<cpu::ICpuOperator> op{nullptr}; + ITensorPack run_pack{}; +}; + +NEConv3D::NEConv3D() : _impl(std::make_unique<Impl>()) +{ +} + +NEConv3D::~NEConv3D() = default; + +void NEConv3D::configure( + ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const Conv3dInfo &conv_info) +{ + // Perform validate step + ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output); + ARM_COMPUTE_ERROR_THROW_ON(cpu::CpuDirectConv3d::validate( + input->info(), weights->info(), ((biases != nullptr) ? biases->info() : nullptr), output->info(), conv_info)); + ARM_COMPUTE_LOG_PARAMS(input, weights, biases, output, conv_info); + + auto f = std::make_unique<cpu::CpuDirectConv3d>(); + f->configure(input->info(), weights->info(), ((biases != nullptr) ? biases->info() : nullptr), output->info(), + conv_info); + _impl->op = std::move(f); + + if (_impl->op != nullptr) + { + _impl->run_pack = {{ACL_SRC_0, input}, {ACL_SRC_1, weights}, {ACL_SRC_2, biases}, {ACL_DST, output}}; + } +} + +Status NEConv3D::validate(const ITensorInfo *input, + const ITensorInfo *weights, + const ITensorInfo *biases, + const ITensorInfo *output, + const Conv3dInfo &conv_info) +{ + ARM_COMPUTE_RETURN_ON_ERROR(cpu::CpuDirectConv3d::validate(input, weights, biases, output, conv_info)); + + return Status{}; +} + +void NEConv3D::run() +{ + if (_impl->op != nullptr) + { + _impl->op->run(_impl->run_pack); + } +} +} // namespace arm_compute diff --git a/src/runtime/NEON/functions/NEConvertFullyConnectedWeights.cpp b/src/runtime/NEON/functions/NEConvertFullyConnectedWeights.cpp index f65c035da6..84e8565aaf 100644 --- a/src/runtime/NEON/functions/NEConvertFullyConnectedWeights.cpp +++ b/src/runtime/NEON/functions/NEConvertFullyConnectedWeights.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018-2019 ARM Limited. + * Copyright (c) 2018-2021 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -23,27 +23,49 @@ */ #include "arm_compute/runtime/NEON/functions/NEConvertFullyConnectedWeights.h" +#include "arm_compute/core/Validate.h" + +#include "src/cpu/operators/CpuConvertFullyConnectedWeights.h" + namespace arm_compute { -NEConvertFullyConnectedWeights::NEConvertFullyConnectedWeights() - : _kernel() +struct NEConvertFullyConnectedWeights::Impl +{ + const ITensor *src{nullptr}; + ITensor *dst{nullptr}; + std::unique_ptr<cpu::CpuConvertFullyConnectedWeights> op{nullptr}; +}; +NEConvertFullyConnectedWeights::NEConvertFullyConnectedWeights() : _impl(std::make_unique<Impl>()) { } +NEConvertFullyConnectedWeights::~NEConvertFullyConnectedWeights() = default; -void NEConvertFullyConnectedWeights::configure(const ITensor *input, ITensor *output, const TensorShape &original_input_shape, - DataLayout data_layout) +void NEConvertFullyConnectedWeights::configure(const ITensor *input, + ITensor *output, + const TensorShape &original_input_shape, + DataLayout data_layout) { - _kernel.configure(input, output, original_input_shape, data_layout); + ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); + + _impl->src = input; + _impl->dst = output; + _impl->op = std::make_unique<cpu::CpuConvertFullyConnectedWeights>(); + _impl->op->configure(_impl->src->info(), _impl->dst->info(), original_input_shape, data_layout); } -Status NEConvertFullyConnectedWeights::validate(const ITensorInfo *input, const ITensorInfo *output, const TensorShape &original_input_shape, - DataLayout data_layout) +Status NEConvertFullyConnectedWeights::validate(const ITensorInfo *input, + const ITensorInfo *output, + const TensorShape &original_input_shape, + DataLayout data_layout) { - return NEConvertFullyConnectedWeightsKernel::validate(input, output, original_input_shape, data_layout); + return cpu::CpuConvertFullyConnectedWeights::validate(input, output, original_input_shape, data_layout); } void NEConvertFullyConnectedWeights::run() { - NEScheduler::get().schedule(&_kernel, Window::DimZ); + ITensorPack pack; + pack.add_tensor(TensorType::ACL_SRC, _impl->src); + pack.add_tensor(TensorType::ACL_DST, _impl->dst); + _impl->op->run(pack); } -} // namespace arm_compute
\ No newline at end of file +} // namespace arm_compute diff --git a/src/runtime/NEON/functions/NEConvolution.cpp b/src/runtime/NEON/functions/NEConvolution.cpp deleted file mode 100644 index 255cb3d704..0000000000 --- a/src/runtime/NEON/functions/NEConvolution.cpp +++ /dev/null @@ -1,127 +0,0 @@ -/* - * Copyright (c) 2016-2020 ARM Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "arm_compute/runtime/NEON/functions/NEConvolution.h" - -#include "arm_compute/core/Error.h" -#include "arm_compute/core/ITensor.h" -#include "arm_compute/core/NEON/kernels/NEConvolutionKernel.h" -#include "arm_compute/core/PixelValue.h" -#include "arm_compute/core/TensorInfo.h" -#include "arm_compute/core/Utils.h" -#include "arm_compute/core/Validate.h" -#include "arm_compute/runtime/NEON/NEScheduler.h" -#include "arm_compute/runtime/TensorAllocator.h" -#include "support/MemorySupport.h" - -#include <array> -#include <utility> - -using namespace arm_compute; - -void NEConvolution3x3::configure(ITensor *input, ITensor *output, const int16_t *conv, uint32_t scale, BorderMode border_mode, uint8_t constant_border_value) -{ - auto k = arm_compute::support::cpp14::make_unique<NEConvolution3x3Kernel>(); - k->configure(input, output, conv, scale, border_mode == BorderMode::UNDEFINED); - _kernel = std::move(k); - _border_handler.configure(input, _kernel->border_size(), border_mode, PixelValue(constant_border_value)); -} - -template <unsigned int matrix_size> -NEConvolutionSquare<matrix_size>::NEConvolutionSquare(std::shared_ptr<IMemoryManager> memory_manager) - : _memory_group(std::move(memory_manager)), _tmp(), _is_separable(false), _kernel_hor(), _kernel_vert(), _kernel(), _border_handler() -{ -} - -template <unsigned int matrix_size> -void NEConvolutionSquare<matrix_size>::configure(ITensor *input, ITensor *output, const int16_t *conv, uint32_t scale, BorderMode border_mode, - uint8_t constant_border_value) -{ - ARM_COMPUTE_ERROR_ON(conv == nullptr); - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8); - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::S16); - - std::array<int16_t, matrix_size> conv_col{ { 0 } }; - std::array<int16_t, matrix_size> conv_row{ { 0 } }; - - _is_separable = separate_matrix(conv, conv_col.data(), conv_row.data(), matrix_size); - - if(_is_separable) - { - DataType intermediate_type = DataType::UNKNOWN; - std::tie(std::ignore, intermediate_type) = data_type_for_convolution(conv_col.data(), conv_row.data(), matrix_size); - - _tmp.allocator()->init(TensorInfo(input->info()->tensor_shape(), 1, intermediate_type)); - - // Manage intermediate buffers - _memory_group.manage(&_tmp); - - // Calculate scale - if(scale == 0) - { - scale = calculate_matrix_scale(conv, matrix_size); - } - - _kernel_hor.configure(input, &_tmp, conv_row.data(), border_mode == BorderMode::UNDEFINED); - _kernel_vert.configure(&_tmp, output, conv_col.data(), scale, border_mode == BorderMode::UNDEFINED); - - _tmp.allocator()->allocate(); - - _border_handler.configure(input, _kernel_hor.border_size(), border_mode, PixelValue(constant_border_value)); - } - else - { - _kernel.configure(input, output, conv, scale, border_mode == BorderMode::UNDEFINED); - _border_handler.configure(input, _kernel.border_size(), border_mode, PixelValue(constant_border_value)); - } -} - -template <unsigned int matrix_size> -void NEConvolutionSquare<matrix_size>::run() -{ - NEScheduler::get().schedule(&_border_handler, Window::DimZ); - - if(_is_separable) - { - MemoryGroupResourceScope scope_mg(_memory_group); - - NEScheduler::get().schedule(&_kernel_hor, Window::DimY); - NEScheduler::get().schedule(&_kernel_vert, Window::DimY); - } - else - { - NEScheduler::get().schedule(&_kernel, Window::DimY); - } -} - -template class arm_compute::NEConvolutionSquare<5>; -template class arm_compute::NEConvolutionSquare<7>; -template class arm_compute::NEConvolutionSquare<9>; - -void NEConvolutionRectangle::configure(ITensor *input, ITensor *output, const int16_t *conv, uint32_t rows, uint32_t cols, uint32_t scale, BorderMode border_mode, uint8_t constant_border_value) -{ - auto k = arm_compute::support::cpp14::make_unique<NEConvolutionRectangleKernel>(); - k->configure(input, output, conv, rows, cols, scale, border_mode == BorderMode::UNDEFINED); - _kernel = std::move(k); - _border_handler.configure(input, _kernel->border_size(), border_mode, PixelValue(constant_border_value)); -} diff --git a/src/runtime/NEON/functions/NEConvolutionLayer.cpp b/src/runtime/NEON/functions/NEConvolutionLayer.cpp index 4a779917a7..8efebbbb1a 100644 --- a/src/runtime/NEON/functions/NEConvolutionLayer.cpp +++ b/src/runtime/NEON/functions/NEConvolutionLayer.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2020 ARM Limited. + * Copyright (c) 2017-2021, 2023 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -25,174 +25,184 @@ #include "arm_compute/core/PixelValue.h" #include "arm_compute/core/Utils.h" +#include "arm_compute/core/utils/DataTypeUtils.h" #include "arm_compute/core/Validate.h" -#include "arm_compute/runtime/NEON/NEScheduler.h" -#include "support/MemorySupport.h" +#include "arm_compute/runtime/NEON/functions/NEFFTConvolutionLayer.h" -#include <cmath> -#include <tuple> -#include <utility> +#include "src/common/utils/Log.h" +#include "src/core/helpers/MemoryHelpers.h" +#include "src/cpu/operators/CpuConv2d.h" +#include "src/cpu/operators/CpuDirectConv2d.h" +#include "src/cpu/operators/CpuGemmConv2d.h" +#include "src/cpu/operators/CpuGemmDirectConv2d.h" +#include "src/cpu/operators/CpuWinogradConv2d.h" namespace arm_compute { -NEConvolutionLayer::NEConvolutionLayer(std::shared_ptr<IMemoryManager> memory_manager) //NOLINT - : _memory_manager(std::move(memory_manager)), - _function() +using namespace arm_compute::experimental; + +struct NEConvolutionLayer::Impl +{ + MemoryGroup memory_group{}; + std::shared_ptr<IMemoryManager> memory_manager{}; + std::unique_ptr<cpu::ICpuOperator> op{nullptr}; + ITensorPack run_pack{}; + ITensorPack prep_pack{}; + WorkspaceData<Tensor> workspace{}; + experimental::MemoryRequirements aux_mem_req{}; + std::unique_ptr<IFunction> func{nullptr}; +}; + +NEConvolutionLayer::NEConvolutionLayer(std::shared_ptr<IMemoryManager> memory_manager) : _impl(std::make_unique<Impl>()) { + _impl->memory_manager = std::move(memory_manager); } -void NEConvolutionLayer::configure(ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const PadStrideInfo &conv_info, const WeightsInfo &weights_info, - const Size2D &dilation, const ActivationLayerInfo &act_info, bool enable_fast_math, unsigned int num_groups) +NEConvolutionLayer::~NEConvolutionLayer() = default; + +void NEConvolutionLayer::configure(ITensor *input, + const ITensor *weights, + const ITensor *biases, + ITensor *output, + const PadStrideInfo &conv_info, + const WeightsInfo &weights_info, + const Size2D &dilation, + const ActivationLayerInfo &act_info, + bool enable_fast_math, + unsigned int num_groups) { // Perform validate step ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output); ARM_COMPUTE_UNUSED(num_groups); - ARM_COMPUTE_ERROR_THROW_ON(NEConvolutionLayer::validate(input->info(), weights->info(), ((biases != nullptr) ? biases->info() : nullptr), output->info(), conv_info, weights_info, dilation, act_info, - enable_fast_math)); - - switch(NEConvolutionLayer::get_convolution_method(input->info(), weights->info(), output->info(), conv_info, weights_info, dilation, act_info, enable_fast_math)) + ARM_COMPUTE_ERROR_THROW_ON(NEConvolutionLayer::validate( + input->info(), weights->info(), ((biases != nullptr) ? biases->info() : nullptr), output->info(), conv_info, + weights_info, dilation, act_info, enable_fast_math, num_groups)); + ARM_COMPUTE_LOG_PARAMS(input, weights, biases, output, conv_info, weights_info, dilation, act_info, + enable_fast_math, num_groups); + + const Conv2dInfo info(conv_info, dilation, act_info, enable_fast_math, num_groups); + switch (cpu::CpuConv2d::get_convolution_method(input->info(), weights->info(), output->info(), conv_info, + weights_info, dilation, act_info, enable_fast_math)) { case ConvolutionMethod::WINOGRAD: - { - auto f = arm_compute::support::cpp14::make_unique<NEWinogradConvolutionLayer>(_memory_manager); - f->configure(input, weights, biases, output, conv_info, act_info, enable_fast_math); - _function = std::move(f); - break; - } case ConvolutionMethod::GEMM: - { - auto f = arm_compute::support::cpp14::make_unique<NEGEMMConvolutionLayer>(_memory_manager); - f->configure(input, weights, biases, output, conv_info, weights_info, dilation, act_info); - _function = std::move(f); - break; - } + case ConvolutionMethod::GEMM_CONV2D: case ConvolutionMethod::DIRECT: { - auto f = arm_compute::support::cpp14::make_unique<NEDirectConvolutionLayer>(_memory_manager); - f->configure(input, weights, biases, output, conv_info, act_info); - _function = std::move(f); + auto f = std::make_unique<cpu::CpuConv2d>(); + f->configure(input->info(), weights->info(), ((biases != nullptr) ? biases->info() : nullptr), + output->info(), conv_info, weights_info, dilation, act_info, enable_fast_math, num_groups); + _impl->op = std::move(f); break; } case ConvolutionMethod::FFT: { - auto f = arm_compute::support::cpp14::make_unique<NEFFTConvolutionLayer>(_memory_manager); + auto f = std::make_unique<NEFFTConvolutionLayer>(_impl->memory_manager); f->configure(input, weights, biases, output, conv_info, act_info); - _function = std::move(f); + _impl->func = std::move(f); break; } default: ARM_COMPUTE_ERROR("Not supported."); break; } + + if (_impl->op) + { + _impl->memory_group = MemoryGroup(std::move(_impl->memory_manager)); + _impl->aux_mem_req = _impl->op->workspace(); + _impl->run_pack = {{ACL_SRC_0, input}, {ACL_SRC_1, weights}, {ACL_SRC_2, biases}, {ACL_DST, output}}; + _impl->prep_pack = {{ACL_SRC_1, weights}, {ACL_SRC_2, biases}}; + _impl->workspace = + manage_workspace<Tensor>(_impl->aux_mem_req, _impl->memory_group, _impl->run_pack, _impl->prep_pack); + } } -Status NEConvolutionLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info, - const WeightsInfo &weights_info, const Size2D &dilation, const ActivationLayerInfo &act_info, bool enable_fast_math, unsigned int num_groups) +Status NEConvolutionLayer::validate(const ITensorInfo *input, + const ITensorInfo *weights, + const ITensorInfo *biases, + const ITensorInfo *output, + const PadStrideInfo &conv_info, + const WeightsInfo &weights_info, + const Size2D &dilation, + const ActivationLayerInfo &act_info, + bool enable_fast_math, + unsigned int num_groups) { - ARM_COMPUTE_RETURN_ERROR_ON_MSG((num_groups != 1), "Grouping (num_groups != 1) is not supported on NEON"); + const Conv2dInfo info(conv_info, dilation, act_info, enable_fast_math, num_groups); + + ARM_COMPUTE_RETURN_ERROR_ON_MSG(!weights->are_values_constant(), "Dynamic weights are not supported"); + + // Biases with dynamic values are not supported with quantized inputs. + if (biases) + { + ARM_COMPUTE_RETURN_ERROR_ON_MSG((!biases->are_values_constant() && is_data_type_quantized(input->data_type())), + "Dynamic Biases are not supported with quantized input data."); + } - switch(NEConvolutionLayer::get_convolution_method(input, weights, output, conv_info, weights_info, dilation, act_info, enable_fast_math)) + switch (cpu::CpuConv2d::get_convolution_method(input, weights, output, conv_info, weights_info, dilation, act_info, + enable_fast_math)) { case ConvolutionMethod::WINOGRAD: - //Validate Winograd - ARM_COMPUTE_RETURN_ON_ERROR(NEWinogradConvolutionLayer::validate(input, weights, biases, output, conv_info, act_info, enable_fast_math)); - break; case ConvolutionMethod::GEMM: - //Validate Gemm-based Convolution - ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMConvolutionLayer::validate(input, weights, biases, output, conv_info, weights_info, dilation, act_info)); - break; + case ConvolutionMethod::GEMM_CONV2D: case ConvolutionMethod::DIRECT: - //Validate Direct Convolution - ARM_COMPUTE_RETURN_ON_ERROR(NEDirectConvolutionLayer::validate(input, weights, biases, output, conv_info, act_info)); + ARM_COMPUTE_RETURN_ON_ERROR(cpu::CpuConv2d::validate(input, weights, biases, output, conv_info, + weights_info, dilation, act_info, enable_fast_math, + num_groups)); break; case ConvolutionMethod::FFT: - // Validate FFT-based convolution layer - ARM_COMPUTE_RETURN_ON_ERROR(NEFFTConvolutionLayer::validate(input, weights, nullptr, output, conv_info, act_info)); + ARM_COMPUTE_RETURN_ON_ERROR( + NEFFTConvolutionLayer::validate(input, weights, biases, output, conv_info, act_info)); break; default: ARM_COMPUTE_ERROR("Not supported."); break; } - return Status{}; } -ConvolutionMethod NEConvolutionLayer::get_convolution_method(const ITensorInfo *input, const ITensorInfo *weights, - const ITensorInfo *output, const PadStrideInfo &conv_info, - const WeightsInfo &weights_info, const Size2D &dilation, const ActivationLayerInfo &act_info, bool enable_fast_math) +ConvolutionMethod NEConvolutionLayer::get_convolution_method(const ITensorInfo *input, + const ITensorInfo *weights, + const ITensorInfo *output, + const PadStrideInfo &conv_info, + const WeightsInfo &weights_info, + const Size2D &dilation, + const ActivationLayerInfo &act_info, + bool enable_fast_math) { - ARM_COMPUTE_ERROR_ON_NULLPTR(input, output, weights); - ARM_COMPUTE_UNUSED(weights_info); - - const size_t idx_w = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::WIDTH); - const size_t idx_h = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::HEIGHT); - const size_t idx_c = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::CHANNEL); - - /* Input spatial dims, kernel size, IFM/OFM, conv info*/ - using ConvolutionConfiguration = std::tuple<Size2D, Size2D, Size2D, PadStrideInfo>; - using ConfigurationMethod = std::pair<ConvolutionConfiguration, ConvolutionMethod>; - - const std::vector<ConfigurationMethod> known_configs = - { - // Alexnet - ConfigurationMethod(ConvolutionConfiguration(Size2D(27U, 27U), Size2D(5U, 5U), Size2D(48U, 128U), PadStrideInfo(1U, 1U, 2U, 2U)), ConvolutionMethod::GEMM), - // VGG16 / VGG19 - ConfigurationMethod(ConvolutionConfiguration(Size2D(224U, 224U), Size2D(3U, 3U), Size2D(3U, 64U), PadStrideInfo(1U, 1U, 1U, 1U)), ConvolutionMethod::GEMM), - // Mobilenet 224 - ConfigurationMethod(ConvolutionConfiguration(Size2D(224U, 224U), Size2D(3U, 3U), Size2D(3U, 32U), PadStrideInfo(2U, 2U, 0U, 1U, 0U, 1U, DimensionRoundingType::FLOOR)), ConvolutionMethod::GEMM), - // Mobilenet 160 - ConfigurationMethod(ConvolutionConfiguration(Size2D(160U, 160U), Size2D(3U, 3U), Size2D(3U, 24U), PadStrideInfo(2U, 2U, 0U, 1U, 0U, 1U, DimensionRoundingType::FLOOR)), ConvolutionMethod::GEMM) - }; - - const auto find_config = [&](ConfigurationMethod c) - { - const ConvolutionConfiguration config = c.first; - const PadStrideInfo info = std::get<3>(config); + return cpu::CpuConv2d::get_convolution_method(input, weights, output, conv_info, weights_info, dilation, act_info, + enable_fast_math); +} - return std::get<0>(config) == Size2D(input->dimension(idx_w), input->dimension(idx_h)) && std::get<1>(config) == Size2D(weights->dimension(idx_w), weights->dimension(idx_h)) - && std::get<2>(config) == Size2D(weights->dimension(idx_c), weights->dimension(3)) && info.pad_top() == conv_info.pad_top() && info.pad_right() == conv_info.pad_right() - && info.pad_bottom() == conv_info.pad_bottom() && info.pad_left() == conv_info.pad_left() && info.stride() == conv_info.stride(); - }; +void NEConvolutionLayer::run() +{ + prepare(); - std::vector<ConfigurationMethod>::const_iterator found; - if((found = std::find_if(known_configs.begin(), known_configs.end(), find_config)) != known_configs.end()) - { - return (*found).second; - } + MemoryGroupResourceScope scope_mg(_impl->memory_group); - if(dilation != Size2D(1U, 1U)) + if (_impl->func) { - return ConvolutionMethod::GEMM; + _impl->func->run(); } else { - // SRGAN - // Output might not be initialized when it is an internal tensor of the layer using the convolution - if(input->total_size() > 1e7 && (weights->dimension(idx_h) > 7) - && (NEDirectConvolutionLayer::validate(input, weights, nullptr, output, conv_info, act_info))) - { - return ConvolutionMethod::DIRECT; - } - if((weights->dimension(idx_h) > 7) && (input->dimension(idx_c) > output->dimension(idx_c)) && (NEFFTConvolutionLayer::validate(input, weights, nullptr, output, conv_info, act_info))) - { - return ConvolutionMethod::FFT; - } - if(input->dimension(idx_c) < 16) - { - return ConvolutionMethod::GEMM; - } - return bool(NEWinogradConvolutionLayer::validate(input, weights, nullptr, output, conv_info, act_info, enable_fast_math)) ? ConvolutionMethod::WINOGRAD : ConvolutionMethod::GEMM; + _impl->op->run(_impl->run_pack); } } -void NEConvolutionLayer::run() -{ - prepare(); - _function->run(); -} - void NEConvolutionLayer::prepare() { - _function->prepare(); + if (_impl->func) + { + _impl->func->prepare(); + } + else + { + _impl->op->prepare(_impl->prep_pack); + + // Release temporary tensors that are only used in prepare stage + release_temporaries<Tensor>(_impl->aux_mem_req, _impl->workspace); + } } } // namespace arm_compute diff --git a/src/runtime/NEON/functions/NECopy.cpp b/src/runtime/NEON/functions/NECopy.cpp index 55c4faf9ab..c975d3a5b5 100644 --- a/src/runtime/NEON/functions/NECopy.cpp +++ b/src/runtime/NEON/functions/NECopy.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018-2020 ARM Limited. + * Copyright (c) 2017-2021 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -23,22 +23,51 @@ */ #include "arm_compute/runtime/NEON/functions/NECopy.h" -#include "arm_compute/core/NEON/kernels/NECopyKernel.h" -#include "support/MemorySupport.h" +#include "arm_compute/core/Validate.h" + +#include "src/cpu/operators/CpuCopy.h" #include <utility> namespace arm_compute { +struct NECopy::Impl +{ + const ITensor *src{nullptr}; + ITensor *dst{nullptr}; + std::unique_ptr<cpu::CpuCopy> op{nullptr}; +}; + +NECopy::NECopy() : _impl(std::make_unique<Impl>()) +{ +} +NECopy::NECopy(NECopy &&) = default; +NECopy &NECopy::operator=(NECopy &&) = default; +NECopy::~NECopy() = default; + void NECopy::configure(ITensor *input, ITensor *output) { - auto k = arm_compute::support::cpp14::make_unique<NECopyKernel>(); - k->configure(input, output); - _kernel = std::move(k); + ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); + + _impl->src = input; + _impl->dst = output; + _impl->op = std::make_unique<cpu::CpuCopy>(); + _impl->op->configure(input->info(), output->info()); +} + +Status NECopy::validate(const ITensorInfo *input, const ITensorInfo *output) +{ + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output); + ARM_COMPUTE_RETURN_ON_ERROR(cpu::CpuCopy::validate(input, output)); + + return Status{}; } -Status NECopy::validate(const arm_compute::ITensorInfo *input, const arm_compute::ITensorInfo *output) +void NECopy::run() { - return NECopyKernel::validate(input, output); + ITensorPack pack; + pack.add_tensor(TensorType::ACL_SRC, _impl->src); + pack.add_tensor(TensorType::ACL_DST, _impl->dst); + _impl->op->run(pack); } } // namespace arm_compute diff --git a/src/runtime/NEON/functions/NECropResize.cpp b/src/runtime/NEON/functions/NECropResize.cpp index cc39d0284e..a94b0882da 100644 --- a/src/runtime/NEON/functions/NECropResize.cpp +++ b/src/runtime/NEON/functions/NECropResize.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019 ARM Limited. + * Copyright (c) 2019-2021 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -21,27 +21,47 @@ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ +#include "arm_compute/runtime/NEON/functions/NECropResize.h" + #include "arm_compute/runtime/NEON/NEScheduler.h" +#include "arm_compute/runtime/Tensor.h" -#include "arm_compute/runtime/NEON/functions/NECropResize.h" +#include "src/common/utils/Log.h" +#include "src/core/NEON/kernels/NECropKernel.h" #include <cstddef> namespace arm_compute { +NECropResize::~NECropResize() = default; + NECropResize::NECropResize() - : _output(nullptr), _num_boxes(0), _method(), _extrapolation_value(0), _crop(), _scale(), _crop_results(), _scaled_results() + : _output(nullptr), + _num_boxes(0), + _method(), + _extrapolation_value(0), + _crop(), + _scale(), + _crop_results(), + _scaled_results() { } -Status NECropResize::validate(const ITensorInfo *input, const ITensorInfo *boxes, const ITensorInfo *box_ind, const ITensorInfo *output, - Coordinates2D crop_size, InterpolationPolicy method, float extrapolation_value) +Status NECropResize::validate(const ITensorInfo *input, + const ITensorInfo *boxes, + const ITensorInfo *box_ind, + const ITensorInfo *output, + Coordinates2D crop_size, + InterpolationPolicy method, + float extrapolation_value) { ARM_COMPUTE_RETURN_ERROR_ON(crop_size.x <= 0 || crop_size.y <= 0); ARM_COMPUTE_RETURN_ERROR_ON(method == InterpolationPolicy::AREA); TensorInfo temp_info; - ARM_COMPUTE_RETURN_ON_ERROR(NECropKernel::validate(input->clone().get(), boxes->clone().get(), box_ind->clone().get(), &temp_info, boxes->tensor_shape()[1] - 1, extrapolation_value)); - if(output->total_size() > 0) + ARM_COMPUTE_RETURN_ON_ERROR(NECropKernel::validate(input->clone().get(), boxes->clone().get(), + box_ind->clone().get(), &temp_info, boxes->tensor_shape()[1] - 1, + extrapolation_value)); + if (output->total_size() > 0) { ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_NOT_IN(output, DataType::F32); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, output); @@ -51,11 +71,18 @@ Status NECropResize::validate(const ITensorInfo *input, const ITensorInfo *boxes return Status{}; } -void NECropResize::configure(const ITensor *input, const ITensor *boxes, const ITensor *box_ind, ITensor *output, Coordinates2D crop_size, - InterpolationPolicy method, float extrapolation_value) +void NECropResize::configure(const ITensor *input, + const ITensor *boxes, + const ITensor *box_ind, + ITensor *output, + Coordinates2D crop_size, + InterpolationPolicy method, + float extrapolation_value) { ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); - ARM_COMPUTE_ERROR_THROW_ON(NECropResize::validate(input->info(), boxes->info(), box_ind->info(), output->info(), crop_size, method, extrapolation_value)); + ARM_COMPUTE_ERROR_THROW_ON(NECropResize::validate(input->info(), boxes->info(), box_ind->info(), output->info(), + crop_size, method, extrapolation_value)); + ARM_COMPUTE_LOG_PARAMS(input, boxes, box_ind, output, crop_size, method, extrapolation_value); _num_boxes = boxes->info()->tensor_shape()[1]; TensorShape out_shape(input->info()->tensor_shape()[0], crop_size.x, crop_size.y); @@ -75,20 +102,20 @@ void NECropResize::configure(const ITensor *input, const ITensor *boxes, const I _scaled_results.reserve(_num_boxes); _scale.reserve(_num_boxes); - for(unsigned int i = 0; i < _num_boxes; ++i) + for (unsigned int i = 0; i < _num_boxes; ++i) { - auto crop_tensor = support::cpp14::make_unique<Tensor>(); + auto crop_tensor = std::make_unique<Tensor>(); TensorInfo crop_result_info(1, DataType::F32); crop_result_info.set_data_layout(DataLayout::NHWC); crop_tensor->allocator()->init(crop_result_info); - auto scale_tensor = support::cpp14::make_unique<Tensor>(); + auto scale_tensor = std::make_unique<Tensor>(); TensorInfo scaled_result_info(out_shape, 1, DataType::F32); scaled_result_info.set_data_layout(DataLayout::NHWC); scale_tensor->allocator()->init(scaled_result_info); - auto crop_kernel = support::cpp14::make_unique<NECropKernel>(); - auto scale_kernel = support::cpp14::make_unique<NEScale>(); + auto crop_kernel = std::make_unique<NECropKernel>(); + auto scale_kernel = std::make_unique<NEScale>(); crop_kernel->configure(input, boxes, box_ind, crop_tensor.get(), i, _extrapolation_value); _crop.emplace_back(std::move(crop_kernel)); @@ -102,7 +129,7 @@ void NECropResize::run() { ARM_COMPUTE_ERROR_ON_MSG(_output == nullptr, "Unconfigured function"); - for(unsigned int i = 0; i < _num_boxes; ++i) + for (unsigned int i = 0; i < _num_boxes; ++i) { // Size of the crop box in _boxes and thus the shape of _crop_results[i] // may not be known until run-time and so the kernels cannot be configured until then. @@ -111,12 +138,15 @@ void NECropResize::run() NEScheduler::get().schedule(_crop[i].get(), Window::DimZ); // Scale the cropped image. - _scale[i]->configure(_crop_results[i].get(), _scaled_results[i].get(), _method, BorderMode::CONSTANT, PixelValue(_extrapolation_value), SamplingPolicy::TOP_LEFT, false); + _scale[i]->configure(_crop_results[i].get(), _scaled_results[i].get(), + ScaleKernelInfo{_method, BorderMode::CONSTANT, PixelValue(_extrapolation_value), + SamplingPolicy::TOP_LEFT, false}); _scaled_results[i]->allocator()->allocate(); _scale[i]->run(); // Copy scaled image into output. - std::copy_n(_scaled_results[i]->buffer(), _scaled_results[i]->info()->total_size(), _output->ptr_to_element(Coordinates(0, 0, 0, i))); + std::copy_n(_scaled_results[i]->buffer(), _scaled_results[i]->info()->total_size(), + _output->ptr_to_element(Coordinates(0, 0, 0, i))); } } -} // namespace arm_compute
\ No newline at end of file +} // namespace arm_compute diff --git a/src/runtime/NEON/functions/NEDeconvolutionLayer.cpp b/src/runtime/NEON/functions/NEDeconvolutionLayer.cpp index dd53fbbdc3..081c7cc538 100644 --- a/src/runtime/NEON/functions/NEDeconvolutionLayer.cpp +++ b/src/runtime/NEON/functions/NEDeconvolutionLayer.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2020 ARM Limited. + * Copyright (c) 2017-2021, 2023 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -25,10 +25,13 @@ #include "arm_compute/core/Helpers.h" #include "arm_compute/core/Utils.h" -#include "arm_compute/core/Validate.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" +#include "arm_compute/core/Validate.h" #include "arm_compute/runtime/NEON/NEScheduler.h" +#include "src/common/utils/Log.h" +#include "src/core/helpers/AutoConfiguration.h" + using namespace arm_compute::misc::shape_calculator; namespace arm_compute @@ -59,9 +62,9 @@ PadStrideInfo compute_upsample_info(const PadStrideInfo &info, uint32_t deconv_p deconv_pad_top += deconv_pad_y / 2; deconv_pad_bottom += deconv_pad_y / 2; - return PadStrideInfo(stride_x, stride_y, deconv_pad_left, deconv_pad_right, deconv_pad_top, deconv_pad_bottom, DimensionRoundingType::FLOOR); + return PadStrideInfo(stride_x, stride_y, deconv_pad_left, deconv_pad_right, deconv_pad_top, deconv_pad_bottom, + DimensionRoundingType::FLOOR); } - } // namespace NEDeconvolutionLayer::NEDeconvolutionLayer(std::shared_ptr<IMemoryManager> memory_manager) // NOLINT @@ -75,27 +78,54 @@ NEDeconvolutionLayer::NEDeconvolutionLayer(std::shared_ptr<IMemoryManager> memor _original_weights(nullptr), _input(nullptr), _info(), - _is_prepared(false) + _is_prepared(false), + _do_upsampling(true) { } -Status NEDeconvolutionLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *bias, const ITensorInfo *output, const PadStrideInfo &info) +Status NEDeconvolutionLayer::validate(const ITensorInfo *input, + const ITensorInfo *weights, + const ITensorInfo *bias, + const ITensorInfo *output, + const PadStrideInfo &info, + bool enable_fast_math, + const WeightsInfo &weights_info) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32, DataType::F16, DataType::QASYMM8, DataType::QASYMM8_SIGNED); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(weights, input); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(weights, input); - const unsigned int width_idx = get_data_layout_dimension_index(weights->data_layout(), DataLayoutDimension::WIDTH); - const unsigned int height_idx = get_data_layout_dimension_index(weights->data_layout(), DataLayoutDimension::HEIGHT); - ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(width_idx) != weights->dimension(height_idx)); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32, DataType::F16, DataType::QASYMM8, + DataType::QASYMM8_SIGNED); + const unsigned int width_idx = get_data_layout_dimension_index(weights->data_layout(), DataLayoutDimension::WIDTH); + const unsigned int height_idx = + get_data_layout_dimension_index(weights->data_layout(), DataLayoutDimension::HEIGHT); ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(width_idx) < 1); + ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(height_idx) < 1); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(weights, input); + if (is_data_type_quantized_per_channel(weights->data_type()) && is_data_type_quantized(input->data_type())) + { + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(weights, 1, DataType::QSYMM8_PER_CHANNEL); + } + else + { + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights); + } - auto out_dims = deconvolution_output_dimensions(input->dimension(width_idx), input->dimension(height_idx), weights->dimension(width_idx), weights->dimension(height_idx), info); + const unsigned int pad_left = info.pad_left(); + const unsigned int pad_top = info.pad_top(); + const unsigned int pad_right = info.pad_right(); + const unsigned int pad_bottom = info.pad_bottom(); + + ARM_COMPUTE_RETURN_ERROR_ON(((input->dimension(width_idx) - 1) * info.stride().first + + weights->dimension(width_idx)) < (pad_left + pad_right)); + ARM_COMPUTE_RETURN_ERROR_ON(((input->dimension(height_idx) - 1) * info.stride().second + + weights->dimension(height_idx)) < (pad_top + pad_bottom)); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights); - if(bias != nullptr) + auto out_dims = + deconvolution_output_dimensions(input->dimension(width_idx), input->dimension(height_idx), + weights->dimension(width_idx), weights->dimension(height_idx), info); + + if (bias != nullptr) { - if(is_data_type_quantized_asymmetric(input->data_type())) + if (is_data_type_quantized_asymmetric(input->data_type())) { ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(bias, 1, DataType::S32); } @@ -105,46 +135,84 @@ Status NEDeconvolutionLayer::validate(const ITensorInfo *input, const ITensorInf } } - if(output->tensor_shape().total_size() > 0) + if (output->tensor_shape().total_size() > 0) { ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); const TensorShape output_shape = compute_deconvolution_output_shape(out_dims, *input, *weights); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->dimension(Window::DimX) != output_shape.x(), "Output's width is invalid."); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->dimension(Window::DimY) != output_shape.y(), "Output's height is invalid."); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->dimension(Window::DimZ) != output_shape.z(), "Output's depth is invalid."); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->dimension(Window::DimX) != output_shape.x(), + "Output's width is invalid."); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->dimension(Window::DimY) != output_shape.y(), + "Output's height is invalid."); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->dimension(Window::DimZ) != output_shape.z(), + "Output's depth is invalid."); } - uint32_t deconv_pad_x = 0; - uint32_t deconv_pad_y = 0; - const unsigned int stride_x = info.stride().first; - const unsigned int stride_y = info.stride().second; - const TensorShape scale_out_shape = compute_deconvolution_upsampled_shape(*input, *weights, stride_x, stride_y, out_dims, deconv_pad_x, deconv_pad_y); - TensorInfo scale_out_info(input->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(scale_out_shape)); - const PadStrideInfo conv_info(1, 1, 0, 0, 0, 0, DimensionRoundingType::CEIL); + uint32_t deconv_pad_x = 0; + uint32_t deconv_pad_y = 0; + const uint32_t stride_x = info.stride().first; + const uint32_t stride_y = info.stride().second; + const auto deconv_padding = compute_deconvolution_padding(*input, *weights, static_cast<int32_t>(stride_x), + static_cast<int32_t>(stride_y), out_dims); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(deconv_padding.first < 0 || deconv_padding.second < 0, + "Negative padding not supported"); + + const TensorShape scale_out_shape = compute_deconvolution_upsampled_shape(*input, *weights, stride_x, stride_y, + out_dims, deconv_pad_x, deconv_pad_y); + TensorInfo scale_out_info(input->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(scale_out_shape)); + const PadStrideInfo upsample_info = compute_upsample_info(info, deconv_pad_x, deconv_pad_y); + + // Do not perform upsampling when the operation uses unit stride in all dimensions + const bool do_upsampling = stride_x != 1 || stride_y != 1; - const unsigned int batches_idx = get_data_layout_dimension_index(weights->data_layout(), DataLayoutDimension::BATCHES); - const unsigned int channel_idx = get_data_layout_dimension_index(weights->data_layout(), DataLayoutDimension::CHANNEL); + const unsigned int batches_idx = + get_data_layout_dimension_index(weights->data_layout(), DataLayoutDimension::BATCHES); + const unsigned int channel_idx = + get_data_layout_dimension_index(weights->data_layout(), DataLayoutDimension::CHANNEL); ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(batches_idx) != scale_out_info.dimension(batches_idx)); ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(channel_idx) != scale_out_info.dimension(channel_idx)); - ARM_COMPUTE_RETURN_ON_ERROR(NEConvolutionLayer::validate(&scale_out_info, weights, bias, output, conv_info, WeightsInfo())); + if (do_upsampling) + { + const PadStrideInfo conv_info(1, 1, 0, 0, 0, 0, DimensionRoundingType::CEIL); + ARM_COMPUTE_RETURN_ON_ERROR(NEConvolutionLayer::validate(&scale_out_info, weights, bias, output, conv_info, + weights_info, Size2D(1U, 1U), ActivationLayerInfo(), + enable_fast_math)); + } + else + { + const PadStrideInfo conv_info(1, 1, upsample_info.pad_left(), upsample_info.pad_right(), + upsample_info.pad_top(), upsample_info.pad_bottom(), DimensionRoundingType::CEIL); + ARM_COMPUTE_RETURN_ON_ERROR(NEConvolutionLayer::validate(input, weights, bias, output, conv_info, weights_info, + Size2D(1U, 1U), ActivationLayerInfo(), + enable_fast_math)); + } return Status{}; } -void NEDeconvolutionLayer::configure(ITensor *input, const ITensor *weights, const ITensor *bias, ITensor *output, const PadStrideInfo &info) +void NEDeconvolutionLayer::configure(ITensor *input, + const ITensor *weights, + const ITensor *bias, + ITensor *output, + const PadStrideInfo &info, + bool enable_fast_math, + const WeightsInfo &weights_info) { // Perform validation step ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output); - ARM_COMPUTE_ERROR_THROW_ON(NEDeconvolutionLayer::validate(input->info(), weights->info(), (bias == nullptr) ? nullptr : bias->info(), output->info(), info)); + ARM_COMPUTE_ERROR_THROW_ON(NEDeconvolutionLayer::validate(input->info(), weights->info(), + (bias == nullptr) ? nullptr : bias->info(), + output->info(), info, enable_fast_math, weights_info)); + ARM_COMPUTE_LOG_PARAMS(input, weights, bias, output, info, enable_fast_math, weights_info); const DataLayout data_layout = input->info()->data_layout(); const unsigned int width_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH); const unsigned int height_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT); - auto out_dims = deconvolution_output_dimensions(input->info()->dimension(width_idx), input->info()->dimension(height_idx), - weights->info()->dimension(width_idx), weights->info()->dimension(height_idx), info); + auto out_dims = deconvolution_output_dimensions( + input->info()->dimension(width_idx), input->info()->dimension(height_idx), + weights->info()->dimension(width_idx), weights->info()->dimension(height_idx), info); const TensorShape output_shape = compute_deconvolution_output_shape(out_dims, *input->info(), *weights->info()); @@ -157,32 +225,24 @@ void NEDeconvolutionLayer::configure(ITensor *input, const ITensor *weights, con const unsigned int stride_y = info.stride().second; // Output auto initialization if not yet initialized - auto_init_if_empty(*output->info(), output_shape, 1, input->info()->data_type(), input->info()->quantization_info()); + auto_init_if_empty(*output->info(), output_shape, 1, input->info()->data_type(), + input->info()->quantization_info()); _flip_axis.allocator()->init(TensorInfo(TensorShape(2U), 1, DataType::U32)); - _memory_group.manage(&_scaled_output); _weights_flipped.allocator()->init(weights->info()->clone()->set_data_layout(data_layout)); _flip_weights.configure(weights, &_weights_flipped, &_flip_axis); // setup the function to convolve the upscaled output - const PadStrideInfo conv_info(1, 1, 0, 0, 0, 0, DimensionRoundingType::CEIL); - uint32_t deconv_pad_x = 0; - uint32_t deconv_pad_y = 0; - - const TensorShape scale_out_shape = compute_deconvolution_upsampled_shape(*input->info(), *weights->info(), - stride_x, stride_y, - out_dims, deconv_pad_x, deconv_pad_y); + uint32_t deconv_pad_x = 0; + uint32_t deconv_pad_y = 0; + const TensorShape scale_out_shape = compute_deconvolution_upsampled_shape( + *input->info(), *weights->info(), stride_x, stride_y, out_dims, deconv_pad_x, deconv_pad_y); const PadStrideInfo upsample_info = compute_upsample_info(info, deconv_pad_x, deconv_pad_y); - TensorInfo scale_out_info(scale_out_shape, 1, input->info()->data_type(), input->info()->quantization_info()); - scale_out_info.set_data_layout(data_layout); - _scaled_output.allocator()->init(scale_out_info); - - _upsample_f.configure(input, &_scaled_output, upsample_info); - - _conv_f.configure(&_scaled_output, &_weights_flipped, bias, output, conv_info); + // Do not perform upsampling when the operation uses unit stride in all dimensions + _do_upsampling = stride_x != 1 || stride_y != 1; // Setup flip axis data _flip_axis.allocator()->allocate(); @@ -190,7 +250,32 @@ void NEDeconvolutionLayer::configure(ITensor *input, const ITensor *weights, con axis_data[0] = static_cast<uint32_t>(width_idx); axis_data[1] = static_cast<uint32_t>(height_idx); - _scaled_output.allocator()->allocate(); + // Setup convolution and upsampling, if needed + if (_do_upsampling) + { + _memory_group.manage(&_scaled_output); + + const PadStrideInfo conv_info(1, 1, 0, 0, 0, 0, DimensionRoundingType::CEIL); + TensorInfo scale_out_info(scale_out_shape, 1, input->info()->data_type(), input->info()->quantization_info()); + scale_out_info.set_data_layout(data_layout); + _scaled_output.allocator()->init(scale_out_info); + + // Minor optimization: In the upsampling step, we do not need to allocate space for the padding in the upsampled image. + // The padding amount can be given as input to the convolution layer. + _upsample_f.configure(input, &_scaled_output, upsample_info); + + _conv_f.configure(&_scaled_output, &_weights_flipped, bias, output, conv_info, weights_info, Size2D(1U, 1U), + ActivationLayerInfo(), enable_fast_math); + + _scaled_output.allocator()->allocate(); + } + else + { + const PadStrideInfo conv_info(1, 1, upsample_info.pad_left(), upsample_info.pad_right(), + upsample_info.pad_top(), upsample_info.pad_bottom(), DimensionRoundingType::CEIL); + _conv_f.configure(input, &_weights_flipped, bias, output, conv_info, weights_info, Size2D(1U, 1U), + ActivationLayerInfo(), enable_fast_math); + } } void NEDeconvolutionLayer::run() @@ -199,13 +284,16 @@ void NEDeconvolutionLayer::run() MemoryGroupResourceScope scope_mg(_memory_group); - _upsample_f.run(); + if (_do_upsampling) + { + _upsample_f.run(); + } _conv_f.run(); } void NEDeconvolutionLayer::prepare() { - if(!_is_prepared) + if (!_is_prepared) { ARM_COMPUTE_ERROR_ON(!_original_weights->is_used()); diff --git a/src/runtime/NEON/functions/NEDepthConvertLayer.cpp b/src/runtime/NEON/functions/NEDepthConvertLayer.cpp index a2f890ef95..766635dfa1 100644 --- a/src/runtime/NEON/functions/NEDepthConvertLayer.cpp +++ b/src/runtime/NEON/functions/NEDepthConvertLayer.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2016-2020 ARM Limited. + * Copyright (c) 2016-2021 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -23,21 +23,52 @@ */ #include "arm_compute/runtime/NEON/functions/NEDepthConvertLayer.h" -#include "arm_compute/core/NEON/kernels/NEDepthConvertLayerKernel.h" -#include "support/MemorySupport.h" +#include "arm_compute/core/Validate.h" + +#include "src/cpu/operators/CpuCast.h" #include <utility> -using namespace arm_compute; +namespace arm_compute +{ +struct NEDepthConvertLayer::Impl +{ + const ITensor *src{nullptr}; + ITensor *dst{nullptr}; + std::unique_ptr<cpu::CpuCast> op{nullptr}; +}; + +NEDepthConvertLayer::NEDepthConvertLayer() : _impl(std::make_unique<Impl>()) +{ +} +NEDepthConvertLayer::NEDepthConvertLayer(NEDepthConvertLayer &&) = default; +NEDepthConvertLayer &NEDepthConvertLayer::operator=(NEDepthConvertLayer &&) = default; +NEDepthConvertLayer::~NEDepthConvertLayer() = default; void NEDepthConvertLayer::configure(const ITensor *input, ITensor *output, ConvertPolicy policy, uint32_t shift) { - auto k = arm_compute::support::cpp14::make_unique<NEDepthConvertLayerKernel>(); - k->configure(input, output, policy, shift); - _kernel = std::move(k); + ARM_COMPUTE_UNUSED(shift); + + _impl->src = input; + _impl->dst = output; + + ARM_COMPUTE_ERROR_ON_NULLPTR(_impl->src, _impl->dst); + ARM_COMPUTE_ERROR_ON(shift != 0); + + _impl->op = std::make_unique<cpu::CpuCast>(); + _impl->op->configure(_impl->src->info(), _impl->dst->info(), policy); +} + +Status +NEDepthConvertLayer::validate(const ITensorInfo *input, const ITensorInfo *output, ConvertPolicy policy, uint32_t shift) +{ + ARM_COMPUTE_RETURN_ERROR_ON(shift != 0); + return cpu::CpuCast::validate(input, output, policy); } -Status NEDepthConvertLayer::validate(const ITensorInfo *input, const ITensorInfo *output, ConvertPolicy policy, uint32_t shift) +void NEDepthConvertLayer::run() { - return NEDepthConvertLayerKernel::validate(input, output, policy, shift); + ITensorPack pack = {{ACL_SRC, _impl->src}, {ACL_DST, _impl->dst}}; + _impl->op->run(pack); } +} // namespace arm_compute diff --git a/src/runtime/NEON/functions/NEDepthToSpaceLayer.cpp b/src/runtime/NEON/functions/NEDepthToSpaceLayer.cpp index 3569eecf0a..5eea4dca65 100644 --- a/src/runtime/NEON/functions/NEDepthToSpaceLayer.cpp +++ b/src/runtime/NEON/functions/NEDepthToSpaceLayer.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019 ARM Limited. + * Copyright (c) 2019-2021, 2023 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -25,15 +25,25 @@ #include "arm_compute/runtime/NEON/functions/NEDepthToSpaceLayer.h" #include "arm_compute/core/Error.h" -#include "arm_compute/core/TensorInfo.h" #include "arm_compute/core/Types.h" -#include "arm_compute/core/Validate.h" +#include "arm_compute/runtime/NEON/NEScheduler.h" + +#include "src/common/utils/Log.h" +#include "src/core/NEON/kernels/NEDepthToSpaceLayerKernel.h" namespace arm_compute { +NEDepthToSpaceLayer::NEDepthToSpaceLayer() : _kernel{} +{ +} + +NEDepthToSpaceLayer::~NEDepthToSpaceLayer() = default; + void NEDepthToSpaceLayer::configure(const ITensor *input, ITensor *output, int32_t block_shape) { - auto k = arm_compute::support::cpp14::make_unique<NEDepthToSpaceLayerKernel>(); + ARM_COMPUTE_LOG_PARAMS(input, output, block_shape); + + auto k = std::make_unique<NEDepthToSpaceLayerKernel>(); k->configure(input, output, block_shape); _kernel = std::move(k); } @@ -42,4 +52,10 @@ Status NEDepthToSpaceLayer::validate(const ITensorInfo *input, const ITensorInfo { return NEDepthToSpaceLayerKernel::validate(input, output, block_shape); } + +void NEDepthToSpaceLayer::run() +{ + NEScheduler::get().schedule(_kernel.get(), _kernel->get_split_dimension()); +} + } // namespace arm_compute diff --git a/src/runtime/NEON/functions/NEDepthwiseConvolutionLayer.cpp b/src/runtime/NEON/functions/NEDepthwiseConvolutionLayer.cpp index 7214971044..6c085645db 100644 --- a/src/runtime/NEON/functions/NEDepthwiseConvolutionLayer.cpp +++ b/src/runtime/NEON/functions/NEDepthwiseConvolutionLayer.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2020 ARM Limited. + * Copyright (c) 2017-2021, 2023 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -28,542 +28,358 @@ #include "arm_compute/core/utils/quantization/AsymmHelpers.h" #include "arm_compute/runtime/NEON/NEScheduler.h" +#include "src/common/utils/Log.h" +#include "src/cpu/operators/CpuDepthwiseConv2d.h" + using namespace arm_compute::misc; using namespace arm_compute::misc::shape_calculator; namespace arm_compute { -namespace -{ -Status validate_arguments_optimized(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info, - unsigned int depth_multiplier, const ActivationLayerInfo &act_info, const Size2D &dilation) -{ - ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F16, DataType::F32); - if(!is_data_type_quantized_per_channel(weights->data_type())) - { - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights); - } - ARM_COMPUTE_RETURN_ERROR_ON(input->data_layout() == DataLayout::UNKNOWN); - ARM_COMPUTE_RETURN_ERROR_ON(dilation.x() < 1 || dilation.y() < 1); - const size_t idx_w = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::WIDTH); - const size_t idx_h = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::HEIGHT); - ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(idx_w) + (weights->dimension(idx_w) - 1) * (dilation.x() - 1) > input->dimension(idx_w) + conv_info.pad_left() + conv_info.pad_right()); - ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(idx_h) + (weights->dimension(idx_h) - 1) * (dilation.y() - 1) > input->dimension(idx_h) + conv_info.pad_top() + conv_info.pad_bottom()); - - if(biases != nullptr) - { - const unsigned int channel_idx = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::CHANNEL); - ARM_COMPUTE_RETURN_ERROR_ON(biases->num_dimensions() > 1); - ARM_COMPUTE_RETURN_ERROR_ON(biases->dimension(0) != weights->dimension(channel_idx)); - } - - const bool is_quantized = (!is_data_type_quantized_per_channel(weights->data_type())) && is_data_type_quantized_asymmetric(input->data_type()); - - if(!NEDepthwiseConvolutionAssemblyDispatch::is_optimized_supported(input, weights, conv_info, depth_multiplier, dilation)) - { - TensorInfo accumulator = TensorInfo(output->clone()->set_is_resizable(true).reset_padding().set_data_type(DataType::S32)); - ARM_COMPUTE_RETURN_ON_ERROR(NEDepthwiseConvolutionLayer3x3Kernel::validate(input, weights, is_quantized ? &accumulator : output, conv_info, depth_multiplier, dilation)); - - if(is_quantized) - { - DirectConvolutionLayerOutputStageKernelInfo direct_conv_info; - direct_conv_info.output_data_type = input->data_type(); - ARM_COMPUTE_RETURN_ON_ERROR(NEDirectConvolutionLayerOutputStageKernel::validate(&accumulator, biases, output, direct_conv_info)); - } - } - else - { - ARM_COMPUTE_RETURN_ON_ERROR(NEDepthwiseConvolutionAssemblyDispatch::validate(input, weights, biases, output, conv_info, depth_multiplier, act_info, dilation)); - } +NEDepthwiseConvolutionLayer::~NEDepthwiseConvolutionLayer() = default; - //Validate Activation Layer - if(act_info.enabled()) - { - ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayer::validate(output, nullptr, act_info)); - } - return Status{}; -} -} // namespace - -NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerOptimizedInternal::NEDepthwiseConvolutionLayerOptimizedInternal(std::shared_ptr<IMemoryManager> memory_manager) - : _memory_group(memory_manager), _dwc_kernel(), _dwc_optimized_func(memory_manager), _output_stage_kernel(), _border_handler(), _permute_input(), _permute_weights(), _permute_output(), - _activationlayer_function(), _accumulator(), _permuted_input(), _permuted_weights(), _permuted_output(), _original_weights(nullptr), _has_bias(false), _is_quantized(false), _is_optimized(false), - _is_nchw(true), _permute(false), _is_activationlayer_enabled(false), _is_prepared(false) +struct NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerOptimizedInternal::Impl +{ + ITensor *src{nullptr}; // SRC_0 + ITensor *dst{nullptr}; // DST_0 + const ITensor *weights{nullptr}; // SRC_1 + const ITensor *biases{nullptr}; // SRC_2 + Tensor permuted_input{}; // INT_0 + Tensor permuted_weights{}; // INT_1 + Tensor permuted_output{}; // INT_2 + Tensor workspace{}; // INT_3 + Tensor packed_weights{}; // INT_4 + std::shared_ptr<cpu::CpuDepthwiseConv2d> op{nullptr}; + bool is_prepared{false}; + bool permute{false}; +}; + +NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerOptimizedInternal::NEDepthwiseConvolutionLayerOptimizedInternal( + std::shared_ptr<IMemoryManager> memory_manager) + : _memory_group(memory_manager), _impl(std::make_unique<Impl>()) { } -void NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerOptimizedInternal::configure_generic(ITensor *input, - const ITensor *weights, - const ITensor *biases, - ITensor *output, - const PadStrideInfo &conv_info, - unsigned int depth_multiplier, - const ActivationLayerInfo &act_info, - const Size2D &dilation) +void NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerOptimizedInternal::configure( + ITensor *input, + const ITensor *weights, + const ITensor *biases, + ITensor *output, + const PadStrideInfo &conv_info, + unsigned int depth_multiplier, + const ActivationLayerInfo &act_info, + const Size2D &dilation) { - ARM_COMPUTE_UNUSED(act_info); - - PixelValue zero_value(0.f); - - // Initialize the intermediate accumulator tensor in case of quantized input - if(_is_quantized) - { - TensorShape accum_shape = output->info()->tensor_shape(); - DataLayout accum_layout = output->info()->data_layout(); - if(!_is_nchw) - { - permute(accum_shape, PermutationVector(1U, 2U, 0U)); - accum_layout = DataLayout::NCHW; - } - - _memory_group.manage(&_accumulator); - _accumulator.allocator()->init(TensorInfo(accum_shape, 1, DataType::S32, output->info()->quantization_info())); - _accumulator.info()->set_data_layout(accum_layout); - zero_value = PixelValue(static_cast<uint32_t>(input->info()->quantization_info().uniform().offset)); - } - - if(!_is_nchw) - { - _memory_group.manage(&_permuted_input); - _memory_group.manage(&_permuted_output); - - // Configure the function to transform the input tensor from NHWC -> NCHW - _permute_input.configure(input, &_permuted_input, PermutationVector(1U, 2U, 0U)); - _permuted_input.info()->set_data_layout(DataLayout::NCHW); - - // Configure the function to transform the weights tensor from HWI -> IHW - _permute_weights.configure(weights, &_permuted_weights, PermutationVector(1U, 2U, 0U)); - _permuted_weights.info()->set_data_layout(DataLayout::NCHW); - _permuted_output.info()->set_quantization_info(output->info()->quantization_info()); + ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output); - // Configure depthwise - _dwc_kernel.configure(&_permuted_input, &_permuted_weights, (_is_quantized) ? &_accumulator : &_permuted_output, conv_info, depth_multiplier, dilation); + bool is_nhwc = input->info()->data_layout() == DataLayout::NCHW; + _impl->src = input; + _impl->weights = weights; + _impl->biases = biases; + _impl->dst = output; + _impl->permute = is_nhwc; - // Configure border handler - _border_handler.configure(&_permuted_input, _dwc_kernel.border_size(), BorderMode::CONSTANT, zero_value); + _impl->op = std::make_unique<cpu::CpuDepthwiseConv2d>(); + ConvolutionInfo info{conv_info, depth_multiplier, act_info, dilation}; + _impl->op->configure(_impl->src->info(), _impl->weights->info(), + _impl->biases == nullptr ? nullptr : _impl->biases->info(), _impl->dst->info(), info); - // Allocate tensors - _permuted_input.allocator()->allocate(); - } - else - { - // Configure depthwise convolution kernel - _dwc_kernel.configure(input, weights, (_is_quantized) ? &_accumulator : output, conv_info, depth_multiplier, dilation); + // Configure pipeline + ActivationLayerInfo act_info_to_use = ActivationLayerInfo(); + const bool is_relu = arm_compute::utils::info_helpers::is_relu(act_info); + const bool is_relu6 = arm_compute::utils::info_helpers::is_relu6(act_info); + bool is_activationlayer_enabled = act_info.enabled() && !(is_relu || is_relu6); - // Configure border handler - _border_handler.configure(input, _dwc_kernel.border_size(), BorderMode::CONSTANT, zero_value); - } - - // Configure biases accumulation - if(_is_quantized) - { - const UniformQuantizationInfo iq_info = input->info()->quantization_info().uniform(); - const UniformQuantizationInfo wq_info = weights->info()->quantization_info().uniform(); - const UniformQuantizationInfo oq_info = (output->info()->total_size() == 0) ? iq_info : output->info()->quantization_info().uniform(); - - float multiplier = (iq_info.scale * wq_info.scale) / oq_info.scale; - int32_t output_multiplier; - int32_t output_shift; - quantization::calculate_quantized_multiplier(multiplier, &output_multiplier, &output_shift); - - DirectConvolutionLayerOutputStageKernelInfo direct_conv_info; - direct_conv_info.result_fixedpoint_multiplier = output_multiplier; - direct_conv_info.result_shift = output_shift; - direct_conv_info.result_offset_after_shift = oq_info.offset; - direct_conv_info.output_data_type = input->info()->data_type(); - _output_stage_kernel.configure(&_accumulator, biases, _is_nchw ? output : &_permuted_output, direct_conv_info); - _accumulator.allocator()->allocate(); - } - else if(_has_bias) + if (!is_activationlayer_enabled) { - _output_stage_kernel.configure(_is_nchw ? output : &_permuted_output, biases); + act_info_to_use = act_info; } + info = ConvolutionInfo{conv_info, depth_multiplier, act_info_to_use, dilation}; - // Permute output - if(!_is_nchw) - { - // Configure the function to transform the convoluted output to NHWC - _permute_output.configure(&_permuted_output, output, PermutationVector(2U, 0U, 1U)); - _permuted_output.allocator()->allocate(); - } -} + auto dwc_optimized_func = std::make_unique<cpu::CpuDepthwiseConv2dAssemblyDispatch>(); -void NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerOptimizedInternal::configure_optimized(const ITensor *input, - const ITensor *weights, - const ITensor *biases, - ITensor *output, - const PadStrideInfo &conv_info, - unsigned int depth_multiplier, - const ActivationLayerInfo &act_info, - const Size2D &dilation) -{ - ActivationLayerInfo act_info_to_use = ActivationLayerInfo(); - const bool is_relu = arm_compute::utils::info_helpers::is_relu(act_info); - const bool is_relu6 = arm_compute::utils::info_helpers::is_relu6(act_info); - _is_activationlayer_enabled = act_info.enabled() && !(is_relu || is_relu6); - if(!_is_activationlayer_enabled) + if (is_nhwc) { - act_info_to_use = act_info; - } + auto permute_input = std::make_unique<cpu::CpuPermute>(); + auto permute_weights = std::make_unique<cpu::CpuPermute>(); + auto permute_output = std::make_unique<cpu::CpuPermute>(); - if(_is_nchw) - { - _memory_group.manage(&_permuted_input); - _memory_group.manage(&_permuted_output); + _memory_group.manage(&_impl->permuted_input); + _memory_group.manage(&_impl->permuted_weights); + _memory_group.manage(&_impl->permuted_output); // Configure the function to transform the input tensor from NCHW -> NHWC - _permute_input.configure(input, &_permuted_input, PermutationVector(2U, 0U, 1U)); - _permuted_input.info()->set_data_layout(DataLayout::NHWC); + permute_input->configure(input->info(), _impl->permuted_input.info(), PermutationVector(2U, 0U, 1U)); + _impl->permuted_input.info()->set_data_layout(DataLayout::NHWC); // Configure the function to transform the weights tensor from IHW -> HWI - _permute_weights.configure(weights, &_permuted_weights, PermutationVector(2U, 0U, 1U)); - _permuted_weights.info()->set_data_layout(DataLayout::NHWC); + permute_weights->configure(weights->info(), _impl->permuted_weights.info(), PermutationVector(2U, 0U, 1U)); + _impl->permuted_weights.info()->set_data_layout(DataLayout::NHWC); - _permuted_output.info()->set_data_layout(DataLayout::NHWC); - _permuted_output.info()->set_quantization_info(output->info()->quantization_info()); + _impl->permuted_output.info()->set_data_layout(DataLayout::NHWC); + _impl->permuted_output.info()->set_quantization_info(output->info()->quantization_info()); // Configure optimized depthwise - _dwc_optimized_func.configure(&_permuted_input, &_permuted_weights, biases, &_permuted_output, conv_info, depth_multiplier, act_info_to_use, dilation); + dwc_optimized_func->configure(_impl->permuted_input.info(), _impl->permuted_weights.info(), + biases == nullptr ? nullptr : biases->info(), _impl->permuted_output.info(), + info); // Configure the function to transform the convoluted output to ACL's native ordering format NCHW - _permuted_output.info()->set_data_layout(DataLayout::NHWC); - _permute_output.configure(&_permuted_output, output, PermutationVector(1U, 2U, 0U)); + _impl->permuted_output.info()->set_data_layout(DataLayout::NHWC); + permute_output->configure(_impl->permuted_output.info(), output->info(), PermutationVector(1U, 2U, 0U)); - // Allocate tensors - _permuted_input.allocator()->allocate(); - _permuted_output.allocator()->allocate(); + _impl->permuted_input.allocator()->allocate(); + _impl->permuted_output.allocator()->allocate(); } else { - _dwc_optimized_func.configure(input, weights, biases, output, conv_info, depth_multiplier, act_info_to_use, dilation); - } -} - -void NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerOptimizedInternal::configure(ITensor *input, - const ITensor *weights, - const ITensor *biases, - ITensor *output, const PadStrideInfo &conv_info, - unsigned int depth_multiplier, - const ActivationLayerInfo &act_info, - const Size2D &dilation) -{ - ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output); - // Perform validation step - ARM_COMPUTE_ERROR_THROW_ON(NEDepthwiseConvolutionLayerOptimizedInternal::validate(input->info(), weights->info(), (biases == nullptr) ? nullptr : biases->info(), - output->info(), conv_info, depth_multiplier, act_info, dilation)); - - _original_weights = weights; - _is_quantized = is_data_type_quantized_asymmetric(input->info()->data_type()); - _has_bias = biases != nullptr; - _is_optimized = NEDepthwiseConvolutionAssemblyDispatch::is_optimized_supported(input->info(), - weights->info(), - conv_info, - depth_multiplier, - dilation); - _is_nchw = input->info()->data_layout() == DataLayout::NCHW; - _permute = _is_optimized == _is_nchw; - _is_prepared = false; - _is_activationlayer_enabled = act_info.enabled(); - - // Configure appropriate pipeline - if(_is_optimized) - { - configure_optimized(input, weights, biases, output, conv_info, depth_multiplier, act_info, dilation); - } - else - { - configure_generic(input, weights, biases, output, conv_info, depth_multiplier, act_info, dilation); - } - - // Configure activation - if(_is_activationlayer_enabled) - { - _activationlayer_function.configure(output, nullptr, act_info); - } -} - -Status NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerOptimizedInternal::validate(const ITensorInfo *input, - const ITensorInfo *weights, - const ITensorInfo *biases, - const ITensorInfo *output, - const PadStrideInfo &conv_info, - unsigned int depth_multiplier, - const ActivationLayerInfo &act_info, - const Size2D &dilation) -{ - return validate_arguments_optimized(input, weights, biases, output, conv_info, depth_multiplier, act_info, dilation); -} - -void NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerOptimizedInternal::run_generic() -{ - // Fill border - NEScheduler::get().schedule(&_border_handler, Window::DimX); - - // Execute depthwise convolution - NEScheduler::get().schedule(&_dwc_kernel, Window::DimX); - - // Add biases - if(_has_bias || _is_quantized) - { - NEScheduler::get().schedule(&_output_stage_kernel, Window::DimX); + dwc_optimized_func->configure(_impl->src->info(), _impl->weights->info(), + biases == nullptr ? nullptr : biases->info(), _impl->dst->info(), info); } - // Permute output - if(!_is_nchw) - { - _permute_output.run(); - } + // Allocate memory based on the internal memory requirements + experimental::MemoryRequirements mem_req = dwc_optimized_func->workspace(); + _impl->workspace.allocator()->init(TensorInfo(TensorShape{mem_req[0].size + mem_req[0].alignment}, 1, DataType::S8), + mem_req[0].alignment); + _impl->packed_weights.allocator()->init( + TensorInfo(TensorShape{mem_req[1].size + mem_req[1].alignment}, 1, DataType::S8), mem_req[1].alignment); + _memory_group.manage(&_impl->workspace); + _memory_group.manage(&_impl->packed_weights); + _impl->workspace.allocator()->allocate(); + _impl->packed_weights.allocator()->allocate(); } -void NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerOptimizedInternal::run_optimized() +Status +NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerOptimizedInternal::validate(const ITensorInfo *input, + const ITensorInfo *weights, + const ITensorInfo *biases, + const ITensorInfo *output, + const PadStrideInfo &conv_info, + unsigned int depth_multiplier, + const ActivationLayerInfo &act_info, + const Size2D &dilation) { - // Run assembly function - _dwc_optimized_func.run(); - - // Permute output - if(_is_nchw) - { - _permute_output.run(); - } + ConvolutionInfo info{conv_info, depth_multiplier, act_info, dilation}; + return cpu::CpuDepthwiseConv2d::validate(input, weights, biases, output, info); } void NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerOptimizedInternal::run() { prepare(); - MemoryGroupResourceScope scope_mg(_memory_group); - // Permute input - if(_permute) - { - _permute_input.run(); - } - - _is_optimized ? run_optimized() : run_generic(); - - // Run activation - if(_is_activationlayer_enabled) - { - _activationlayer_function.run(); - } + ITensorPack pack; + pack.add_tensor(TensorType::ACL_SRC_0, _impl->src); + pack.add_tensor(TensorType::ACL_SRC_1, _impl->weights); + pack.add_tensor(TensorType::ACL_SRC_2, _impl->biases); + pack.add_tensor(TensorType::ACL_INT_0, &_impl->permuted_input); + pack.add_tensor(TensorType::ACL_INT_1, &_impl->permuted_weights); + pack.add_tensor(TensorType::ACL_INT_2, &_impl->permuted_output); + pack.add_tensor(TensorType::ACL_INT_3, &_impl->workspace); + pack.add_tensor(TensorType::ACL_INT_4, &_impl->packed_weights); + pack.add_tensor(TensorType::ACL_DST_0, _impl->dst); + + _impl->op->run(pack); } void NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerOptimizedInternal::prepare() { - if(!_is_prepared) + if (!_impl->is_prepared) { // Permute weights - if(_permute) + if (_impl->permute) { - _permuted_weights.allocator()->allocate(); - _permute_weights.run(); - _original_weights->mark_as_unused(); + _impl->permuted_weights.allocator()->allocate(); } - // Prepare optimized function - if(_is_optimized) + if (!_impl->permuted_weights.is_used()) { - _dwc_optimized_func.prepare(); - if(!_permuted_weights.is_used()) - { - _permuted_weights.allocator()->free(); - } + _impl->permuted_weights.allocator()->free(); } - _is_prepared = true; + _impl->is_prepared = true; } } +struct NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerGeneric::Impl +{ + Tensor permuted_input{}; + Tensor permuted_weights{}; + Tensor permuted_output{}; + bool is_prepared{false}; + bool is_nchw{false}; + bool is_activationlayer_enabled{false}; + const ITensor *weights{nullptr}; + const ITensor *biases{nullptr}; + const ITensor *src{nullptr}; + ITensor *dst{nullptr}; + std::shared_ptr<cpu::CpuDepthwiseConv2d> op{nullptr}; +}; + NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerGeneric::NEDepthwiseConvolutionLayerGeneric() - : _depthwise_conv_kernel(), _fill_border(), _permute_input(), _permute_weights(), _permute_output(), _activationlayer_function(), _permuted_input(), _permuted_weights(), _permuted_output(), - _is_prepared(false), _is_nchw(false), _is_activationlayer_enabled(false), _original_weights(nullptr) + : _impl(std::make_unique<Impl>()) { } -void NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerGeneric::configure(ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const PadStrideInfo &conv_info, - unsigned int depth_multiplier, const ActivationLayerInfo &act_info, const Size2D &dilation) +void NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerGeneric::configure(ITensor *input, + const ITensor *weights, + const ITensor *biases, + ITensor *output, + const PadStrideInfo &conv_info, + unsigned int depth_multiplier, + const ActivationLayerInfo &act_info, + const Size2D &dilation) { ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output); - ARM_COMPUTE_ERROR_THROW_ON(NEDepthwiseConvolutionLayer::validate(input->info(), weights->info(), (biases == nullptr) ? nullptr : biases->info(), - output->info(), conv_info, depth_multiplier, act_info, dilation)); - _is_nchw = input->info()->data_layout() == DataLayout::NCHW; - _is_prepared = !_is_nchw; + const ConvolutionInfo info{conv_info, depth_multiplier, act_info, dilation}; + _impl->op = std::make_unique<cpu::CpuDepthwiseConv2d>(); + _impl->op->configure(input->info(), weights->info(), biases == nullptr ? nullptr : biases->info(), output->info(), + info); + + _impl->src = input; + _impl->dst = output; + _impl->weights = weights; + _impl->biases = biases; + _impl->is_nchw = input->info()->data_layout() == DataLayout::NCHW; + _impl->is_prepared = !_impl->is_nchw; ITensor *input_to_use = input; const ITensor *weights_to_use = weights; ITensor *output_to_use = output; - if(_is_nchw) + if (_impl->is_nchw) { - _permute_input.configure(input, &_permuted_input, PermutationVector(2U, 0U, 1U)); - _permuted_input.info()->set_data_layout(DataLayout::NHWC); - input_to_use = &_permuted_input; + auto permute_input = std::make_unique<cpu::CpuPermute>(); + auto permute_weights = std::make_unique<cpu::CpuPermute>(); + + permute_input->configure(input->info(), _impl->permuted_input.info(), PermutationVector(2U, 0U, 1U)); + _impl->permuted_input.info()->set_data_layout(DataLayout::NHWC); + input_to_use = &_impl->permuted_input; - _permute_weights.configure(weights, &_permuted_weights, PermutationVector(2U, 0U, 1U)); - _permuted_weights.info()->set_data_layout(DataLayout::NHWC); - weights_to_use = &_permuted_weights; + permute_weights->configure(weights->info(), _impl->permuted_weights.info(), PermutationVector(2U, 0U, 1U)); + _impl->permuted_weights.info()->set_data_layout(DataLayout::NHWC); + weights_to_use = &_impl->permuted_weights; - _permuted_output.allocator()->init(output->info()->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(TensorShape())); - output_to_use = &_permuted_output; + _impl->permuted_output.allocator()->init( + output->info()->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(TensorShape())); + output_to_use = &_impl->permuted_output; } - _original_weights = weights_to_use; - _depthwise_conv_kernel.configure(input_to_use, weights_to_use, biases, output_to_use, conv_info, depth_multiplier, dilation); - _fill_border.configure(input_to_use, _depthwise_conv_kernel.border_size(), BorderMode::CONSTANT, PixelValue(static_cast<uint64_t>(0), input->info()->data_type(), input->info()->quantization_info())); + auto depthwise_conv_kernel = std::make_unique<cpu::kernels::CpuDepthwiseConv2dNativeKernel>(); + depthwise_conv_kernel->configure(input_to_use->info(), weights_to_use->info(), + biases == nullptr ? nullptr : biases->info(), output_to_use->info(), info); - if(_is_nchw) + if (_impl->is_nchw) { - _permute_output.configure(&_permuted_output, output, PermutationVector(1U, 2U, 0U)); - _permuted_output.info()->set_data_layout(DataLayout::NHWC); + auto permute_output = std::make_unique<cpu::CpuPermute>(); + permute_output->configure(_impl->permuted_output.info(), output->info(), PermutationVector(1U, 2U, 0U)); + _impl->permuted_output.info()->set_data_layout(DataLayout::NHWC); - _permuted_input.allocator()->allocate(); - _permuted_weights.allocator()->allocate(); - _permuted_output.allocator()->allocate(); - } - - //Configure Activation Layer - _is_activationlayer_enabled = act_info.enabled(); - if(_is_activationlayer_enabled) - { - _activationlayer_function.configure(output, nullptr, act_info); + _impl->permuted_input.allocator()->allocate(); + _impl->permuted_weights.allocator()->allocate(); + _impl->permuted_output.allocator()->allocate(); } } -Status NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerGeneric::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, +Status NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerGeneric::validate(const ITensorInfo *input, + const ITensorInfo *weights, + const ITensorInfo *biases, + const ITensorInfo *output, const PadStrideInfo &conv_info, - unsigned int depth_multiplier, const ActivationLayerInfo &act_info, const Size2D &dilation) + unsigned int depth_multiplier, + const ActivationLayerInfo &act_info, + const Size2D &dilation) { - ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output); - if(input->data_layout() == DataLayout::NCHW) - { - TensorShape permuted_input_shape = input->tensor_shape(); - TensorShape permuted_weights_shape = weights->tensor_shape(); - TensorShape permuted_output_shape = misc::shape_calculator::compute_depthwise_convolution_shape(*input, *weights, conv_info, depth_multiplier, dilation); - permute(permuted_input_shape, PermutationVector(2U, 0U, 1U)); - permute(permuted_weights_shape, PermutationVector(2U, 0U, 1U)); - permute(permuted_output_shape, PermutationVector(2U, 0U, 1U)); - - const TensorInfo permuted_input = TensorInfo(input->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(permuted_input_shape).set_data_layout(DataLayout::NHWC)); - const TensorInfo permuted_weights = TensorInfo(weights->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(permuted_weights_shape).set_data_layout(DataLayout::NHWC)); - const TensorInfo permuted_output = TensorInfo(output->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(permuted_output_shape).set_data_layout(DataLayout::NCHW)); - - ARM_COMPUTE_RETURN_ON_ERROR(NEPermute::validate(input, &permuted_input, PermutationVector(2U, 0U, 1U))); - ARM_COMPUTE_RETURN_ON_ERROR(NEPermute::validate(weights, &permuted_weights, PermutationVector(2U, 0U, 1U))); - ARM_COMPUTE_RETURN_ON_ERROR(NEPermute::validate(&permuted_output, output, PermutationVector(1U, 2U, 0U))); - - ARM_COMPUTE_RETURN_ON_ERROR(NEDepthwiseConvolutionLayerNativeKernel::validate(&permuted_input, &permuted_weights, biases, &permuted_output, conv_info, depth_multiplier, dilation)); - } - else - { - ARM_COMPUTE_RETURN_ON_ERROR(NEDepthwiseConvolutionLayerNativeKernel::validate(input, weights, biases, output, conv_info, depth_multiplier, dilation)); - } - - // Validate Activation Layer - if(act_info.enabled()) - { - ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayer::validate(output, nullptr, act_info)); - } - - return Status{}; + ConvolutionInfo info{conv_info, depth_multiplier, act_info, dilation}; + return cpu::CpuDepthwiseConv2d::validate(input, weights, biases, output, info); } void NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerGeneric::run() { - if(_is_nchw) - { - prepare(); - _permute_input.run(); - } - - NEScheduler::get().schedule(&_fill_border, Window::DimX); - NEScheduler::get().schedule(&_depthwise_conv_kernel, Window::DimY); - - if(_is_nchw) - { - _permute_output.run(); - } - - if(_is_activationlayer_enabled) - { - _activationlayer_function.run(); - } -} - -void NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerGeneric::prepare() -{ - if(!_is_prepared) - { - ARM_COMPUTE_ERROR_ON(!_original_weights->is_used()); - - _permute_weights.run(); - _original_weights->mark_as_unused(); - _is_prepared = true; - } + ITensorPack pack; + pack.add_tensor(TensorType::ACL_SRC_0, _impl->src); + pack.add_tensor(TensorType::ACL_SRC_1, _impl->weights); + pack.add_tensor(TensorType::ACL_SRC_2, _impl->biases); + pack.add_tensor(TensorType::ACL_INT_0, &_impl->permuted_input); + pack.add_tensor(TensorType::ACL_INT_1, &_impl->permuted_weights); + pack.add_tensor(TensorType::ACL_INT_2, &_impl->permuted_output); + pack.add_tensor(TensorType::ACL_DST_0, _impl->dst); + + _impl->op->run(pack); } NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayer(std::shared_ptr<IMemoryManager> memory_manager) - : _depth_conv_func(DepthwiseConvolutionFunction::GENERIC), _func_optimized(std::move(memory_manager)), _func_generic() + : _memory_group(std::move(memory_manager)), _impl(std::make_unique<Impl>()) { } -void NEDepthwiseConvolutionLayer::configure(ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const PadStrideInfo &conv_info, unsigned int depth_multiplier, - const ActivationLayerInfo &act_info, const Size2D &dilation) +#ifndef DOXYGEN_SKIP_THIS +struct NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayer::Impl { - _depth_conv_func = get_depthwiseconvolution_function(input->info(), weights->info(), (biases != nullptr) ? biases->info() : nullptr, output->info(), conv_info, depth_multiplier, act_info, dilation); - switch(_depth_conv_func) - { - case DepthwiseConvolutionFunction::OPTIMIZED: - _func_optimized.configure(input, weights, biases, output, conv_info, depth_multiplier, act_info, dilation); - break; - case DepthwiseConvolutionFunction::GENERIC: - _func_generic.configure(input, weights, biases, output, conv_info, depth_multiplier, act_info, dilation); - break; - default: - ARM_COMPUTE_ERROR("Unsupported DepthwiseConvolutionFunction"); - } -} - -Status NEDepthwiseConvolutionLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info, - unsigned int depth_multiplier, const ActivationLayerInfo &act_info, const Size2D &dilation) + DepthwiseConvolutionFunction depth_conv_func{DepthwiseConvolutionFunction::OPTIMIZED}; + NEDepthwiseConvolutionLayerOptimizedInternal func_optimized{nullptr}; + NEDepthwiseConvolutionLayerGeneric func_generic{}; + std::shared_ptr<cpu::CpuDepthwiseConv2d> op{nullptr}; +}; +#endif // DOXYGEN_SKIP_THIS + +void NEDepthwiseConvolutionLayer::configure(ITensor *input, + const ITensor *weights, + const ITensor *biases, + ITensor *output, + const PadStrideInfo &conv_info, + unsigned int depth_multiplier, + const ActivationLayerInfo &act_info, + const Size2D &dilation) { - DepthwiseConvolutionFunction depth_conv_func = get_depthwiseconvolution_function(input, weights, biases, output, conv_info, depth_multiplier, act_info, dilation); - switch(depth_conv_func) + ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output); + + ARM_COMPUTE_LOG_PARAMS(input, weights, output, conv_info, depth_multiplier, biases, act_info, dilation); + ARM_COMPUTE_ERROR_THROW_ON(NEDepthwiseConvolutionLayer::validate( + input->info(), weights->info(), (biases == nullptr) ? nullptr : biases->info(), output->info(), conv_info, + depth_multiplier, act_info, dilation)); + + const ConvolutionInfo info{conv_info, depth_multiplier, act_info, dilation}; + _impl->op = std::make_shared<cpu::CpuDepthwiseConv2d>(); + _impl->depth_conv_func = _impl->op->get_depthwiseconvolution_function( + input->info(), weights->info(), (biases != nullptr) ? biases->info() : nullptr, output->info(), info); + switch (_impl->depth_conv_func) { case DepthwiseConvolutionFunction::OPTIMIZED: - return NEDepthwiseConvolutionLayerOptimizedInternal::validate(input, weights, biases, output, conv_info, depth_multiplier, act_info, dilation); + _impl->func_optimized.configure(input, weights, biases, output, conv_info, depth_multiplier, act_info, + dilation); break; case DepthwiseConvolutionFunction::GENERIC: - return NEDepthwiseConvolutionLayerGeneric::validate(input, weights, biases, output, conv_info, depth_multiplier, act_info, dilation); + _impl->func_generic.configure(input, weights, biases, output, conv_info, depth_multiplier, act_info, + dilation); break; default: ARM_COMPUTE_ERROR("Unsupported DepthwiseConvolutionFunction"); } } -DepthwiseConvolutionFunction NEDepthwiseConvolutionLayer::get_depthwiseconvolution_function(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, - const PadStrideInfo &conv_info, - unsigned int depth_multiplier, ActivationLayerInfo act_info, const Size2D &dilation) +Status NEDepthwiseConvolutionLayer::validate(const ITensorInfo *input, + const ITensorInfo *weights, + const ITensorInfo *biases, + const ITensorInfo *output, + const PadStrideInfo &conv_info, + unsigned int depth_multiplier, + const ActivationLayerInfo &act_info, + const Size2D &dilation) { - if(bool(NEDepthwiseConvolutionLayerOptimizedInternal::validate(input, weights, biases, output, conv_info, depth_multiplier, act_info, dilation))) - { - return DepthwiseConvolutionFunction::OPTIMIZED; - } - else - { - return DepthwiseConvolutionFunction::GENERIC; - } + ConvolutionInfo info{conv_info, depth_multiplier, act_info, dilation}; + return cpu::CpuDepthwiseConv2d::validate(input, weights, biases, output, info); } void NEDepthwiseConvolutionLayer::run() { - switch(_depth_conv_func) + switch (_impl->depth_conv_func) { case DepthwiseConvolutionFunction::OPTIMIZED: - _func_optimized.run(); + _impl->func_optimized.run(); break; case DepthwiseConvolutionFunction::GENERIC: - _func_generic.run(); + _impl->func_generic.run(); break; default: ARM_COMPUTE_ERROR("DepthwiseConvolutionFunction not properly configured"); @@ -572,13 +388,13 @@ void NEDepthwiseConvolutionLayer::run() void NEDepthwiseConvolutionLayer::prepare() { - switch(_depth_conv_func) + switch (_impl->depth_conv_func) { case DepthwiseConvolutionFunction::OPTIMIZED: - _func_optimized.prepare(); + _impl->func_optimized.prepare(); break; case DepthwiseConvolutionFunction::GENERIC: - _func_generic.prepare(); + _impl->func_generic.prepare(); break; default: ARM_COMPUTE_ERROR("DepthwiseConvolutionFunction not properly configured"); diff --git a/src/runtime/NEON/functions/NEDequantizationLayer.cpp b/src/runtime/NEON/functions/NEDequantizationLayer.cpp index 42a0ee0895..28d19d2950 100644 --- a/src/runtime/NEON/functions/NEDequantizationLayer.cpp +++ b/src/runtime/NEON/functions/NEDequantizationLayer.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2020 ARM Limited. + * Copyright (c) 2017-2021 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -24,20 +24,43 @@ #include "arm_compute/runtime/NEON/functions/NEDequantizationLayer.h" -#include "arm_compute/core/NEON/kernels/NEDequantizationLayerKernel.h" -#include "support/MemorySupport.h" +#include "arm_compute/core/Validate.h" +#include "arm_compute/runtime/Tensor.h" + +#include "src/cpu/operators/CpuDequantize.h" namespace arm_compute { +struct NEDequantizationLayer::Impl +{ + const ITensor *src{nullptr}; + ITensor *dst{nullptr}; + std::unique_ptr<cpu::CpuDequantize> op{nullptr}; +}; + +NEDequantizationLayer::NEDequantizationLayer() : _impl(std::make_unique<Impl>()) +{ +} +NEDequantizationLayer::~NEDequantizationLayer() = default; + void NEDequantizationLayer::configure(const ITensor *input, ITensor *output) { - auto k = arm_compute::support::cpp14::make_unique<NEDequantizationLayerKernel>(); - k->configure(input, output); - _kernel = std::move(k); + _impl->src = input; + _impl->dst = output; + _impl->op = std::make_unique<cpu::CpuDequantize>(); + _impl->op->configure(input->info(), output->info()); } Status NEDequantizationLayer::validate(const ITensorInfo *input, const ITensorInfo *output) { - return NEDequantizationLayerKernel::validate(input, output); + return cpu::CpuDequantize::validate(input, output); +} + +void NEDequantizationLayer::run() +{ + ITensorPack pack; + pack.add_tensor(TensorType::ACL_SRC, _impl->src); + pack.add_tensor(TensorType::ACL_DST, _impl->dst); + _impl->op->run(pack); } } // namespace arm_compute diff --git a/src/runtime/NEON/functions/NEDerivative.cpp b/src/runtime/NEON/functions/NEDerivative.cpp deleted file mode 100644 index 81180307f6..0000000000 --- a/src/runtime/NEON/functions/NEDerivative.cpp +++ /dev/null @@ -1,52 +0,0 @@ -/* - * Copyright (c) 2016, 2017 ARM Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "arm_compute/runtime/NEON/functions/NEDerivative.h" - -#include "arm_compute/core/Error.h" -#include "arm_compute/core/NEON/kernels/NEDerivativeKernel.h" -#include "arm_compute/core/PixelValue.h" -#include "arm_compute/core/Validate.h" -#include "arm_compute/runtime/NEON/NEScheduler.h" - -using namespace arm_compute; - -NEDerivative::NEDerivative() - : _kernel(), _border_handler() -{ -} - -void NEDerivative::configure(ITensor *input, ITensor *output_x, ITensor *output_y, BorderMode border_mode, uint8_t constant_border_value) -{ - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8); - ARM_COMPUTE_ERROR_ON((output_x == nullptr) && (output_y == nullptr)); - - _kernel.configure(input, output_x, output_y, border_mode == BorderMode::UNDEFINED); - _border_handler.configure(input, BorderSize(1), border_mode, PixelValue(constant_border_value)); -} - -void NEDerivative::run() -{ - NEScheduler::get().schedule(&_border_handler, Window::DimZ); - NEScheduler::get().schedule(&_kernel, Window::DimY); -} diff --git a/src/runtime/NEON/functions/NEDetectionPostProcessLayer.cpp b/src/runtime/NEON/functions/NEDetectionPostProcessLayer.cpp index 0d97689d0a..b347390162 100644 --- a/src/runtime/NEON/functions/NEDetectionPostProcessLayer.cpp +++ b/src/runtime/NEON/functions/NEDetectionPostProcessLayer.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2020 ARM Limited. + * Copyright (c) 2019-2021 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -27,6 +27,8 @@ #include "arm_compute/core/Helpers.h" #include "arm_compute/core/Validate.h" +#include "src/common/utils/Log.h" + #include <cstddef> #include <ios> #include <list> @@ -34,23 +36,36 @@ namespace arm_compute { NEDetectionPostProcessLayer::NEDetectionPostProcessLayer(std::shared_ptr<IMemoryManager> memory_manager) - : _memory_group(std::move(memory_manager)), _dequantize(), _detection_post_process(), _decoded_scores(), _run_dequantize(false) + : _memory_group(std::move(memory_manager)), + _dequantize(), + _detection_post_process(), + _decoded_scores(), + _run_dequantize(false) { } -void NEDetectionPostProcessLayer::configure(const ITensor *input_box_encoding, const ITensor *input_scores, const ITensor *input_anchors, - ITensor *output_boxes, ITensor *output_classes, ITensor *output_scores, ITensor *num_detection, DetectionPostProcessLayerInfo info) +void NEDetectionPostProcessLayer::configure(const ITensor *input_box_encoding, + const ITensor *input_scores, + const ITensor *input_anchors, + ITensor *output_boxes, + ITensor *output_classes, + ITensor *output_scores, + ITensor *num_detection, + DetectionPostProcessLayerInfo info) { - ARM_COMPUTE_ERROR_ON_NULLPTR(input_box_encoding, input_scores, input_anchors, output_boxes, output_classes, output_scores); - ARM_COMPUTE_ERROR_THROW_ON(NEDetectionPostProcessLayer::validate(input_box_encoding->info(), input_scores->info(), input_anchors->info(), output_boxes->info(), output_classes->info(), - output_scores->info(), - num_detection->info(), info)); + ARM_COMPUTE_ERROR_ON_NULLPTR(input_box_encoding, input_scores, input_anchors, output_boxes, output_classes, + output_scores); + ARM_COMPUTE_ERROR_THROW_ON(NEDetectionPostProcessLayer::validate( + input_box_encoding->info(), input_scores->info(), input_anchors->info(), output_boxes->info(), + output_classes->info(), output_scores->info(), num_detection->info(), info)); + ARM_COMPUTE_LOG_PARAMS(input_box_encoding, input_scores, input_anchors, output_boxes, output_classes, output_scores, + num_detection, info); const ITensor *input_scores_to_use = input_scores; DetectionPostProcessLayerInfo info_to_use = info; _run_dequantize = is_data_type_quantized(input_box_encoding->info()->data_type()); - if(_run_dequantize) + if (_run_dequantize) { _memory_group.manage(&_decoded_scores); @@ -59,26 +74,37 @@ void NEDetectionPostProcessLayer::configure(const ITensor *input_box_encoding, c input_scores_to_use = &_decoded_scores; // Create a new info struct to avoid dequantizing in the CPP layer - std::array<float, 4> scales_values{ info.scale_value_y(), info.scale_value_x(), info.scale_value_h(), info.scale_value_w() }; - DetectionPostProcessLayerInfo info_quantized(info.max_detections(), info.max_classes_per_detection(), info.nms_score_threshold(), info.iou_threshold(), info.num_classes(), - scales_values, info.use_regular_nms(), info.detection_per_class(), false); + std::array<float, 4> scales_values{info.scale_value_y(), info.scale_value_x(), info.scale_value_h(), + info.scale_value_w()}; + DetectionPostProcessLayerInfo info_quantized( + info.max_detections(), info.max_classes_per_detection(), info.nms_score_threshold(), info.iou_threshold(), + info.num_classes(), scales_values, info.use_regular_nms(), info.detection_per_class(), false); info_to_use = info_quantized; } - _detection_post_process.configure(input_box_encoding, input_scores_to_use, input_anchors, output_boxes, output_classes, output_scores, num_detection, info_to_use); + _detection_post_process.configure(input_box_encoding, input_scores_to_use, input_anchors, output_boxes, + output_classes, output_scores, num_detection, info_to_use); _decoded_scores.allocator()->allocate(); } -Status NEDetectionPostProcessLayer::validate(const ITensorInfo *input_box_encoding, const ITensorInfo *input_scores, const ITensorInfo *input_anchors, - ITensorInfo *output_boxes, ITensorInfo *output_classes, ITensorInfo *output_scores, ITensorInfo *num_detection, DetectionPostProcessLayerInfo info) +Status NEDetectionPostProcessLayer::validate(const ITensorInfo *input_box_encoding, + const ITensorInfo *input_scores, + const ITensorInfo *input_anchors, + ITensorInfo *output_boxes, + ITensorInfo *output_classes, + ITensorInfo *output_scores, + ITensorInfo *num_detection, + DetectionPostProcessLayerInfo info) { bool run_dequantize = is_data_type_quantized(input_box_encoding->data_type()); - if(run_dequantize) + if (run_dequantize) { TensorInfo decoded_classes_info = input_scores->clone()->set_is_resizable(true).set_data_type(DataType::F32); ARM_COMPUTE_RETURN_ON_ERROR(NEDequantizationLayer::validate(input_scores, &decoded_classes_info)); } - ARM_COMPUTE_RETURN_ON_ERROR(CPPDetectionPostProcessLayer::validate(input_box_encoding, input_scores, input_anchors, output_boxes, output_classes, output_scores, num_detection, info)); + ARM_COMPUTE_RETURN_ON_ERROR(CPPDetectionPostProcessLayer::validate(input_box_encoding, input_scores, input_anchors, + output_boxes, output_classes, output_scores, + num_detection, info)); return Status{}; } @@ -88,7 +114,7 @@ void NEDetectionPostProcessLayer::run() MemoryGroupResourceScope scope_mg(_memory_group); // Decode scores if necessary - if(_run_dequantize) + if (_run_dequantize) { _dequantize.run(); } diff --git a/src/runtime/NEON/functions/NEDilate.cpp b/src/runtime/NEON/functions/NEDilate.cpp deleted file mode 100644 index 449147dbe2..0000000000 --- a/src/runtime/NEON/functions/NEDilate.cpp +++ /dev/null @@ -1,40 +0,0 @@ -/* - * Copyright (c) 2016-2020 ARM Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "arm_compute/runtime/NEON/functions/NEDilate.h" - -#include "arm_compute/core/NEON/kernels/NEDilateKernel.h" -#include "arm_compute/core/PixelValue.h" -#include "support/MemorySupport.h" - -#include <utility> - -using namespace arm_compute; - -void NEDilate::configure(ITensor *input, ITensor *output, BorderMode border_mode, uint8_t constant_border_value) -{ - auto k = arm_compute::support::cpp14::make_unique<NEDilateKernel>(); - k->configure(input, output, border_mode == BorderMode::UNDEFINED); - _kernel = std::move(k); - _border_handler.configure(input, _kernel->border_size(), border_mode, PixelValue(constant_border_value)); -} diff --git a/src/runtime/NEON/functions/NEDirectConvolutionLayer.cpp b/src/runtime/NEON/functions/NEDirectConvolutionLayer.cpp index 751a3fa1fb..f1c2cf969f 100644 --- a/src/runtime/NEON/functions/NEDirectConvolutionLayer.cpp +++ b/src/runtime/NEON/functions/NEDirectConvolutionLayer.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2020 ARM Limited. + * Copyright (c) 2017-2021 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -28,95 +28,58 @@ #include "arm_compute/core/Validate.h" #include "arm_compute/runtime/NEON/NEScheduler.h" -#include <cmath> -#include <tuple> +#include "src/cpu/operators/CpuDirectConv2d.h" namespace arm_compute { +struct NEDirectConvolutionLayer::Impl +{ + ITensor *src{nullptr}; + const ITensor *weights{nullptr}; + const ITensor *bias{nullptr}; + ITensor *dst{nullptr}; + std::unique_ptr<cpu::CpuDirectConv2d> op{nullptr}; +}; + NEDirectConvolutionLayer::NEDirectConvolutionLayer(std::shared_ptr<IMemoryManager> memory_manager) - : _memory_group(std::move(memory_manager)), _output_stage_kernel(), _conv_kernel(), _input_border_handler(), _activationlayer_function(), _accumulator(), _has_bias(false), - _is_activationlayer_enabled(false), _dim_split(Window::DimZ) + : _memory_manager(std::move(memory_manager)), _impl(std::make_unique<Impl>()) { } - -void NEDirectConvolutionLayer::configure(ITensor *input, const ITensor *weights, const ITensor *bias, ITensor *output, const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info) +NEDirectConvolutionLayer::~NEDirectConvolutionLayer() = default; + +void NEDirectConvolutionLayer::configure(ITensor *input, + const ITensor *weights, + const ITensor *bias, + ITensor *output, + const PadStrideInfo &conv_info, + const ActivationLayerInfo &act_info) { - ARM_COMPUTE_ERROR_ON(input->info()->data_layout() == DataLayout::UNKNOWN); - - // Free accumulator - if(_accumulator.buffer() != nullptr) - { - _accumulator.allocator()->free(); - } - - _dim_split = input->info()->data_layout() == DataLayout::NCHW ? Window::DimZ : Window::DimY; - - // Check if bias should be added in the convolution result - _has_bias = (bias != nullptr); - - _conv_kernel.configure(input, weights, output, conv_info); - if(_has_bias) - { - _output_stage_kernel.configure(output, bias); - } - - // Add zero padding XY - _input_border_handler.configure(input, _conv_kernel.border_size(), BorderMode::CONSTANT, PixelValue(static_cast<float>(0.f))); - - //Configure Activation Layer - _is_activationlayer_enabled = act_info.enabled(); - if(_is_activationlayer_enabled) - { - _activationlayer_function.configure(output, nullptr, act_info); - } + _impl->src = input; + _impl->weights = weights; + _impl->bias = bias; + _impl->dst = output; + _impl->op = std::make_unique<cpu::CpuDirectConv2d>(_memory_manager); + _impl->op->configure(input->info(), weights->info(), (bias != nullptr ? bias->info() : nullptr), output->info(), + conv_info, act_info); } -Status NEDirectConvolutionLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *bias, const ITensorInfo *output, const PadStrideInfo &conv_info, +Status NEDirectConvolutionLayer::validate(const ITensorInfo *input, + const ITensorInfo *weights, + const ITensorInfo *bias, + const ITensorInfo *output, + const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info) { - ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output); - - // output might not be initialized since it can be an intermediate tensor of another layer - DataType data_type = input->data_type(); - TensorInfo accumulator(output->clone()->set_is_resizable(true).reset_padding().set_data_type(data_type)); - - // Validate Convolution kernel - ARM_COMPUTE_RETURN_ON_ERROR(NEDirectConvolutionLayerKernel::validate(input, weights, &accumulator, conv_info)); - - if(bias != nullptr) - { - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(weights, bias); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(bias->dimension(0) != weights->dimension(3), - "Biases size and number of input feature maps should match"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(bias->num_dimensions() > 1, "Biases should be one dimensional"); - } - - // Validate bias kernel - ARM_COMPUTE_RETURN_ON_ERROR(NEDirectConvolutionLayerOutputStageKernel::validate(&accumulator, bias, output)); - - if(act_info.enabled()) - { - ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayer::validate(output, nullptr, act_info)); - } - - return Status{}; + return cpu::CpuDirectConv2d::validate(input, weights, bias, output, conv_info, act_info); } void NEDirectConvolutionLayer::run() { - NEScheduler::get().schedule(&_input_border_handler, Window::DimZ); - - MemoryGroupResourceScope scope_mg(_memory_group); - - NEScheduler::get().schedule(&_conv_kernel, _dim_split); - if(_has_bias) - { - NEScheduler::get().schedule(&_output_stage_kernel, Window::DimY); - } - - if(_is_activationlayer_enabled) - { - _activationlayer_function.run(); - } + ITensorPack pack; + pack.add_tensor(TensorType::ACL_SRC_0, _impl->src); + pack.add_tensor(TensorType::ACL_SRC_1, _impl->weights); + pack.add_tensor(TensorType::ACL_SRC_2, _impl->bias); + pack.add_tensor(TensorType::ACL_DST, _impl->dst); + _impl->op->run(pack); } } // namespace arm_compute diff --git a/src/runtime/NEON/functions/NEElementwiseOperations.cpp b/src/runtime/NEON/functions/NEElementwiseOperations.cpp new file mode 100644 index 0000000000..685ef2d4d7 --- /dev/null +++ b/src/runtime/NEON/functions/NEElementwiseOperations.cpp @@ -0,0 +1,356 @@ +/* + * Copyright (c) 2018-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/runtime/NEON/functions/NEElementwiseOperations.h" + +#include "arm_compute/core/ITensor.h" +#include "arm_compute/core/Validate.h" + +#include "src/cpu/operators/CpuElementwise.h" + +#include <utility> + +namespace arm_compute +{ +struct NEElementwiseMax::Impl +{ + const ITensor *src_0{nullptr}; + const ITensor *src_1{nullptr}; + ITensor *dst{nullptr}; + std::unique_ptr<cpu::CpuElementwiseMax> op{nullptr}; +}; + +NEElementwiseMax::NEElementwiseMax() : _impl(std::make_unique<Impl>()) +{ +} +NEElementwiseMax::NEElementwiseMax(NEElementwiseMax &&) = default; +NEElementwiseMax &NEElementwiseMax::operator=(NEElementwiseMax &&) = default; +NEElementwiseMax::~NEElementwiseMax() = default; + +void NEElementwiseMax::configure(ITensor *input1, ITensor *input2, ITensor *output, const ActivationLayerInfo &act_info) +{ + ARM_COMPUTE_UNUSED(act_info); + _impl->src_0 = input1; + _impl->src_1 = input2; + _impl->dst = output; + _impl->op = std::make_unique<cpu::CpuElementwiseMax>(); + _impl->op->configure(input1->info(), input2->info(), output->info()); +} + +Status NEElementwiseMax::validate(const ITensorInfo *input1, + const ITensorInfo *input2, + const ITensorInfo *output, + const ActivationLayerInfo &act_info) +{ + ARM_COMPUTE_RETURN_ERROR_ON(act_info.enabled()); + return cpu::CpuElementwiseMax::validate(input1, input2, output); +} + +void NEElementwiseMax::run() +{ + ITensorPack pack; + pack.add_tensor(TensorType::ACL_SRC_0, _impl->src_0); + pack.add_tensor(TensorType::ACL_SRC_1, _impl->src_1); + pack.add_tensor(TensorType::ACL_DST, _impl->dst); + _impl->op->run(pack); +} + +struct NEElementwiseMin::Impl +{ + const ITensor *src_0{nullptr}; + const ITensor *src_1{nullptr}; + ITensor *dst{nullptr}; + std::unique_ptr<cpu::CpuElementwiseMin> op{nullptr}; +}; + +NEElementwiseMin::NEElementwiseMin() : _impl(std::make_unique<Impl>()) +{ +} +NEElementwiseMin::NEElementwiseMin(NEElementwiseMin &&) = default; +NEElementwiseMin &NEElementwiseMin::operator=(NEElementwiseMin &&) = default; +NEElementwiseMin::~NEElementwiseMin() = default; + +void NEElementwiseMin::configure(ITensor *input1, ITensor *input2, ITensor *output, const ActivationLayerInfo &act_info) +{ + ARM_COMPUTE_UNUSED(act_info); + _impl->src_0 = input1; + _impl->src_1 = input2; + _impl->dst = output; + _impl->op = std::make_unique<cpu::CpuElementwiseMin>(); + _impl->op->configure(input1->info(), input2->info(), output->info()); +} + +Status NEElementwiseMin::validate(const ITensorInfo *input1, + const ITensorInfo *input2, + const ITensorInfo *output, + const ActivationLayerInfo &act_info) +{ + ARM_COMPUTE_RETURN_ERROR_ON(act_info.enabled()); + return cpu::CpuElementwiseMin::validate(input1, input2, output); +} + +void NEElementwiseMin::run() +{ + ITensorPack pack; + pack.add_tensor(TensorType::ACL_SRC_0, _impl->src_0); + pack.add_tensor(TensorType::ACL_SRC_1, _impl->src_1); + pack.add_tensor(TensorType::ACL_DST, _impl->dst); + _impl->op->run(pack); +} + +struct NEElementwiseSquaredDiff::Impl +{ + const ITensor *src_0{nullptr}; + const ITensor *src_1{nullptr}; + ITensor *dst{nullptr}; + std::unique_ptr<cpu::CpuElementwiseSquaredDiff> op{nullptr}; +}; + +NEElementwiseSquaredDiff::NEElementwiseSquaredDiff() : _impl(std::make_unique<Impl>()) +{ +} +NEElementwiseSquaredDiff::NEElementwiseSquaredDiff(NEElementwiseSquaredDiff &&) = default; +NEElementwiseSquaredDiff &NEElementwiseSquaredDiff::operator=(NEElementwiseSquaredDiff &&) = default; +NEElementwiseSquaredDiff::~NEElementwiseSquaredDiff() = default; + +void NEElementwiseSquaredDiff::configure(ITensor *input1, + ITensor *input2, + ITensor *output, + const ActivationLayerInfo &act_info) +{ + ARM_COMPUTE_UNUSED(act_info); + _impl->src_0 = input1; + _impl->src_1 = input2; + _impl->dst = output; + _impl->op = std::make_unique<cpu::CpuElementwiseSquaredDiff>(); + _impl->op->configure(input1->info(), input2->info(), output->info()); +} + +Status NEElementwiseSquaredDiff::validate(const ITensorInfo *input1, + const ITensorInfo *input2, + const ITensorInfo *output, + const ActivationLayerInfo &act_info) +{ + ARM_COMPUTE_RETURN_ERROR_ON(act_info.enabled()); + return cpu::CpuElementwiseSquaredDiff::validate(input1, input2, output); +} + +void NEElementwiseSquaredDiff::run() +{ + ITensorPack pack; + pack.add_tensor(TensorType::ACL_SRC_0, _impl->src_0); + pack.add_tensor(TensorType::ACL_SRC_1, _impl->src_1); + pack.add_tensor(TensorType::ACL_DST, _impl->dst); + _impl->op->run(pack); +} + +struct NEElementwiseDivision::Impl +{ + const ITensor *src_0{nullptr}; + const ITensor *src_1{nullptr}; + ITensor *dst{nullptr}; + std::unique_ptr<cpu::CpuElementwiseDivision> op{nullptr}; +}; + +NEElementwiseDivision::NEElementwiseDivision() : _impl(std::make_unique<Impl>()) +{ +} +NEElementwiseDivision::NEElementwiseDivision(NEElementwiseDivision &&) = default; +NEElementwiseDivision &NEElementwiseDivision::operator=(NEElementwiseDivision &&) = default; +NEElementwiseDivision::~NEElementwiseDivision() = default; + +void NEElementwiseDivision::configure(ITensor *input1, + ITensor *input2, + ITensor *output, + const ActivationLayerInfo &act_info) +{ + ARM_COMPUTE_UNUSED(act_info); + _impl->src_0 = input1; + _impl->src_1 = input2; + _impl->dst = output; + _impl->op = std::make_unique<cpu::CpuElementwiseDivision>(); + _impl->op->configure(input1->info(), input2->info(), output->info()); +} + +Status NEElementwiseDivision::validate(const ITensorInfo *input1, + const ITensorInfo *input2, + const ITensorInfo *output, + const ActivationLayerInfo &act_info) +{ + ARM_COMPUTE_RETURN_ERROR_ON(act_info.enabled()); + return cpu::CpuElementwiseDivision::validate(input1, input2, output); +} + +void NEElementwiseDivision::run() +{ + ITensorPack pack; + pack.add_tensor(TensorType::ACL_SRC_0, _impl->src_0); + pack.add_tensor(TensorType::ACL_SRC_1, _impl->src_1); + pack.add_tensor(TensorType::ACL_DST, _impl->dst); + _impl->op->run(pack); +} + +struct NEElementwisePower::Impl +{ + const ITensor *src_0{nullptr}; + const ITensor *src_1{nullptr}; + ITensor *dst{nullptr}; + std::unique_ptr<cpu::CpuElementwisePower> op{nullptr}; +}; + +NEElementwisePower::NEElementwisePower() : _impl(std::make_unique<Impl>()) +{ +} +NEElementwisePower::NEElementwisePower(NEElementwisePower &&) = default; +NEElementwisePower &NEElementwisePower::operator=(NEElementwisePower &&) = default; +NEElementwisePower::~NEElementwisePower() = default; + +void NEElementwisePower::configure(ITensor *input1, + ITensor *input2, + ITensor *output, + const ActivationLayerInfo &act_info) +{ + ARM_COMPUTE_UNUSED(act_info); + _impl->src_0 = input1; + _impl->src_1 = input2; + _impl->dst = output; + _impl->op = std::make_unique<cpu::CpuElementwisePower>(); + _impl->op->configure(input1->info(), input2->info(), output->info()); +} + +Status NEElementwisePower::validate(const ITensorInfo *input1, + const ITensorInfo *input2, + const ITensorInfo *output, + const ActivationLayerInfo &act_info) +{ + ARM_COMPUTE_RETURN_ERROR_ON(act_info.enabled()); + return cpu::CpuElementwisePower::validate(input1, input2, output); +} + +void NEElementwisePower::run() +{ + ITensorPack pack; + pack.add_tensor(TensorType::ACL_SRC_0, _impl->src_0); + pack.add_tensor(TensorType::ACL_SRC_1, _impl->src_1); + pack.add_tensor(TensorType::ACL_DST, _impl->dst); + _impl->op->run(pack); +} + +template <ComparisonOperation COP> +struct NEElementwiseComparisonStatic<COP>::Impl +{ + const ITensor *src_0{nullptr}; + const ITensor *src_1{nullptr}; + ITensor *dst{nullptr}; + std::unique_ptr<cpu::CpuElementwiseComparisonStatic<COP>> op{nullptr}; +}; + +template <ComparisonOperation COP> +NEElementwiseComparisonStatic<COP>::NEElementwiseComparisonStatic() : _impl(std::make_unique<Impl>()) +{ +} +template <ComparisonOperation COP> +NEElementwiseComparisonStatic<COP>::NEElementwiseComparisonStatic(NEElementwiseComparisonStatic &&) = default; +template <ComparisonOperation COP> +NEElementwiseComparisonStatic<COP> & +NEElementwiseComparisonStatic<COP>::operator=(NEElementwiseComparisonStatic &&) = default; +template <ComparisonOperation COP> +NEElementwiseComparisonStatic<COP>::~NEElementwiseComparisonStatic() = default; + +template <ComparisonOperation COP> +void NEElementwiseComparisonStatic<COP>::configure(ITensor *input1, ITensor *input2, ITensor *output) +{ + _impl->src_0 = input1; + _impl->src_1 = input2; + _impl->dst = output; + _impl->op = std::make_unique<cpu::CpuElementwiseComparisonStatic<COP>>(); + _impl->op->configure(input1->info(), input2->info(), output->info()); +} + +template <ComparisonOperation COP> +Status NEElementwiseComparisonStatic<COP>::validate(const ITensorInfo *input1, + const ITensorInfo *input2, + const ITensorInfo *output) +{ + return cpu::CpuElementwiseComparisonStatic<COP>::validate(input1, input2, output); +} + +template <ComparisonOperation COP> +void NEElementwiseComparisonStatic<COP>::run() +{ + ITensorPack pack; + pack.add_tensor(TensorType::ACL_SRC_0, _impl->src_0); + pack.add_tensor(TensorType::ACL_SRC_1, _impl->src_1); + pack.add_tensor(TensorType::ACL_DST, _impl->dst); + _impl->op->run(pack); +} + +struct NEElementwiseComparison::Impl +{ + const ITensor *src_0{nullptr}; + const ITensor *src_1{nullptr}; + ITensor *dst{nullptr}; + std::unique_ptr<cpu::CpuElementwiseComparison> op{nullptr}; +}; + +NEElementwiseComparison::NEElementwiseComparison() : _impl(std::make_unique<Impl>()) +{ +} +NEElementwiseComparison::NEElementwiseComparison(NEElementwiseComparison &&) = default; +NEElementwiseComparison &NEElementwiseComparison::operator=(NEElementwiseComparison &&) = default; +NEElementwiseComparison::~NEElementwiseComparison() = default; + +void NEElementwiseComparison::configure(ITensor *input1, ITensor *input2, ITensor *output, ComparisonOperation op) +{ + _impl->src_0 = input1; + _impl->src_1 = input2; + _impl->dst = output; + _impl->op = std::make_unique<cpu::CpuElementwiseComparison>(); + _impl->op->configure(input1->info(), input2->info(), output->info(), op); +} + +Status NEElementwiseComparison::validate(const ITensorInfo *input1, + const ITensorInfo *input2, + const ITensorInfo *output, + ComparisonOperation op) +{ + return cpu::CpuElementwiseComparison::validate(input1, input2, output, op); +} + +void NEElementwiseComparison::run() +{ + ITensorPack pack; + pack.add_tensor(TensorType::ACL_SRC_0, _impl->src_0); + pack.add_tensor(TensorType::ACL_SRC_1, _impl->src_1); + pack.add_tensor(TensorType::ACL_DST, _impl->dst); + _impl->op->run(pack); +} + +// Supported Specializations +template class NEElementwiseComparisonStatic<ComparisonOperation::Equal>; +template class NEElementwiseComparisonStatic<ComparisonOperation::NotEqual>; +template class NEElementwiseComparisonStatic<ComparisonOperation::Greater>; +template class NEElementwiseComparisonStatic<ComparisonOperation::GreaterEqual>; +template class NEElementwiseComparisonStatic<ComparisonOperation::Less>; +template class NEElementwiseComparisonStatic<ComparisonOperation::LessEqual>; +} // namespace arm_compute diff --git a/src/runtime/NEON/functions/NEElementwiseOperators.cpp b/src/runtime/NEON/functions/NEElementwiseOperators.cpp deleted file mode 100644 index 926ae1fa21..0000000000 --- a/src/runtime/NEON/functions/NEElementwiseOperators.cpp +++ /dev/null @@ -1,138 +0,0 @@ -/* - * Copyright (c) 2018-2020 ARM Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "arm_compute/core/Validate.h" -#include "arm_compute/runtime/NEON/functions/NEElementwiseOperations.h" -#include <arm_compute/core/NEON/kernels/NEElementwiseOperationKernel.h> - -#include "arm_compute/core/ITensor.h" -#include "support/MemorySupport.h" - -#include <utility> - -namespace arm_compute -{ -void NEElementwiseMax::configure(ITensor *input1, ITensor *input2, ITensor *output, const ActivationLayerInfo &act_info) -{ - ARM_COMPUTE_UNUSED(act_info); - auto k = arm_compute::support::cpp14::make_unique<NEArithmeticOperationKernel>(); - k->configure(ArithmeticOperation::MAX, input1, input2, output); - _kernel = std::move(k); -} - -Status NEElementwiseMax::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const ActivationLayerInfo &act_info) -{ - ARM_COMPUTE_RETURN_ERROR_ON(act_info.enabled()); - return NEArithmeticOperationKernel::validate(ArithmeticOperation::MAX, input1, input2, output); -} - -void NEElementwiseMin::configure(ITensor *input1, ITensor *input2, ITensor *output, const ActivationLayerInfo &act_info) -{ - ARM_COMPUTE_UNUSED(act_info); - auto k = arm_compute::support::cpp14::make_unique<NEArithmeticOperationKernel>(); - k->configure(ArithmeticOperation::MIN, input1, input2, output); - _kernel = std::move(k); -} - -Status NEElementwiseMin::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const ActivationLayerInfo &act_info) -{ - ARM_COMPUTE_RETURN_ERROR_ON(act_info.enabled()); - return NEArithmeticOperationKernel::validate(ArithmeticOperation::MIN, input1, input2, output); -} - -void NEElementwiseSquaredDiff::configure(ITensor *input1, ITensor *input2, ITensor *output, const ActivationLayerInfo &act_info) -{ - ARM_COMPUTE_UNUSED(act_info); - auto k = arm_compute::support::cpp14::make_unique<NEArithmeticOperationKernel>(); - k->configure(ArithmeticOperation::SQUARED_DIFF, input1, input2, output); - _kernel = std::move(k); -} - -Status NEElementwiseSquaredDiff::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const ActivationLayerInfo &act_info) -{ - ARM_COMPUTE_RETURN_ERROR_ON(act_info.enabled()); - return NEArithmeticOperationKernel::validate(ArithmeticOperation::SQUARED_DIFF, input1, input2, output); -} - -void NEElementwiseDivision::configure(ITensor *input1, ITensor *input2, ITensor *output, const ActivationLayerInfo &act_info) -{ - ARM_COMPUTE_UNUSED(act_info); - auto k = arm_compute::support::cpp14::make_unique<NEDivisionOperationKernel>(); - k->configure(input1, input2, output); - _kernel = std::move(k); -} - -Status NEElementwiseDivision::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const ActivationLayerInfo &act_info) -{ - ARM_COMPUTE_RETURN_ERROR_ON(act_info.enabled()); - return NEDivisionOperationKernel::validate(input1, input2, output); -} - -void NEElementwisePower::configure(ITensor *input1, ITensor *input2, ITensor *output, const ActivationLayerInfo &act_info) -{ - ARM_COMPUTE_UNUSED(act_info); - auto k = arm_compute::support::cpp14::make_unique<NEPowerOperationKernel>(); - k->configure(input1, input2, output); - _kernel = std::move(k); -} - -Status NEElementwisePower::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const ActivationLayerInfo &act_info) -{ - ARM_COMPUTE_RETURN_ERROR_ON(act_info.enabled()); - return NEPowerOperationKernel::validate(input1, input2, output); -} - -template <ComparisonOperation COP> -void NEElementwiseComparisonStatic<COP>::configure(ITensor *input1, ITensor *input2, ITensor *output) -{ - auto k = arm_compute::support::cpp14::make_unique<NEComparisonOperationKernel>(); - k->configure(COP, input1, input2, output); - _kernel = std::move(k); -} - -template <ComparisonOperation COP> -Status NEElementwiseComparisonStatic<COP>::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output) -{ - return NEComparisonOperationKernel::validate(COP, input1, input2, output); -} - -void NEElementwiseComparison::configure(ITensor *input1, ITensor *input2, ITensor *output, ComparisonOperation op) -{ - auto k = arm_compute::support::cpp14::make_unique<NEComparisonOperationKernel>(); - k->configure(op, input1, input2, output); - _kernel = std::move(k); -} - -Status NEElementwiseComparison::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, ComparisonOperation op) -{ - return NEComparisonOperationKernel::validate(op, input1, input2, output); -} - -// Supported Specializations -template class NEElementwiseComparisonStatic<ComparisonOperation::Equal>; -template class NEElementwiseComparisonStatic<ComparisonOperation::NotEqual>; -template class NEElementwiseComparisonStatic<ComparisonOperation::Greater>; -template class NEElementwiseComparisonStatic<ComparisonOperation::GreaterEqual>; -template class NEElementwiseComparisonStatic<ComparisonOperation::Less>; -template class NEElementwiseComparisonStatic<ComparisonOperation::LessEqual>; -} // namespace arm_compute diff --git a/src/runtime/NEON/functions/NEElementwiseUnaryLayer.cpp b/src/runtime/NEON/functions/NEElementwiseUnaryLayer.cpp index 80db027398..23a092c407 100644 --- a/src/runtime/NEON/functions/NEElementwiseUnaryLayer.cpp +++ b/src/runtime/NEON/functions/NEElementwiseUnaryLayer.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018-2020 ARM Limited. + * Copyright (c) 2018-2021 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -23,88 +23,63 @@ */ #include "arm_compute/runtime/NEON/functions/NEElementwiseUnaryLayer.h" -#include "arm_compute/core/NEON/kernels/NEElementwiseUnaryKernel.h" -#include "support/MemorySupport.h" +#include "src/cpu/operators/CpuElementwiseUnary.h" #include <utility> namespace arm_compute { -void NERsqrtLayer::configure(const ITensor *input, ITensor *output) -{ - auto k = arm_compute::support::cpp14::make_unique<NEElementwiseUnaryKernel>(); - k->configure(ElementWiseUnary::RSQRT, input, output); - _kernel = std::move(k); -} -Status NERsqrtLayer::validate(const ITensorInfo *input, const ITensorInfo *output) -{ - return NEElementwiseUnaryKernel::validate(ElementWiseUnary::RSQRT, input, output); -} +using OperatorType = cpu::CpuElementwiseUnary; -void NEExpLayer::configure(const ITensor *input, ITensor *output) -{ - auto k = arm_compute::support::cpp14::make_unique<NEElementwiseUnaryKernel>(); - k->configure(ElementWiseUnary::EXP, input, output); - _kernel = std::move(k); -} -Status NEExpLayer::validate(const ITensorInfo *input, const ITensorInfo *output) +template <ElementWiseUnary op> +struct NEElementwiseUnaryLayer<op>::Impl { - return NEElementwiseUnaryKernel::validate(ElementWiseUnary::EXP, input, output); -} + const ITensor *src{nullptr}; + ITensor *dst{nullptr}; + std::unique_ptr<OperatorType> cpu_op{nullptr}; +}; -void NENegLayer::configure(const ITensor *input, ITensor *output) +template <ElementWiseUnary op> +NEElementwiseUnaryLayer<op>::NEElementwiseUnaryLayer() : _impl(std::make_unique<Impl>()) { - auto k = arm_compute::support::cpp14::make_unique<NEElementwiseUnaryKernel>(); - k->configure(ElementWiseUnary::NEG, input, output); - _kernel = std::move(k); -} -Status NENegLayer::validate(const ITensorInfo *input, const ITensorInfo *output) -{ - return NEElementwiseUnaryKernel::validate(ElementWiseUnary::NEG, input, output); } +template <ElementWiseUnary op> +NEElementwiseUnaryLayer<op>::~NEElementwiseUnaryLayer() = default; +template <ElementWiseUnary op> +NEElementwiseUnaryLayer<op>::NEElementwiseUnaryLayer(NEElementwiseUnaryLayer &&) = default; +template <ElementWiseUnary op> +NEElementwiseUnaryLayer<op> &NEElementwiseUnaryLayer<op>::operator=(NEElementwiseUnaryLayer &&) = default; -void NELogLayer::configure(const ITensor *input, ITensor *output) +template <ElementWiseUnary op> +void NEElementwiseUnaryLayer<op>::configure(const ITensor *input, ITensor *output) { - auto k = arm_compute::support::cpp14::make_unique<NEElementwiseUnaryKernel>(); - k->configure(ElementWiseUnary::LOG, input, output); - _kernel = std::move(k); -} -Status NELogLayer::validate(const ITensorInfo *input, const ITensorInfo *output) -{ - return NEElementwiseUnaryKernel::validate(ElementWiseUnary::LOG, input, output); + _impl->src = input; + _impl->dst = output; + _impl->cpu_op = std::make_unique<OperatorType>(); + _impl->cpu_op->configure(op, *_impl->src->info(), *_impl->dst->info()); } -void NEAbsLayer::configure(const ITensor *input, ITensor *output) -{ - auto k = arm_compute::support::cpp14::make_unique<NEElementwiseUnaryKernel>(); - k->configure(ElementWiseUnary::ABS, input, output); - _kernel = std::move(k); -} -Status NEAbsLayer::validate(const ITensorInfo *input, const ITensorInfo *output) +template <ElementWiseUnary op> +Status NEElementwiseUnaryLayer<op>::validate(const ITensorInfo *input, const ITensorInfo *output) { - return NEElementwiseUnaryKernel::validate(ElementWiseUnary::ABS, input, output); + return OperatorType::validate(op, *input, *output); } -void NERoundLayer::configure(const ITensor *input, ITensor *output) -{ - auto k = arm_compute::support::cpp14::make_unique<NEElementwiseUnaryKernel>(); - k->configure(ElementWiseUnary::ROUND, input, output); - _kernel = std::move(k); -} -Status NERoundLayer::validate(const ITensorInfo *input, const ITensorInfo *output) +template <ElementWiseUnary op> +void NEElementwiseUnaryLayer<op>::run() { - return NEElementwiseUnaryKernel::validate(ElementWiseUnary::ROUND, input, output); + ITensorPack pack; + pack.add_tensor(TensorType::ACL_SRC, _impl->src); + pack.add_tensor(TensorType::ACL_DST, _impl->dst); + _impl->cpu_op->run(pack); } -void NESinLayer::configure(const ITensor *input, ITensor *output) -{ - auto k = arm_compute::support::cpp14::make_unique<NEElementwiseUnaryKernel>(); - k->configure(ElementWiseUnary::SIN, input, output); - _kernel = std::move(k); -} -Status NESinLayer::validate(const ITensorInfo *input, const ITensorInfo *output) -{ - return NEElementwiseUnaryKernel::validate(ElementWiseUnary::SIN, input, output); -} +template class NEElementwiseUnaryLayer<ElementWiseUnary::RSQRT>; +template class NEElementwiseUnaryLayer<ElementWiseUnary::EXP>; +template class NEElementwiseUnaryLayer<ElementWiseUnary::NEG>; +template class NEElementwiseUnaryLayer<ElementWiseUnary::LOG>; +template class NEElementwiseUnaryLayer<ElementWiseUnary::ABS>; +template class NEElementwiseUnaryLayer<ElementWiseUnary::ROUND>; +template class NEElementwiseUnaryLayer<ElementWiseUnary::SIN>; } // namespace arm_compute diff --git a/src/runtime/NEON/functions/NEEqualizeHistogram.cpp b/src/runtime/NEON/functions/NEEqualizeHistogram.cpp deleted file mode 100644 index 70b93cae9e..0000000000 --- a/src/runtime/NEON/functions/NEEqualizeHistogram.cpp +++ /dev/null @@ -1,62 +0,0 @@ -/* - * Copyright (c) 2016, 2017 ARM Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "arm_compute/runtime/NEON/functions/NEEqualizeHistogram.h" - -#include "arm_compute/core/ITensor.h" -#include "arm_compute/core/TensorInfo.h" -#include "arm_compute/core/Types.h" -#include "arm_compute/core/Validate.h" -#include "arm_compute/runtime/NEON/NEScheduler.h" - -using namespace arm_compute; - -NEEqualizeHistogram::NEEqualizeHistogram() - : _histogram_kernel(), _cd_histogram_kernel(), _map_histogram_kernel(), _hist(nr_bins, 0, max_range), _cum_dist(nr_bins, 0, max_range), _cd_lut(nr_bins, DataType::U8) -{ -} - -void NEEqualizeHistogram::configure(const IImage *input, IImage *output) -{ - ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(input); - ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(output); - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8); - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8); - - // Configure kernels - _histogram_kernel.configure(input, &_hist); - _cd_histogram_kernel.configure(input, &_hist, &_cum_dist, &_cd_lut); - _map_histogram_kernel.configure(input, &_cd_lut, output); -} - -void NEEqualizeHistogram::run() -{ - // Calculate histogram of input. - NEScheduler::get().schedule(&_histogram_kernel, Window::DimY); - - // Calculate cumulative distribution of histogram and create LUT. - NEScheduler::get().schedule(&_cd_histogram_kernel, Window::DimY); - - // Map input to output using created LUT. - NEScheduler::get().schedule(&_map_histogram_kernel, Window::DimY); -} diff --git a/src/runtime/NEON/functions/NEErode.cpp b/src/runtime/NEON/functions/NEErode.cpp deleted file mode 100644 index 4f773b7091..0000000000 --- a/src/runtime/NEON/functions/NEErode.cpp +++ /dev/null @@ -1,40 +0,0 @@ -/* - * Copyright (c) 2016-2020 ARM Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "arm_compute/runtime/NEON/functions/NEErode.h" - -#include "arm_compute/core/NEON/kernels/NEErodeKernel.h" -#include "arm_compute/core/PixelValue.h" -#include "support/MemorySupport.h" - -#include <utility> - -using namespace arm_compute; - -void NEErode::configure(ITensor *input, ITensor *output, BorderMode border_mode, uint8_t constant_border_value) -{ - auto k = arm_compute::support::cpp14::make_unique<NEErodeKernel>(); - k->configure(input, output, border_mode == BorderMode::UNDEFINED); - _kernel = std::move(k); - _border_handler.configure(input, _kernel->border_size(), border_mode, PixelValue(constant_border_value)); -} diff --git a/src/runtime/NEON/functions/NEFFT1D.cpp b/src/runtime/NEON/functions/NEFFT1D.cpp index 25ba1c8391..fb75f9da29 100644 --- a/src/runtime/NEON/functions/NEFFT1D.cpp +++ b/src/runtime/NEON/functions/NEFFT1D.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019 ARM Limited. + * Copyright (c) 2019-2021 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -25,13 +25,28 @@ #include "arm_compute/core/ITensor.h" #include "arm_compute/core/Validate.h" -#include "arm_compute/core/utils/helpers/fft.h" #include "arm_compute/runtime/NEON/NEScheduler.h" +#include "src/common/utils/Log.h" +#include "src/core/NEON/kernels/NEFFTDigitReverseKernel.h" +#include "src/core/NEON/kernels/NEFFTRadixStageKernel.h" +#include "src/core/NEON/kernels/NEFFTScaleKernel.h" +#include "src/core/utils/helpers/fft.h" + namespace arm_compute { +NEFFT1D::~NEFFT1D() = default; + NEFFT1D::NEFFT1D(std::shared_ptr<IMemoryManager> memory_manager) - : _memory_group(std::move(memory_manager)), _digit_reverse_kernel(), _fft_kernels(), _scale_kernel(), _digit_reversed_input(), _digit_reverse_indices(), _num_ffts(0), _axis(0), _run_scale(false) + : _memory_group(std::move(memory_manager)), + _digit_reverse_kernel(), + _fft_kernels(), + _scale_kernel(), + _digit_reversed_input(), + _digit_reverse_indices(), + _num_ffts(0), + _axis(0), + _run_scale(false) { } @@ -39,6 +54,7 @@ void NEFFT1D::configure(const ITensor *input, ITensor *output, const FFT1DInfo & { ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); ARM_COMPUTE_ERROR_THROW_ON(NEFFT1D::validate(input->info(), output->info(), config)); + ARM_COMPUTE_LOG_PARAMS(input, output, config); // Decompose size to radix factors const auto supported_radix = NEFFTRadixStageKernel::supported_radix(); @@ -58,7 +74,8 @@ void NEFFT1D::configure(const ITensor *input, ITensor *output, const FFT1DInfo & TensorInfo digit_reverse_indices_info(TensorShape(input->info()->tensor_shape()[config.axis]), 1, DataType::U32); _digit_reverse_indices.allocator()->init(digit_reverse_indices_info); _memory_group.manage(&_digit_reversed_input); - _digit_reverse_kernel.configure(input, &_digit_reversed_input, &_digit_reverse_indices, digit_reverse_config); + _digit_reverse_kernel = std::make_unique<NEFFTDigitReverseKernel>(); + _digit_reverse_kernel->configure(input, &_digit_reversed_input, &_digit_reverse_indices, digit_reverse_config); // Create and configure FFT kernels unsigned int Nx = 1; @@ -66,7 +83,7 @@ void NEFFT1D::configure(const ITensor *input, ITensor *output, const FFT1DInfo & _fft_kernels.resize(_num_ffts); _axis = config.axis; - for(unsigned int i = 0; i < _num_ffts; ++i) + for (unsigned int i = 0; i < _num_ffts; ++i) { const unsigned int radix_for_stage = decomposed_vector.at(i); @@ -75,18 +92,22 @@ void NEFFT1D::configure(const ITensor *input, ITensor *output, const FFT1DInfo & fft_kernel_info.radix = radix_for_stage; fft_kernel_info.Nx = Nx; fft_kernel_info.is_first_stage = (i == 0); - _fft_kernels[i].configure(&_digit_reversed_input, ((i == (_num_ffts - 1)) && !is_c2r) ? output : nullptr, fft_kernel_info); + _fft_kernels[i] = std::make_unique<NEFFTRadixStageKernel>(); + _fft_kernels[i]->configure(&_digit_reversed_input, ((i == (_num_ffts - 1)) && !is_c2r) ? output : nullptr, + fft_kernel_info); Nx *= radix_for_stage; } // Configure scale kernel - if(_run_scale) + if (_run_scale) { FFTScaleKernelInfo scale_config; scale_config.scale = static_cast<float>(N); scale_config.conjugate = config.direction == FFTDirection::Inverse; - is_c2r ? _scale_kernel.configure(&_digit_reversed_input, output, scale_config) : _scale_kernel.configure(output, nullptr, scale_config); + _scale_kernel = std::make_unique<NEFFTScaleKernel>(); + is_c2r ? _scale_kernel->configure(&_digit_reversed_input, output, scale_config) + : _scale_kernel->configure(output, nullptr, scale_config); } // Allocate tensors @@ -103,7 +124,7 @@ Status NEFFT1D::validate(const ITensorInfo *input, const ITensorInfo *output, co ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output); ARM_COMPUTE_RETURN_ERROR_ON(input->data_type() != DataType::F32); ARM_COMPUTE_RETURN_ERROR_ON(input->num_channels() > 2); - ARM_COMPUTE_RETURN_ERROR_ON(std::set<unsigned int>({ 0, 1 }).count(config.axis) == 0); + ARM_COMPUTE_RETURN_ERROR_ON(std::set<unsigned int>({0, 1}).count(config.axis) == 0); // Check if FFT is decomposable const auto supported_radix = NEFFTRadixStageKernel::supported_radix(); @@ -112,7 +133,7 @@ Status NEFFT1D::validate(const ITensorInfo *input, const ITensorInfo *output, co ARM_COMPUTE_RETURN_ERROR_ON(decomposed_vector.empty()); // Checks performed when output is configured - if((output != nullptr) && (output->total_size() != 0)) + if ((output != nullptr) && (output->total_size() != 0)) { // All combinations are supported except real input with real output (i.e., both input channels set to 1) ARM_COMPUTE_RETURN_ERROR_ON(output->num_channels() == 1 && input->num_channels() == 1); @@ -128,17 +149,17 @@ void NEFFT1D::run() { MemoryGroupResourceScope scope_mg(_memory_group); - NEScheduler::get().schedule(&_digit_reverse_kernel, (_axis == 0 ? Window::DimY : Window::DimZ)); + NEScheduler::get().schedule(_digit_reverse_kernel.get(), (_axis == 0 ? Window::DimY : Window::DimZ)); - for(unsigned int i = 0; i < _num_ffts; ++i) + for (unsigned int i = 0; i < _num_ffts; ++i) { - NEScheduler::get().schedule(&_fft_kernels[i], (_axis == 0 ? Window::DimY : Window::DimX)); + NEScheduler::get().schedule(_fft_kernels[i].get(), (_axis == 0 ? Window::DimY : Window::DimX)); } // Run output scaling - if(_run_scale) + if (_run_scale) { - NEScheduler::get().schedule(&_scale_kernel, Window::DimY); + NEScheduler::get().schedule(_scale_kernel.get(), Window::DimY); } } } // namespace arm_compute diff --git a/src/runtime/NEON/functions/NEFFT2D.cpp b/src/runtime/NEON/functions/NEFFT2D.cpp index 2fea017781..066909221d 100644 --- a/src/runtime/NEON/functions/NEFFT2D.cpp +++ b/src/runtime/NEON/functions/NEFFT2D.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2020 ARM Limited. + * Copyright (c) 2019-2021 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -27,10 +27,17 @@ #include "arm_compute/core/Validate.h" #include "arm_compute/runtime/Scheduler.h" +#include "src/common/utils/Log.h" + namespace arm_compute { +NEFFT2D::~NEFFT2D() = default; + NEFFT2D::NEFFT2D(std::shared_ptr<IMemoryManager> memory_manager) - : _memory_group(memory_manager), _first_pass_func(memory_manager), _second_pass_func(memory_manager), _first_pass_tensor() + : _memory_group(memory_manager), + _first_pass_func(memory_manager), + _second_pass_func(memory_manager), + _first_pass_tensor() { } @@ -38,6 +45,7 @@ void NEFFT2D::configure(const ITensor *input, ITensor *output, const FFT2DInfo & { ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); ARM_COMPUTE_ERROR_THROW_ON(NEFFT2D::validate(input->info(), output->info(), config)); + ARM_COMPUTE_LOG_PARAMS(input, output, config); // Setup first pass FFT1DInfo first_pass_config; @@ -74,7 +82,7 @@ Status NEFFT2D::validate(const ITensorInfo *input, const ITensorInfo *output, co ARM_COMPUTE_RETURN_ON_ERROR(NEFFT1D::validate(&first_pass_tensor, output, second_pass_config)); // Checks performed when output is configured - if((output != nullptr) && (output->total_size() != 0)) + if ((output != nullptr) && (output->total_size() != 0)) { ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); diff --git a/src/runtime/NEON/functions/NEFFTConvolutionLayer.cpp b/src/runtime/NEON/functions/NEFFTConvolutionLayer.cpp index 08230074c3..94f85e5ffa 100644 --- a/src/runtime/NEON/functions/NEFFTConvolutionLayer.cpp +++ b/src/runtime/NEON/functions/NEFFTConvolutionLayer.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019 ARM Limited. + * Copyright (c) 2019-2021 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -25,9 +25,17 @@ #include "arm_compute/core/ITensor.h" #include "arm_compute/core/Utils.h" -#include "arm_compute/core/Validate.h" -#include "arm_compute/core/utils/helpers/fft.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" +#include "arm_compute/core/Validate.h" + +#include "src/common/utils/Log.h" +#include "src/core/helpers/AutoConfiguration.h" +#include "src/core/NEON/kernels/NEFFTDigitReverseKernel.h" +#include "src/core/NEON/kernels/NEFFTRadixStageKernel.h" +#include "src/core/NEON/kernels/NEFFTScaleKernel.h" +#include "src/core/NEON/kernels/NEPadLayerKernel.h" +#include "src/core/NEON/kernels/NEReductionOperationKernel.h" +#include "src/core/utils/helpers/fft.h" namespace arm_compute { @@ -39,11 +47,11 @@ int pad_decomposable(int N) int pad = 0; bool is_decomposed = false; - while(!is_decomposed) + while (!is_decomposed) { const auto decomposed_vector = arm_compute::helpers::fft::decompose_stages(N++, supported_radix); is_decomposed = !decomposed_vector.empty(); - if(!is_decomposed) + if (!is_decomposed) { ++pad; } @@ -93,10 +101,19 @@ NEFFTConvolutionLayer::NEFFTConvolutionLayer(std::shared_ptr<IMemoryManager> mem _is_prepared(false) { } - -void NEFFTConvolutionLayer::configure(ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const PadStrideInfo &conv_info, - const ActivationLayerInfo &act_info) +NEFFTConvolutionLayer::~NEFFTConvolutionLayer() = default; + +void NEFFTConvolutionLayer::configure(ITensor *input, + const ITensor *weights, + const ITensor *biases, + ITensor *output, + const PadStrideInfo &conv_info, + const ActivationLayerInfo &act_info, + bool enable_fast_math) { + ARM_COMPUTE_UNUSED(enable_fast_math); + ARM_COMPUTE_LOG_PARAMS(input, weights, biases, output, conv_info, act_info, enable_fast_math); + _original_weights = weights; _original_bias = biases; @@ -104,21 +121,24 @@ void NEFFTConvolutionLayer::configure(ITensor *input, const ITensor *weights, co _has_bias = biases != nullptr; // Get indices for the width and height - const size_t idx_width = get_data_layout_dimension_index(input->info()->data_layout(), DataLayoutDimension::WIDTH); - const size_t idx_height = get_data_layout_dimension_index(input->info()->data_layout(), DataLayoutDimension::HEIGHT); + const size_t idx_width = get_data_layout_dimension_index(input->info()->data_layout(), DataLayoutDimension::WIDTH); + const size_t idx_height = + get_data_layout_dimension_index(input->info()->data_layout(), DataLayoutDimension::HEIGHT); // Input shape, kernel size and output tile - const Size2D input_dims = Size2D(input->info()->tensor_shape()[idx_width], input->info()->tensor_shape()[idx_height]); - const Size2D kernel_size = Size2D(weights->info()->tensor_shape()[idx_width], weights->info()->tensor_shape()[idx_height]); - const Size2D pad_valid = Size2D(pad_decomposable(input_dims.x() + kernel_size.x() - 1), - pad_decomposable(input_dims.y() + kernel_size.y() - 1)); + const Size2D input_dims = + Size2D(input->info()->tensor_shape()[idx_width], input->info()->tensor_shape()[idx_height]); + const Size2D kernel_size = + Size2D(weights->info()->tensor_shape()[idx_width], weights->info()->tensor_shape()[idx_height]); + const Size2D pad_valid = Size2D(pad_decomposable(input_dims.x() + kernel_size.x() - 1), + pad_decomposable(input_dims.y() + kernel_size.y() - 1)); // Tensors to use ITensor *input_to_use = input; const ITensor *weights_to_use = weights; ITensor *output_to_use = _has_bias ? &_bias_output : output; // Permute bias - if(biases != nullptr) + if (biases != nullptr) { _permute_bias_func.configure(biases, &_permuted_bias, PermutationVector(1U, 2U, 0U)); _permuted_bias.info()->set_data_layout(DataLayout::NCHW); @@ -126,7 +146,7 @@ void NEFFTConvolutionLayer::configure(ITensor *input, const ITensor *weights, co // Permute input if needed _needs_permute = input->info()->data_layout() == DataLayout::NHWC; - if(_needs_permute) + if (_needs_permute) { _memory_group.manage(&_permuted_input); // Configure the function to transform the input tensor from NHWC -> NCHW @@ -147,18 +167,18 @@ void NEFFTConvolutionLayer::configure(ITensor *input, const ITensor *weights, co _flip_weights_func.configure(weights_to_use, &_flipped_weights, &_flip_axis); // Pad weights - const PaddingList padding_w = { { 0, input_dims.x() + pad_valid.x() - 1 }, { 0, input_dims.y() + pad_valid.y() - 1 } }; + const PaddingList padding_w = {{0, input_dims.x() + pad_valid.x() - 1}, {0, input_dims.y() + pad_valid.y() - 1}}; _pad_weights_func.configure(&_flipped_weights, &_padded_weights, padding_w); // Transform weights - _transform_weights_func = support::cpp14::make_unique<NEFFT2D>(); + _transform_weights_func = std::make_unique<NEFFT2D>(); _transform_weights_func->configure(&_padded_weights, &_transformed_weights, FFT2DInfo()); // Pad input - const PaddingList padding_in = { { 0, kernel_size.x() + pad_valid.x() - 1 }, { 0, kernel_size.y() + pad_valid.y() - 1 } }; + const PaddingList padding_in = {{0, kernel_size.x() + pad_valid.x() - 1}, {0, kernel_size.y() + pad_valid.y() - 1}}; _memory_group.manage(&_padded_input); _pad_input_func.configure(input_to_use, &_padded_input, padding_in); - if(_needs_permute) + if (_needs_permute) { _permuted_input.allocator()->allocate(); } @@ -182,7 +202,8 @@ void NEFFTConvolutionLayer::configure(ITensor *input, const ITensor *weights, co _memory_group.manage(&_itransformed_output); FFT2DInfo itranform_info; itranform_info.direction = FFTDirection::Inverse; - _itransformed_output.allocator()->init(_output_reduced.info()->clone()->set_is_resizable(true).set_num_channels(1).reset_padding()); + _itransformed_output.allocator()->init( + _output_reduced.info()->clone()->set_is_resizable(true).set_num_channels(1).reset_padding()); _itransform_output_func.configure(&_output_reduced, &_itransformed_output, itranform_info); _output_reduced.allocator()->allocate(); @@ -194,26 +215,29 @@ void NEFFTConvolutionLayer::configure(ITensor *input, const ITensor *weights, co // Extract correct region const int start_left = kernel_size.x() - conv_info.pad_left() - 1; const int start_top = kernel_size.y() - conv_info.pad_top() - 1; - const int end_right = _reshaped_output.info()->tensor_shape().x() - (kernel_size.x() - conv_info.pad_right() - 1) - pad_valid.x(); - const int end_botton = _reshaped_output.info()->tensor_shape().y() - (kernel_size.y() - conv_info.pad_bottom() - 1) - pad_valid.y(); - if(_has_bias) + const int end_right = + _reshaped_output.info()->tensor_shape().x() - (kernel_size.x() - conv_info.pad_right() - 1) - pad_valid.x(); + const int end_botton = + _reshaped_output.info()->tensor_shape().y() - (kernel_size.y() - conv_info.pad_bottom() - 1) - pad_valid.y(); + if (_has_bias) { _memory_group.manage(&_bias_output); } - else if(_needs_permute) + else if (_needs_permute) { output_to_use = &_permuted_output; _memory_group.manage(&_permuted_output); } - _extract_output_func.configure(&_reshaped_output, output_to_use, Coordinates(start_left, start_top), Coordinates(end_right, end_botton)); + _extract_output_func.configure(&_reshaped_output, output_to_use, Coordinates(start_left, start_top), + Coordinates(end_right, end_botton)); _reshaped_output.allocator()->allocate(); _itransformed_output.allocator()->allocate(); // Add bias - if(biases != nullptr) + if (biases != nullptr) { output_to_use = output; - if(_needs_permute) + if (_needs_permute) { output_to_use = &_permuted_output; _memory_group.manage(&_permuted_output); @@ -224,7 +248,7 @@ void NEFFTConvolutionLayer::configure(ITensor *input, const ITensor *weights, co } // Permute output - if(_needs_permute) + if (_needs_permute) { // Configure the function to transform the convoluted output to ACL's native ordering format NCHW _permuted_output.info()->set_data_layout(DataLayout::NCHW); @@ -236,7 +260,7 @@ void NEFFTConvolutionLayer::configure(ITensor *input, const ITensor *weights, co // Configure Activation Layer _is_activationlayer_enabled = act_info.enabled(); - if(_is_activationlayer_enabled) + if (_is_activationlayer_enabled) { _activation_layer_func.configure(output, nullptr, act_info); } @@ -249,9 +273,16 @@ void NEFFTConvolutionLayer::configure(ITensor *input, const ITensor *weights, co axis_data[1] = 1; } -Status NEFFTConvolutionLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info, - const ActivationLayerInfo &act_info) +Status NEFFTConvolutionLayer::validate(const ITensorInfo *input, + const ITensorInfo *weights, + const ITensorInfo *biases, + const ITensorInfo *output, + const PadStrideInfo &conv_info, + const ActivationLayerInfo &act_info, + bool enable_fast_math) { + ARM_COMPUTE_UNUSED(enable_fast_math); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights); @@ -266,11 +297,13 @@ Status NEFFTConvolutionLayer::validate(const ITensorInfo *input, const ITensorIn const auto strides = conv_info.stride(); ARM_COMPUTE_RETURN_ERROR_ON(strides.first != strides.second && strides.first != 1); ARM_COMPUTE_RETURN_ERROR_ON(kernel_size.x() != kernel_size.y()); - ARM_COMPUTE_RETURN_ERROR_ON(conv_info.pad_left() != (kernel_size.x() / 2) || conv_info.pad_right() != (kernel_size.x() / 2)); - ARM_COMPUTE_RETURN_ERROR_ON(conv_info.pad_top() != (kernel_size.y() / 2) || conv_info.pad_bottom() != (kernel_size.y() / 2)); + ARM_COMPUTE_RETURN_ERROR_ON(conv_info.pad_left() != (kernel_size.x() / 2) || + conv_info.pad_right() != (kernel_size.x() / 2)); + ARM_COMPUTE_RETURN_ERROR_ON(conv_info.pad_top() != (kernel_size.y() / 2) || + conv_info.pad_bottom() != (kernel_size.y() / 2)); // Validate biases - if(biases != nullptr) + if (biases != nullptr) { const size_t idx_channels = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::CHANNEL); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, biases); @@ -278,13 +311,14 @@ Status NEFFTConvolutionLayer::validate(const ITensorInfo *input, const ITensorIn } // Checks performed when output is configured - if((output != nullptr) && (output->total_size() != 0)) + if ((output != nullptr) && (output->total_size() != 0)) { ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); - ARM_COMPUTE_RETURN_ERROR_ON((input->tensor_shape()[idx_height] != output->tensor_shape()[idx_height]) || (input->tensor_shape()[idx_width] != output->tensor_shape()[idx_width])); + ARM_COMPUTE_RETURN_ERROR_ON((input->tensor_shape()[idx_height] != output->tensor_shape()[idx_height]) || + (input->tensor_shape()[idx_width] != output->tensor_shape()[idx_width])); // Validate Activation Layer - if(act_info.enabled()) + if (act_info.enabled()) { ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayer::validate(output, nullptr, act_info)); } @@ -300,7 +334,7 @@ void NEFFTConvolutionLayer::run() MemoryGroupResourceScope scope_mg(_memory_group); // Transform input - if(_needs_permute) + if (_needs_permute) { _permute_input_func.run(); } @@ -318,17 +352,17 @@ void NEFFTConvolutionLayer::run() _extract_output_func.run(); // Add bias - if(_has_bias) + if (_has_bias) { _bias_add_func.run(); } - if(_needs_permute) + if (_needs_permute) { _permute_output_func.run(); } // Run activation layer - if(_is_activationlayer_enabled) + if (_is_activationlayer_enabled) { _activation_layer_func.run(); } @@ -336,10 +370,10 @@ void NEFFTConvolutionLayer::run() void NEFFTConvolutionLayer::prepare() { - if(!_is_prepared) + if (!_is_prepared) { // Permute bias to NCHW - if(_original_bias != nullptr) + if (_original_bias != nullptr) { _permuted_bias.allocator()->allocate(); _permute_bias_func.run(); @@ -349,7 +383,7 @@ void NEFFTConvolutionLayer::prepare() const ITensor *cur_weights = _original_weights; // Permute weights - if(_needs_permute) + if (_needs_permute) { ARM_COMPUTE_ERROR_ON(!cur_weights->is_used()); diff --git a/src/runtime/NEON/functions/NEFastCorners.cpp b/src/runtime/NEON/functions/NEFastCorners.cpp deleted file mode 100644 index af3530151c..0000000000 --- a/src/runtime/NEON/functions/NEFastCorners.cpp +++ /dev/null @@ -1,106 +0,0 @@ -/* - * Copyright (c) 2016-2019 ARM Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "arm_compute/runtime/NEON/functions/NEFastCorners.h" - -#include "arm_compute/core/Error.h" -#include "arm_compute/core/ITensor.h" -#include "arm_compute/core/NEON/kernels/NEFillBorderKernel.h" -#include "arm_compute/core/PixelValue.h" -#include "arm_compute/core/TensorInfo.h" -#include "arm_compute/core/Validate.h" -#include "arm_compute/runtime/Array.h" -#include "arm_compute/runtime/NEON/NEScheduler.h" -#include "arm_compute/runtime/TensorAllocator.h" - -using namespace arm_compute; - -NEFastCorners::NEFastCorners(std::shared_ptr<IMemoryManager> memory_manager) - : _memory_group(std::move(memory_manager)), - _fast_corners_kernel(), - _border_handler(), - _nonmax_kernel(), - _fill_kernel(), - _output(), - _suppressed(), - _non_max(false) -{ -} - -void NEFastCorners::configure(IImage *input, float threshold, bool nonmax_suppression, KeyPointArray *corners, - BorderMode border_mode, uint8_t constant_border_value) -{ - ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(input); - ARM_COMPUTE_ERROR_ON(BorderMode::UNDEFINED != border_mode); - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8); - ARM_COMPUTE_ERROR_ON(nullptr == corners); - ARM_COMPUTE_ERROR_ON(threshold < 1 && threshold > 255); - - _non_max = nonmax_suppression; - - TensorInfo tensor_info(input->info()->tensor_shape(), Format::U8); - _output.allocator()->init(tensor_info); - _memory_group.manage(&_output); - - // If border is UNDEFINED _fast_corners_kernel will operate in xwindow (3, - // width - 3) and ywindow (3, height -3) so the output image will leave the - // pixels on the borders unchanged. This is reflected in the valid region - // of the output. The non maxima suppression is only run on the valid - // pixels. - _fast_corners_kernel.configure(input, &_output, threshold, nonmax_suppression, BorderMode::UNDEFINED == border_mode); - _border_handler.configure(input, _fast_corners_kernel.border_size(), border_mode, constant_border_value); - - if(!_non_max) - { - _fill_kernel.configure(&_output, 1 /* we keep all texels >0 */, corners); - } - else - { - _suppressed.allocator()->init(tensor_info); - _memory_group.manage(&_suppressed); - _nonmax_kernel.configure(&_output, &_suppressed, BorderMode::UNDEFINED == border_mode); - _fill_kernel.configure(&_suppressed, 1 /* we keep all texels >0 */, corners); - - // Allocate intermediate tensors - _suppressed.allocator()->allocate(); - } - - // Allocate intermediate tensors - _output.allocator()->allocate(); -} - -void NEFastCorners::run() -{ - NEScheduler::get().schedule(&_border_handler, Window::DimZ); - - MemoryGroupResourceScope scope_mg(_memory_group); - - NEScheduler::get().schedule(&_fast_corners_kernel, Window::DimY); - - if(_non_max) - { - NEScheduler::get().schedule(&_nonmax_kernel, Window::DimY); - } - - NEScheduler::get().schedule(&_fill_kernel, Window::DimY); -} diff --git a/src/runtime/NEON/functions/NEFill.cpp b/src/runtime/NEON/functions/NEFill.cpp index d507f7c88f..bc1d5b7f5c 100644 --- a/src/runtime/NEON/functions/NEFill.cpp +++ b/src/runtime/NEON/functions/NEFill.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2020 ARM Limited. + * Copyright (c) 2017-2021 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -23,18 +23,40 @@ */ #include "arm_compute/runtime/NEON/functions/NEFill.h" -#include "arm_compute/core/Window.h" -#include "arm_compute/runtime/NEON/NEScheduler.h" -#include "support/MemorySupport.h" +#include "arm_compute/core/Validate.h" + +#include "src/cpu/operators/CpuFill.h" #include <utility> namespace arm_compute { +struct NEFill::Impl +{ + ITensor *tensor{nullptr}; + std::unique_ptr<cpu::CpuFill> op{nullptr}; +}; + +NEFill::NEFill() : _impl(std::make_unique<Impl>()) +{ +} +NEFill::NEFill(NEFill &&) = default; +NEFill &NEFill::operator=(NEFill &&) = default; +NEFill::~NEFill() = default; + void NEFill::configure(ITensor *tensor, PixelValue constant_value) { - auto k = arm_compute::support::cpp14::make_unique<NEMemsetKernel>(); - k->configure(tensor, constant_value); - _kernel = std::move(k); + ARM_COMPUTE_ERROR_ON_NULLPTR(tensor); + + _impl->tensor = tensor; + _impl->op = std::make_unique<cpu::CpuFill>(); + _impl->op->configure(tensor->info(), constant_value); +} + +void NEFill::run() +{ + ITensorPack pack; + pack.add_tensor(TensorType::ACL_SRC_DST, _impl->tensor); + _impl->op->run(pack); } } // namespace arm_compute diff --git a/src/runtime/NEON/functions/NEFillBorder.cpp b/src/runtime/NEON/functions/NEFillBorder.cpp index 6b7a0faa85..a3ab9c3db4 100644 --- a/src/runtime/NEON/functions/NEFillBorder.cpp +++ b/src/runtime/NEON/functions/NEFillBorder.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2016-2019 ARM Limited. + * Copyright (c) 2016-2021 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -26,15 +26,27 @@ #include "arm_compute/core/Window.h" #include "arm_compute/runtime/NEON/NEScheduler.h" +#include "src/common/utils/Log.h" +#include "src/core/NEON/kernels/NEFillBorderKernel.h" + namespace arm_compute { -void NEFillBorder::configure(ITensor *input, unsigned int border_width, BorderMode border_mode, const PixelValue &constant_border_value) +NEFillBorder::NEFillBorder() : _border_handler(nullptr) +{ +} + +void NEFillBorder::configure(ITensor *input, + unsigned int border_width, + BorderMode border_mode, + const PixelValue &constant_border_value) { - _border_handler.configure(input, BorderSize(border_width), border_mode, constant_border_value); + ARM_COMPUTE_LOG_PARAMS(input, border_width, border_mode, constant_border_value); + _border_handler = std::make_unique<NEFillBorderKernel>(); + _border_handler->configure(input, BorderSize(border_width), border_mode, constant_border_value); } void NEFillBorder::run() { - NEScheduler::get().schedule(&_border_handler, Window::DimZ); + NEScheduler::get().schedule(_border_handler.get(), Window::DimZ); } -} // namespace arm_compute
\ No newline at end of file +} // namespace arm_compute diff --git a/src/runtime/NEON/functions/NEFlattenLayer.cpp b/src/runtime/NEON/functions/NEFlattenLayer.cpp index a28411c4e9..56db2be3fa 100644 --- a/src/runtime/NEON/functions/NEFlattenLayer.cpp +++ b/src/runtime/NEON/functions/NEFlattenLayer.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2020 ARM Limited. + * Copyright (c) 2017-2021 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -23,21 +23,57 @@ */ #include "arm_compute/runtime/NEON/functions/NEFlattenLayer.h" -#include "arm_compute/core/NEON/kernels/NEFlattenLayerKernel.h" -#include "arm_compute/core/Size2D.h" -#include "support/MemorySupport.h" +#include "arm_compute/core/TensorInfo.h" +#include "arm_compute/core/utils/misc/ShapeCalculator.h" +#include "arm_compute/core/Validate.h" + +#include "src/core/helpers/AutoConfiguration.h" +#include "src/cpu/operators/CpuFlatten.h" namespace arm_compute { +struct NEFlattenLayer::Impl +{ + const ITensor *src{nullptr}; + ITensor *dst{nullptr}; + std::unique_ptr<cpu::CpuFlatten> op{nullptr}; +}; + +NEFlattenLayer::NEFlattenLayer() : _impl(std::make_unique<Impl>()) +{ +} +NEFlattenLayer::NEFlattenLayer(NEFlattenLayer &&) = default; +NEFlattenLayer &NEFlattenLayer::operator=(NEFlattenLayer &&) = default; +NEFlattenLayer::~NEFlattenLayer() = default; + void NEFlattenLayer::configure(const ITensor *input, ITensor *output) { - auto k = arm_compute::support::cpp14::make_unique<NEFlattenLayerKernel>(); - k->configure(input, output); - _kernel = std::move(k); + ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); + _impl->src = input; + _impl->dst = output; + auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape( + misc::shape_calculator::compute_flatten_shape(input->info()))); + + _impl->op = std::make_unique<cpu::CpuFlatten>(); + _impl->op->configure(_impl->src->info(), _impl->dst->info()); } Status NEFlattenLayer::validate(const ITensorInfo *input, const ITensorInfo *output) { - return NEFlattenLayerKernel::validate(input, output); + // Checks performed when output is configured + if (output->total_size() != 0) + { + const TensorInfo tensor_info_output = + input->clone()->set_tensor_shape(misc::shape_calculator::compute_flatten_shape(input)); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output, &tensor_info_output); + } + return cpu::CpuFlatten::validate(input, output); +} +void NEFlattenLayer::run() +{ + ITensorPack pack; + pack.add_tensor(TensorType::ACL_SRC, _impl->src); + pack.add_tensor(TensorType::ACL_DST, _impl->dst); + _impl->op->run(pack); } } // namespace arm_compute diff --git a/src/runtime/NEON/functions/NEFloor.cpp b/src/runtime/NEON/functions/NEFloor.cpp index 98b9725329..112c93c478 100644 --- a/src/runtime/NEON/functions/NEFloor.cpp +++ b/src/runtime/NEON/functions/NEFloor.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2020 ARM Limited. + * Copyright (c) 2017-2021 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -23,20 +23,47 @@ */ #include "arm_compute/runtime/NEON/functions/NEFloor.h" -#include "arm_compute/core/NEON/kernels/NEFloorKernel.h" -#include "support/MemorySupport.h" +#include "arm_compute/core/Validate.h" + +#include "src/cpu/operators/CpuFloor.h" namespace arm_compute { +struct NEFloor::Impl +{ + const ITensor *src{nullptr}; + ITensor *dst{nullptr}; + std::unique_ptr<cpu::CpuFloor> op{nullptr}; +}; + +NEFloor::NEFloor() : _impl(std::make_unique<Impl>()) +{ +} +NEFloor::NEFloor(NEFloor &&) = default; +NEFloor &NEFloor::operator=(NEFloor &&) = default; +NEFloor::~NEFloor() = default; + void NEFloor::configure(const ITensor *input, ITensor *output) { - auto k = arm_compute::support::cpp14::make_unique<NEFloorKernel>(); - k->configure(input, output); - _kernel = std::move(k); + ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); + + _impl->src = input; + _impl->dst = output; + + _impl->op = std::make_unique<cpu::CpuFloor>(); + _impl->op->configure(_impl->src->info(), _impl->dst->info()); } Status NEFloor::validate(const ITensorInfo *input, const ITensorInfo *output) { - return NEFloorKernel::validate(input, output); + return cpu::CpuFloor::validate(input, output); +} + +void NEFloor::run() +{ + ITensorPack pack; + pack.add_tensor(TensorType::ACL_SRC, _impl->src); + pack.add_tensor(TensorType::ACL_DST, _impl->dst); + _impl->op->run(pack); } } // namespace arm_compute diff --git a/src/runtime/NEON/functions/NEFullyConnectedLayer.cpp b/src/runtime/NEON/functions/NEFullyConnectedLayer.cpp index e275bca2f9..2656d0fa0f 100644 --- a/src/runtime/NEON/functions/NEFullyConnectedLayer.cpp +++ b/src/runtime/NEON/functions/NEFullyConnectedLayer.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2020 ARM Limited. + * Copyright (c) 2017-2023 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -23,466 +23,138 @@ */ #include "arm_compute/runtime/NEON/functions/NEFullyConnectedLayer.h" -#include "arm_compute/core/Helpers.h" -#include "arm_compute/core/Size2D.h" +#include "arm_compute/core/ITensorPack.h" #include "arm_compute/core/Validate.h" -#include "arm_compute/core/utils/misc/ShapeCalculator.h" -#include "arm_compute/core/utils/quantization/AsymmHelpers.h" -#include "arm_compute/runtime/NEON/NEScheduler.h" +#include "arm_compute/runtime/MemoryGroup.h" +#include "arm_compute/runtime/NEON/functions/NEConvertFullyConnectedWeights.h" -#include <algorithm> -#include <cmath> +#include "src/common/utils/Log.h" +#include "src/core/helpers/MemoryHelpers.h" +#include "src/cpu/operators/CpuFullyConnected.h" namespace arm_compute { -using namespace arm_compute::misc::shape_calculator; +using namespace arm_compute::experimental; -namespace +struct NEFullyConnectedLayer::Impl { -// Get min, max bound of a quantized assymetric output tensor, with the effect of fused activation -std::pair<PixelValue, PixelValue> get_quantized_asymmetric_output_min_max(const QuantizationInfo &q_info, const ActivationLayerInfo &act_info, DataType data_type) -{ - PixelValue type_min{}; - PixelValue type_max{}; - std::tie(type_min, type_max) = get_min_max(data_type); - const UniformQuantizationInfo q_unif = q_info.uniform(); - - if(act_info.enabled()) - { - switch(act_info.activation()) - { - case ActivationLayerInfo::ActivationFunction::RELU: - type_min = PixelValue(q_unif.offset); - break; - case ActivationLayerInfo::ActivationFunction::BOUNDED_RELU: - type_min = PixelValue(q_unif.offset); - type_max = PixelValue(act_info.a(), data_type, q_info); - break; - case ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU: - type_min = PixelValue(act_info.b(), data_type, q_info); - type_max = PixelValue(act_info.a(), data_type, q_info); - break; - default: - ARM_COMPUTE_ERROR("Activation function not supported."); - break; - } - } - - return std::make_pair(type_min, type_max); -} - -Status get_gemmlowp_output_stage_info(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *output, const ActivationLayerInfo &act, - GEMMLowpOutputStageInfo &gemmlowp_output_stage_info) -{ - const auto data_type = input->data_type(); - const QuantizationInfo oq_info = output->quantization_info(); - const UniformQuantizationInfo iq_unif = input->quantization_info().uniform(); - const UniformQuantizationInfo wq_unif = weights->quantization_info().uniform(); - const UniformQuantizationInfo oq_unif = oq_info.uniform(); - - float multiplier = (iq_unif.scale * wq_unif.scale) / oq_unif.scale; - int32_t output_multiplier; - int32_t output_shift; - - ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier(multiplier, &output_multiplier, &output_shift)); - - PixelValue type_min{}; - PixelValue type_max{}; - std::tie(type_min, type_max) = get_quantized_asymmetric_output_min_max(oq_info, act, data_type); - - gemmlowp_output_stage_info.gemmlowp_multiplier = output_multiplier; - gemmlowp_output_stage_info.gemmlowp_shift = output_shift; - gemmlowp_output_stage_info.gemmlowp_offset = oq_unif.offset; - gemmlowp_output_stage_info.type = GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT; - gemmlowp_output_stage_info.gemmlowp_min_bound = type_min.get<int32_t>(); - gemmlowp_output_stage_info.gemmlowp_max_bound = type_max.get<int32_t>(); - - return Status{}; -} - -Status validate_mm(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const ActivationLayerInfo &act) -{ - if(is_data_type_quantized_asymmetric(input->data_type())) - { - // Since we need negative offsets for computing convolution, we need to change QuantizationInfo() - // Extract and negate input and weights offset - const QuantizationInfo input_quantization_info(input->quantization_info().uniform().scale, -input->quantization_info().uniform().offset); - const QuantizationInfo weights_quantization_info(weights->quantization_info().uniform().scale, -weights->quantization_info().uniform().offset); - - GEMMLowpOutputStageInfo gemmlowp_output_stage_info; - ARM_COMPUTE_RETURN_ON_ERROR(get_gemmlowp_output_stage_info(input, weights, output, act, gemmlowp_output_stage_info)); - - GEMMInfo gemm_info; - gemm_info.set_gemmlowp_output_stage(gemmlowp_output_stage_info); - - // Validate gemmlowp function - ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpMatrixMultiplyCore::validate(&input->clone()->set_quantization_info(input_quantization_info), - &weights->clone()->set_quantization_info(weights_quantization_info), - biases, - output, - gemm_info)); - } - else - { - ARM_COMPUTE_RETURN_ON_ERROR(NEGEMM::validate(input, weights, biases, output, 1.f, 1.0f, GEMMInfo(false, false, true /* Reshape weights only for the first run */))); - } - - return Status{}; -} -} // namespace - -void NEFullyConnectedLayerReshapeWeights::configure(const ITensor *input, ITensor *output) -{ - auto k = arm_compute::support::cpp14::make_unique<NETransposeKernel>(); - k->configure(input, output); - _kernel = std::move(k); -} - -Status NEFullyConnectedLayerReshapeWeights::validate(const ITensorInfo *input, const ITensorInfo *output) -{ - return NETransposeKernel::validate(input, output); -} - -NEFullyConnectedLayer::NEFullyConnectedLayer(std::shared_ptr<IMemoryManager> memory_manager, IWeightsManager *weights_manager) - : _memory_group(std::move(memory_manager)), _weights_manager(weights_manager), _flatten_kernel(), _convert_weights(), _convert_weights_managed(), _reshape_weights_function(), - _reshape_weights_managed_function(), _mm_gemm(nullptr, weights_manager), _mm_gemmlowp(nullptr, weights_manager), _flatten_output(), _converted_weights_output(), _reshape_weights_output(), - _original_weights(nullptr), _are_weights_converted(true), _are_weights_reshaped(false), _is_fc_after_conv(false), _is_quantized_asymmetric(false), _is_prepared(false) -{ -} - -void NEFullyConnectedLayer::configure_mm(const ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const ActivationLayerInfo &act) -{ - if(_is_quantized_asymmetric) - { - // Since we need negative offsets for computing convolution, we need to change QuantizationInfo() - // Extract and negate input and weights offset - const QuantizationInfo input_quantization_info = input->info()->quantization_info(); - const QuantizationInfo weights_quantization_info = weights->info()->quantization_info(); - - input->info()->set_quantization_info(QuantizationInfo(input_quantization_info.uniform().scale, -input_quantization_info.uniform().offset)); - weights->info()->set_quantization_info(QuantizationInfo(weights_quantization_info.uniform().scale, -weights_quantization_info.uniform().offset)); - - // Configure gemmlowp function and output stage for asymmetric quantized types - GEMMLowpOutputStageInfo gemmlowp_output_stage_info; - const Status status = get_gemmlowp_output_stage_info(input->info(), weights->info(), output->info(), act, gemmlowp_output_stage_info); - ARM_COMPUTE_ERROR_ON(status.error_code() != ErrorCode::OK); - - GEMMInfo gemm_info; - gemm_info.set_gemmlowp_output_stage(gemmlowp_output_stage_info); - gemm_info.set_activation_info(act); - _mm_gemmlowp.configure(input, weights, biases, output, gemm_info); - - // Revert back QuantizatioInfo as input and weights could be used in other fully connected layers - input->info()->set_quantization_info(input_quantization_info); - weights->info()->set_quantization_info(weights_quantization_info); - } - else - { - // Configure matrix multiply kernel - GEMMInfo gemm_info(false, false, true /* Reshape weights only for the first run */); - gemm_info.set_activation_info(act); - _mm_gemm.configure(input, weights, biases, output, 1.f, 1.0f, gemm_info); - } -} - -void NEFullyConnectedLayer::configure_conv_fc(const ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const ActivationLayerInfo &act) -{ - ARM_COMPUTE_ERROR_ON((weights->info()->dimension(1) != (input->info()->dimension(0) * input->info()->dimension(1) * input->info()->dimension(2)))); + MemoryGroup memory_group{}; + IWeightsManager *weights_manager{nullptr}; - // If the fully connected layer is called after a convolution layer, the input tensor must be linearized + std::unique_ptr<cpu::CpuFullyConnected> op{nullptr}; - // Initialize output tensor for flatten - TensorShape shape_flatten = compute_flatten_shape(input->info()); - _flatten_output.allocator()->init(input->info()->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(shape_flatten)); + const ITensor *original_weights{nullptr}; - // Configure flatten kernel - _memory_group.manage(&_flatten_output); - _flatten_kernel.configure(input, &_flatten_output); + ITensorPack run_pack{}; + WorkspaceData<Tensor> workspace{}; + experimental::MemoryRequirements aux_mem_req{}; - // Configure matrix multiply kernel - configure_mm(&_flatten_output, weights, biases, output, act); + bool is_prepared{false}; + bool dynamic_weights{false}; +}; - // Allocate the output tensor for flatten once all the configure methods have been called - _flatten_output.allocator()->allocate(); -} +NEFullyConnectedLayer::~NEFullyConnectedLayer() = default; -void NEFullyConnectedLayer::configure_fc_fc(const ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const ActivationLayerInfo &act) +NEFullyConnectedLayer::NEFullyConnectedLayer(std::shared_ptr<IMemoryManager> memory_manager, + IWeightsManager *weights_manager) + : _impl(std::make_unique<Impl>()) { - ARM_COMPUTE_ERROR_ON(input->info()->dimension(0) != weights->info()->dimension(1)); - - // Configure matrix multiply kernel - configure_mm(input, weights, biases, output, act); + _impl->memory_group = MemoryGroup(std::move(memory_manager)); + _impl->weights_manager = weights_manager; } -void NEFullyConnectedLayer::configure(const ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, - FullyConnectedLayerInfo fc_info) +void NEFullyConnectedLayer::configure(const ITensor *input, + const ITensor *weights, + const ITensor *biases, + ITensor *output, + FullyConnectedLayerInfo fc_info, + const WeightsInfo &weights_info) { // Perform validate step ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output); - ARM_COMPUTE_ERROR_THROW_ON(NEFullyConnectedLayer::validate(input->info(), - weights->info(), + ARM_COMPUTE_ERROR_THROW_ON(NEFullyConnectedLayer::validate(input->info(), weights->info(), biases != nullptr ? biases->info() : nullptr, - output->info(), - fc_info)); - - _are_weights_converted = true; - _are_weights_reshaped = fc_info.transpose_weights ? fc_info.are_weights_reshaped : true; - _is_fc_after_conv = true; - _is_quantized_asymmetric = is_data_type_quantized_asymmetric(input->info()->data_type()); - _original_weights = weights; - - if(_weights_manager) - { - _weights_manager->manage(weights); - } - - // With the Fully Connected layer we can have 4 different cases: - // 1) Convolution layer -> Fully Connected layer without batches - // 2) Fully Connected layer -> Fully Connected layer without batches - // 3) Convolution layer -> Fully Connected layer with batches - // 4) Fully Connected layer -> Fully Connected layer with batches + output->info(), fc_info, weights_info)); + ARM_COMPUTE_LOG_PARAMS(input, weights, biases, output, fc_info); - const ITensor *weights_to_use = weights; + _impl->op = std::make_unique<cpu::CpuFullyConnected>(); + _impl->original_weights = weights; + _impl->is_prepared = false; - // Check if we have a fully connected layer with batches - const bool is_batched_fc_layer = output->info()->dimension(1) > 1; - if(is_batched_fc_layer) - { - _is_fc_after_conv = (TensorShape::num_max_dimensions >= 4) && (std::equal(input->info()->tensor_shape().cbegin() + 3, - input->info()->tensor_shape().cend(), - output->info()->tensor_shape().cbegin() + 1)); - } - else - { - _is_fc_after_conv = input->info()->num_dimensions() > 1; - } - - // Reshape weights if needed - if(!_are_weights_reshaped) - { - if(_weights_manager && _weights_manager->are_weights_managed(weights)) - { - _reshape_weights_managed_function.configure(weights); - weights_to_use = _weights_manager->acquire(weights, &_reshape_weights_managed_function); - } - else - { - // Reshape the weights - _reshape_weights_function.configure(weights, &_reshape_weights_output); - weights_to_use = &_reshape_weights_output; - } - } + _impl->op->configure(input->info(), weights->info(), (biases != nullptr) ? biases->info() : nullptr, output->info(), + fc_info, weights_info); - // Convert weights if needed - if(_is_fc_after_conv && (input->info()->data_layout() != fc_info.weights_trained_layout)) + if (_impl->weights_manager != nullptr) { - if(_weights_manager && _weights_manager->are_weights_managed(weights_to_use)) - { - _convert_weights_managed.configure(weights_to_use, - input->info()->tensor_shape(), - fc_info.weights_trained_layout); - weights_to_use = _weights_manager->acquire(weights, &_convert_weights_managed); - } - else - { - // Convert weights - _convert_weights.configure(weights_to_use, - &_converted_weights_output, - input->info()->tensor_shape(), - fc_info.weights_trained_layout); - - weights_to_use = &_converted_weights_output; - } - _are_weights_converted = false; + _impl->weights_manager->manage(_impl->original_weights); } - if(_is_fc_after_conv) - { - // Fully Connected layer after a Convolution Layer without batches - configure_conv_fc(input, weights_to_use, biases, output, fc_info.activation_info); - } - else - { - // Fully Connected layer after a Fully Connected Layer without batches - configure_fc_fc(input, weights_to_use, biases, output, fc_info.activation_info); - } + _impl->aux_mem_req = _impl->op->workspace(); + _impl->run_pack = {{ACL_SRC_0, input}, {ACL_SRC_1, weights}, {ACL_SRC_2, biases}, {ACL_DST, output}}; + _impl->workspace = + manage_workspace<Tensor>(_impl->aux_mem_req, _impl->memory_group, _impl->run_pack, _impl->run_pack); - _are_weights_reshaped = _are_weights_reshaped || fc_info.retain_internal_weights; + _impl->dynamic_weights = !weights->info()->are_values_constant() && fc_info.transpose_weights && + !fc_info.are_weights_reshaped && !fc_info.retain_internal_weights; } -Status NEFullyConnectedLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, - FullyConnectedLayerInfo fc_info) +Status NEFullyConnectedLayer::has_opt_impl(arm_compute::WeightFormat &expected_weight_format, + const ITensorInfo *input, + const ITensorInfo *weights, + const ITensorInfo *biases, + const ITensorInfo *output, + const FullyConnectedLayerInfo &fc_info, + const WeightsInfo &weights_info) { - ARM_COMPUTE_UNUSED(fc_info.retain_internal_weights); - ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F16, DataType::F32); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights, output); - ARM_COMPUTE_RETURN_ERROR_ON(weights->num_dimensions() > 2); - ARM_COMPUTE_RETURN_ERROR_ON(biases != nullptr && biases->num_dimensions() > 1); - - bool weights_reshaped = fc_info.transpose_weights ? fc_info.are_weights_reshaped : true; - bool is_fc_after_conv = true; - - const ITensorInfo &flatten_input = TensorInfo(input->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(compute_flatten_shape(input))); - const ITensorInfo &reshaped_weights = TensorInfo(weights->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(compute_transposed_shape(*weights))); - const ITensorInfo &converted_weights = weights_reshaped ? TensorInfo(weights->clone()->set_is_resizable(true).reset_padding()) : TensorInfo(*reshaped_weights.clone()); - - // With the Fully Connected layer we can have 4 different cases: - // 1) Convolution layer -> Fully Connected layer without batches - // 2) Fully Connected layer -> Fully Connected layer without batches - // 3) Convolution layer -> Fully Connected layer with batches - // 4) Fully Connected layer -> Fully Connected layer with batches - - const ITensorInfo *input_to_use = input; - const ITensorInfo *weights_to_use = weights; - - // Check if we have a fully connected layer with batches - const bool is_batched_fc_layer = output->dimension(1) > 1; - - if(is_batched_fc_layer) - { - is_fc_after_conv = (TensorShape::num_max_dimensions >= 4) && (std::equal(input->tensor_shape().cbegin() + 3, - input->tensor_shape().cend(), - output->tensor_shape().cbegin() + 1)); - } - else - { - is_fc_after_conv = input->num_dimensions() > 1; - } - - if(!weights_reshaped) - { - // Validate reshape weights kernel - ARM_COMPUTE_RETURN_ON_ERROR(NEFullyConnectedLayerReshapeWeights::validate(weights, &reshaped_weights)); - weights_to_use = &reshaped_weights; - } - - if(is_fc_after_conv && (input->data_layout() != fc_info.weights_trained_layout)) - { - // Validate convert weights kernel - ARM_COMPUTE_RETURN_ON_ERROR(NEConvertFullyConnectedWeights::validate(weights_to_use, - &converted_weights, - input->tensor_shape(), - fc_info.weights_trained_layout)); - weights_to_use = &converted_weights; - } - - if(is_fc_after_conv) - { - // Fully Connected layer after a Convolution Layer without batches - ARM_COMPUTE_RETURN_ERROR_ON((weights_to_use->dimension(1) != (input->dimension(0) * input->dimension(1) * input->dimension(2)))); - - // Validate flatten kernel - ARM_COMPUTE_RETURN_ON_ERROR(NEFlattenLayerKernel::validate(input, &flatten_input)); - input_to_use = &flatten_input; - } - else - { - // Fully Connected layer after a Fully Connected Layer without batches - ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(0) != weights_to_use->dimension(1)); - } - // Validate matrix multiply kernel - ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(input_to_use, weights_to_use, biases, output, fc_info.activation_info)); + return cpu::CpuFullyConnected::has_opt_impl(expected_weight_format, input, weights, biases, output, fc_info, + weights_info); +} - return Status{}; +Status NEFullyConnectedLayer::validate(const ITensorInfo *input, + const ITensorInfo *weights, + const ITensorInfo *biases, + const ITensorInfo *output, + FullyConnectedLayerInfo fc_info, + const WeightsInfo &weights_info) +{ + return cpu::CpuFullyConnected::validate(input, weights, biases, output, fc_info, weights_info); } void NEFullyConnectedLayer::run() { - prepare(); - - MemoryGroupResourceScope scope_mg(_memory_group); - - // Linearize input if it comes from a convolutional layer - if(_is_fc_after_conv) + if (!_impl->dynamic_weights) { - NEScheduler::get().schedule(&_flatten_kernel, Window::DimY); + prepare(); } - // Run matrix multiply - if(_is_quantized_asymmetric) - { - _mm_gemmlowp.run(); - } - else - { - _mm_gemm.run(); - } + MemoryGroupResourceScope scope_mg(_impl->memory_group); + _impl->op->run(_impl->run_pack); } void NEFullyConnectedLayer::prepare() { - if(!_is_prepared) + if (!_impl->is_prepared) { - if(!_weights_manager) - { - ARM_COMPUTE_ERROR_ON(!_original_weights->is_used()); - } - - auto release_unused = [](Tensor * w) - { - if(!w->is_used()) - { - w->allocator()->free(); - } - }; + _impl->op->prepare(_impl->run_pack); - // Pointer to current weights - const ITensor *cur_weights = _original_weights; + // Release temporary tensors that are only used in prepare stage + release_temporaries<Tensor>(_impl->aux_mem_req, _impl->workspace); + _impl->is_prepared = true; - // Reshape of the weights (happens only once) - if(!_are_weights_reshaped) + // Handle weights managed infrastructure + if (_impl->weights_manager != nullptr && _impl->weights_manager->are_weights_managed(_impl->original_weights)) { - if(_weights_manager && _weights_manager->are_weights_managed(_original_weights)) + // Ensure that b gets marked as unused (memory released) only after the last function which uses b also finishes its prepare + // This is for cases where multiple functions share the same b (weights) + // Therefore when a function marks original b as unused, we pre-mark it in weights manager, and mark it back to used so that it doesn't get released before its last reference + const ITensor *original_b = _impl->original_weights; + if (!original_b->is_used()) { - cur_weights = _weights_manager->run(cur_weights, &_reshape_weights_managed_function); + _impl->weights_manager->pre_mark_as_unused(original_b); } - else - { - // Reshape of the weights (happens only once) - if(!_are_weights_reshaped) - { - // Run reshape weights kernel and mark weights as unused - _reshape_weights_output.allocator()->allocate(); - _reshape_weights_function.run(); - } - cur_weights->mark_as_unused(); - cur_weights = &_reshape_weights_output; - } - _are_weights_reshaped = true; - } - - // Convert weights if needed (happens only once) - if(!_are_weights_converted) - { - if(_weights_manager && _weights_manager->are_weights_managed(cur_weights)) - { - _weights_manager->run(cur_weights, &_convert_weights_managed); - } - else - { - _converted_weights_output.allocator()->allocate(); - _convert_weights.run(); - cur_weights->mark_as_unused(); - } - - _are_weights_converted = true; - } - - // Release reshaped weights if unused - release_unused(&_reshape_weights_output); - - // Prepare GEMM prepare and release unused weights - if(!_is_quantized_asymmetric) - { - _mm_gemm.prepare(); + _impl->original_weights->mark_as_used(); + _impl->weights_manager->release(_impl->original_weights); } - - // Release converted weights if unused - release_unused(&_reshape_weights_output); - release_unused(&_converted_weights_output); - - _is_prepared = true; } } } // namespace arm_compute diff --git a/src/runtime/NEON/functions/NEFuseBatchNormalization.cpp b/src/runtime/NEON/functions/NEFuseBatchNormalization.cpp index 68dc159f75..f5b8b57dac 100644 --- a/src/runtime/NEON/functions/NEFuseBatchNormalization.cpp +++ b/src/runtime/NEON/functions/NEFuseBatchNormalization.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018-2019 ARM Limited. + * Copyright (c) 2018-2021 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -29,31 +29,53 @@ #include "arm_compute/core/Types.h" #include "arm_compute/runtime/NEON/NEScheduler.h" +#include "src/common/utils/Log.h" +#include "src/core/NEON/kernels/NEFuseBatchNormalizationKernel.h" + namespace arm_compute { -NEFuseBatchNormalization::NEFuseBatchNormalization() - : _fuse_bn_kernel() +NEFuseBatchNormalization::~NEFuseBatchNormalization() = default; + +NEFuseBatchNormalization::NEFuseBatchNormalization() : _fuse_bn_kernel() { } -void NEFuseBatchNormalization::configure(const ITensor *input_weights, const ITensor *bn_mean, const ITensor *bn_var, - ITensor *fused_weights, ITensor *fused_bias, - const ITensor *input_bias, const ITensor *bn_beta, const ITensor *bn_gamma, - float epsilon, FuseBatchNormalizationType fbn_type) +void NEFuseBatchNormalization::configure(const ITensor *input_weights, + const ITensor *bn_mean, + const ITensor *bn_var, + ITensor *fused_weights, + ITensor *fused_bias, + const ITensor *input_bias, + const ITensor *bn_beta, + const ITensor *bn_gamma, + float epsilon, + FuseBatchNormalizationType fbn_type) { - _fuse_bn_kernel.configure(input_weights, bn_mean, bn_var, fused_weights, fused_bias, input_bias, bn_beta, bn_gamma, epsilon, fbn_type); + ARM_COMPUTE_LOG_PARAMS(input_weights, bn_mean, bn_var, fused_weights, fused_bias, input_bias, bn_beta, bn_gamma, + epsilon, fbn_type); + + _fuse_bn_kernel = std::make_unique<NEFuseBatchNormalizationKernel>(); + _fuse_bn_kernel->configure(input_weights, bn_mean, bn_var, fused_weights, fused_bias, input_bias, bn_beta, bn_gamma, + epsilon, fbn_type); } -Status NEFuseBatchNormalization::validate(const ITensorInfo *input_weights, const ITensorInfo *bn_mean, const ITensorInfo *bn_var, - const ITensorInfo *fused_weights, const ITensorInfo *fused_bias, - const ITensorInfo *input_bias, const ITensorInfo *bn_beta, const ITensorInfo *bn_gamma, - float epsilon, FuseBatchNormalizationType fbn_type) +Status NEFuseBatchNormalization::validate(const ITensorInfo *input_weights, + const ITensorInfo *bn_mean, + const ITensorInfo *bn_var, + const ITensorInfo *fused_weights, + const ITensorInfo *fused_bias, + const ITensorInfo *input_bias, + const ITensorInfo *bn_beta, + const ITensorInfo *bn_gamma, + float epsilon, + FuseBatchNormalizationType fbn_type) { - return NEFuseBatchNormalizationKernel::validate(input_weights, bn_mean, bn_var, fused_weights, fused_bias, input_bias, bn_beta, bn_gamma, epsilon, fbn_type); + return NEFuseBatchNormalizationKernel::validate(input_weights, bn_mean, bn_var, fused_weights, fused_bias, + input_bias, bn_beta, bn_gamma, epsilon, fbn_type); } void NEFuseBatchNormalization::run() { - NEScheduler::get().schedule(&_fuse_bn_kernel, Window::DimY); + NEScheduler::get().schedule(_fuse_bn_kernel.get(), Window::DimY); } } // namespace arm_compute diff --git a/src/runtime/NEON/functions/NEGEMM.cpp b/src/runtime/NEON/functions/NEGEMM.cpp index 2bd459a389..934a8250cc 100644 --- a/src/runtime/NEON/functions/NEGEMM.cpp +++ b/src/runtime/NEON/functions/NEGEMM.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2020 ARM Limited. + * Copyright (c) 2017-2023 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -23,354 +23,140 @@ */ #include "arm_compute/runtime/NEON/functions/NEGEMM.h" -#include "arm_compute/core/CPP/Validate.h" -#include "arm_compute/core/Error.h" -#include "arm_compute/core/Helpers.h" -#include "arm_compute/core/ITensor.h" +#include "arm_compute/core/ITensorPack.h" #include "arm_compute/core/TensorInfo.h" #include "arm_compute/core/Types.h" -#include "arm_compute/core/Validate.h" -#include "arm_compute/core/utils/misc/ShapeCalculator.h" -#include "arm_compute/runtime/NEON/NEScheduler.h" -#include "arm_compute/runtime/NEON/functions/NEGEMMAssemblyDispatch.h" -#include "arm_compute/runtime/TensorAllocator.h" +#include "arm_compute/runtime/MemoryGroup.h" +#include "arm_compute/runtime/Tensor.h" -#include <cmath> +#include "src/core/CPP/Validate.h" +#include "src/core/helpers/MemoryHelpers.h" +#include "src/cpu/operators/CpuGemm.h" -using namespace arm_compute::misc::shape_calculator; +using namespace arm_compute::experimental; namespace arm_compute { -NEGEMM::NEGEMM(std::shared_ptr<IMemoryManager> memory_manager, IWeightsManager *weights_manager) - : _memory_group(memory_manager), _weights_manager(weights_manager), _interleave_kernel(), _transpose_kernel(), _mm_kernel(), _asm_glue(memory_manager, weights_manager), _ma_kernel(), - _alpha_scale_func(nullptr), _add_bias_kernel(), _activation_func(), _tmp_a(), _tmp_b(), _tmp_d(), _original_b(nullptr), _run_vector_matrix_multiplication(false), _run_alpha_scale(false), - _run_addition(false), _run_bias_addition(false), _run_activation(false), _reshape_b_only_on_first_run(false), _is_prepared(false) -{ -} - -void NEGEMM::configure(const ITensor *a, const ITensor *b, const ITensor *c, ITensor *d, float alpha, float beta, const GEMMInfo &gemm_info) +struct NEGEMM::Impl { - ARM_COMPUTE_ERROR_THROW_ON(NEGEMM::validate(a->info(), b->info(), (c != nullptr) ? c->info() : nullptr, d->info(), alpha, beta, gemm_info)); - - const bool is_c_bias = gemm_info.reshape_b_only_on_first_run(); - bool run_optimised = bool(NEGEMMAssemblyDispatch::validate(a->info(), b->info(), (is_c_bias && c != nullptr) ? c->info() : nullptr, d->info(), gemm_info)); - - // Check if we need to reshape the matrix B only on the first run - _is_prepared = false; - _reshape_b_only_on_first_run = gemm_info.reshape_b_only_on_first_run(); - _run_vector_matrix_multiplication = a->info()->dimension(1) < 2; - _original_b = b; - _run_alpha_scale = alpha != 1.f; - _run_bias_addition = c != nullptr && gemm_info.reshape_b_only_on_first_run(); - _run_addition = beta != 0 && c != nullptr && !gemm_info.reshape_b_only_on_first_run(); - _run_activation = gemm_info.activation_info().enabled() && (!run_optimised || (run_optimised && !NEGEMMAssemblyDispatch::is_activation_supported(gemm_info.activation_info()))); - - if(run_optimised) - { - const ITensor *c_to_use = is_c_bias ? c : nullptr; - if(MEMInfo::get_policy() == MemoryPolicy::MINIMIZE) - { - GEMMInfo gemm_info_ntb = gemm_info; - gemm_info_ntb.set_pretranpose_B(false); - _asm_glue.configure(a, b, c_to_use, d, gemm_info_ntb); - } - else - { - _asm_glue.configure(a, b, c_to_use, d, gemm_info); - } - ARM_COMPUTE_ERROR_ON(!_asm_glue.is_configured()); - - // Scale product by alpha - if(_run_alpha_scale) - { - _alpha_scale_func.configure(d, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LINEAR, alpha, 0.f)); - } - } - else - { - // Pick output tensor in case bias addition should be performed - ITensor *gemm_output_to_use = d; - if(_run_bias_addition) - { - gemm_output_to_use = &_tmp_d; - _memory_group.manage(&_tmp_d); - } - - // Select between GEMV and GEMM - if(_run_vector_matrix_multiplication) - { - // Configure the matrix multiply kernel - _mm_kernel.configure(a, b, gemm_output_to_use, alpha, false); - } - else - { - TensorShape shape_tmp_a = a->info()->tensor_shape(); - TensorShape shape_tmp_b = b->info()->tensor_shape(); + MemoryGroup memory_group{}; + IWeightsManager *weights_manager{nullptr}; - shape_tmp_a.set(0, a->info()->dimension(0) * 4); - shape_tmp_a.set(1, std::ceil(a->info()->dimension(1) / 4.0f)); + std::unique_ptr<cpu::CpuGemm> op{nullptr}; - const unsigned int transpose_w = 16 / data_size_from_type(b->info()->data_type()); - shape_tmp_b.set(0, b->info()->dimension(1) * transpose_w); - shape_tmp_b.set(1, std::ceil(b->info()->dimension(0) / static_cast<float>(transpose_w))); + const ITensor *original_b{nullptr}; + bool is_prepared{false}; - TensorInfo info_a = a->info()->clone()->set_tensor_shape(shape_tmp_a).set_is_resizable(true); - TensorInfo info_b = b->info()->clone()->set_tensor_shape(shape_tmp_b).set_is_resizable(true); + ITensorPack run_pack{}; + ITensorPack prep_pack{}; + WorkspaceData<Tensor> workspace{}; + experimental::MemoryRequirements aux_mem_req{}; +}; - _tmp_a.allocator()->init(info_a); - _tmp_b.allocator()->init(info_b); - - // Manage intermediate buffers - _memory_group.manage(&_tmp_a); - if(!_reshape_b_only_on_first_run) - { - _memory_group.manage(&_tmp_b); - } - - int m = a->info()->dimension(1); - int n = b->info()->dimension(0); - int k = a->info()->dimension(0); - - // Configure interleave kernel - _interleave_kernel.configure(a, &_tmp_a); - - // Configure transpose kernel - _transpose_kernel.configure(b, &_tmp_b); - - // Configure matrix multiplication kernel - _mm_kernel.configure(&_tmp_a, &_tmp_b, gemm_output_to_use, alpha, true, GEMMReshapeInfo(m, n, k)); - - // Allocate once the all configure methods have been called - _tmp_a.allocator()->allocate(); - if(!_reshape_b_only_on_first_run) - { - _tmp_b.allocator()->allocate(); - } - } - - if(_run_bias_addition) - { - _add_bias_kernel.configure(gemm_output_to_use, c, d, ConvertPolicy::SATURATE); - _tmp_d.allocator()->allocate(); - } - } - - // Configure matrix addition kernel - if(_run_addition) - { - _ma_kernel.configure(c, d, beta); - } - - // Configure activation - const ActivationLayerInfo &activation = gemm_info.activation_info(); - if(_run_activation) - { - _activation_func.configure(d, nullptr, activation); - } +NEGEMM::NEGEMM(std::shared_ptr<IMemoryManager> memory_manager, IWeightsManager *weights_manager) + : _impl(std::make_unique<Impl>()) +{ + _impl->memory_group = MemoryGroup(std::move(memory_manager)); + _impl->weights_manager = weights_manager; } -Status NEGEMM::validate(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *output, float alpha, float beta, const GEMMInfo &gemm_info) -{ - ARM_COMPUTE_UNUSED(alpha); - const bool is_c_bias = gemm_info.reshape_b_only_on_first_run(); +NEGEMM::~NEGEMM() = default; - ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(a); - ARM_COMPUTE_RETURN_ERROR_ON_CPU_BF16_UNSUPPORTED(a); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(a, 1, DataType::BFLOAT16, DataType::F16, DataType::F32); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(a, b); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(a->dimension(0) != b->dimension(1), "The product AB is defined only if the number of columns in A is equal to the number of rows in B"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.is_a_reshaped(), "Matrix A already reshaped is not supported"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.is_b_reshaped(), "Matrix B already reshaped is not supported"); - if(a->data_type() != DataType::BFLOAT16) - { - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(a, output); - } +void NEGEMM::configure(const ITensor *a, + const ITensor *b, + const ITensor *c, + ITensor *d, + float alpha, + float beta, + const GEMMInfo &gemm_info) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(a, b, d); + ARM_COMPUTE_ERROR_THROW_ON(cpu::CpuGemm::validate(a->info(), b->info(), (c != nullptr) ? c->info() : nullptr, + d->info(), alpha, beta, gemm_info)); - if(c != nullptr && !is_c_bias) - { - ARM_COMPUTE_RETURN_ERROR_ON(gemm_info.depth_output_gemm3d() != 0); - ARM_COMPUTE_RETURN_ERROR_ON(gemm_info.reinterpret_input_as_3d()); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(c, output); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(a->dimension(1) != c->dimension(1), "The C matrix must have the same number of rows as the matrix A"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(b->dimension(0) != c->dimension(0), "The C matrix must have the same number of columns as the matrix B"); - } + // Check if we need to reshape the matrix B only on the first run + _impl->is_prepared = false; + _impl->original_b = b; + _impl->op = std::make_unique<cpu::CpuGemm>(); - if(output->total_size() != 0) + // Make the B matrix dynamic values. + auto b_info_to_use = b->info()->clone(); + if (!gemm_info.reshape_b_only_on_first_run()) { - ARM_COMPUTE_RETURN_ERROR_ON(b->dimension(0) != output->dimension(0)); - if(gemm_info.depth_output_gemm3d() != 0) - { - if(gemm_info.reinterpret_input_as_3d()) - { - ARM_COMPUTE_RETURN_ERROR_ON(a->dimension(1) != output->dimension(1)); - ARM_COMPUTE_RETURN_ERROR_ON(a->dimension(2) != output->dimension(2)); - } - else - { - ARM_COMPUTE_RETURN_ERROR_ON(a->dimension(1) != output->dimension(1) * output->dimension(2)); - } - } - else - { - ARM_COMPUTE_RETURN_ERROR_ON(a->dimension(1) != output->dimension(1)); - } + b_info_to_use->set_are_values_constant(false); } - // Check if we need to run the optimized assembly kernel - const bool run_optimised = bool(NEGEMMAssemblyDispatch::validate(a, b, is_c_bias ? c : nullptr, output, gemm_info)); - - if(!run_optimised) - { - ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.reinterpret_input_as_3d(), "NEGEMM cannot reinterpret the input tensor as 3D"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.depth_output_gemm3d() != 0, "NEGEMM cannot reinterpret the output tensor as 3D"); - - // Check if the first input tensor is a vector. - const bool run_vector_matrix_multiplication = a->dimension(1) < 2; - // Check if we need to reshape the matrix A and matrix B - const bool run_interleave_transpose = !run_vector_matrix_multiplication && !(gemm_info.reshape_b_only_on_first_run()); - - // Arguments used by GEMMReshapeInfo - // If we pass the matrix A and matrix B reshaped to NEGEMMMatrixMultiplyKernel, we need to pass m, n, k, mult_transpose1xW_width and mult_interleave4x4_height to NEGEMMReshapeInfo - // in order to know how the matrices have been reshaped - const int m = a->dimension(1); - const int n = b->dimension(0); - const int k = a->dimension(0); - int mult_transpose1xW_width = 1; - int mult_interleave4x4_height = 1; - - const GEMMReshapeInfo reshape_info = GEMMReshapeInfo(m, n, k, mult_transpose1xW_width, mult_interleave4x4_height, gemm_info.depth_output_gemm3d()); - - const ITensorInfo *matrix_a_info = a; - const ITensorInfo *matrix_b_info = b; - - TensorInfo tmp_a_info{}; - TensorInfo tmp_b_info{}; - TensorInfo tmp_output_info = *output->clone(); - - if(run_interleave_transpose) - { - matrix_a_info = &tmp_a_info; - matrix_b_info = &tmp_b_info; - - // Validate interleave kernel - auto_init_if_empty(tmp_a_info, a->clone()->set_tensor_shape(compute_interleaved_shape(*a, mult_interleave4x4_height, gemm_info.reinterpret_input_as_3d()))); - ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMInterleave4x4Kernel::validate(a, &tmp_a_info)); - - // Validate transpose kernel - auto_init_if_empty(tmp_b_info, b->clone()->set_tensor_shape(compute_transpose1xW_with_element_size_shape(*b, mult_transpose1xW_width))); - ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMTranspose1xWKernel::validate(b, &tmp_b_info)); - } + _impl->op->configure(a->info(), b_info_to_use.get(), (c != nullptr) ? c->info() : nullptr, d->info(), alpha, beta, + gemm_info); - // Validate matrix multiply - auto_init_if_empty(tmp_output_info, matrix_a_info->clone()->set_tensor_shape(compute_mm_shape(*matrix_a_info, *matrix_b_info, run_interleave_transpose, reshape_info))); - ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMMatrixMultiplyKernel::validate(matrix_a_info, matrix_b_info, &tmp_output_info, alpha, run_interleave_transpose, reshape_info)); - - if(c != nullptr && gemm_info.reshape_b_only_on_first_run()) - { - ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAdditionKernel::validate(&tmp_output_info, c, output, ConvertPolicy::SATURATE)); - } - } + _impl->aux_mem_req = _impl->op->workspace(); + _impl->run_pack = {{ACL_SRC_0, a}, {ACL_SRC_1, b}, {ACL_SRC_2, c}, {ACL_DST, d}}; + _impl->prep_pack = {{ACL_SRC_1, b}, {ACL_SRC_2, c}}; + _impl->workspace = + manage_workspace<Tensor>(_impl->aux_mem_req, _impl->memory_group, _impl->run_pack, _impl->prep_pack); +} - // Validate matrix addition kernel - if(beta != 0 && c != nullptr && !is_c_bias) +Status NEGEMM::validate(const ITensorInfo *a, + const ITensorInfo *b, + const ITensorInfo *c, + const ITensorInfo *output, + float alpha, + float beta, + const GEMMInfo &gemm_info) +{ + // Make the B matrix dynamic values. + auto b_to_use = b->clone(); + if (!gemm_info.reshape_b_only_on_first_run()) { - ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMMatrixAdditionKernel::validate(c, output, beta)); + b_to_use->set_are_values_constant(false); } - // Validate activation - const ActivationLayerInfo &activation = gemm_info.activation_info(); - if(activation.enabled()) - { - ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayer::validate(output, nullptr, activation)); - } + return cpu::CpuGemm::validate(a, b_to_use.get(), c, output, alpha, beta, gemm_info); +} - return Status{}; +Status NEGEMM::has_opt_impl(arm_compute::WeightFormat &expected_weight_format, + const ITensorInfo *a, + const ITensorInfo *b, + const ITensorInfo *c, + const ITensorInfo *output, + float alpha, + float beta, + const GEMMInfo &gemm_info) +{ + ARM_COMPUTE_UNUSED(alpha, beta); + return cpu::CpuGemm::has_opt_impl(expected_weight_format, a, b, c, output, gemm_info); } void NEGEMM::run() { prepare(); - MemoryGroupResourceScope scope_mg(_memory_group); - - if(_asm_glue.is_configured()) - { - _asm_glue.run(); - if(_run_alpha_scale) - { - _alpha_scale_func.run(); - } - } - else - { - if(!_run_vector_matrix_multiplication) - { - // Run interleave kernel - NEScheduler::get().schedule(&_interleave_kernel, Window::DimY); - - if(!_reshape_b_only_on_first_run) - { - // Run transpose kernel - NEScheduler::get().schedule(&_transpose_kernel, Window::DimY); - } - } - - NEScheduler::get().schedule(&_mm_kernel, _run_vector_matrix_multiplication ? Window::DimX : Window::DimY); - - // Run bias addition kernel - if(_run_bias_addition) - { - NEScheduler::get().schedule(&_add_bias_kernel, Window::DimY); - } - } - - // Run matrix addition kernel - if(_run_addition) - { - NEScheduler::get().schedule(&_ma_kernel, Window::DimY); - } - - // Run activation function - if(_run_activation) - { - _activation_func.run(); - } + MemoryGroupResourceScope scope_mg(_impl->memory_group); + _impl->op->run(_impl->run_pack); } void NEGEMM::prepare() { - if(!_is_prepared) + if (!_impl->is_prepared) { - const bool original_b_managed_by_weights_manager = _weights_manager && _weights_manager->are_weights_managed(_original_b); - if(_asm_glue.is_configured()) - { - if(!original_b_managed_by_weights_manager) - { - ARM_COMPUTE_ERROR_ON(!_original_b->is_used()); - } + _impl->op->prepare(_impl->prep_pack); + + auto has_reshape = + std::find_if(_impl->aux_mem_req.begin(), _impl->aux_mem_req.end(), + [](const MemoryInfo &m) -> bool { return m.lifetime == MemoryLifetime::Persistent; }); - _asm_glue.prepare(); - if(!original_b_managed_by_weights_manager) - { - _original_b->mark_as_unused(); - } + if (has_reshape != std::end(_impl->aux_mem_req)) + { + _impl->original_b->mark_as_unused(); } - else if(_reshape_b_only_on_first_run && !_run_vector_matrix_multiplication && !_asm_glue.is_configured()) + else { - if(!original_b_managed_by_weights_manager) - { - ARM_COMPUTE_ERROR_ON(!_original_b->is_used()); - } - - _tmp_b.allocator()->allocate(); - NEScheduler::get().schedule(&_transpose_kernel, Window::DimY); - if(!original_b_managed_by_weights_manager) - { - _original_b->mark_as_unused(); - } + _impl->run_pack.add_const_tensor(ACL_SRC_1, _impl->original_b); } - _is_prepared = true; + // Release temporary tensors that are only used in prepare stage + release_temporaries<Tensor>(_impl->aux_mem_req, _impl->workspace); + _impl->is_prepared = true; } } } // namespace arm_compute diff --git a/src/runtime/NEON/functions/NEGEMMAssemblyDispatch.cpp b/src/runtime/NEON/functions/NEGEMMAssemblyDispatch.cpp deleted file mode 100644 index 24bd7d7a8c..0000000000 --- a/src/runtime/NEON/functions/NEGEMMAssemblyDispatch.cpp +++ /dev/null @@ -1,622 +0,0 @@ -/* - * Copyright (c) 2018-2020 ARM Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "arm_compute/runtime/NEON/functions/NEGEMMAssemblyDispatch.h" - -#include "arm_compute/core/CPP/Validate.h" -#include "arm_compute/runtime/NEON/NEScheduler.h" -#include "arm_compute/runtime/NEON/functions/NESimpleAssemblyFunction.h" - -#include <arm_neon.h> - -namespace arm_compute -{ -namespace -{ -arm_gemm::Activation map_to_arm_gemm_activation(const ActivationLayerInfo &act) -{ - arm_gemm::Activation gemm_act; - - // Early exit in case lower bound is other than 0, as it's not yet supported - if(act.b() != 0.f) - { - return gemm_act; - } - - switch(act.activation()) - { - case ActivationLayerInfo::ActivationFunction::RELU: - gemm_act.type = arm_gemm::Activation::Type::ReLU; - break; - case ActivationLayerInfo::ActivationFunction::BOUNDED_RELU: - gemm_act.type = arm_gemm::Activation::Type::BoundedReLU; - gemm_act.param1 = act.a(); - gemm_act.param2 = 0.f; - break; - case ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU: - gemm_act.type = arm_gemm::Activation::Type::BoundedReLU; - gemm_act.param1 = act.a(); - gemm_act.param2 = act.b(); - break; - default: - gemm_act.type = arm_gemm::Activation::Type::None; - } - - return gemm_act; -} - -template <typename TypeInput, typename TypeOutput> -class FallbackTransform : public ITransformWeights -{ -public: - FallbackTransform() noexcept {}; - /** Prevent instances of this class from being copied (As this class contains pointers) */ - FallbackTransform(const FallbackTransform &) = delete; - /** Default move constructor */ - FallbackTransform(FallbackTransform &&) = default; - /** Prevent instances of this class from being copied (As this class contains pointers) */ - FallbackTransform &operator=(const FallbackTransform &) = delete; - /** Default move assignment operator */ - FallbackTransform &operator=(FallbackTransform &&) = default; - void run() override - { - _output.allocator()->allocate(); - ARM_COMPUTE_ERROR_ON(_output.buffer() == nullptr); - _gemm_kernel_asm->pretranspose_B_array(_output.buffer(), _in1_ptr, _ldb, _multi_stride_b); - _reshape_run = true; - } - - void release() override - { - _output.allocator()->free(); - } - - ITensor *get_weights() override - { - return &_output; - } - - uint32_t uid() override - { - uint32_t id = (_B_pretranspose_size | 0x80000000); - return id; - } - - void configure(size_t B_pretranspose_size, unsigned int alignment) - { - _output.allocator()->init(TensorInfo(TensorShape{ (B_pretranspose_size + alignment /* FIXME: remove alignment after COMPMID-1088 */) }, 1, DataType::S8), alignment); - _B_pretranspose_size = B_pretranspose_size; - } - - void set_pretranspose(ITensor *tensor) - { - if(!_reshape_run) - { - _gemm_kernel_asm->set_pretransposed_B_data(tensor->buffer()); - } - } - - void set_args(const int ldb, const TypeInput *in1_ptr, const int multi_stride_b, std::shared_ptr<arm_gemm::GemmCommon<TypeInput, TypeOutput>> gemm_kernel_asm) - { - _ldb = ldb; - _in1_ptr = in1_ptr; - _multi_stride_b = multi_stride_b; - _gemm_kernel_asm = gemm_kernel_asm; - } - -private: - Tensor _output{}; - int _ldb{}; - const TypeInput *_in1_ptr{}; - int _multi_stride_b{}; - size_t _B_pretranspose_size{}; - std::shared_ptr<arm_gemm::GemmCommon<TypeInput, TypeOutput>> _gemm_kernel_asm{ nullptr }; -}; - -/** Fallback in case ACL doesn't have a function */ -template <typename TypeInput, typename TypeOutput, class OutputStage = arm_gemm::Nothing> -class Fallback : public NEGEMMAssemblyDispatch::IFallback -{ -public: - /** Destructor */ - ~Fallback() - { - // Release memory if we have allocated the memory ourselves - if(_pretranspose && !(_weights_manager && _weights_manager->are_weights_managed(_b))) - { - delete _pretranspose; - } - } - - /** Initialise the functions's input and output. - * - * @param[in] a Input tensor containing the Matrix A. - * @param[in] b Input tensor containing the Matrix B. - * @param[in] c Input tensor containing the Matrix C. - * @param[out] d Output tensor to store the result of matrix multiplication. - * @param[in] args Matrix multiplication information. - * @param[in] gemm_info GEMM meta-data - * @param[in] memory_group Memory group to be used by the function. - * @param[in] weights_manager Weights manager to be used by the function. - * @param[in] os Output stage meta-data. - */ - void configure(const ITensor *a, const ITensor *b, const ITensor *c, ITensor *d, - arm_gemm::GemmArgs args, const GEMMInfo &gemm_info, - MemoryGroup &memory_group, IWeightsManager *weights_manager, const OutputStage &os = {}); - - /** Set requantization shifts to be used - * - * @param[in] shifts Requantization shifts - * - * @return Pointer to the shift data - */ - /** Set requantization data to be used - * - * - * @param shifts Requantization shifts - * @param multipliers Requantization multipliers - * - * @return A tuple with the pointers to the shift and multiplier data respectively - */ - std::tuple<const int32_t *, const int32_t *> set_requantize_data(const std::vector<int32_t> &shifts, - const std::vector<int32_t> &multipliers); - - // Inherited methods overridden: - void run() override; - void prepare() override; - bool is_configured() const override; - -private: - /** Allocate a workspace tensor. - * - * @param[in] workspace_size Size to allocate. - * @param[in] memory_group Tensor memory group. - * @param[in] alignment Workspace memory alignment. - */ - void allocate_workspace(size_t workspace_size, MemoryGroup &memory_group, size_t alignment); - - /** Assembly Gemm kernel */ - std::shared_ptr<arm_gemm::GemmCommon<TypeInput, TypeOutput>> _gemm_kernel_asm{ nullptr }; - /** Optimised NEON kernel */ - std::unique_ptr<INEKernel> _optimised_kernel{ nullptr }; - /** Input A */ - const ITensor *_a - { - nullptr - }; - /** Input B */ - const ITensor *_b - { - nullptr - }; - const ITensor *_c - { - nullptr - }; - /** Output */ - ITensor *_d{ nullptr }; - /** GEMM workspace */ - Tensor _workspace{}; - /** Pre-transpose tensor */ - ITensor *_pretranspose{ nullptr }; - /** Prepared flag */ - bool _is_prepared{ false }; - /** GEMM meta-data */ - GEMMInfo _gemm_info{}; - /** Weights manager */ - IWeightsManager *_weights_manager{ nullptr }; - /** Weights transform object */ - FallbackTransform<TypeInput, TypeOutput> _weights_transform{}; - /** GEMM kernel description */ - arm_gemm::KernelDescription _kernel_info{}; - /** Per channel quantization shifts */ - std::vector<int32_t> _shifts{}; - /** Per channel quantization multipliers */ - std::vector<int32_t> _multipliers{}; -}; - -template <typename TypeInput, typename TypeOutput, class OutputStage> -std::tuple<const int32_t *, const int32_t *> Fallback<TypeInput, TypeOutput, OutputStage>::set_requantize_data(const std::vector<int32_t> &shifts, - const std::vector<int32_t> &multipliers) -{ - _multipliers = multipliers; - _shifts = shifts; - std::transform(_shifts.begin(), _shifts.end(), _shifts.begin(), std::negate<int32_t>()); - return std::make_tuple(_shifts.data(), _multipliers.data()); -} - -template <typename TypeInput, typename TypeOutput, class OutputStage> -void Fallback<TypeInput, TypeOutput, OutputStage>::configure(const ITensor *a, const ITensor *b, const ITensor *c, ITensor *d, - arm_gemm::GemmArgs args, const GEMMInfo &gemm_info, - MemoryGroup &memory_group, IWeightsManager *weights_manager, const OutputStage &os) -{ - arm_gemm::GemmConfig gemm_cfg; - _kernel_info = arm_gemm::get_gemm_method<TypeInput, TypeOutput, OutputStage>(args, os); - _weights_manager = weights_manager; - if(_kernel_info.method != arm_gemm::GemmMethod::GEMV_BATCHED) - { - gemm_cfg.filter = _kernel_info.name; - args._cfg = &gemm_cfg; - } - _gemm_kernel_asm = arm_gemm::gemm<TypeInput, TypeOutput, OutputStage>(args, os); - if(_gemm_kernel_asm == nullptr) - { - //configuration not supported: Leave function unconfigured: - return; - } - - // arm_compute wrapper for the Gemm object (see above) - std::unique_ptr<NEGEMMAssemblyWrapperKernel<TypeInput, TypeOutput>> acl_gemm_wrapper = support::cpp14::make_unique<NEGEMMAssemblyWrapperKernel<TypeInput, TypeOutput>>(); - ARM_COMPUTE_ERROR_ON(acl_gemm_wrapper == nullptr); - acl_gemm_wrapper->configure(_gemm_kernel_asm.get(), gemm_cfg.filter); - const size_t workspace_size = _gemm_kernel_asm->get_working_size(); - if(workspace_size > 0) - { - // Allocate workspace - const unsigned int alignment = 4096; - allocate_workspace(workspace_size, memory_group, alignment); - } - - //if we disable this code below in brackets then ConvLayer deadlocks when threads > 1 and - //the shapes are In=1x1x1024 Weights=1x1x1024x1001 Biases=1001 Out=1x1x1001 - { - const unsigned int window_size = get_total_window_size(*_gemm_kernel_asm); - if(window_size < static_cast<unsigned int>(args._maxthreads)) - { - _gemm_kernel_asm->set_nthreads(window_size); - } - } - - _optimised_kernel = std::move(acl_gemm_wrapper); - _a = a; - _b = b; - _c = c; - _d = d; - _gemm_info = gemm_info; - // Check for pre-transposed support - if(_gemm_kernel_asm->B_pretranspose_required()) - { - // Forcing 128-byte alignment (required by 32-bit kernels) - const unsigned int alignment = 128; - const size_t B_pretranspose_size = _gemm_kernel_asm->get_B_pretransposed_array_size(); - if(weights_manager && _weights_manager->are_weights_managed(b)) - { - _weights_transform.configure(B_pretranspose_size, alignment); - _pretranspose = _weights_manager->acquire(b, &_weights_transform); - } - else - { - _pretranspose = new Tensor(); - static_cast<Tensor *>(_pretranspose)->allocator()->init(TensorInfo(TensorShape{ (B_pretranspose_size + alignment /* FIXME: remove alignment after COMPMID-1088 */) }, 1, DataType::S8), alignment); - } - } -} - -template <typename TypeInput, typename TypeOutput, class OutputStage> -void Fallback<TypeInput, TypeOutput, OutputStage>::prepare() -{ - if(!_is_prepared) - { - // Setup up matrix bias in the assembly kernel, it's just a pointer to matrix C. - if(_c && _c->info()->data_type() == DataType::S32) - { - _gemm_kernel_asm->set_quantized_bias(reinterpret_cast<const int32_t *>(_c->buffer() + _c->info()->offset_first_element_in_bytes()), 0); - } - - // Pretranspose B if required - if(_gemm_kernel_asm->B_pretranspose_required()) - { - const int ldb = _b->info()->strides_in_bytes().y() / sizeof(TypeInput); - const auto in1_ptr = reinterpret_cast<const TypeInput *>(_b->buffer() + _b->info()->offset_first_element_in_bytes()); - const int multi_stride_b = _b->info()->strides_in_bytes().z() / sizeof(TypeInput); - - if(_weights_manager && _weights_manager->are_weights_managed(_b)) - { - _weights_transform.set_args(ldb, in1_ptr, multi_stride_b, _gemm_kernel_asm); - _weights_manager->run(_b, &_weights_transform); - - // If we didn't run the reshape function, set the pretransposed buffer - if(!_weights_transform.is_reshape_run()) - { - _weights_transform.set_pretranspose(_pretranspose); - } - } - else - { - static_cast<Tensor *>(_pretranspose)->allocator()->allocate(); - ARM_COMPUTE_ERROR_ON(_pretranspose->buffer() == nullptr); - _gemm_kernel_asm->pretranspose_B_array(_pretranspose->buffer(), in1_ptr, ldb, multi_stride_b); - _b->mark_as_unused(); - } - } - - _is_prepared = true; - } -} - -template <typename TypeInput, typename TypeOutput, class OutputStage> -void Fallback<TypeInput, TypeOutput, OutputStage>::allocate_workspace(size_t workspace_size, MemoryGroup &memory_group, size_t alignment) -{ - ARM_COMPUTE_ERROR_ON_MSG(workspace_size == 0, "size cannot be 0"); - _workspace.allocator()->init(TensorInfo(TensorShape{ (workspace_size + alignment /* FIXME: remove alignment after COMPMID-1088 */) }, 1, DataType::S8), alignment); - memory_group.manage(&_workspace); - _workspace.allocator()->allocate(); -} - -template <typename TypeInput, typename TypeOutput, class OutputStage> -bool Fallback<TypeInput, TypeOutput, OutputStage>::is_configured() const -{ - return _optimised_kernel != nullptr; -} - -template <typename TypeInput, typename TypeOutput, class OutputStage> -void Fallback<TypeInput, TypeOutput, OutputStage>::run() -{ - const int lda = _a->info()->strides_in_bytes().y() / sizeof(TypeInput); - int ldb = 0; - const int ldd = _d->info()->strides_in_bytes().y() / sizeof(TypeOutput); - - const size_t a_batch_idx = _gemm_info.reinterpret_input_as_3d() != 0 ? 3 : 2; - const size_t a_multi_idx = a_batch_idx + 1; - const size_t d_batch_idx = _gemm_info.depth_output_gemm3d() != 0 ? 3 : 2; - const size_t d_multi_idx = d_batch_idx + 1; - - const int batch_stride_a = _a->info()->strides_in_bytes()[a_batch_idx] / sizeof(TypeInput); - const int batch_stride_d = _d->info()->strides_in_bytes()[d_batch_idx] / sizeof(TypeOutput); - - const int multi_stride_a = _a->info()->strides_in_bytes()[a_multi_idx] / sizeof(TypeInput); - int multi_stride_b = 0; - const int multi_stride_d = _d->info()->strides_in_bytes()[d_multi_idx] / sizeof(TypeOutput); - - const auto in0_ptr = reinterpret_cast<const TypeInput *>(_a->buffer() + _a->info()->offset_first_element_in_bytes()); - const TypeInput *in1_ptr = nullptr; - auto out_ptr = reinterpret_cast<TypeOutput *>(_d->buffer() + _d->info()->offset_first_element_in_bytes()); - - // Check if B is pre-tranposed and de-reference if not - if(!_gemm_kernel_asm->B_is_pretransposed()) - { - ldb = _b->info()->strides_in_bytes().y() / sizeof(TypeInput); - multi_stride_b = _b->info()->strides_in_bytes().z() / sizeof(TypeInput); - in1_ptr = reinterpret_cast<const TypeInput *>(_b->buffer() + _b->info()->offset_first_element_in_bytes()); - } - - // Set workspace if needed and reset number of threads as buffer manager gets re-created with max_threads - if(_workspace.buffer() != nullptr) - { - _gemm_kernel_asm->set_working_space(reinterpret_cast<void *>(_workspace.buffer())); - const unsigned int window_size = get_total_window_size(*_gemm_kernel_asm); - unsigned int num_threads = NEScheduler::get().num_threads(); - if(window_size < num_threads) - { - num_threads = window_size; - _gemm_kernel_asm->set_nthreads(num_threads); - } - } - - // Prepare assembly kernel - prepare(); - - TypeOutput *bias = nullptr; - // Setup up matrix bias in the assembly kernel, it's just a pointer to matrix C. - if(_c && _c->info()->data_type() != DataType::S32) - { - bias = reinterpret_cast<TypeOutput *>(_c->buffer() + _c->info()->offset_first_element_in_bytes()); - } - // Set gemm parameters - _gemm_kernel_asm->set_arrays(in0_ptr, lda, batch_stride_a, multi_stride_a, - in1_ptr, ldb, multi_stride_b, - out_ptr, ldd, batch_stride_d, multi_stride_d, - bias, 0); - // Schedule assembly kernel - IScheduler::Hints scheduling_hint = IScheduler::Hints(Window::DimX); - if(_kernel_info.method == arm_gemm::GemmMethod::GEMM_INTERLEAVED && _d->info()->data_type() == DataType::F32) - { - const int granule_threshold = 200; - scheduling_hint = IScheduler::Hints(Window::DimX, IScheduler::StrategyHint::DYNAMIC, granule_threshold); - - } - else if(_kernel_info.method == arm_gemm::GemmMethod::GEMM_INTERLEAVED_2D && _d->info()->data_type() == DataType::F32) - { - //GEMM_INTERLEAVED supports 2D parallelism, IScheduler::split_dimensions_all signals to parallelise over all window dimensions - const int granule_threshold = 200; - scheduling_hint = IScheduler::Hints(IScheduler::split_dimensions_all, IScheduler::StrategyHint::STATIC, granule_threshold); - } - - NEScheduler::get().schedule(_optimised_kernel.get(), scheduling_hint); -} - -template <typename TypeInput, typename TypeOutput> -void create_arm_gemm(std::unique_ptr<NEGEMMAssemblyDispatch::IFallback> &arm_gemm, MemoryGroup &memory_group, - const ITensor *a, const ITensor *b, const ITensor *c, ITensor *d, arm_gemm::Activation activation, const GEMMInfo &gemm_info, - IWeightsManager *weights_manager) -{ - INEGEMMWrapperKernel::Params p = INEGEMMWrapperKernel::extract_parameters(a, b, d, gemm_info); - const CPUInfo &ci = NEScheduler::get().cpu_info(); - unsigned int num_threads = NEScheduler::get().num_threads(); - - arm_gemm::GemmArgs args(&ci, p.M, p.N, p.K, p.batches, p.multis, false, false, activation, num_threads, gemm_info.pretranpose_B()); - - // Create arm_gemm fallback - auto fallback = support::cpp14::make_unique<Fallback<TypeInput, TypeOutput>>(); - fallback->configure(a, b, c, d, args, gemm_info, memory_group, weights_manager); - arm_gemm = std::move(fallback); -} - -template <typename TypeInput, typename TypeOutput> -void create_arm_gemm_quant(std::unique_ptr<NEGEMMAssemblyDispatch::IFallback> &arm_gemm, MemoryGroup &memory_group, - const ITensor *a, const ITensor *b, const ITensor *c, ITensor *d, arm_gemm::Activation activation, const GEMMInfo &gemm_info, - IWeightsManager *weights_manager) -{ - INEGEMMWrapperKernel::Params p = INEGEMMWrapperKernel::extract_parameters(a, b, d, gemm_info); - const CPUInfo &ci = NEScheduler::get().cpu_info(); - unsigned int num_threads = NEScheduler::get().num_threads(); - - arm_gemm::GemmArgs args(&ci, p.M, p.N, p.K, p.batches, p.multis, false, false, activation, num_threads, gemm_info.pretranpose_B()); - - // Create arm_gemm fallback - auto fallback = support::cpp14::make_unique<Fallback<TypeInput, TypeOutput, arm_gemm::Requantize32>>(); - - // Configure requantization info - const int32_t a_offset = -a->info()->quantization_info().uniform().offset; - const int32_t b_offset = -b->info()->quantization_info().uniform().offset; - const GEMMLowpOutputStageInfo os_info = gemm_info.gemmlowp_output_stage(); - - arm_gemm::Requantize32 gemm_requant_info{}; - if(os_info.gemmlowp_shifts.size() > 1) - { - const auto requantize_data = fallback->set_requantize_data(os_info.gemmlowp_shifts, os_info.gemmlowp_multipliers); - gemm_requant_info = arm_gemm::Requantize32(nullptr, 0, - a_offset, b_offset, os_info.gemmlowp_offset, - std::get<0>(requantize_data), std::get<1>(requantize_data), - os_info.gemmlowp_min_bound, os_info.gemmlowp_max_bound); - } - else - { - gemm_requant_info = arm_gemm::Requantize32(nullptr, 0, - a_offset, b_offset, os_info.gemmlowp_offset, - -os_info.gemmlowp_shift, os_info.gemmlowp_multiplier, - os_info.gemmlowp_min_bound, os_info.gemmlowp_max_bound); - } - - // Configure fallback - fallback->configure(a, b, c, d, args, gemm_info, memory_group, weights_manager, gemm_requant_info); - arm_gemm = std::move(fallback); -} - -} //namespace - -NEGEMMAssemblyDispatch::NEGEMMAssemblyDispatch(std::shared_ptr<IMemoryManager> memory_manager, IWeightsManager *weights_manager) - : _arm_gemm(nullptr), _memory_group(std::move(memory_manager)), _weights_manager(weights_manager) -{ -} - -Status NEGEMMAssemblyDispatch::validate(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *d, const GEMMInfo &gemm_info) -{ - ARM_COMPUTE_UNUSED(gemm_info, c); - ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(a, b, d); - ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(a); - ARM_COMPUTE_RETURN_ERROR_ON_CPU_BF16_UNSUPPORTED(a); -#ifndef __aarch64__ - ARM_COMPUTE_RETURN_ERROR_ON_MSG(a->element_size() == 1, "8bit integer types only supported for aarch64"); -#endif /* __aarch64__ */ - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(a, 1, DataType::U8, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::S8, - DataType::BFLOAT16, DataType::F16, DataType::F32); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(b, 1, DataType::U8, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM8_PER_CHANNEL, DataType::S8, - DataType::BFLOAT16, DataType::F16, DataType::F32); - if(is_data_type_quantized_per_channel(b->data_type())) - { - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(a, 1, DataType::QASYMM8_SIGNED, DataType::S8); - } - else - { - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(a, b); - } - ARM_COMPUTE_RETURN_ERROR_ON_MSG(a->data_type() == DataType::F32 && d->data_type() != DataType::F32, "Only F32 output supported for F32 input"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(a->data_type() == DataType::F16 && d->data_type() != DataType::F16, "Only F16 output supported for F16 input"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(a->data_type() == DataType::BFLOAT16 && d->data_type() != DataType::F32, "Only F32 output supported for BFLOAT16 input"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(a->data_type() == DataType::U8 && d->data_type() != DataType::U32, "Only U32 output supported for U8 input"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(a->data_type() == DataType::S8 && d->data_type() != DataType::S32, "Only S32 output supported for S8 input"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(a->data_type() == DataType::QASYMM8 && d->data_type() != DataType::QASYMM8, "Only QASYMM8 output supported for QASYMM8 input"); - return Status{}; -} - -bool NEGEMMAssemblyDispatch::is_activation_supported(const ActivationLayerInfo &activation) -{ - arm_gemm::Activation act = map_to_arm_gemm_activation(activation); - return act.type != arm_gemm::Activation::Type::None; -} - -void NEGEMMAssemblyDispatch::configure(const ITensor *a, const ITensor *b, const ITensor *c, ITensor *d, const GEMMInfo &gemm_info) -{ - ARM_COMPUTE_ERROR_ON_NULLPTR(a, b, d); - arm_gemm::Activation act = map_to_arm_gemm_activation(gemm_info.activation_info()); - - //If we don't support a combination of data types, silently return: it is the caller's responsibility to check if configure() was successful via is_configured() - if(!NEGEMMAssemblyDispatch::validate(a->info(), b->info(), c != nullptr ? c->info() : nullptr, d->info(), gemm_info)) - { - return; - } - - switch(a->info()->data_type()) - { - case DataType::F32: - create_arm_gemm<float, float>(_arm_gemm, _memory_group, a, b, c, d, act, gemm_info, _weights_manager); - break; -#ifdef __aarch64__ - case DataType::U8: - case DataType::QASYMM8: - if(d->info()->data_type() == DataType::S32) - { - create_arm_gemm<uint8_t, uint32_t>(_arm_gemm, _memory_group, a, b, c, d, act, gemm_info, _weights_manager); - } - else - { - create_arm_gemm_quant<uint8_t, uint8_t>(_arm_gemm, _memory_group, a, b, c, d, act, gemm_info, _weights_manager); - } - break; - case DataType::S8: - case DataType::QASYMM8_SIGNED: - if(d->info()->data_type() == DataType::S32) - { - create_arm_gemm<int8_t, int32_t>(_arm_gemm, _memory_group, a, b, c, d, act, gemm_info, _weights_manager); - } - else - { - create_arm_gemm_quant<int8_t, int8_t>(_arm_gemm, _memory_group, a, b, c, d, act, gemm_info, _weights_manager); - } - break; -#endif /* __aarch64__ */ -#if defined(__ARM_FEATURE_BF16_VECTOR_ARITHMETIC) || defined(ARM_COMPUTE_FORCE_BF16) - case DataType::BFLOAT16: - create_arm_gemm<bfloat16, float>(_arm_gemm, _memory_group, a, b, c, d, act, gemm_info, _weights_manager); - break; -#endif /* defined(__ARM_FEATURE_BF16_VECTOR_ARITHMETIC) || defined(ARM_COMPUTE_FORCE_BF16) */ -#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC - case DataType::F16: - create_arm_gemm<float16_t, float16_t>(_arm_gemm, _memory_group, a, b, c, d, act, gemm_info, _weights_manager); - break; -#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */ - default: - break; - } -} - -void NEGEMMAssemblyDispatch::prepare() -{ - ARM_COMPUTE_ERROR_ON(_arm_gemm == nullptr); - _arm_gemm->prepare(); -} - -bool NEGEMMAssemblyDispatch::is_configured() const -{ - return _arm_gemm != nullptr && _arm_gemm->is_configured(); -} - -void NEGEMMAssemblyDispatch::run() -{ - MemoryGroupResourceScope scope_mg(_memory_group); - - ARM_COMPUTE_ERROR_ON(_arm_gemm == nullptr); - _arm_gemm->run(); -} -} //namespace arm_compute diff --git a/src/runtime/NEON/functions/NEGEMMConv2d.cpp b/src/runtime/NEON/functions/NEGEMMConv2d.cpp new file mode 100644 index 0000000000..6cca02eea9 --- /dev/null +++ b/src/runtime/NEON/functions/NEGEMMConv2d.cpp @@ -0,0 +1,116 @@ +/* + * Copyright (c) 2020-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/runtime/NEON/functions/NEGEMMConv2d.h" + +#include "arm_compute/core/utils/misc/ShapeCalculator.h" +#include "arm_compute/runtime/Tensor.h" + +#include "src/core/helpers/MemoryHelpers.h" +#include "src/cpu/operators/CpuGemmDirectConv2d.h" + +namespace arm_compute +{ +using OperatorType = cpu::CpuGemmDirectConv2d; +using namespace arm_compute::experimental; + +struct NEGEMMConv2d::Impl +{ + const ITensor *weights{nullptr}; + std::unique_ptr<OperatorType> op{nullptr}; + ITensorPack run_pack{}; + ITensorPack prep_pack{}; + WorkspaceData<Tensor> workspace{}; + MemoryGroup memory_group{}; + bool is_prepared{false}; + experimental::MemoryRequirements aux_mem_req{}; +}; + +NEGEMMConv2d::NEGEMMConv2d(const std::shared_ptr<IMemoryManager> &memory_manager) : _impl(std::make_unique<Impl>()) +{ + _impl->memory_group = MemoryGroup(memory_manager); +} + +NEGEMMConv2d::~NEGEMMConv2d() = default; + +void NEGEMMConv2d::configure( + ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const Conv2dInfo &info) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output); + + _impl->weights = weights; + _impl->is_prepared = false; + _impl->op = std::make_unique<OperatorType>(); + + _impl->op->configure(input->info(), weights->info(), biases != nullptr ? biases->info() : nullptr, output->info(), + info); + + _impl->aux_mem_req = _impl->op->workspace(); + _impl->run_pack = {{TensorType::ACL_SRC_0, input}, {TensorType::ACL_SRC_2, biases}, {TensorType::ACL_DST, output}}; + _impl->prep_pack = {{TensorType::ACL_SRC_1, weights}, {TensorType::ACL_SRC_2, biases}}; + _impl->workspace = + manage_workspace<Tensor>(_impl->op->workspace(), _impl->memory_group, _impl->run_pack, _impl->prep_pack); +} + +Status NEGEMMConv2d::validate(const ITensorInfo *input, + const ITensorInfo *weights, + const ITensorInfo *biases, + const ITensorInfo *output, + const Conv2dInfo &info) +{ + return OperatorType::validate(input, weights, biases, output, info); +} + +void NEGEMMConv2d::run() +{ + prepare(); + + MemoryGroupResourceScope scope_mg(_impl->memory_group); + _impl->op->run(_impl->run_pack); +} + +void NEGEMMConv2d::prepare() +{ + if (!_impl->is_prepared) + { + _impl->op->prepare(_impl->prep_pack); + + auto has_reshape = + std::find_if(_impl->aux_mem_req.begin(), _impl->aux_mem_req.end(), + [](const MemoryInfo &m) -> bool { return m.lifetime == MemoryLifetime::Persistent; }); + + if (has_reshape != std::end(_impl->aux_mem_req)) + { + _impl->weights->mark_as_unused(); + } + else + { + _impl->run_pack.add_const_tensor(ACL_SRC_1, _impl->weights); + } + + // Release temporary tensors that are only used in prepare stage + release_temporaries<Tensor>(_impl->aux_mem_req, _impl->workspace); + _impl->is_prepared = true; + } +} +} // namespace arm_compute diff --git a/src/runtime/NEON/functions/NEGEMMConvolutionLayer.cpp b/src/runtime/NEON/functions/NEGEMMConvolutionLayer.cpp index a41d23f8d7..c8f65d2fd9 100644 --- a/src/runtime/NEON/functions/NEGEMMConvolutionLayer.cpp +++ b/src/runtime/NEON/functions/NEGEMMConvolutionLayer.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2020 ARM Limited. + * Copyright (c) 2017-2022 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -26,571 +26,109 @@ #include "arm_compute/core/Size2D.h" #include "arm_compute/core/Utils.h" #include "arm_compute/core/Validate.h" -#include "arm_compute/core/utils/misc/ShapeCalculator.h" -#include "arm_compute/core/utils/quantization/AsymmHelpers.h" -#include "arm_compute/runtime/NEON/NEScheduler.h" +#include "arm_compute/runtime/Tensor.h" -#include <set> -#include <tuple> +#include "src/core/helpers/MemoryHelpers.h" +#include "src/cpu/operators/CpuGemmConv2d.h" -namespace arm_compute -{ -using namespace arm_compute::misc::shape_calculator; - -NEConvolutionLayerReshapeWeights::NEConvolutionLayerReshapeWeights() - : _weights_reshape_kernel() -{ -} - -void NEConvolutionLayerReshapeWeights::configure(const ITensor *weights, const ITensor *biases, ITensor *output) -{ - // Perform validation step - ARM_COMPUTE_ERROR_ON_NULLPTR(weights, output); - ARM_COMPUTE_ERROR_THROW_ON(NEConvolutionLayerReshapeWeights::validate(weights->info(), - (biases != nullptr) ? biases->info() : nullptr, - output->info())); - const bool append_biases = (biases != nullptr) && !is_data_type_quantized_asymmetric(weights->info()->data_type()); - const ITensor *biases_to_use = (append_biases) ? biases : nullptr; - - _weights_reshape_kernel.configure(weights, biases_to_use, output); - - output->info()->set_quantization_info(weights->info()->quantization_info()); -} - -Status NEConvolutionLayerReshapeWeights::validate(const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output) -{ - ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(weights); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(weights, 1, - DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM8_PER_CHANNEL, - DataType::BFLOAT16, DataType::F16, DataType::F32); - ARM_COMPUTE_RETURN_ERROR_ON(weights->num_dimensions() > 4); - - if(biases != nullptr) - { - const int idx_kernels = get_data_layout_dimension_index(weights->data_layout(), DataLayoutDimension::BATCHES); - ARM_COMPUTE_RETURN_ERROR_ON(is_data_type_quantized_asymmetric(weights->data_type())); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(weights, biases); - ARM_COMPUTE_RETURN_ERROR_ON(biases->dimension(0) != weights->dimension(idx_kernels)); - ARM_COMPUTE_RETURN_ERROR_ON(biases->num_dimensions() > 1); - } - - if((output != nullptr) && (output->total_size() != 0)) - { - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(weights, output); - - NEWeightsReshapeKernel::validate(weights, biases, output); - } - - return Status{}; -} +using namespace arm_compute::experimental; -void NEConvolutionLayerReshapeWeights::run() -{ - NEScheduler::get().schedule(&_weights_reshape_kernel, 3); -} - -NEGEMMConvolutionLayer::NEGEMMConvolutionLayer(const std::shared_ptr<IMemoryManager> &memory_manager, IWeightsManager *weights_manager) - : _memory_group(memory_manager), _weights_manager(weights_manager), _reshape_weights(), _reshape_weights_managed(), _im2col_kernel(), _mm_gemm(memory_manager), _mm_gemmlowp(memory_manager), - _col2im_kernel(), _reshape_layer(), _original_weights(nullptr), _im2col_output(), _weights_reshaped(), _gemm_output(), _tmp_output(), _data_layout(DataLayout::NCHW), _skip_im2col(false), - _skip_col2im(false), _is_quantized(false), _is_prepared(false) +namespace arm_compute { -} - -void NEGEMMConvolutionLayer::configure_mm(const ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const ActivationLayerInfo &act_info, int gemm_3d_depth) +struct NEGEMMConvolutionLayer::Impl { - ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights); - ARM_COMPUTE_ERROR_THROW_ON(validate_mm(input->info(), weights->info(), biases == nullptr ? nullptr : biases->info(), output == nullptr ? nullptr : output->info(), - act_info, gemm_3d_depth, _skip_im2col)); - - // Create GEMMInfo structure - const GEMMInfo &gemm_info = GEMMInfo(false, false, true /* Reshape weights only for the first run */, - gemm_3d_depth, _skip_im2col /* Reinterpret the input as 3D if im2col is skipped */, - false, GEMMLowpOutputStageInfo(), false, false, act_info); - - // Supported activations in GEMM - const std::set<ActivationLayerInfo::ActivationFunction> supported_acts = { ActivationLayerInfo::ActivationFunction::RELU, - ActivationLayerInfo::ActivationFunction::BOUNDED_RELU, - ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU - }; - - if(_is_quantized) - { - // Since we need negative offsets for computing convolution, we need to change QuantizationInfo() - // Extract and negate input and weights offset - const QuantizationInfo iqinfo = input->info()->quantization_info(); - const QuantizationInfo wqinfo = weights->info()->quantization_info(); - const QuantizationInfo oqinfo = (output->info()->total_size() == 0) ? iqinfo : output->info()->quantization_info(); - const UniformQuantizationInfo uiqinfo = iqinfo.uniform(); - const UniformQuantizationInfo uoqinfo = oqinfo.uniform(); - const DataType data_type = input->info()->data_type(); - - input->info()->set_quantization_info(QuantizationInfo(uiqinfo.scale, -uiqinfo.offset)); - if(!is_data_type_quantized_per_channel(weights->info()->data_type())) - { - const UniformQuantizationInfo uwqinfo = wqinfo.uniform(); - weights->info()->set_quantization_info(QuantizationInfo(uwqinfo.scale, -uwqinfo.offset)); - } - - // Merge activation with output stage - PixelValue type_min{}; - PixelValue type_max{}; - std::tie(type_min, type_max) = get_min_max(data_type); - int32_t min_activation = type_min.get<int32_t>(); - int32_t max_activation = type_max.get<int32_t>(); - - if(supported_acts.count(act_info.activation()) != 0) - { - std::tie(min_activation, max_activation) = get_quantized_activation_min_max(act_info, data_type, uoqinfo); - } - - GEMMLowpOutputStageInfo output_info; - output_info.type = GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT; - output_info.gemmlowp_offset = uoqinfo.offset; - output_info.gemmlowp_min_bound = min_activation; - output_info.gemmlowp_max_bound = max_activation; - output_info.is_quantized_per_channel = (weights->info()->data_type() == DataType::QSYMM8_PER_CHANNEL); - quantization::calculate_quantized_multipliers(iqinfo, wqinfo, oqinfo, output_info); - - _mm_gemmlowp.configure(input, weights, biases, output, GEMMInfo(false, false, true, gemm_3d_depth, _skip_im2col, false, output_info)); - - // Revert back QuantizatioInfo as input and weights could be used in other convolution layers - input->info()->set_quantization_info(iqinfo); - weights->info()->set_quantization_info(wqinfo); - } - else - { - // Configure matrix multiply function - _mm_gemm.configure(input, weights, biases, output, 1.0f, 0.0f, gemm_info); - } -} - -Status NEGEMMConvolutionLayer::validate_mm(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, - const ActivationLayerInfo &act_info, int gemm_3d_depth, bool skip_im2col) + const ITensor *weights{nullptr}; + std::unique_ptr<cpu::CpuGemmConv2d> op{nullptr}; + ITensorPack run_pack{}; + MemoryGroup memory_group{}; + IWeightsManager *weights_manager{nullptr}; + MemoryRequirements aux_mem_req{}; + WorkspaceData<Tensor> workspace_tensors{}; + bool is_prepared{false}; +}; + +NEGEMMConvolutionLayer::NEGEMMConvolutionLayer(const std::shared_ptr<IMemoryManager> &memory_manager, + IWeightsManager *weights_manager) + : _impl(std::make_unique<Impl>()) { - const DataType data_type = input->data_type(); - const bool is_quantized = is_data_type_quantized_asymmetric(data_type); - const bool is_activation_enabled = act_info.enabled(); - - // Create GEMMInfo structure - const GEMMInfo gemm_info = GEMMInfo(false, false, true /* Reshape weights only for the first run */, - gemm_3d_depth, skip_im2col /* Reinterpret the input as 3D if im2col is skipped */, - false, GEMMLowpOutputStageInfo(), false, false, act_info); - - if(is_quantized) - { - // Since we need negative offsets for computing convolution, we need to change QuantizationInfo() - // Extract and negate input and weights offset - const QuantizationInfo &iqinfo = input->quantization_info(); - const QuantizationInfo &wqinfo = weights->quantization_info(); - const QuantizationInfo &oqinfo = (output->total_size() == 0) ? iqinfo : output->quantization_info(); - const UniformQuantizationInfo uoqinfo = oqinfo.uniform(); - - // Merge activation with output stage - PixelValue type_min{}; - PixelValue type_max{}; - std::tie(type_min, type_max) = get_min_max(data_type); - int32_t min_activation = type_min.get<int32_t>(); - int32_t max_activation = type_max.get<int32_t>(); - - const std::set<ActivationLayerInfo::ActivationFunction> supported_acts = { ActivationLayerInfo::ActivationFunction::RELU, - ActivationLayerInfo::ActivationFunction::BOUNDED_RELU, - ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU - }; - if(is_activation_enabled && supported_acts.count(act_info.activation()) != 0) - { - std::tie(min_activation, max_activation) = get_quantized_activation_min_max(act_info, data_type, uoqinfo); - } - - GEMMLowpOutputStageInfo output_info; - output_info.type = GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT; - output_info.gemmlowp_offset = uoqinfo.offset; - output_info.gemmlowp_min_bound = min_activation; - output_info.gemmlowp_max_bound = max_activation; - output_info.is_quantized_per_channel = (weights->data_type() == DataType::QSYMM8_PER_CHANNEL); - ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multipliers(iqinfo, wqinfo, oqinfo, output_info)); - - // Perform validation step on GEMMLowp - std::unique_ptr<ITensorInfo> input_qa = input->clone(); - std::unique_ptr<ITensorInfo> weights_qa = weights->clone(); - input_qa->set_quantization_info(QuantizationInfo(iqinfo.uniform().scale, -iqinfo.uniform().offset)); - weights_qa->set_quantization_info(QuantizationInfo(wqinfo.uniform().scale, -wqinfo.uniform().offset)); - return NEGEMMLowpMatrixMultiplyCore::validate(input_qa.get(), weights_qa.get(), biases, output, GEMMInfo(false, false, true, gemm_3d_depth, skip_im2col, false, output_info)); - } - else - { - // Perform validation step on Matrix multiply function - return NEGEMM::validate(input, weights, nullptr, output, 1.0f, 0.0f, gemm_info); - } + _impl->weights_manager = weights_manager; + _impl->memory_group = MemoryGroup(memory_manager); } - -Status NEGEMMConvolutionLayer::validate_gemm3d(const ITensorInfo *input_info, const ITensorInfo *weights_info, const ActivationLayerInfo &act_info, int gemm_3d_depth, bool skip_im2col) +NEGEMMConvolutionLayer::~NEGEMMConvolutionLayer() = default; + +void NEGEMMConvolutionLayer::configure(const ITensor *input, + const ITensor *weights, + const ITensor *biases, + ITensor *output, + const PadStrideInfo &conv_info, + const WeightsInfo &weights_info, + const Size2D &dilation, + const ActivationLayerInfo &act_info, + bool enable_fast_math, + unsigned int num_groups) { - const DataType data_type = input_info->data_type(); - const unsigned int mult_y = skip_im2col ? 1U : gemm_3d_depth; - const unsigned int mult_z = skip_im2col ? gemm_3d_depth : 1U; - - // Set dummy tensor shapes for the validation - const TensorInfo dummy_input_info(TensorShape(4U, 4U * mult_y, 1U * mult_z), 1, data_type, input_info->quantization_info()); - const TensorInfo dummy_weights_info(TensorShape(4U, 4U), 1, data_type, weights_info->quantization_info()); - const TensorInfo dummy_output_info(TensorShape(4U, 4U, gemm_3d_depth), 1, data_type, input_info->quantization_info()); + ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output); - return validate_mm(&dummy_input_info, &dummy_weights_info, nullptr, &dummy_output_info, act_info, gemm_3d_depth, skip_im2col); + _impl->weights = weights; + _impl->op = std::make_unique<cpu::CpuGemmConv2d>(); + _impl->op->configure(input->info(), weights->info(), (biases != nullptr ? biases->info() : nullptr), output->info(), + conv_info, weights_info, dilation, act_info, enable_fast_math, num_groups); + + _impl->run_pack = {{TensorType::ACL_SRC_0, input}, + {TensorType::ACL_SRC_1, weights}, + {TensorType::ACL_SRC_2, biases}, + {TensorType::ACL_DST, output}}; + _impl->aux_mem_req = _impl->op->workspace(); + _impl->workspace_tensors = + manage_workspace<Tensor>(_impl->aux_mem_req, _impl->memory_group, _impl->run_pack, _impl->run_pack); } -void NEGEMMConvolutionLayer::configure(const ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const PadStrideInfo &conv_info, const WeightsInfo &weights_info, - const Size2D &dilation, const ActivationLayerInfo &act_info, unsigned int num_groups) +Status NEGEMMConvolutionLayer::validate(const ITensorInfo *input, + const ITensorInfo *weights, + const ITensorInfo *biases, + const ITensorInfo *output, + const PadStrideInfo &conv_info, + const WeightsInfo &weights_info, + const Size2D &dilation, + const ActivationLayerInfo &act_info, + bool enable_fast_math, + unsigned int num_groups) { - ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output); - ARM_COMPUTE_UNUSED(num_groups, weights_info); - ARM_COMPUTE_ERROR_THROW_ON(NEGEMMConvolutionLayer::validate(input->info(), - weights->info(), - biases != nullptr ? biases->info() : nullptr, - output->info(), - conv_info, - weights_info, - dilation, - act_info, - num_groups)); - - const DataType data_type = input->info()->data_type(); - const DataLayout data_layout = input->info()->data_layout(); - const int idx_width = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH); - const int idx_height = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT); - const int idx_kernels = get_data_layout_dimension_index(data_layout, DataLayoutDimension::BATCHES); - - const unsigned int kernel_width = weights->info()->dimension(idx_width); - const unsigned int kernel_height = weights->info()->dimension(idx_height); - - _is_prepared = weights_info.retain_internal_weights(); - _original_weights = weights; - _is_quantized = is_data_type_quantized_asymmetric(input->info()->data_type()); - _data_layout = data_layout; - _skip_im2col = (data_layout == DataLayout::NHWC && kernel_width == 1 && kernel_height == 1 && conv_info.stride().first == 1 && conv_info.stride().second == 1); - - const ITensor *gemm_input_to_use = input; - ITensor *gemm_output_to_use = output; - - // Get convolved dimensions - unsigned int conv_w = 0; - unsigned int conv_h = 0; - std::tie(conv_w, conv_h) = scaled_dimensions(input->info()->dimension(idx_width), - input->info()->dimension(idx_height), - kernel_width, - kernel_height, - conv_info, - dilation); - - // Check if GEMM3D is supported - if(data_layout == DataLayout::NHWC) - { - _skip_col2im = bool(validate_gemm3d(input->info(), weights->info(), act_info, conv_h, true)); - // If not supported, we need to perform im2col and col2im (or reshape layer) - if(!_skip_col2im) - { - _skip_im2col = false; - } - } - else - { - _skip_col2im = false; - } - - // Get parameters from conv_info - unsigned int stride_x = 0; - unsigned int stride_y = 0; - std::tie(stride_x, stride_y) = conv_info.stride(); - - unsigned int mat_weights_cols = weights->info()->dimension(idx_kernels); - - // _weights_reshaped will be auto configured in the kernel. - // Just append biases and do not transpose 1xW as it will be reshaped in NEGEMM - const ITensor *weights_to_use = weights; - - if(_weights_manager && _weights_manager->are_weights_managed(weights)) - { - _reshape_weights_managed.configure(weights, nullptr); - weights_to_use = _weights_manager->acquire(weights, &_reshape_weights_managed); - } - else - { - _reshape_weights.configure(weights, nullptr, &_weights_reshaped); - weights_to_use = &_weights_reshaped; - } - - // Create tensor to store im2col reshaped inputs - if(!_skip_im2col) - { - _memory_group.manage(&_im2col_output); - - // Configure - _im2col_kernel.configure(input, &_im2col_output, Size2D(kernel_width, kernel_height), conv_info, false, dilation); - - // Update GEMM input - gemm_input_to_use = &_im2col_output; - } - - // Create temporary GEMM output tensor in case we cannot skip col2im - const DataType output_data_type = data_type == DataType::BFLOAT16 ? DataType::F32 : data_type; - if(!_skip_col2im) - { - TensorShape shape_gemm; - - // Calculate GEMM output shape - shape_gemm = _im2col_output.info()->tensor_shape(); - shape_gemm.set(0, mat_weights_cols); - shape_gemm.set(1, conv_w * conv_h); - - // FIXME: input->clone() doesn't work with subtensors for grouped convolutions. - TensorInfo info_gemm(shape_gemm, 1, output_data_type); - info_gemm.set_quantization_info(output->info()->quantization_info()).set_data_layout(input->info()->data_layout()); - _gemm_output.allocator()->init(info_gemm); - _memory_group.manage(&_gemm_output); - - // Update GEMM output - gemm_output_to_use = &_gemm_output; - } - - // Configure GEMM - // In case we need to skip col2im, GEMM3D (gemm_3d_depth != 0) must be called in order to avoid reshaping the output matrix - const unsigned int gemm_3d_depth = _skip_col2im ? conv_h : 0; - configure_mm(gemm_input_to_use, weights_to_use, biases, gemm_output_to_use, act_info, gemm_3d_depth); - - if(!_skip_im2col) - { - _im2col_output.allocator()->allocate(); - } - - if(!_skip_col2im) - { - if(_data_layout == DataLayout::NCHW) - { - // Configure col2im - _col2im_kernel.configure(gemm_output_to_use, output, Size2D(conv_w, conv_h)); - } - else - { - // Configure reshape layer - _reshape_layer.configure(gemm_output_to_use, output); - } - } - - if(_is_quantized && !_skip_col2im) - { - _tmp_output.allocator()->allocate(); - } - - if(!_skip_col2im || _is_quantized) - { - _gemm_output.allocator()->allocate(); - } - - ARM_COMPUTE_ERROR_ON_MSG((output->info()->dimension(idx_width) != conv_w) || (output->info()->dimension(idx_height) != conv_h), - "Output shape does not match the expected one"); + return cpu::CpuGemmConv2d::validate(input, weights, biases, output, conv_info, weights_info, dilation, act_info, + enable_fast_math, num_groups); } -Status NEGEMMConvolutionLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info, - const WeightsInfo &weights_info, const Size2D &dilation, const ActivationLayerInfo &act_info, unsigned int num_groups) +Status NEGEMMConvolutionLayer::has_opt_impl(arm_compute::WeightFormat &expected_weight_format, + const ITensorInfo *src, + const ITensorInfo *weights, + const ITensorInfo *biases, + const ITensorInfo *dst, + const PadStrideInfo &conv_info, + const WeightsInfo &weights_info, + const Size2D &dilation, + const ActivationLayerInfo &act_info, + const bool enable_fast_math) { - ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(weights_info.are_reshaped(), "Weights already reshaped are not supported!"); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::BFLOAT16, DataType::F16, DataType::F32); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(weights, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM8_PER_CHANNEL, DataType::BFLOAT16, DataType::F16, DataType::F32); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, weights); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(num_groups > 1, "Grouping (num_groups != 1) is not supported on NEON"); - - const DataLayout data_layout = input->data_layout(); - const DataType data_type = input->data_type(); - const int idx_width = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH); - const int idx_height = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT); - const int idx_channel = get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL); - const int idx_kernels = get_data_layout_dimension_index(data_layout, DataLayoutDimension::BATCHES); - - const unsigned int kernel_width = weights->dimension(idx_width); - const unsigned int kernel_height = weights->dimension(idx_height); - - TensorInfo im2col_reshaped_info{}; - TensorInfo info_gemm{}; - TensorInfo tmp_info{}; - TensorInfo weights_reshaped_info{}; - const ITensorInfo *gemm_input_to_use = input; - const ITensorInfo *gemm_output_to_use = output; - const ITensorInfo *weights_to_use = weights; - - const bool append_bias = false; - const bool is_quantized = is_data_type_quantized_asymmetric(data_type); - const bool is_bf16 = data_type == DataType::BFLOAT16; - bool skip_im2col = (data_layout == DataLayout::NHWC && kernel_width == 1 && kernel_height == 1 && conv_info.stride().first == 1 && conv_info.stride().second == 1); - - // Get convolved dimensions - unsigned int conv_w = 0; - unsigned int conv_h = 0; - - std::tie(conv_w, conv_h) = scaled_dimensions(input->dimension(idx_width), - input->dimension(idx_height), - kernel_width, - kernel_height, - conv_info, - dilation); - - // Check if GEMM3D is supported - bool skip_col2im = false; - if(data_layout == DataLayout::NHWC) - { - skip_col2im = bool(validate_gemm3d(input, weights, act_info, conv_h, true)); - // If not supported, we need to perform im2col and col2im (or reshape layer) - if(!skip_col2im) - { - skip_im2col = false; - } - } - - if(skip_col2im) - { - // If not supported, we need to perform im2col and col2im (or reshape layer) - if(!bool(validate_gemm3d(input, weights, act_info, conv_h, skip_im2col))) - { - skip_im2col = false; - skip_col2im = false; - } - } - - ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(idx_channel) != input->dimension(idx_channel)); - ARM_COMPUTE_RETURN_ERROR_ON(weights->num_dimensions() > 4); - - // Validate biases - if(biases != nullptr) - { - if(is_quantized) - { - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(biases, 1, DataType::S32); - } - else if(is_bf16) - { - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(biases, 1, DataType::F32); - } - else - { - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, biases); - } - ARM_COMPUTE_RETURN_ERROR_ON(biases->dimension(0) != weights->dimension(idx_kernels)); - ARM_COMPUTE_RETURN_ERROR_ON(biases->num_dimensions() > 1); - } - - unsigned int mat_weights_cols = weights->dimension(idx_kernels); - unsigned int mat_weights_rows = weights->dimension(idx_width) * weights->dimension(idx_height) * weights->dimension(idx_channel); - - // Output tensor auto inizialization if not yet initialized - ARM_COMPUTE_RETURN_ON_ERROR(NEConvolutionLayerReshapeWeights::validate(weights, nullptr, nullptr)); - weights_reshaped_info = TensorInfo(compute_weights_reshaped_shape(*weights, append_bias), 1, data_type); - weights_reshaped_info.set_quantization_info(weights->quantization_info()); - weights_to_use = &weights_reshaped_info; - - if(!skip_im2col) - { - // Create tensor info for im2col reshaped inputs - // For NEON the batch size is on the fourth dimension - // TODO (giaiod01): Auto-initialize the output shape of im2col COMPMID-1482 - TensorShape shape_im2col = input->tensor_shape(); - shape_im2col.set(0, mat_weights_rows); - shape_im2col.set(1, conv_w * conv_h); - shape_im2col.set(2, 1); - - im2col_reshaped_info = TensorInfo(shape_im2col, 1, data_type); - im2col_reshaped_info.set_quantization_info(input->quantization_info()); - - ARM_COMPUTE_RETURN_ON_ERROR(NEIm2ColKernel::validate(input, &im2col_reshaped_info, Size2D(kernel_width, kernel_height), conv_info, append_bias, dilation)); - gemm_input_to_use = &im2col_reshaped_info; - } - - // Create temporary GEMM output tensor in case we cannot skip col2im - const DataType output_data_type = data_type == DataType::BFLOAT16 ? DataType::F32 : data_type; - if(!skip_col2im) - { - TensorShape shape_gemm = gemm_input_to_use->tensor_shape(); - shape_gemm.set(0, mat_weights_cols); - shape_gemm.set(1, conv_w * conv_h); - info_gemm = TensorInfo(shape_gemm, 1, output_data_type); - } - else - { - info_gemm = TensorInfo(output->tensor_shape(), 1, output_data_type); - } - info_gemm.set_quantization_info(output->quantization_info()).set_data_layout(input->data_layout()); - gemm_output_to_use = &info_gemm; - ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemm_input_to_use, weights_to_use, biases, gemm_output_to_use, act_info, skip_col2im ? conv_h : 0, skip_im2col)); - - // Validate Col2Im/ReshapeLayer - if(!skip_col2im && (data_layout == DataLayout::NCHW)) - { - ARM_COMPUTE_RETURN_ON_ERROR(NECol2ImKernel::validate(gemm_output_to_use, output, Size2D(conv_w, conv_h))); - } - - return Status{}; + return cpu::CpuGemmConv2d::has_opt_impl(expected_weight_format, src, weights, biases, dst, conv_info, weights_info, + dilation, act_info, enable_fast_math); } void NEGEMMConvolutionLayer::run() { prepare(); - - MemoryGroupResourceScope scope_mg(_memory_group); - - if(!_skip_im2col) - { - // Run input reshaping - unsigned int y_dim = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::HEIGHT); - NEScheduler::get().schedule(&_im2col_kernel, y_dim); - } - - // Runs NEGEMM or NEGEMMLowpMatrixMultiplyCore functions - if(_is_quantized) - { - // Run gemmlowp - _mm_gemmlowp.run(); - } - else - { - // Run gemm - _mm_gemm.run(); - } - - // Reshape output matrix - if(!_skip_col2im) - { - if(_data_layout == DataLayout::NCHW) - { - NEScheduler::get().schedule(&_col2im_kernel, Window::DimY); - } - else - { - _reshape_layer.run(); - } - } + MemoryGroupResourceScope scope_mg(_impl->memory_group); + _impl->op->run(_impl->run_pack); } void NEGEMMConvolutionLayer::prepare() { - if(!_is_prepared) + if (!_impl->is_prepared) { - if(_weights_manager && _weights_manager->are_weights_managed(_original_weights)) - { - _weights_manager->run(_original_weights, &_reshape_weights_managed); - } - else - { - // Run weights reshaping and mark original weights tensor as unused - _weights_reshaped.allocator()->allocate(); - _reshape_weights.run(); - _original_weights->mark_as_unused(); - } - - // Prepare GEMM - _is_quantized ? _mm_gemmlowp.prepare() : _mm_gemm.prepare(); - if(!_weights_reshaped.is_used()) - { - _weights_reshaped.allocator()->free(); - } + _impl->op->prepare(_impl->run_pack); - _is_prepared = true; + // Release temporary tensors that are only used in prepare stage + release_temporaries<Tensor>(_impl->aux_mem_req, _impl->workspace_tensors); + _impl->is_prepared = true; } } } // namespace arm_compute diff --git a/src/runtime/NEON/functions/NEGEMMInterleave4x4.cpp b/src/runtime/NEON/functions/NEGEMMInterleave4x4.cpp deleted file mode 100644 index 5692beb3f7..0000000000 --- a/src/runtime/NEON/functions/NEGEMMInterleave4x4.cpp +++ /dev/null @@ -1,37 +0,0 @@ -/* - * Copyright (c) 2017-2020 ARM Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "arm_compute/runtime/NEON/functions/NEGEMMInterleave4x4.h" - -#include "arm_compute/core/NEON/kernels/NEGEMMInterleave4x4Kernel.h" -#include "support/MemorySupport.h" - -namespace arm_compute -{ -void NEGEMMInterleave4x4::configure(const ITensor *input, ITensor *output) -{ - auto k = arm_compute::support::cpp14::make_unique<NEGEMMInterleave4x4Kernel>(); - k->configure(input, output); - _kernel = std::move(k); -} -} // namespace arm_compute diff --git a/src/runtime/NEON/functions/NEGEMMLowpAssemblyMatrixMultiplyCore.cpp b/src/runtime/NEON/functions/NEGEMMLowpAssemblyMatrixMultiplyCore.cpp deleted file mode 100644 index 087df19e20..0000000000 --- a/src/runtime/NEON/functions/NEGEMMLowpAssemblyMatrixMultiplyCore.cpp +++ /dev/null @@ -1,139 +0,0 @@ -/* - * Copyright (c) 2017-2020 ARM Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "arm_compute/runtime/NEON/functions/NEGEMMLowpAssemblyMatrixMultiplyCore.h" - -#include "arm_compute/core/Error.h" -#include "arm_compute/core/Helpers.h" -#include "arm_compute/core/ITensor.h" -#include "arm_compute/core/NEON/kernels/NEGEMMInterleave4x4Kernel.h" -#include "arm_compute/core/NEON/kernels/NEGEMMLowpMatrixMultiplyKernel.h" -#include "arm_compute/core/NEON/kernels/NEGEMMTranspose1xWKernel.h" -#include "arm_compute/core/TensorInfo.h" -#include "arm_compute/core/Types.h" -#include "arm_compute/core/Validate.h" -#include "arm_compute/runtime/NEON/NEScheduler.h" -#include "arm_compute/runtime/TensorAllocator.h" -#include "support/MemorySupport.h" - -using namespace arm_compute; - -NEGEMMLowpAssemblyMatrixMultiplyCore::NEGEMMLowpAssemblyMatrixMultiplyCore(std::shared_ptr<IMemoryManager> memory_manager) - : _memory_group(memory_manager), _asm_glue(memory_manager), _mm_kernel(nullptr), _mtx_a_reshape_kernel(nullptr), _mtx_b_reshape_kernel(nullptr), _tmp_a(), _tmp_b() -{ -} - -void NEGEMMLowpAssemblyMatrixMultiplyCore::configure(const ITensor *a, const ITensor *b, const ITensor *c, ITensor *output) -{ - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(a, 1, DataType::U8, DataType::S8); - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U32, DataType::S32); - ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(a, b); - ARM_COMPUTE_ERROR_ON_MSG((a)->info()->dimension(0) != (b)->info()->dimension(1), "The product AB is defined only if the number of columns in A is equal to the number of rows in B"); - ARM_COMPUTE_ERROR_ON_MSG((a)->info()->dimension(1) != (output)->info()->dimension(1), "The output matrix must have the same number of rows as the matrix A"); - ARM_COMPUTE_ERROR_ON_MSG((b)->info()->dimension(0) != (output)->info()->dimension(0), "The output matrix must have the same number of columns as the matrix B"); - - bool run_optimised = false; - switch(a->info()->data_type()) - { - case DataType::S8: - case DataType::QASYMM8: - case DataType::U8: - { - _asm_glue.configure(a, b, c, output, GEMMInfo(false, false, true)); - run_optimised = _asm_glue.is_configured(); - break; - } - default: - { - ARM_COMPUTE_ERROR("Datatype not supported"); - break; - } - } - if(!run_optimised) - { - // The interleaved output matrix will have the following shape: [ a_height * 4, ceil(a_width / 4.0f) ] - TensorShape shape_tmp_a = a->info()->tensor_shape(); - shape_tmp_a.set(0, a->info()->dimension(0) * 4); - shape_tmp_a.set(1, std::ceil(a->info()->dimension(1) / 4.f)); - - // The transpose1xW output matrix will have the following shape: [ b_height * 16, ceil(b_width / 16.0f) ] - TensorShape shape_tmp_b = b->info()->tensor_shape(); - shape_tmp_b.set(0, b->info()->dimension(1) * 16); - shape_tmp_b.set(1, std::ceil(b->info()->dimension(0) / 16.f)); - - TensorInfo info_a(shape_tmp_a, 1, a->info()->data_type()); - TensorInfo info_b(shape_tmp_b, 1, b->info()->data_type()); - _tmp_a.allocator()->init(info_a); - _tmp_b.allocator()->init(info_b); - _memory_group.manage(&_tmp_a); - _memory_group.manage(&_tmp_b); - - // Configure interleave kernel - { - auto k = arm_compute::support::cpp14::make_unique<NEGEMMInterleave4x4Kernel>(); - k->configure(a, &_tmp_a); - _mtx_a_reshape_kernel = std::move(k); - } - - // Configure transpose kernel - { - auto k = arm_compute::support::cpp14::make_unique<NEGEMMTranspose1xWKernel>(); - k->configure(b, &_tmp_b); - _mtx_b_reshape_kernel = std::move(k); - } - - // Configure matrix multiply kernel - { - auto k = arm_compute::support::cpp14::make_unique<NEGEMMLowpMatrixMultiplyKernel>(); - k->configure(&_tmp_a, &_tmp_b, output); - _mm_kernel = std::move(k); - } - - // Allocate tensors - _tmp_a.allocator()->allocate(); - _tmp_b.allocator()->allocate(); - } -} - -void NEGEMMLowpAssemblyMatrixMultiplyCore::run() -{ - MemoryGroupResourceScope scope_mg(_memory_group); - if(_mtx_a_reshape_kernel) - { - NEScheduler::get().schedule(_mtx_a_reshape_kernel.get(), Window::DimY); - } - - if(_mtx_b_reshape_kernel) - { - NEScheduler::get().schedule(_mtx_b_reshape_kernel.get(), Window::DimY); - } - - if(_asm_glue.is_configured()) - { - _asm_glue.run(); - } - else - { - NEScheduler::get().schedule(_mm_kernel.get(), Window::DimY); - } -} diff --git a/src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.cpp b/src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.cpp index 993907c29a..44bfc6a51e 100644 --- a/src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.cpp +++ b/src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2020 ARM Limited. + * Copyright (c) 2017-2021, 2023 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -23,564 +23,109 @@ */ #include "arm_compute/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.h" -#include "arm_compute/core/Error.h" -#include "arm_compute/core/Helpers.h" #include "arm_compute/core/ITensor.h" -#include "arm_compute/core/KernelDescriptors.h" -#include "arm_compute/core/TensorInfo.h" -#include "arm_compute/core/Types.h" #include "arm_compute/core/Validate.h" -#include "arm_compute/core/utils/misc/ShapeCalculator.h" +#include "arm_compute/runtime/IWeightsManager.h" +#include "arm_compute/runtime/MemoryGroup.h" #include "arm_compute/runtime/NEON/NEScheduler.h" -#include "arm_compute/runtime/TensorAllocator.h" -#include "support/MemorySupport.h" +#include "arm_compute/runtime/Tensor.h" + +#include "src/core/helpers/MemoryHelpers.h" +#include "src/cpu/operators/CpuGemmLowpMatrixMultiplyCore.h" + +using namespace arm_compute::experimental; namespace arm_compute { -using namespace arm_compute::misc::shape_calculator; - -NEGEMMLowpMatrixMultiplyCore::NEGEMMLowpMatrixMultiplyCore(std::shared_ptr<IMemoryManager> memory_manager, IWeightsManager *weights_manager) - : _memory_group(memory_manager), _weights_manager(weights_manager), _asm_glue(memory_manager, weights_manager), _mm_kernel(), _mtx_a_reshape_kernel(), _mtx_b_reshape_kernel(), - _mtx_a_reduction_kernel(), _mtx_b_reduction_kernel(), _offset_contribution_kernel(), _offset_contribution_output_stage_kernel(), _activation_func(), _convert_to_signed_asymm(), - _convert_from_signed_asymm(), _vector_sum_col(), _vector_sum_row(), _tmp_a(), _tmp_b(), _mm_result_s32(), _signed_a(), _signed_output(), _original_b(nullptr), _a_offset(0), _b_offset(0), - _run_vector_matrix_multiplication(false), _assembly_path(false), _fused_assembly_path(false), _reshape_b_only_on_first_run(false), _is_prepared(false), _fuse_output_stage(false), - _run_activation(false), _flip_signedness(false) +struct NEGEMMLowpMatrixMultiplyCore::Impl { + const ITensor *b{nullptr}; + std::unique_ptr<cpu::CpuGemmLowpMatrixMultiplyCore> op{nullptr}; + ITensorPack run_pack{}; + ITensorPack prep_pack{}; + MemoryGroup memory_group{}; + IWeightsManager *weights_manager{nullptr}; + MemoryRequirements aux_mem_req{}; + WorkspaceData<Tensor> workspace_tensors{}; + bool is_prepared{false}; +}; + +NEGEMMLowpMatrixMultiplyCore::NEGEMMLowpMatrixMultiplyCore(std::shared_ptr<IMemoryManager> memory_manager, + IWeightsManager *weights_manager) + : _impl(std::make_unique<Impl>()) +{ + _impl->weights_manager = weights_manager; + _impl->memory_group = MemoryGroup(memory_manager); } +NEGEMMLowpMatrixMultiplyCore::~NEGEMMLowpMatrixMultiplyCore() = default; -void NEGEMMLowpMatrixMultiplyCore::configure(const ITensor *a, const ITensor *b, const ITensor *c, ITensor *output, const GEMMInfo &gemm_info) +void NEGEMMLowpMatrixMultiplyCore::configure( + const ITensor *a, const ITensor *b, const ITensor *c, ITensor *output, const GEMMInfo &gemm_info) { ARM_COMPUTE_ERROR_ON_NULLPTR(a, b, output); - ARM_COMPUTE_UNUSED(c); - ARM_COMPUTE_ERROR_THROW_ON(NEGEMMLowpMatrixMultiplyCore::validate(a->info(), b->info(), c != nullptr ? c->info() : nullptr, output->info(), gemm_info)); - - const ITensor *matrix_a = a; - const ITensor *matrix_b = b; - GEMMInfo info = gemm_info; - - // Set internal variables - _a_offset = a->info()->quantization_info().uniform().offset; - _b_offset = b->info()->quantization_info().uniform().offset; - _run_vector_matrix_multiplication = a->info()->dimension(1) < 2; - _reshape_b_only_on_first_run = info.reshape_b_only_on_first_run(); - _is_prepared = false; - _fused_assembly_path = false; - _flip_signedness = is_data_type_quantized_per_channel(b->info()->data_type()) && (a->info()->data_type() == DataType::QASYMM8) && _reshape_b_only_on_first_run; - _original_b = b; - - const ITensor *a_to_use = a; - - // Convert to QASYMM8 -> QASYMM8_SIGNED and back - if(_flip_signedness) - { - const int32_t offset_correction = 128; - const DataType dt = DataType::QASYMM8_SIGNED; - const UniformQuantizationInfo iqinfo = a_to_use->info()->quantization_info().uniform(); - - _signed_a.allocator()->init(a_to_use->info()->clone()->set_data_type(dt).set_quantization_info(QuantizationInfo(iqinfo.scale, iqinfo.offset + offset_correction))); - _memory_group.manage(&_signed_a); - _convert_to_signed_asymm.configure(a_to_use, &_signed_a); - a_to_use = &_signed_a; - _a_offset = _signed_a.info()->quantization_info().uniform().offset; - - const UniformQuantizationInfo oqinfo = output->info()->quantization_info().uniform(); - _memory_group.manage(&_signed_output); - _signed_output.allocator()->init(output->info()->clone()->set_data_type(dt).set_quantization_info(QuantizationInfo(oqinfo.scale, oqinfo.offset - offset_correction))); - - // Output stage correction - GEMMLowpOutputStageInfo output_stage_corr = info.gemmlowp_output_stage(); - output_stage_corr.gemmlowp_offset = _signed_output.info()->quantization_info().uniform().offset; - output_stage_corr.gemmlowp_min_bound -= offset_correction; - output_stage_corr.gemmlowp_max_bound -= offset_correction; - info.set_gemmlowp_output_stage(output_stage_corr); - - // Update matrix a - matrix_a = &_signed_a; - } - - // If GEMMLowpOutputStage != NONE, fuse the offset contribution with the output stage - if(info.gemmlowp_output_stage().type != GEMMLowpOutputStageType::NONE) - { - _fuse_output_stage = true; - _memory_group.manage(&_mm_result_s32); - TensorInfo info_mm_result_s32(output->info()->tensor_shape(), 1, DataType::S32); - _mm_result_s32.allocator()->init(info_mm_result_s32); - } - -#ifdef __aarch64__ - switch(a->info()->data_type()) - { - case DataType::QASYMM8: - case DataType::QASYMM8_SIGNED: - case DataType::U8: - case DataType::S8: - { - if(is_data_type_quantized_asymmetric(a_to_use->info()->data_type()) && info.gemmlowp_output_stage().type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT) - { - _asm_glue.configure(a_to_use, b, c, output, gemm_info); - _fused_assembly_path = _asm_glue.is_configured(); - } - else - { - _asm_glue.configure(a_to_use, b, nullptr, _fuse_output_stage ? &_mm_result_s32 : output, gemm_info); - } - _assembly_path = _asm_glue.is_configured(); - break; - } - default: - { - ARM_COMPUTE_ERROR("Datatype not supported"); - break; - } - } -#endif /* __aarch64__ */ - if(!(_assembly_path || _run_vector_matrix_multiplication)) - { - matrix_a = &_tmp_a; - matrix_b = &_tmp_b; - // The interleaved output matrix will have the following shape: [ a_height * 4, ceil(a_width / 4.0f) ] - TensorInfo a_info(compute_interleaved_shape(*a_to_use->info()), 1, a_to_use->info()->data_type(), a_to_use->info()->quantization_info()); - // The transpose1xW output matrix will have the following shape: [ b_height * 16, ceil(b_width / 16.0f) ] - TensorInfo b_info(compute_transpose1xW_shape(*b->info()), 1, b->info()->data_type(), b->info()->quantization_info()); - _tmp_a.allocator()->init(a_info); - _tmp_b.allocator()->init(b_info); - _memory_group.manage(&_tmp_a); - if(!_reshape_b_only_on_first_run) - { - _memory_group.manage(&_tmp_b); - } - - // Configure interleave kernel - _mtx_a_reshape_kernel.configure(a_to_use, &_tmp_a); - - // Configure transpose kernel - _mtx_b_reshape_kernel.configure(b, &_tmp_b); - } - - if(!_fused_assembly_path) - { - // Build reduction info - const GEMMLowpReductionKernelInfo reduction_info(a_to_use->info()->dimension(0), false, 0, false); - - // Initialize matrix B reduction kernel only if _a_offset is not equal to 0 - if(_a_offset != 0) - { - TensorInfo info_vector_sum_col(compute_reductionA_shape(*b->info()), 1, DataType::S32); - - _vector_sum_col.allocator()->init(info_vector_sum_col); - if(!_reshape_b_only_on_first_run) - { - _memory_group.manage(&_vector_sum_col); - } - - // Configure Matrix B reduction kernel - _mtx_b_reduction_kernel.configure(b, &_vector_sum_col, reduction_info); - } - - // Initialize Matrix A reduction kernel only if _b_offset is not equal to 0 - if(_b_offset != 0) - { - TensorInfo info_vector_sum_row(compute_reductionB_shape(*a_to_use->info()), 1, DataType::S32); - - _vector_sum_row.allocator()->init(info_vector_sum_row); - _memory_group.manage(&_vector_sum_row); - - // Configure matrix A reduction kernel - _mtx_a_reduction_kernel.configure(a_to_use, &_vector_sum_row, reduction_info); - } - - if(_fuse_output_stage) - { - // Configure matrix multiply kernel - if(!_assembly_path) - { - _mm_kernel.configure(matrix_a, matrix_b, &_mm_result_s32); - } - - _offset_contribution_output_stage_kernel.configure(&_mm_result_s32, - _a_offset == 0 ? nullptr : &_vector_sum_col, - _b_offset == 0 ? nullptr : &_vector_sum_row, c, - _flip_signedness ? &_signed_output : output, - a->info()->dimension(0), - _a_offset, _b_offset, info.gemmlowp_output_stage()); - - if(_flip_signedness) - { - _convert_from_signed_asymm.configure(&_signed_output, output); - } - } - else - { - // Configure matrix multiply kernel - if(!_assembly_path) - { - _mm_kernel.configure(matrix_a, matrix_b, output); - } - // Configure offset contribution kernel - _offset_contribution_kernel.configure(output, _a_offset == 0 ? nullptr : &_vector_sum_col, _b_offset == 0 ? nullptr : &_vector_sum_row, a_to_use->info()->dimension(0), _a_offset, _b_offset); - } - - // Configure activation - const ActivationLayerInfo &activation = gemm_info.activation_info(); - _run_activation = activation.enabled() && (!_assembly_path || (_assembly_path && !NEGEMMAssemblyDispatch::is_activation_supported(activation))); - if(_run_activation) - { - _activation_func.configure(output, nullptr, activation); - } - } - - // Allocate tensors - if(!_assembly_path && !_run_vector_matrix_multiplication) - { - _tmp_a.allocator()->allocate(); - if(!_reshape_b_only_on_first_run) - { - _tmp_b.allocator()->allocate(); - } - } - - if(!_fused_assembly_path) - { - if(_a_offset != 0 && !_reshape_b_only_on_first_run) - { - _vector_sum_col.allocator()->allocate(); - } - - if(_b_offset != 0) - { - _vector_sum_row.allocator()->allocate(); - } - } - - if(_fuse_output_stage) - { - _mm_result_s32.allocator()->allocate(); - } - - if(_flip_signedness) - { - _signed_a.allocator()->allocate(); - _signed_output.allocator()->allocate(); - } + // Make the B matrix dynamic values. + auto b_info_to_use = b->info()->clone(); + if (!gemm_info.reshape_b_only_on_first_run()) + { + b_info_to_use->set_are_values_constant(false); + } + + _impl->b = b; + _impl->op = std::make_unique<cpu::CpuGemmLowpMatrixMultiplyCore>(); + _impl->op->configure(a->info(), b_info_to_use.get(), (c != nullptr ? c->info() : nullptr), output->info(), + gemm_info); + _impl->run_pack = {{TensorType::ACL_SRC_0, a}, + {TensorType::ACL_SRC_1, b}, + {TensorType::ACL_SRC_2, c}, + {TensorType::ACL_DST, output}}; + _impl->prep_pack = {{TensorType::ACL_SRC_1, b}, {TensorType::ACL_SRC_2, c}}; + _impl->aux_mem_req = _impl->op->workspace(); + _impl->workspace_tensors = + manage_workspace<Tensor>(_impl->aux_mem_req, _impl->memory_group, _impl->run_pack, _impl->prep_pack); } -Status NEGEMMLowpMatrixMultiplyCore::validate(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *output, const GEMMInfo &gemm_info) +Status NEGEMMLowpMatrixMultiplyCore::validate(const ITensorInfo *a, + const ITensorInfo *b, + const ITensorInfo *c, + const ITensorInfo *output, + const GEMMInfo &gemm_info) { - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(a, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(b, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM8, DataType::QSYMM8_PER_CHANNEL); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::S32, DataType::QASYMM8, DataType::QASYMM8_SIGNED); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(c != nullptr && gemm_info.gemmlowp_output_stage().type == GEMMLowpOutputStageType::NONE, "Bias addition not supported in NEGEMMLowpMatrixMultiplyCore for output S32"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG((a)->dimension(0) != (b)->dimension(1), - "The product AB is defined only if the number of columns in A is equal to the number of rows in B"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.is_a_reshaped(), "Matrix A already reshaped is not supported"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.is_b_reshaped(), "Matrix B already reshaped is not supported"); - - GEMMInfo info = gemm_info; - const ITensorInfo *matrix_a_info = a; - const ITensorInfo *matrix_b_info = b; - - const ITensorInfo *a_to_use = a; - - TensorInfo tmp_a_info{}; - TensorInfo tmp_b_info{}; - TensorInfo mm_result_s32_info{}; - - int32_t a_offset = a->quantization_info().uniform().offset; - int32_t b_offset = b->quantization_info().uniform().offset; - - bool fuse_output_stage = info.gemmlowp_output_stage().type != GEMMLowpOutputStageType::NONE; - if(fuse_output_stage) + // Make the B matrix dynamic values. + auto b_info_to_use = b->clone(); + if (!gemm_info.reshape_b_only_on_first_run()) { - auto_init_if_empty(mm_result_s32_info, a->clone()->set_tensor_shape(output->tensor_shape()).set_data_type(DataType::S32)); + b_info_to_use->set_are_values_constant(false); } - // Convert QASYMM8->QASYMM8_SIGNED - TensorInfo signed_a{}; - TensorInfo signed_output{}; - bool flip_signedness = is_data_type_quantized_per_channel(b->data_type()) && (a->data_type() == DataType::QASYMM8) && info.reshape_b_only_on_first_run(); - if(flip_signedness) - { - const int32_t offset_correction = 128; - const DataType dt = DataType::QASYMM8_SIGNED; - const UniformQuantizationInfo iqinfo = a_to_use->quantization_info().uniform(); - - signed_a = a_to_use->clone()->set_data_type(dt).set_quantization_info(QuantizationInfo(iqinfo.scale, iqinfo.offset + offset_correction)); - ARM_COMPUTE_RETURN_ON_ERROR(NEConvertQuantizedSignednessKernel::validate(a_to_use, &signed_a)); - a_to_use = &signed_a; - a_offset = signed_a.quantization_info().uniform().offset; - - const UniformQuantizationInfo oqinfo = output->quantization_info().uniform(); - signed_output = output->clone()->set_data_type(dt).set_quantization_info(QuantizationInfo(oqinfo.scale, oqinfo.offset - offset_correction)); - - // Output stage correction - GEMMLowpOutputStageInfo output_stage_corr = info.gemmlowp_output_stage(); - output_stage_corr.gemmlowp_offset = signed_output.quantization_info().uniform().offset; - output_stage_corr.gemmlowp_min_bound -= offset_correction; - output_stage_corr.gemmlowp_max_bound -= offset_correction; - info.set_gemmlowp_output_stage(output_stage_corr); - - // Update matrix a - matrix_a_info = &signed_a; - } - - // Check if we need to run the optimized assembly kernel - bool run_optimised = false; - bool run_optimised_requantized = false; - if(a_to_use->data_type() == DataType::QASYMM8 && info.gemmlowp_output_stage().type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT) - { - run_optimised = bool(NEGEMMAssemblyDispatch::validate(a_to_use, b, c, output, gemm_info)); - run_optimised_requantized = run_optimised; - - const UniformQuantizationInfo a_qinfo = a_to_use->quantization_info().uniform(); - const QuantizationInfo b_qinfo = b->quantization_info(); - const UniformQuantizationInfo output_qinfo = output->quantization_info().uniform(); - for(auto const s : b_qinfo.scale()) - { - const float fmultipler = a_qinfo.scale * s / output_qinfo.scale; - if(fmultipler > 1.f) - { - run_optimised_requantized = false; - break; - } - } - } - else - { - run_optimised = bool(NEGEMMAssemblyDispatch::validate(a_to_use, b, nullptr, fuse_output_stage ? &mm_result_s32_info : output, gemm_info)); - } - - if(run_optimised) - { - ARM_COMPUTE_RETURN_ERROR_ON(b->dimension(0) != output->dimension(0)); - if(info.depth_output_gemm3d() != 0) - { - if(info.reinterpret_input_as_3d()) - { - ARM_COMPUTE_RETURN_ERROR_ON(a->dimension(1) != output->dimension(1)); - ARM_COMPUTE_RETURN_ERROR_ON(a->dimension(2) != output->dimension(2)); - } - else - { - ARM_COMPUTE_RETURN_ERROR_ON(a->dimension(1) != output->dimension(1) * output->dimension(2)); - } - } - else - { - ARM_COMPUTE_RETURN_ERROR_ON(a->dimension(1) != output->dimension(1)); - } - } - else - { - ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.reinterpret_input_as_3d(), "NEGEMM cannot reinterpret the input tensor as 3D"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.depth_output_gemm3d() != 0, "NEGEMM cannot reinterpret the output tensor as 3D"); - - const bool run_vector_matrix_multiplication = a->dimension(1) < 2; - if(!run_vector_matrix_multiplication) - { - matrix_a_info = &tmp_a_info; - matrix_b_info = &tmp_b_info; - - // The interleaved output matrix will have the following shape: [ a_height * 4, ceil(a_width / 4.0f) ] - TensorShape shape_tmp_a = a->tensor_shape(); - shape_tmp_a.set(0, a->dimension(0) * 4); - shape_tmp_a.set(1, std::ceil(a->dimension(1) / 4.f)); - - // The transpose1xW output matrix will have the following shape: [ b_height * 16, ceil(b_width / 16.0f) ] - TensorShape shape_tmp_b = b->tensor_shape(); - shape_tmp_b.set(0, b->dimension(1) * 16); - shape_tmp_b.set(1, std::ceil(b->dimension(0) / 16.f)); - - // Validate interleave kernel - auto_init_if_empty(tmp_a_info, a_to_use->clone()->set_tensor_shape(shape_tmp_a)); - auto_init_if_empty(tmp_b_info, b->clone()->set_tensor_shape(shape_tmp_b)); - - ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMInterleave4x4Kernel::validate(a_to_use, &tmp_a_info)); - ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMTranspose1xWKernel::validate(b, &tmp_b_info)); - } - } - - if(!run_optimised_requantized) - { - TensorInfo info_vector_sum_col{}; - TensorInfo info_vector_sum_row{}; - - const GEMMLowpReductionKernelInfo reduction_info(a_to_use->dimension(0), false, 0, false); - - // Validate matrix B reduction kernel only if _a_offset is not equal to 0 - if(a_offset != 0) - { - info_vector_sum_col = TensorInfo(compute_reductionA_shape(*b), 1, DataType::S32); - - // Configure Matrix B reduction kernel - ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpMatrixBReductionKernel::validate(b, &info_vector_sum_col, reduction_info)); - } - - // Validate Matrix A reduction kernel only if _b_offset is not equal to 0 - if(b_offset != 0) - { - info_vector_sum_row = TensorInfo(compute_reductionB_shape(*a), 1, DataType::S32); - - // Configure matrix A reduction kernel - ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpMatrixAReductionKernel::validate(a_to_use, &info_vector_sum_row, reduction_info)); - } - - if(fuse_output_stage) - { - if(!run_optimised) - { - ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpMatrixMultiplyKernel::validate(matrix_a_info, matrix_b_info, &mm_result_s32_info)); - } - - // Validate offset contribution kernel - ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpOffsetContributionOutputStageKernel::validate(&mm_result_s32_info, - a_offset == 0 ? nullptr : &info_vector_sum_col, - b_offset == 0 ? nullptr : &info_vector_sum_row, - c, - flip_signedness ? &signed_output : output, - a_offset, b_offset, - info.gemmlowp_output_stage())); - } - else - { - if(!run_optimised) - { - ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpMatrixMultiplyKernel::validate(matrix_a_info, matrix_b_info, output)); - } - // Validate offset contribution kernel - ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpOffsetContributionKernel::validate(output, - a_offset == 0 ? nullptr : &info_vector_sum_col, - b_offset == 0 ? nullptr : &info_vector_sum_row, - a_offset, b_offset)); - } - } - - // Validate activation - const ActivationLayerInfo &activation = gemm_info.activation_info(); - if(activation.enabled()) - { - ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayer::validate(output, nullptr, activation)); - } - - return Status{}; + return cpu::CpuGemmLowpMatrixMultiplyCore::validate(a, b_info_to_use.get(), c, output, gemm_info); } void NEGEMMLowpMatrixMultiplyCore::run() { prepare(); - - MemoryGroupResourceScope scope_mg(_memory_group); - - // Convert QASYMM8->QASYMM8_SIGNED - if(_flip_signedness) - { - NEScheduler::get().schedule(&_convert_to_signed_asymm, Window::DimY); - } - - // Run GEMM - if(_asm_glue.is_configured()) - { - _asm_glue.run(); - } - else - { - if(!_run_vector_matrix_multiplication) - { - // Run interleave kernel - NEScheduler::get().schedule(&_mtx_a_reshape_kernel, Window::DimY); - - if(!_reshape_b_only_on_first_run) - { - // Run transpose kernel - NEScheduler::get().schedule(&_mtx_b_reshape_kernel, Window::DimY); - } - } - NEScheduler::get().schedule(&_mm_kernel, Window::DimY); - } - - if(!_fused_assembly_path) - { - // Run matrix A reduction kernel only if _b_offset is not equal to 0 - if(_b_offset != 0) - { - NEScheduler::get().schedule(&_mtx_a_reduction_kernel, Window::DimX); - } - - // Run matrix B reduction kernel only if _a_offset is not equal to 0 - if(_a_offset != 0 && !_reshape_b_only_on_first_run) - { - NEScheduler::get().schedule(&_mtx_b_reduction_kernel, Window::DimX); - } - - if(_fuse_output_stage) - { - // Run offset contribution kernel - NEScheduler::get().schedule(&_offset_contribution_output_stage_kernel, Window::DimY); - } - else - { - // Run offset contribution kernel - NEScheduler::get().schedule(&_offset_contribution_kernel, Window::DimY); - } - } - - // Convert QASYMM8_SIGNED->QASYMM8 - if(_flip_signedness) - { - NEScheduler::get().schedule(&_convert_from_signed_asymm, Window::DimY); - } - - // Run fused activation unless already run in the fused assembly - if(_run_activation && !_fused_assembly_path) - { - _activation_func.run(); - } + MemoryGroupResourceScope scope_mg(_impl->memory_group); + _impl->op->run(_impl->run_pack); } void NEGEMMLowpMatrixMultiplyCore::prepare() { - if(!_is_prepared) + if (!_impl->is_prepared) { - const bool original_b_managed_by_weights_manager = _weights_manager && _weights_manager->are_weights_managed(_original_b); - // Run assembly reshape - if(_asm_glue.is_configured()) - { - if(!original_b_managed_by_weights_manager) - { - ARM_COMPUTE_ERROR_ON(!_original_b->is_used()); - } + _impl->op->prepare(_impl->prep_pack); - _asm_glue.prepare(); - if(!original_b_managed_by_weights_manager) - { - _original_b->mark_as_unused(); - } - } - // Run non-assembly reshape - else if(_reshape_b_only_on_first_run && !_run_vector_matrix_multiplication && !_asm_glue.is_configured()) - { - if(!original_b_managed_by_weights_manager) - { - ARM_COMPUTE_ERROR_ON(!_original_b->is_used()); - } - - // Run reshape kernel and mark original weights tensor as unused - _tmp_b.allocator()->allocate(); - NEScheduler::get().schedule(&_mtx_b_reshape_kernel, Window::DimY); - if(!original_b_managed_by_weights_manager) - { - _original_b->mark_as_unused(); - } - } + auto has_reshape = + std::find_if(_impl->aux_mem_req.begin(), _impl->aux_mem_req.end(), + [](const MemoryInfo &m) -> bool { return m.lifetime == MemoryLifetime::Persistent; }); - // Run matrix B reduction kernel only if _a_offset is not equal to 0 - if(!_fused_assembly_path && _a_offset != 0 && _reshape_b_only_on_first_run) + if (has_reshape != std::end(_impl->aux_mem_req)) { - _vector_sum_col.allocator()->allocate(); - NEScheduler::get().schedule(&_mtx_b_reduction_kernel, Window::DimX); + _impl->b->mark_as_unused(); } - _is_prepared = true; + // Release temporary tensors that are only used in prepare stage + release_temporaries<Tensor>(_impl->aux_mem_req, _impl->workspace_tensors); + _impl->is_prepared = true; } } } // namespace arm_compute diff --git a/src/runtime/NEON/functions/NEGEMMLowpOutputStage.cpp b/src/runtime/NEON/functions/NEGEMMLowpOutputStage.cpp index 43ca7b3fbb..8178003b5e 100644 --- a/src/runtime/NEON/functions/NEGEMMLowpOutputStage.cpp +++ b/src/runtime/NEON/functions/NEGEMMLowpOutputStage.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2020 ARM Limited. + * Copyright (c) 2017-2021 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -24,179 +24,55 @@ #include "arm_compute/runtime/NEON/functions/NEGEMMLowpOutputStage.h" #include "arm_compute/core/ITensor.h" -#include "arm_compute/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ScaleKernel.h" -#include "arm_compute/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel.h" -#include "arm_compute/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel.h" -#include "arm_compute/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel.h" #include "arm_compute/core/Validate.h" -#include "support/MemorySupport.h" -namespace arm_compute -{ -void NEGEMMLowpQuantizeDownInt32ToUint8Scale::configure(const ITensor *input, const ITensor *bias, ITensor *output, int result_offset, int result_mult_int, int result_shift, int min, int max) -{ - GEMMLowpOutputStageInfo info = GEMMLowpOutputStageInfo(); - info.gemmlowp_offset = result_offset; - info.gemmlowp_multiplier = result_mult_int; - info.gemmlowp_shift = result_shift; - info.gemmlowp_min_bound = min; - info.gemmlowp_max_bound = max; - - auto k = arm_compute::support::cpp14::make_unique<NEGEMMLowpQuantizeDownInt32ScaleKernel>(); - k->configure(input, bias, output, &info); - _kernel = std::move(k); -} - -Status NEGEMMLowpQuantizeDownInt32ToUint8Scale::validate(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, int min, int max) -{ - GEMMLowpOutputStageInfo info = GEMMLowpOutputStageInfo(); - info.gemmlowp_min_bound = min; - info.gemmlowp_max_bound = max; +#include "src/cpu/operators/CpuGemmLowpOutputStage.h" - return NEGEMMLowpQuantizeDownInt32ScaleKernel::validate(input, bias, output, &info); -} - -void NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint::configure(const ITensor *input, const ITensor *bias, ITensor *output, int result_fixedpoint_multiplier, int result_shift, - int result_offset_after_shift, int min, int max) -{ - auto k = arm_compute::support::cpp14::make_unique<NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel>(); - k->configure(input, bias, output, result_fixedpoint_multiplier, result_shift, result_offset_after_shift, min, max); - _kernel = std::move(k); -} - -Status NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint::validate(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, int min, int max) -{ - return NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel::validate(input, bias, output, min, max); -} - -void NEGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPoint::configure(const ITensor *input, const ITensor *bias, ITensor *output, int result_fixedpoint_multiplier, int result_shift, - int result_offset_after_shift, int min, int max) +namespace arm_compute { - auto k = arm_compute::support::cpp14::make_unique<NEGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel>(); - k->configure(input, bias, output, result_fixedpoint_multiplier, result_shift, result_offset_after_shift, min, max); - _kernel = std::move(k); -} - -Status NEGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPoint::validate(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, int min, int max) +struct NEGEMMLowpOutputStage::Impl { - return NEGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel::validate(input, bias, output, min, max); -} - -void NEGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPoint::configure(const ITensor *input, const ITensor *bias, ITensor *output, int result_fixedpoint_multiplier, int result_shift, int min, int max) + const ITensor *src{nullptr}; + const ITensor *bias{nullptr}; + ITensor *dst{nullptr}; + ITensorPack run_pack{}; + std::unique_ptr<cpu::CpuGemmLowpOutputStage> op{nullptr}; +}; + +NEGEMMLowpOutputStage::NEGEMMLowpOutputStage() : _impl(std::make_unique<Impl>()) { - auto k = arm_compute::support::cpp14::make_unique<NEGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel>(); - k->configure(input, bias, output, result_fixedpoint_multiplier, result_shift, min, max); - _kernel = std::move(k); } +NEGEMMLowpOutputStage::~NEGEMMLowpOutputStage() = default; -Status NEGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPoint::validate(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, int min, int max) -{ - return NEGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel::validate(input, bias, output, min, max); -} - -void NEGEMMLowpOutputStage::configure(const ITensor *input, const ITensor *bias, ITensor *output, const GEMMLowpOutputStageInfo &info) +void NEGEMMLowpOutputStage::configure(const ITensor *input, + const ITensor *bias, + ITensor *output, + const GEMMLowpOutputStageInfo &info) { // Perform validate step ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); - ARM_COMPUTE_ERROR_THROW_ON(NEGEMMLowpOutputStage::validate(input->info(), bias != nullptr ? bias->info() : nullptr, output->info(), info)); - - switch(info.type) - { - case GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT: - { - switch(info.output_data_type) - { - case DataType::QASYMM8: - { - auto k = arm_compute::support::cpp14::make_unique<NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel>(); - k->configure(input, bias, output, info.gemmlowp_multiplier, info.gemmlowp_shift, info.gemmlowp_offset, info.gemmlowp_min_bound, info.gemmlowp_max_bound); - _kernel = std::move(k); - break; - } - case DataType::QASYMM8_SIGNED: - { - auto k = arm_compute::support::cpp14::make_unique<NEGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel>(); - k->configure(input, bias, output, info.gemmlowp_multiplier, info.gemmlowp_shift, info.gemmlowp_offset, info.gemmlowp_min_bound, info.gemmlowp_max_bound); - _kernel = std::move(k); - break; - } - case DataType::QSYMM16: - { - auto k = arm_compute::support::cpp14::make_unique<NEGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel>(); - k->configure(input, bias, output, info.gemmlowp_multiplier, info.gemmlowp_shift, info.gemmlowp_min_bound, info.gemmlowp_max_bound); - _kernel = std::move(k); - break; - } - default: - { - ARM_COMPUTE_ERROR("Unsupported output data type."); - break; - } - } - break; - } - case GEMMLowpOutputStageType::QUANTIZE_DOWN: - { - switch(info.output_data_type) - { - case DataType::QASYMM8: - case DataType::QASYMM8_SIGNED: - { - auto k = arm_compute::support::cpp14::make_unique<NEGEMMLowpQuantizeDownInt32ScaleKernel>(); - k->configure(input, bias, output, &info); - _kernel = std::move(k); - break; - } - default: - { - ARM_COMPUTE_ERROR("Unsupported output data type."); - break; - } - } - break; - } - default: - ARM_COMPUTE_ERROR("Unsupported GEMMLowpOutputStage type."); - } + ARM_COMPUTE_ERROR_THROW_ON( + NEGEMMLowpOutputStage::validate(input->info(), bias != nullptr ? bias->info() : nullptr, output->info(), info)); + _impl->src = input; + _impl->bias = bias; + _impl->dst = output; + _impl->op = std::make_unique<cpu::CpuGemmLowpOutputStage>(); + _impl->op->configure(input->info(), (bias == nullptr) ? nullptr : bias->info(), output->info(), info); + + _impl->run_pack = { + {TensorType::ACL_SRC, _impl->src}, {TensorType::ACL_BIAS, _impl->bias}, {TensorType::ACL_DST, _impl->dst}}; } -Status NEGEMMLowpOutputStage::validate(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, const GEMMLowpOutputStageInfo &info) +Status NEGEMMLowpOutputStage::validate(const ITensorInfo *input, + const ITensorInfo *bias, + const ITensorInfo *output, + const GEMMLowpOutputStageInfo &info) { - ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(output); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->data_type() == DataType::UNKNOWN, "NEGEMMLowpQuantizeDownScaleByFixedPoint cannot be used with UNKNOWN output data type."); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM16); - - ARM_COMPUTE_RETURN_ERROR_ON((info.type != GEMMLowpOutputStageType::QUANTIZE_DOWN) && (info.type != GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT)); + return cpu::CpuGemmLowpOutputStage::validate(input, bias, output, info); +} - switch(info.type) - { - case GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT: - { - switch(output->data_type()) - { - case DataType::QASYMM8: - return NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel::validate(input, bias, output, info.gemmlowp_min_bound, info.gemmlowp_max_bound); - case DataType::QASYMM8_SIGNED: - return NEGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel::validate(input, bias, output, info.gemmlowp_min_bound, info.gemmlowp_max_bound); - case DataType::QSYMM16: - return NEGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel::validate(input, bias, output, info.gemmlowp_min_bound, info.gemmlowp_max_bound); - default: - return ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Unsupported output data type."); - } - } - case GEMMLowpOutputStageType::QUANTIZE_DOWN: - { - switch(output->data_type()) - { - case DataType::QASYMM8: - case DataType::QASYMM8_SIGNED: - return NEGEMMLowpQuantizeDownInt32ScaleKernel::validate(input, bias, output, &info); - default: - return ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Unsupported output data type."); - } - } - default: - return ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Unsupported GEMMLowpOutputStage type."); - } +void NEGEMMLowpOutputStage::run() +{ + _impl->op->run(_impl->run_pack); } } // namespace arm_compute diff --git a/src/runtime/NEON/functions/NEGEMMTranspose1xW.cpp b/src/runtime/NEON/functions/NEGEMMTranspose1xW.cpp deleted file mode 100644 index c7a50a8932..0000000000 --- a/src/runtime/NEON/functions/NEGEMMTranspose1xW.cpp +++ /dev/null @@ -1,45 +0,0 @@ -/* - * Copyright (c) 2017-2020 ARM Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "arm_compute/runtime/NEON/functions/NEGEMMTranspose1xW.h" - -#include "arm_compute/core/Error.h" -#include "arm_compute/core/ITensor.h" -#include "arm_compute/core/NEON/kernels/NEGEMMTranspose1xWKernel.h" -#include "arm_compute/core/Types.h" -#include "arm_compute/core/Validate.h" -#include "support/MemorySupport.h" - -namespace arm_compute -{ -void NEGEMMTranspose1xW::configure(const ITensor *input, ITensor *output) -{ - auto k = arm_compute::support::cpp14::make_unique<NEGEMMTranspose1xWKernel>(); - k->configure(input, output); - _kernel = std::move(k); -} -Status NEGEMMTranspose1xW::validate(const ITensorInfo *input, const ITensorInfo *output) -{ - return NEGEMMTranspose1xWKernel::validate(input, output); -} -} // namespace arm_compute diff --git a/src/runtime/NEON/functions/NEGather.cpp b/src/runtime/NEON/functions/NEGather.cpp index cad42a3417..62b8cfa48b 100644 --- a/src/runtime/NEON/functions/NEGather.cpp +++ b/src/runtime/NEON/functions/NEGather.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2020 ARM Limited. + * Copyright (c) 2019-2021 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -23,8 +23,8 @@ */ #include "arm_compute/runtime/NEON/functions/NEGather.h" -#include "arm_compute/core/NEON/kernels/NEGatherKernel.h" -#include "support/MemorySupport.h" +#include "src/common/utils/Log.h" +#include "src/core/NEON/kernels/NEGatherKernel.h" #include <utility> @@ -32,7 +32,8 @@ namespace arm_compute { void NEGather::configure(const ITensor *input, const ITensor *indices, ITensor *output, int axis) { - auto k = arm_compute::support::cpp14::make_unique<NEGatherKernel>(); + ARM_COMPUTE_LOG_PARAMS(input, indices, output, axis); + auto k = std::make_unique<NEGatherKernel>(); k->configure(input, indices, output, axis); _kernel = std::move(k); } diff --git a/src/runtime/NEON/functions/NEGaussian3x3.cpp b/src/runtime/NEON/functions/NEGaussian3x3.cpp deleted file mode 100644 index 399d19ddde..0000000000 --- a/src/runtime/NEON/functions/NEGaussian3x3.cpp +++ /dev/null @@ -1,40 +0,0 @@ -/* - * Copyright (c) 2016-2020 ARM Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "arm_compute/runtime/NEON/functions/NEGaussian3x3.h" - -#include "arm_compute/core/NEON/kernels/NEGaussian3x3Kernel.h" -#include "arm_compute/core/PixelValue.h" -#include "support/MemorySupport.h" - -#include <utility> - -using namespace arm_compute; - -void NEGaussian3x3::configure(ITensor *input, ITensor *output, BorderMode border_mode, uint8_t constant_border_value) -{ - auto k = arm_compute::support::cpp14::make_unique<NEGaussian3x3Kernel>(); - k->configure(input, output, border_mode == BorderMode::UNDEFINED); - _kernel = std::move(k); - _border_handler.configure(input, _kernel->border_size(), border_mode, PixelValue(constant_border_value)); -} diff --git a/src/runtime/NEON/functions/NEGaussian5x5.cpp b/src/runtime/NEON/functions/NEGaussian5x5.cpp deleted file mode 100644 index 3c7411e2de..0000000000 --- a/src/runtime/NEON/functions/NEGaussian5x5.cpp +++ /dev/null @@ -1,66 +0,0 @@ -/* - * Copyright (c) 2016-2019 ARM Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "arm_compute/runtime/NEON/functions/NEGaussian5x5.h" - -#include "arm_compute/core/ITensor.h" -#include "arm_compute/core/NEON/kernels/NEGaussian5x5Kernel.h" -#include "arm_compute/core/PixelValue.h" -#include "arm_compute/core/TensorInfo.h" -#include "arm_compute/runtime/NEON/NEScheduler.h" -#include "arm_compute/runtime/TensorAllocator.h" - -using namespace arm_compute; - -NEGaussian5x5::NEGaussian5x5(std::shared_ptr<IMemoryManager> memory_manager) - : _memory_group(std::move(memory_manager)), _kernel_hor(), _kernel_vert(), _tmp(), _border_handler() -{ -} - -void NEGaussian5x5::configure(ITensor *input, ITensor *output, BorderMode border_mode, uint8_t constant_border_value) -{ - // Init temporary buffer - TensorInfo tensor_info(input->info()->tensor_shape(), 1, DataType::S16); - _tmp.allocator()->init(tensor_info); - - // Manage intermediate buffers - _memory_group.manage(&_tmp); - - // Create and configure kernels for the two passes - _kernel_hor.configure(input, &_tmp, border_mode == BorderMode::UNDEFINED); - _kernel_vert.configure(&_tmp, output, border_mode == BorderMode::UNDEFINED); - - _tmp.allocator()->allocate(); - - _border_handler.configure(input, _kernel_hor.border_size(), border_mode, PixelValue(constant_border_value)); -} - -void NEGaussian5x5::run() -{ - NEScheduler::get().schedule(&_border_handler, Window::DimZ); - - MemoryGroupResourceScope scope_mg(_memory_group); - - NEScheduler::get().schedule(&_kernel_hor, Window::DimY); - NEScheduler::get().schedule(&_kernel_vert, Window::DimY); -} diff --git a/src/runtime/NEON/functions/NEGaussianPyramid.cpp b/src/runtime/NEON/functions/NEGaussianPyramid.cpp deleted file mode 100644 index d08bf1e282..0000000000 --- a/src/runtime/NEON/functions/NEGaussianPyramid.cpp +++ /dev/null @@ -1,193 +0,0 @@ -/* - * Copyright (c) 2016-2020 ARM Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "arm_compute/runtime/NEON/functions/NEGaussianPyramid.h" - -#include "arm_compute/core/Error.h" -#include "arm_compute/core/ITensor.h" -#include "arm_compute/core/NEON/kernels/NEGaussianPyramidKernel.h" -#include "arm_compute/core/NEON/kernels/NEScaleKernel.h" -#include "arm_compute/core/PixelValue.h" -#include "arm_compute/core/TensorInfo.h" -#include "arm_compute/core/Validate.h" -#include "arm_compute/runtime/NEON/NEScheduler.h" -#include "arm_compute/runtime/NEON/functions/NEGaussian5x5.h" -#include "arm_compute/runtime/Pyramid.h" -#include "arm_compute/runtime/Tensor.h" -#include "arm_compute/runtime/TensorAllocator.h" - -#include <cstddef> - -using namespace arm_compute; - -NEGaussianPyramid::NEGaussianPyramid() - : _input(nullptr), _pyramid(nullptr), _tmp() -{ -} - -NEGaussianPyramidHalf::NEGaussianPyramidHalf() // NOLINT - : _horizontal_border_handler(), - _vertical_border_handler(), - _horizontal_reduction(), - _vertical_reduction() -{ -} - -void NEGaussianPyramidHalf::configure(const ITensor *input, IPyramid *pyramid, BorderMode border_mode, uint8_t constant_border_value) -{ - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8); - ARM_COMPUTE_ERROR_ON(nullptr == pyramid); - ARM_COMPUTE_ERROR_ON(input->info()->num_dimensions() != pyramid->get_pyramid_level(0)->info()->num_dimensions()); - ARM_COMPUTE_ERROR_ON(input->info()->dimension(0) != pyramid->info()->width()); - ARM_COMPUTE_ERROR_ON(input->info()->dimension(1) != pyramid->info()->height()); - ARM_COMPUTE_ERROR_ON(SCALE_PYRAMID_HALF != pyramid->info()->scale()); - - // Constant value to use for vertical fill border when the border mode is CONSTANT - const uint16_t pixel_value_u16 = static_cast<uint16_t>(constant_border_value) * 2 + static_cast<uint16_t>(constant_border_value) * 8 + static_cast<uint16_t>(constant_border_value) * 6; - - /* Get number of pyramid levels */ - const size_t num_levels = pyramid->info()->num_levels(); - const size_t num_stages = num_levels - 1; - - _input = input; - _pyramid = pyramid; - - if(num_levels > 1) - { - // Apply half scale to the X dimension of the tensor shape - TensorShape tensor_shape = pyramid->info()->tensor_shape(); - tensor_shape.set(0, (pyramid->info()->width() + 1) * SCALE_PYRAMID_HALF); - - PyramidInfo pyramid_info(num_levels - 1, SCALE_PYRAMID_HALF, tensor_shape, Format::S16); - _tmp.init(pyramid_info); - - _horizontal_reduction.clear(); - _vertical_reduction.clear(); - _horizontal_border_handler.clear(); - _vertical_border_handler.clear(); - - _horizontal_reduction.resize(num_stages); - _vertical_reduction.resize(num_stages); - _horizontal_border_handler.resize(num_stages); - _vertical_border_handler.resize(num_stages); - - for(size_t i = 0; i < num_stages; ++i) - { - /* Configure horizontal kernel */ - _horizontal_reduction[i].configure(_pyramid->get_pyramid_level(i), _tmp.get_pyramid_level(i)); - - /* Configure vertical kernel */ - _vertical_reduction[i].configure(_tmp.get_pyramid_level(i), _pyramid->get_pyramid_level(i + 1)); - - /* Configure border */ - _horizontal_border_handler[i].configure(_pyramid->get_pyramid_level(i), _horizontal_reduction[i].border_size(), border_mode, PixelValue(constant_border_value)); - - /* Configure border */ - _vertical_border_handler[i].configure(_tmp.get_pyramid_level(i), _vertical_reduction[i].border_size(), border_mode, PixelValue(pixel_value_u16)); - } - - _tmp.allocate(); - } -} - -void NEGaussianPyramidHalf::run() -{ - ARM_COMPUTE_ERROR_ON_MSG(_pyramid == nullptr, "Unconfigured function"); - - /* Get number of pyramid levels */ - const unsigned int num_levels = _pyramid->info()->num_levels(); - - /* The first level of the pyramid has the input image */ - _pyramid->get_pyramid_level(0)->copy_from(*_input); - - for(unsigned int i = 0; i < num_levels - 1; ++i) - { - NEScheduler::get().schedule(&_horizontal_border_handler[i], Window::DimZ); - NEScheduler::get().schedule(&_horizontal_reduction[i], Window::DimY); - NEScheduler::get().schedule(&_vertical_border_handler[i], Window::DimZ); - NEScheduler::get().schedule(&_vertical_reduction[i], Window::DimY); - } -} - -NEGaussianPyramidOrb::NEGaussianPyramidOrb() // NOLINT - : _gaus5x5(), - _scale_nearest() -{ -} - -void NEGaussianPyramidOrb::configure(const ITensor *input, IPyramid *pyramid, BorderMode border_mode, uint8_t constant_border_value) -{ - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8); - ARM_COMPUTE_ERROR_ON(nullptr == pyramid); - ARM_COMPUTE_ERROR_ON(input->info()->num_dimensions() != pyramid->get_pyramid_level(0)->info()->num_dimensions()); - ARM_COMPUTE_ERROR_ON(input->info()->dimension(0) != pyramid->info()->width()); - ARM_COMPUTE_ERROR_ON(input->info()->dimension(1) != pyramid->info()->height()); - ARM_COMPUTE_ERROR_ON(SCALE_PYRAMID_ORB != pyramid->info()->scale()); - - /* Get number of pyramid levels */ - const size_t num_levels = pyramid->info()->num_levels(); - const size_t num_stages = num_levels - 1; - - _input = input; - _pyramid = pyramid; - - _gaus5x5.clear(); - _scale_nearest.clear(); - - _gaus5x5.resize(num_stages); - _scale_nearest.resize(num_stages); - - if(num_levels > 1) - { - PyramidInfo pyramid_info(num_levels - 1, SCALE_PYRAMID_ORB, pyramid->info()->tensor_shape(), Format::U8); - _tmp.init(pyramid_info); - - for(size_t i = 0; i < num_levels - 1; ++i) - { - /* Configure gaussian 5x5 */ - _gaus5x5[i].configure(_pyramid->get_pyramid_level(i), _tmp.get_pyramid_level(i), border_mode, constant_border_value); - - /* Configure scale */ - _scale_nearest[i].configure(_tmp.get_pyramid_level(i), _pyramid->get_pyramid_level(i + 1), InterpolationPolicy::NEAREST_NEIGHBOR, BorderMode::UNDEFINED); - } - - _tmp.allocate(); - } -} - -void NEGaussianPyramidOrb::run() -{ - ARM_COMPUTE_ERROR_ON_MSG(_pyramid == nullptr, "Unconfigured function"); - - /* Get number of pyramid levels */ - const size_t num_levels = _pyramid->info()->num_levels(); - - /* The first level of the pyramid has the input image */ - _pyramid->get_pyramid_level(0)->copy_from(*_input); - - for(unsigned int i = 0; i < num_levels - 1; ++i) - { - _gaus5x5[i].run(); - _scale_nearest[i].run(); - } -} diff --git a/src/runtime/NEON/functions/NEGenerateProposalsLayer.cpp b/src/runtime/NEON/functions/NEGenerateProposalsLayer.cpp index 82880bac85..1022b4153e 100644 --- a/src/runtime/NEON/functions/NEGenerateProposalsLayer.cpp +++ b/src/runtime/NEON/functions/NEGenerateProposalsLayer.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2020 ARM Limited. + * Copyright (c) 2019-2021 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -26,17 +26,23 @@ #include "arm_compute/core/Types.h" #include "arm_compute/runtime/NEON/NEScheduler.h" +#include "src/common/utils/Log.h" +#include "src/core/helpers/AutoConfiguration.h" +#include "src/core/NEON/kernels/NEFillBorderKernel.h" +#include "src/core/NEON/kernels/NEGenerateProposalsLayerKernel.h" +#include "src/core/NEON/kernels/NEPadLayerKernel.h" + namespace arm_compute { NEGenerateProposalsLayer::NEGenerateProposalsLayer(std::shared_ptr<IMemoryManager> memory_manager) : _memory_group(memory_manager), - _permute_deltas_kernel(), - _flatten_deltas_kernel(), - _permute_scores_kernel(), - _flatten_scores_kernel(), - _compute_anchors_kernel(), - _bounding_box_kernel(), - _pad_kernel(), + _permute_deltas(), + _flatten_deltas(), + _permute_scores(), + _flatten_scores(), + _compute_anchors(nullptr), + _bounding_box(), + _pad(), _dequantize_anchors(), _dequantize_deltas(), _quantize_all_proposals(), @@ -61,46 +67,63 @@ NEGenerateProposalsLayer::NEGenerateProposalsLayer(std::shared_ptr<IMemoryManage { } -void NEGenerateProposalsLayer::configure(const ITensor *scores, const ITensor *deltas, const ITensor *anchors, ITensor *proposals, ITensor *scores_out, ITensor *num_valid_proposals, +NEGenerateProposalsLayer::~NEGenerateProposalsLayer() = default; + +void NEGenerateProposalsLayer::configure(const ITensor *scores, + const ITensor *deltas, + const ITensor *anchors, + ITensor *proposals, + ITensor *scores_out, + ITensor *num_valid_proposals, const GenerateProposalsInfo &info) { ARM_COMPUTE_ERROR_ON_NULLPTR(scores, deltas, anchors, proposals, scores_out, num_valid_proposals); - ARM_COMPUTE_ERROR_THROW_ON(NEGenerateProposalsLayer::validate(scores->info(), deltas->info(), anchors->info(), proposals->info(), scores_out->info(), num_valid_proposals->info(), info)); + ARM_COMPUTE_ERROR_THROW_ON(NEGenerateProposalsLayer::validate(scores->info(), deltas->info(), anchors->info(), + proposals->info(), scores_out->info(), + num_valid_proposals->info(), info)); + ARM_COMPUTE_LOG_PARAMS(scores, deltas, anchors, proposals, scores_out, num_valid_proposals, info); _is_nhwc = scores->info()->data_layout() == DataLayout::NHWC; const DataType scores_data_type = scores->info()->data_type(); _is_qasymm8 = scores_data_type == DataType::QASYMM8; - const int num_anchors = scores->info()->dimension(get_data_layout_dimension_index(scores->info()->data_layout(), DataLayoutDimension::CHANNEL)); - const int feat_width = scores->info()->dimension(get_data_layout_dimension_index(scores->info()->data_layout(), DataLayoutDimension::WIDTH)); - const int feat_height = scores->info()->dimension(get_data_layout_dimension_index(scores->info()->data_layout(), DataLayoutDimension::HEIGHT)); - const int total_num_anchors = num_anchors * feat_width * feat_height; - const int pre_nms_topN = info.pre_nms_topN(); - const int post_nms_topN = info.post_nms_topN(); - const size_t values_per_roi = info.values_per_roi(); + const int num_anchors = scores->info()->dimension( + get_data_layout_dimension_index(scores->info()->data_layout(), DataLayoutDimension::CHANNEL)); + const int feat_width = scores->info()->dimension( + get_data_layout_dimension_index(scores->info()->data_layout(), DataLayoutDimension::WIDTH)); + const int feat_height = scores->info()->dimension( + get_data_layout_dimension_index(scores->info()->data_layout(), DataLayoutDimension::HEIGHT)); + const int total_num_anchors = num_anchors * feat_width * feat_height; + const int pre_nms_topN = info.pre_nms_topN(); + const int post_nms_topN = info.post_nms_topN(); + const size_t values_per_roi = info.values_per_roi(); const QuantizationInfo scores_qinfo = scores->info()->quantization_info(); const DataType rois_data_type = (_is_qasymm8) ? DataType::QASYMM16 : scores_data_type; - const QuantizationInfo rois_qinfo = (_is_qasymm8) ? QuantizationInfo(0.125f, 0) : scores->info()->quantization_info(); + const QuantizationInfo rois_qinfo = + (_is_qasymm8) ? QuantizationInfo(0.125f, 0) : scores->info()->quantization_info(); // Compute all the anchors _memory_group.manage(&_all_anchors); - _compute_anchors_kernel.configure(anchors, &_all_anchors, ComputeAnchorsInfo(feat_width, feat_height, info.spatial_scale())); + _compute_anchors = std::make_unique<NEComputeAllAnchorsKernel>(); + _compute_anchors->configure(anchors, &_all_anchors, + ComputeAnchorsInfo(feat_width, feat_height, info.spatial_scale())); const TensorShape flatten_shape_deltas(values_per_roi, total_num_anchors); - _deltas_flattened.allocator()->init(TensorInfo(flatten_shape_deltas, 1, scores_data_type, deltas->info()->quantization_info())); + _deltas_flattened.allocator()->init( + TensorInfo(flatten_shape_deltas, 1, scores_data_type, deltas->info()->quantization_info())); // Permute and reshape deltas _memory_group.manage(&_deltas_flattened); - if(!_is_nhwc) + if (!_is_nhwc) { _memory_group.manage(&_deltas_permuted); - _permute_deltas_kernel.configure(deltas, &_deltas_permuted, PermutationVector{ 2, 0, 1 }); - _flatten_deltas_kernel.configure(&_deltas_permuted, &_deltas_flattened); + _permute_deltas.configure(deltas, &_deltas_permuted, PermutationVector{2, 0, 1}); + _flatten_deltas.configure(&_deltas_permuted, &_deltas_flattened); _deltas_permuted.allocator()->allocate(); } else { - _flatten_deltas_kernel.configure(deltas, &_deltas_flattened); + _flatten_deltas.configure(deltas, &_deltas_flattened); } const TensorShape flatten_shape_scores(1, total_num_anchors); @@ -108,21 +131,21 @@ void NEGenerateProposalsLayer::configure(const ITensor *scores, const ITensor *d // Permute and reshape scores _memory_group.manage(&_scores_flattened); - if(!_is_nhwc) + if (!_is_nhwc) { _memory_group.manage(&_scores_permuted); - _permute_scores_kernel.configure(scores, &_scores_permuted, PermutationVector{ 2, 0, 1 }); - _flatten_scores_kernel.configure(&_scores_permuted, &_scores_flattened); + _permute_scores.configure(scores, &_scores_permuted, PermutationVector{2, 0, 1}); + _flatten_scores.configure(&_scores_permuted, &_scores_flattened); _scores_permuted.allocator()->allocate(); } else { - _flatten_scores_kernel.configure(scores, &_scores_flattened); + _flatten_scores.configure(scores, &_scores_flattened); } Tensor *anchors_to_use = &_all_anchors; Tensor *deltas_to_use = &_deltas_flattened; - if(_is_qasymm8) + if (_is_qasymm8) { _all_anchors_f32.allocator()->init(TensorInfo(_all_anchors.info()->tensor_shape(), 1, DataType::F32)); _deltas_flattened_f32.allocator()->init(TensorInfo(_deltas_flattened.info()->tensor_shape(), 1, DataType::F32)); @@ -140,16 +163,17 @@ void NEGenerateProposalsLayer::configure(const ITensor *scores, const ITensor *d // Bounding box transform _memory_group.manage(&_all_proposals); BoundingBoxTransformInfo bbox_info(info.im_width(), info.im_height(), 1.f); - _bounding_box_kernel.configure(anchors_to_use, &_all_proposals, deltas_to_use, bbox_info); + _bounding_box.configure(anchors_to_use, &_all_proposals, deltas_to_use, bbox_info); deltas_to_use->allocator()->allocate(); anchors_to_use->allocator()->allocate(); _all_proposals_to_use = &_all_proposals; - if(_is_qasymm8) + if (_is_qasymm8) { _memory_group.manage(&_all_proposals_quantized); // Requantize all_proposals to QASYMM16 with 0.125 scale and 0 offset - _all_proposals_quantized.allocator()->init(TensorInfo(_all_proposals.info()->tensor_shape(), 1, DataType::QASYMM16, QuantizationInfo(0.125f, 0))); + _all_proposals_quantized.allocator()->init( + TensorInfo(_all_proposals.info()->tensor_shape(), 1, DataType::QASYMM16, QuantizationInfo(0.125f, 0))); _quantize_all_proposals.configure(&_all_proposals, &_all_proposals_quantized); _all_proposals.allocator()->allocate(); _all_proposals_to_use = &_all_proposals_quantized; @@ -165,7 +189,8 @@ void NEGenerateProposalsLayer::configure(const ITensor *scores, const ITensor *d // Note that NMS needs outputs preinitialized. auto_init_if_empty(*scores_out->info(), TensorShape(scores_nms_size), 1, scores_data_type, scores_qinfo); - auto_init_if_empty(*_proposals_4_roi_values.info(), TensorShape(values_per_roi, scores_nms_size), 1, rois_data_type, rois_qinfo); + auto_init_if_empty(*_proposals_4_roi_values.info(), TensorShape(values_per_roi, scores_nms_size), 1, rois_data_type, + rois_qinfo); auto_init_if_empty(*num_valid_proposals->info(), TensorShape(1), 1, DataType::U32); // Initialize temporaries (unused) outputs @@ -178,17 +203,12 @@ void NEGenerateProposalsLayer::configure(const ITensor *scores, const ITensor *d _memory_group.manage(&_proposals_4_roi_values); - const BoxNMSLimitInfo box_nms_info(0.0f, info.nms_thres(), scores_nms_size, false, NMSType::LINEAR, 0.5f, 0.001f, true, min_size_scaled, info.im_width(), info.im_height()); - _cpp_nms.configure(&_scores_flattened /*scores_in*/, - _all_proposals_to_use /*boxes_in,*/, - nullptr /* batch_splits_in*/, - scores_out /* scores_out*/, - &_proposals_4_roi_values /*boxes_out*/, - &_classes_nms_unused /*classes*/, - nullptr /*batch_splits_out*/, - &_keeps_nms_unused /*keeps*/, - num_valid_proposals /* keeps_size*/, - box_nms_info); + const BoxNMSLimitInfo box_nms_info(0.0f, info.nms_thres(), scores_nms_size, false, NMSType::LINEAR, 0.5f, 0.001f, + true, min_size_scaled, info.im_width(), info.im_height()); + _cpp_nms.configure(&_scores_flattened /*scores_in*/, _all_proposals_to_use /*boxes_in,*/, + nullptr /* batch_splits_in*/, scores_out /* scores_out*/, &_proposals_4_roi_values /*boxes_out*/, + &_classes_nms_unused /*classes*/, nullptr /*batch_splits_out*/, &_keeps_nms_unused /*keeps*/, + num_valid_proposals /* keeps_size*/, box_nms_info); _keeps_nms_unused.allocator()->allocate(); _classes_nms_unused.allocator()->allocate(); @@ -196,12 +216,17 @@ void NEGenerateProposalsLayer::configure(const ITensor *scores, const ITensor *d _scores_flattened.allocator()->allocate(); // Add the first column that represents the batch id. This will be all zeros, as we don't support multiple images - _pad_kernel.configure(&_proposals_4_roi_values, proposals, PaddingList{ { 1, 0 } }); + _pad.configure(&_proposals_4_roi_values, proposals, PaddingList{{1, 0}}); _proposals_4_roi_values.allocator()->allocate(); } -Status NEGenerateProposalsLayer::validate(const ITensorInfo *scores, const ITensorInfo *deltas, const ITensorInfo *anchors, const ITensorInfo *proposals, const ITensorInfo *scores_out, - const ITensorInfo *num_valid_proposals, const GenerateProposalsInfo &info) +Status NEGenerateProposalsLayer::validate(const ITensorInfo *scores, + const ITensorInfo *deltas, + const ITensorInfo *anchors, + const ITensorInfo *proposals, + const ITensorInfo *scores_out, + const ITensorInfo *num_valid_proposals, + const GenerateProposalsInfo &info) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(scores, deltas, anchors, proposals, scores_out, num_valid_proposals); ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(scores, 1, DataType::QASYMM8, DataType::F16, DataType::F32); @@ -209,9 +234,12 @@ Status NEGenerateProposalsLayer::validate(const ITensorInfo *scores, const ITens ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(scores, deltas); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(scores, deltas); - const int num_anchors = scores->dimension(get_data_layout_dimension_index(scores->data_layout(), DataLayoutDimension::CHANNEL)); - const int feat_width = scores->dimension(get_data_layout_dimension_index(scores->data_layout(), DataLayoutDimension::WIDTH)); - const int feat_height = scores->dimension(get_data_layout_dimension_index(scores->data_layout(), DataLayoutDimension::HEIGHT)); + const int num_anchors = + scores->dimension(get_data_layout_dimension_index(scores->data_layout(), DataLayoutDimension::CHANNEL)); + const int feat_width = + scores->dimension(get_data_layout_dimension_index(scores->data_layout(), DataLayoutDimension::WIDTH)); + const int feat_height = + scores->dimension(get_data_layout_dimension_index(scores->data_layout(), DataLayoutDimension::HEIGHT)); const int num_images = scores->dimension(3); const int total_num_anchors = num_anchors * feat_width * feat_height; const int values_per_roi = info.values_per_roi(); @@ -220,76 +248,100 @@ Status NEGenerateProposalsLayer::validate(const ITensorInfo *scores, const ITens ARM_COMPUTE_RETURN_ERROR_ON(num_images > 1); - if(is_qasymm8) + if (is_qasymm8) { ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(anchors, 1, DataType::QSYMM16); const UniformQuantizationInfo anchors_qinfo = anchors->quantization_info().uniform(); ARM_COMPUTE_RETURN_ERROR_ON(anchors_qinfo.scale != 0.125f); } - TensorInfo all_anchors_info(anchors->clone()->set_tensor_shape(TensorShape(values_per_roi, total_num_anchors)).set_is_resizable(true)); - ARM_COMPUTE_RETURN_ON_ERROR(NEComputeAllAnchorsKernel::validate(anchors, &all_anchors_info, ComputeAnchorsInfo(feat_width, feat_height, info.spatial_scale()))); - - TensorInfo deltas_permuted_info = deltas->clone()->set_tensor_shape(TensorShape(values_per_roi * num_anchors, feat_width, feat_height)).set_is_resizable(true); - TensorInfo scores_permuted_info = scores->clone()->set_tensor_shape(TensorShape(num_anchors, feat_width, feat_height)).set_is_resizable(true); - if(scores->data_layout() == DataLayout::NHWC) + TensorInfo all_anchors_info( + anchors->clone()->set_tensor_shape(TensorShape(values_per_roi, total_num_anchors)).set_is_resizable(true)); + ARM_COMPUTE_RETURN_ON_ERROR(NEComputeAllAnchorsKernel::validate( + anchors, &all_anchors_info, ComputeAnchorsInfo(feat_width, feat_height, info.spatial_scale()))); + + TensorInfo deltas_permuted_info = + deltas->clone() + ->set_tensor_shape(TensorShape(values_per_roi * num_anchors, feat_width, feat_height)) + .set_is_resizable(true); + TensorInfo scores_permuted_info = + scores->clone()->set_tensor_shape(TensorShape(num_anchors, feat_width, feat_height)).set_is_resizable(true); + if (scores->data_layout() == DataLayout::NHWC) { ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(deltas, &deltas_permuted_info); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(scores, &scores_permuted_info); } else { - ARM_COMPUTE_RETURN_ON_ERROR(NEPermuteKernel::validate(deltas, &deltas_permuted_info, PermutationVector{ 2, 0, 1 })); - ARM_COMPUTE_RETURN_ON_ERROR(NEPermuteKernel::validate(scores, &scores_permuted_info, PermutationVector{ 2, 0, 1 })); + ARM_COMPUTE_RETURN_ON_ERROR(NEPermute::validate(deltas, &deltas_permuted_info, PermutationVector{2, 0, 1})); + ARM_COMPUTE_RETURN_ON_ERROR(NEPermute::validate(scores, &scores_permuted_info, PermutationVector{2, 0, 1})); } - TensorInfo deltas_flattened_info(deltas->clone()->set_tensor_shape(TensorShape(values_per_roi, total_num_anchors)).set_is_resizable(true)); - ARM_COMPUTE_RETURN_ON_ERROR(NEReshapeLayerKernel::validate(&deltas_permuted_info, &deltas_flattened_info)); + TensorInfo deltas_flattened_info( + deltas->clone()->set_tensor_shape(TensorShape(values_per_roi, total_num_anchors)).set_is_resizable(true)); + ARM_COMPUTE_RETURN_ON_ERROR(NEReshapeLayer::validate(&deltas_permuted_info, &deltas_flattened_info)); - TensorInfo scores_flattened_info(scores->clone()->set_tensor_shape(TensorShape(1, total_num_anchors)).set_is_resizable(true)); - TensorInfo proposals_4_roi_values(deltas->clone()->set_tensor_shape(TensorShape(values_per_roi, total_num_anchors)).set_is_resizable(true)); + TensorInfo scores_flattened_info( + scores->clone()->set_tensor_shape(TensorShape(1, total_num_anchors)).set_is_resizable(true)); + TensorInfo proposals_4_roi_values( + deltas->clone()->set_tensor_shape(TensorShape(values_per_roi, total_num_anchors)).set_is_resizable(true)); - ARM_COMPUTE_RETURN_ON_ERROR(NEReshapeLayerKernel::validate(&scores_permuted_info, &scores_flattened_info)); + ARM_COMPUTE_RETURN_ON_ERROR(NEReshapeLayer::validate(&scores_permuted_info, &scores_flattened_info)); TensorInfo *proposals_4_roi_values_to_use = &proposals_4_roi_values; - TensorInfo proposals_4_roi_values_quantized(deltas->clone()->set_tensor_shape(TensorShape(values_per_roi, total_num_anchors)).set_is_resizable(true)); - proposals_4_roi_values_quantized.set_data_type(DataType::QASYMM16).set_quantization_info(QuantizationInfo(0.125f, 0)); - if(is_qasymm8) + TensorInfo proposals_4_roi_values_quantized( + deltas->clone()->set_tensor_shape(TensorShape(values_per_roi, total_num_anchors)).set_is_resizable(true)); + proposals_4_roi_values_quantized.set_data_type(DataType::QASYMM16) + .set_quantization_info(QuantizationInfo(0.125f, 0)); + if (is_qasymm8) { - TensorInfo all_anchors_f32_info(anchors->clone()->set_tensor_shape(TensorShape(values_per_roi, total_num_anchors)).set_is_resizable(true).set_data_type(DataType::F32)); - ARM_COMPUTE_RETURN_ON_ERROR(NEDequantizationLayerKernel::validate(&all_anchors_info, &all_anchors_f32_info)); - - TensorInfo deltas_flattened_f32_info(deltas->clone()->set_tensor_shape(TensorShape(values_per_roi, total_num_anchors)).set_is_resizable(true).set_data_type(DataType::F32)); - ARM_COMPUTE_RETURN_ON_ERROR(NEDequantizationLayerKernel::validate(&deltas_flattened_info, &deltas_flattened_f32_info)); - - TensorInfo proposals_4_roi_values_f32(deltas->clone()->set_tensor_shape(TensorShape(values_per_roi, total_num_anchors)).set_is_resizable(true).set_data_type(DataType::F32)); - ARM_COMPUTE_RETURN_ON_ERROR(NEBoundingBoxTransformKernel::validate(&all_anchors_f32_info, &proposals_4_roi_values_f32, &deltas_flattened_f32_info, - BoundingBoxTransformInfo(info.im_width(), info.im_height(), 1.f))); - - ARM_COMPUTE_RETURN_ON_ERROR(NEQuantizationLayerKernel::validate(&proposals_4_roi_values_f32, &proposals_4_roi_values_quantized)); + TensorInfo all_anchors_f32_info(anchors->clone() + ->set_tensor_shape(TensorShape(values_per_roi, total_num_anchors)) + .set_is_resizable(true) + .set_data_type(DataType::F32)); + ARM_COMPUTE_RETURN_ON_ERROR(NEDequantizationLayer::validate(&all_anchors_info, &all_anchors_f32_info)); + + TensorInfo deltas_flattened_f32_info(deltas->clone() + ->set_tensor_shape(TensorShape(values_per_roi, total_num_anchors)) + .set_is_resizable(true) + .set_data_type(DataType::F32)); + ARM_COMPUTE_RETURN_ON_ERROR( + NEDequantizationLayer::validate(&deltas_flattened_info, &deltas_flattened_f32_info)); + + TensorInfo proposals_4_roi_values_f32(deltas->clone() + ->set_tensor_shape(TensorShape(values_per_roi, total_num_anchors)) + .set_is_resizable(true) + .set_data_type(DataType::F32)); + ARM_COMPUTE_RETURN_ON_ERROR(NEBoundingBoxTransform::validate( + &all_anchors_f32_info, &proposals_4_roi_values_f32, &deltas_flattened_f32_info, + BoundingBoxTransformInfo(info.im_width(), info.im_height(), 1.f))); + + ARM_COMPUTE_RETURN_ON_ERROR( + NEQuantizationLayer::validate(&proposals_4_roi_values_f32, &proposals_4_roi_values_quantized)); proposals_4_roi_values_to_use = &proposals_4_roi_values_quantized; } else { - ARM_COMPUTE_RETURN_ON_ERROR(NEBoundingBoxTransformKernel::validate(&all_anchors_info, &proposals_4_roi_values, &deltas_flattened_info, - BoundingBoxTransformInfo(info.im_width(), info.im_height(), 1.f))); + ARM_COMPUTE_RETURN_ON_ERROR( + NEBoundingBoxTransform::validate(&all_anchors_info, &proposals_4_roi_values, &deltas_flattened_info, + BoundingBoxTransformInfo(info.im_width(), info.im_height(), 1.f))); } - ARM_COMPUTE_RETURN_ON_ERROR(NEPadLayerKernel::validate(proposals_4_roi_values_to_use, proposals, PaddingList{ { 1, 0 } })); + ARM_COMPUTE_RETURN_ON_ERROR(NEPadLayer::validate(proposals_4_roi_values_to_use, proposals, PaddingList{{1, 0}})); - if(num_valid_proposals->total_size() > 0) + if (num_valid_proposals->total_size() > 0) { ARM_COMPUTE_RETURN_ERROR_ON(num_valid_proposals->num_dimensions() > 1); ARM_COMPUTE_RETURN_ERROR_ON(num_valid_proposals->dimension(0) > 1); ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(num_valid_proposals, 1, DataType::U32); } - if(proposals->total_size() > 0) + if (proposals->total_size() > 0) { ARM_COMPUTE_RETURN_ERROR_ON(proposals->num_dimensions() > 2); ARM_COMPUTE_RETURN_ERROR_ON(proposals->dimension(0) != size_t(values_per_roi) + 1); ARM_COMPUTE_RETURN_ERROR_ON(proposals->dimension(1) != size_t(total_num_anchors)); - if(is_qasymm8) + if (is_qasymm8) { ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(proposals, 1, DataType::QASYMM16); const UniformQuantizationInfo proposals_qinfo = proposals->quantization_info().uniform(); @@ -302,7 +354,7 @@ Status NEGenerateProposalsLayer::validate(const ITensorInfo *scores, const ITens } } - if(scores_out->total_size() > 0) + if (scores_out->total_size() > 0) { ARM_COMPUTE_RETURN_ERROR_ON(scores_out->num_dimensions() > 1); ARM_COMPUTE_RETURN_ERROR_ON(scores_out->dimension(0) != size_t(total_num_anchors)); @@ -318,36 +370,36 @@ void NEGenerateProposalsLayer::run() MemoryGroupResourceScope scope_mg(_memory_group); // Compute all the anchors - NEScheduler::get().schedule(&_compute_anchors_kernel, Window::DimY); + NEScheduler::get().schedule(_compute_anchors.get(), Window::DimY); // Transpose and reshape the inputs - if(!_is_nhwc) + if (!_is_nhwc) { - NEScheduler::get().schedule(&_permute_deltas_kernel, Window::DimY); - NEScheduler::get().schedule(&_permute_scores_kernel, Window::DimY); + _permute_deltas.run(); + _permute_scores.run(); } - NEScheduler::get().schedule(&_flatten_deltas_kernel, Window::DimY); - NEScheduler::get().schedule(&_flatten_scores_kernel, Window::DimY); + _flatten_deltas.run(); + _flatten_scores.run(); - if(_is_qasymm8) + if (_is_qasymm8) { - NEScheduler::get().schedule(&_dequantize_anchors, Window::DimY); - NEScheduler::get().schedule(&_dequantize_deltas, Window::DimY); + _dequantize_anchors.run(); + _dequantize_deltas.run(); } // Build the boxes - NEScheduler::get().schedule(&_bounding_box_kernel, Window::DimY); + _bounding_box.run(); - if(_is_qasymm8) + if (_is_qasymm8) { - NEScheduler::get().schedule(&_quantize_all_proposals, Window::DimY); + _quantize_all_proposals.run(); } // Non maxima suppression _cpp_nms.run(); // Add dummy batch indexes - NEScheduler::get().schedule(&_pad_kernel, Window::DimY); + _pad.run(); } } // namespace arm_compute diff --git a/src/runtime/NEON/functions/NEHOGDescriptor.cpp b/src/runtime/NEON/functions/NEHOGDescriptor.cpp deleted file mode 100644 index 8efc091d0a..0000000000 --- a/src/runtime/NEON/functions/NEHOGDescriptor.cpp +++ /dev/null @@ -1,108 +0,0 @@ -/* - * Copyright (c) 2016-2019 ARM Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "arm_compute/runtime/NEON/functions/NEHOGDescriptor.h" - -#include "arm_compute/core/Error.h" -#include "arm_compute/core/HOGInfo.h" -#include "arm_compute/core/Size2D.h" -#include "arm_compute/core/Validate.h" -#include "arm_compute/runtime/NEON/NEScheduler.h" - -using namespace arm_compute; - -NEHOGDescriptor::NEHOGDescriptor(std::shared_ptr<IMemoryManager> memory_manager) - : _memory_group(std::move(memory_manager)), _gradient(), _orient_bin(), _block_norm(), _mag(), _phase(), _hog_space() -{ -} - -void NEHOGDescriptor::configure(ITensor *input, ITensor *output, const IHOG *hog, BorderMode border_mode, uint8_t constant_border_value) -{ - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8); - ARM_COMPUTE_ERROR_ON(nullptr == output); - ARM_COMPUTE_ERROR_ON(nullptr == hog); - - const HOGInfo *hog_info = hog->info(); - const size_t width = input->info()->dimension(Window::DimX); - const size_t height = input->info()->dimension(Window::DimY); - const size_t num_bins = hog_info->num_bins(); - - Size2D cell_size = hog_info->cell_size(); - - // Calculate number of cells along the x and y directions for the hog_space - const size_t num_cells_x = width / cell_size.width; - const size_t num_cells_y = height / cell_size.height; - - // TensorShape of the input image - const TensorShape &shape_img = input->info()->tensor_shape(); - - // TensorShape of the hog space - TensorShape shape_hog_space = input->info()->tensor_shape(); - shape_hog_space.set(Window::DimX, num_cells_x); - shape_hog_space.set(Window::DimY, num_cells_y); - - // Allocate memory for magnitude, phase and hog space - TensorInfo info_mag(shape_img, Format::S16); - _mag.allocator()->init(info_mag); - - TensorInfo info_phase(shape_img, Format::U8); - _phase.allocator()->init(info_phase); - - TensorInfo info_space(shape_hog_space, num_bins, DataType::F32); - _hog_space.allocator()->init(info_space); - - // Manage intermediate buffers - _memory_group.manage(&_mag); - _memory_group.manage(&_phase); - - // Initialise gradient kernel - _gradient.configure(input, &_mag, &_phase, hog_info->phase_type(), border_mode, constant_border_value); - - // Manage intermediate buffers - _memory_group.manage(&_hog_space); - - // Initialise orientation binning kernel - _orient_bin.configure(&_mag, &_phase, &_hog_space, hog->info()); - - // Initialize HOG norm kernel - _block_norm.configure(&_hog_space, output, hog->info()); - - // Allocate intermediate tensors - _mag.allocator()->allocate(); - _phase.allocator()->allocate(); - _hog_space.allocator()->allocate(); -} - -void NEHOGDescriptor::run() -{ - MemoryGroupResourceScope scope_mg(_memory_group); - - // Run gradient - _gradient.run(); - - // Run orientation binning kernel - NEScheduler::get().schedule(&_orient_bin, Window::DimY); - - // Run block normalization kernel - NEScheduler::get().schedule(&_block_norm, Window::DimY); -} diff --git a/src/runtime/NEON/functions/NEHOGDetector.cpp b/src/runtime/NEON/functions/NEHOGDetector.cpp deleted file mode 100644 index 95d7aae931..0000000000 --- a/src/runtime/NEON/functions/NEHOGDetector.cpp +++ /dev/null @@ -1,36 +0,0 @@ -/* - * Copyright (c) 2016-2020 ARM Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "arm_compute/runtime/NEON/functions/NEHOGDetector.h" - -#include "arm_compute/core/NEON/kernels/NEHOGDetectorKernel.h" -#include "support/MemorySupport.h" - -using namespace arm_compute; - -void NEHOGDetector::configure(const ITensor *input, const IHOG *hog, IDetectionWindowArray *detection_windows, const Size2D &detection_window_stride, float threshold, size_t idx_class) -{ - auto k = arm_compute::support::cpp14::make_unique<NEHOGDetectorKernel>(); - k->configure(input, hog, detection_windows, detection_window_stride, threshold, idx_class); - _kernel = std::move(k); -} diff --git a/src/runtime/NEON/functions/NEHOGGradient.cpp b/src/runtime/NEON/functions/NEHOGGradient.cpp deleted file mode 100644 index 27606327e3..0000000000 --- a/src/runtime/NEON/functions/NEHOGGradient.cpp +++ /dev/null @@ -1,90 +0,0 @@ -/* - * Copyright (c) 2016-2020 ARM Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "arm_compute/runtime/NEON/functions/NEHOGGradient.h" - -#include "arm_compute/core/NEON/kernels/NEMagnitudePhaseKernel.h" -#include "arm_compute/core/Validate.h" -#include "arm_compute/runtime/NEON/NEScheduler.h" -#include "support/MemorySupport.h" - -using namespace arm_compute; - -NEHOGGradient::NEHOGGradient(std::shared_ptr<IMemoryManager> memory_manager) // NOLINT - : _memory_group(std::move(memory_manager)), - _derivative(), - _mag_phase(nullptr), - _gx(), - _gy() -{ -} - -void NEHOGGradient::configure(ITensor *input, ITensor *output_magnitude, ITensor *output_phase, PhaseType phase_type, BorderMode border_mode, uint8_t constant_border_value) -{ - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8); - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output_magnitude, 1, DataType::S16); - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output_phase, 1, DataType::U8); - - const TensorShape &shape_img = input->info()->tensor_shape(); - - // Allocate image memory - TensorInfo info(shape_img, Format::S16); - _gx.allocator()->init(info); - _gy.allocator()->init(info); - - // Manage intermediate buffers - _memory_group.manage(&_gx); - _memory_group.manage(&_gy); - - // Initialise derivate kernel - _derivative.configure(input, &_gx, &_gy, border_mode, constant_border_value); - - // Initialise magnitude/phase kernel - if(PhaseType::UNSIGNED == phase_type) - { - auto k = arm_compute::support::cpp14::make_unique<NEMagnitudePhaseKernel<MagnitudeType::L2NORM, PhaseType::UNSIGNED>>(); - k->configure(&_gx, &_gy, output_magnitude, output_phase); - _mag_phase = std::move(k); - } - else - { - auto k = arm_compute::support::cpp14::make_unique<NEMagnitudePhaseKernel<MagnitudeType::L2NORM, PhaseType::SIGNED>>(); - k->configure(&_gx, &_gy, output_magnitude, output_phase); - _mag_phase = std::move(k); - } - - // Allocate intermediate tensors - _gx.allocator()->allocate(); - _gy.allocator()->allocate(); -} - -void NEHOGGradient::run() -{ - MemoryGroupResourceScope scope_mg(_memory_group); - - // Run derivative - _derivative.run(); - - // Run magnitude/phase kernel - NEScheduler::get().schedule(_mag_phase.get(), Window::DimY); -} diff --git a/src/runtime/NEON/functions/NEHOGMultiDetection.cpp b/src/runtime/NEON/functions/NEHOGMultiDetection.cpp deleted file mode 100644 index 572e427b6e..0000000000 --- a/src/runtime/NEON/functions/NEHOGMultiDetection.cpp +++ /dev/null @@ -1,264 +0,0 @@ -/* - * Copyright (c) 2016-2020 ARM Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "arm_compute/runtime/NEON/functions/NEHOGMultiDetection.h" - -#include "arm_compute/core/Error.h" -#include "arm_compute/core/TensorInfo.h" -#include "arm_compute/core/Validate.h" -#include "arm_compute/runtime/NEON/NEScheduler.h" -#include "arm_compute/runtime/Tensor.h" - -using namespace arm_compute; - -NEHOGMultiDetection::NEHOGMultiDetection(std::shared_ptr<IMemoryManager> memory_manager) // NOLINT - : _memory_group(std::move(memory_manager)), - _gradient_kernel(), - _orient_bin_kernel(), - _block_norm_kernel(), - _hog_detect_kernel(), - _non_maxima_kernel(), - _hog_space(), - _hog_norm_space(), - _detection_windows(), - _mag(), - _phase(), - _non_maxima_suppression(false), - _num_orient_bin_kernel(0), - _num_block_norm_kernel(0), - _num_hog_detect_kernel(0) -{ -} - -void NEHOGMultiDetection::configure(ITensor *input, const IMultiHOG *multi_hog, IDetectionWindowArray *detection_windows, const ISize2DArray *detection_window_strides, BorderMode border_mode, - uint8_t constant_border_value, float threshold, bool non_maxima_suppression, float min_distance) -{ - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8); - ARM_COMPUTE_ERROR_ON_INVALID_MULTI_HOG(multi_hog); - ARM_COMPUTE_ERROR_ON(nullptr == detection_windows); - ARM_COMPUTE_ERROR_ON(detection_window_strides->num_values() != multi_hog->num_models()); - - const size_t width = input->info()->dimension(Window::DimX); - const size_t height = input->info()->dimension(Window::DimY); - const TensorShape &shape_img = input->info()->tensor_shape(); - const size_t num_models = multi_hog->num_models(); - PhaseType phase_type = multi_hog->model(0)->info()->phase_type(); - - size_t prev_num_bins = multi_hog->model(0)->info()->num_bins(); - Size2D prev_cell_size = multi_hog->model(0)->info()->cell_size(); - Size2D prev_block_size = multi_hog->model(0)->info()->block_size(); - Size2D prev_block_stride = multi_hog->model(0)->info()->block_stride(); - - /* Check if NEHOGOrientationBinningKernel and NEHOGBlockNormalizationKernel kernels can be skipped for a specific HOG data-object - * - * 1) NEHOGOrientationBinningKernel and NEHOGBlockNormalizationKernel are skipped if the cell size and the number of bins don't change. - * Since "multi_hog" is sorted,it is enough to check the HOG descriptors at level "ith" and level "(i-1)th - * 2) NEHOGBlockNormalizationKernel is skipped if the cell size, the number of bins and block size do not change. - * Since "multi_hog" is sorted,it is enough to check the HOG descriptors at level "ith" and level "(i-1)th - * - * @note Since the orientation binning and block normalization kernels can be skipped, we need to keep track of the input to process for each kernel - * with "input_orient_bin", "input_hog_detect" and "input_block_norm" - */ - std::vector<size_t> input_orient_bin; - std::vector<size_t> input_hog_detect; - std::vector<std::pair<size_t, size_t>> input_block_norm; - - input_orient_bin.push_back(0); - input_hog_detect.push_back(0); - input_block_norm.emplace_back(0, 0); - - for(size_t i = 1; i < num_models; ++i) - { - size_t cur_num_bins = multi_hog->model(i)->info()->num_bins(); - Size2D cur_cell_size = multi_hog->model(i)->info()->cell_size(); - Size2D cur_block_size = multi_hog->model(i)->info()->block_size(); - Size2D cur_block_stride = multi_hog->model(i)->info()->block_stride(); - - if((cur_num_bins != prev_num_bins) || (cur_cell_size.width != prev_cell_size.width) || (cur_cell_size.height != prev_cell_size.height)) - { - prev_num_bins = cur_num_bins; - prev_cell_size = cur_cell_size; - prev_block_size = cur_block_size; - prev_block_stride = cur_block_stride; - - // Compute orientation binning and block normalization kernels. Update input to process - input_orient_bin.push_back(i); - input_block_norm.emplace_back(i, input_orient_bin.size() - 1); - } - else if((cur_block_size.width != prev_block_size.width) || (cur_block_size.height != prev_block_size.height) || (cur_block_stride.width != prev_block_stride.width) - || (cur_block_stride.height != prev_block_stride.height)) - { - prev_block_size = cur_block_size; - prev_block_stride = cur_block_stride; - - // Compute block normalization kernel. Update input to process - input_block_norm.emplace_back(i, input_orient_bin.size() - 1); - } - - // Update input to process for hog detector kernel - input_hog_detect.push_back(input_block_norm.size() - 1); - } - - _detection_windows = detection_windows; - _non_maxima_suppression = non_maxima_suppression; - _num_orient_bin_kernel = input_orient_bin.size(); // Number of NEHOGOrientationBinningKernel kernels to compute - _num_block_norm_kernel = input_block_norm.size(); // Number of NEHOGBlockNormalizationKernel kernels to compute - _num_hog_detect_kernel = input_hog_detect.size(); // Number of NEHOGDetector functions to compute - - _orient_bin_kernel.clear(); - _block_norm_kernel.clear(); - _hog_detect_kernel.clear(); - _hog_space.clear(); - _hog_norm_space.clear(); - - _orient_bin_kernel.resize(_num_orient_bin_kernel); - _block_norm_kernel.resize(_num_block_norm_kernel); - _hog_detect_kernel.resize(_num_hog_detect_kernel); - _hog_space.resize(_num_orient_bin_kernel); - _hog_norm_space.resize(_num_block_norm_kernel); - _non_maxima_kernel = CPPDetectionWindowNonMaximaSuppressionKernel(); - - // Allocate tensors for magnitude and phase - TensorInfo info_mag(shape_img, Format::S16); - _mag.allocator()->init(info_mag); - - TensorInfo info_phase(shape_img, Format::U8); - _phase.allocator()->init(info_phase); - - // Manage intermediate buffers - _memory_group.manage(&_mag); - _memory_group.manage(&_phase); - - // Initialise gradient kernel - _gradient_kernel.configure(input, &_mag, &_phase, phase_type, border_mode, constant_border_value); - - // Configure NETensor for the HOG space and orientation binning kernel - for(size_t i = 0; i < _num_orient_bin_kernel; ++i) - { - const size_t idx_multi_hog = input_orient_bin[i]; - - // Get the corresponding cell size and number of bins - const Size2D &cell = multi_hog->model(idx_multi_hog)->info()->cell_size(); - const size_t num_bins = multi_hog->model(idx_multi_hog)->info()->num_bins(); - - // Calculate number of cells along the x and y directions for the hog_space - const size_t num_cells_x = width / cell.width; - const size_t num_cells_y = height / cell.height; - - // TensorShape of hog space - TensorShape shape_hog_space = input->info()->tensor_shape(); - shape_hog_space.set(Window::DimX, num_cells_x); - shape_hog_space.set(Window::DimY, num_cells_y); - - // Allocate HOG space - TensorInfo info_space(shape_hog_space, num_bins, DataType::F32); - _hog_space[i].allocator()->init(info_space); - - // Manage intermediate buffers - _memory_group.manage(&_hog_space[i]); - - // Initialise orientation binning kernel - _orient_bin_kernel[i].configure(&_mag, &_phase, &_hog_space[i], multi_hog->model(idx_multi_hog)->info()); - } - - // Allocate intermediate tensors - _mag.allocator()->allocate(); - _phase.allocator()->allocate(); - - // Configure NETensor for the normalized HOG space and block normalization kernel - for(size_t i = 0; i < _num_block_norm_kernel; ++i) - { - const size_t idx_multi_hog = input_block_norm[i].first; - const size_t idx_orient_bin = input_block_norm[i].second; - - // Allocate normalized HOG space - TensorInfo tensor_info(*(multi_hog->model(idx_multi_hog)->info()), width, height); - _hog_norm_space[i].allocator()->init(tensor_info); - - // Manage intermediate buffers - _memory_group.manage(&_hog_norm_space[i]); - - // Initialize block normalization kernel - _block_norm_kernel[i].configure(&_hog_space[idx_orient_bin], &_hog_norm_space[i], multi_hog->model(idx_multi_hog)->info()); - } - - // Allocate intermediate tensors - for(size_t i = 0; i < _num_orient_bin_kernel; ++i) - { - _hog_space[i].allocator()->allocate(); - } - - // Configure HOG detector kernel - for(size_t i = 0; i < _num_hog_detect_kernel; ++i) - { - const size_t idx_block_norm = input_hog_detect[i]; - - _hog_detect_kernel[i].configure(&_hog_norm_space[idx_block_norm], multi_hog->model(i), detection_windows, detection_window_strides->at(i), threshold, i); - } - - // Configure non maxima suppression kernel - _non_maxima_kernel.configure(_detection_windows, min_distance); - - // Allocate intermediate tensors - for(size_t i = 0; i < _num_block_norm_kernel; ++i) - { - _hog_norm_space[i].allocator()->allocate(); - } -} - -void NEHOGMultiDetection::run() -{ - ARM_COMPUTE_ERROR_ON_MSG(_detection_windows == nullptr, "Unconfigured function"); - - MemoryGroupResourceScope scope_mg(_memory_group); - - // Reset detection window - _detection_windows->clear(); - - // Run gradient - _gradient_kernel.run(); - - // Run orientation binning kernel - for(auto &kernel : _orient_bin_kernel) - { - NEScheduler::get().schedule(&kernel, Window::DimY); - } - - // Run block normalization kernel - for(auto &kernel : _block_norm_kernel) - { - NEScheduler::get().schedule(&kernel, Window::DimY); - } - - // Run HOG detector kernel - for(auto &kernel : _hog_detect_kernel) - { - kernel.run(); - } - - // Run non-maxima suppression kernel if enabled - if(_non_maxima_suppression) - { - NEScheduler::get().schedule(&_non_maxima_kernel, Window::DimY); - } -} diff --git a/src/runtime/NEON/functions/NEHarrisCorners.cpp b/src/runtime/NEON/functions/NEHarrisCorners.cpp deleted file mode 100644 index bf1e27114c..0000000000 --- a/src/runtime/NEON/functions/NEHarrisCorners.cpp +++ /dev/null @@ -1,210 +0,0 @@ -/* - * Copyright (c) 2016-2020 ARM Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "arm_compute/runtime/NEON/functions/NEHarrisCorners.h" - -#include "arm_compute/core/Error.h" -#include "arm_compute/core/NEON/kernels/NEFillBorderKernel.h" -#include "arm_compute/core/NEON/kernels/NEHarrisCornersKernel.h" -#include "arm_compute/core/TensorInfo.h" -#include "arm_compute/core/Validate.h" -#include "arm_compute/runtime/Array.h" -#include "arm_compute/runtime/NEON/NEScheduler.h" -#include "arm_compute/runtime/NEON/functions/NESobel3x3.h" -#include "arm_compute/runtime/NEON/functions/NESobel5x5.h" -#include "arm_compute/runtime/NEON/functions/NESobel7x7.h" -#include "arm_compute/runtime/TensorAllocator.h" -#include "support/MemorySupport.h" - -#include <cmath> -#include <utility> - -using namespace arm_compute; - -NEHarrisCorners::NEHarrisCorners(std::shared_ptr<IMemoryManager> memory_manager) // NOLINT - : _memory_group(std::move(memory_manager)), - _sobel(), - _harris_score(), - _non_max_suppr(), - _candidates(), - _sort_euclidean(), - _border_gx(), - _border_gy(), - _gx(), - _gy(), - _score(), - _nonmax(), - _corners_list(), - _num_corner_candidates(0) -{ -} - -void NEHarrisCorners::configure(IImage *input, float threshold, float min_dist, - float sensitivity, int32_t gradient_size, int32_t block_size, KeyPointArray *corners, - BorderMode border_mode, uint8_t constant_border_value) -{ - ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(input); - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8); - ARM_COMPUTE_ERROR_ON(!(block_size == 3 || block_size == 5 || block_size == 7)); - - const TensorShape shape = input->info()->tensor_shape(); - TensorInfo tensor_info_gxgy; - - if(gradient_size < 7) - { - tensor_info_gxgy.init(shape, Format::S16); - } - else - { - tensor_info_gxgy.init(shape, Format::S32); - } - - _gx.allocator()->init(tensor_info_gxgy); - _gy.allocator()->init(tensor_info_gxgy); - - // Manage intermediate buffers - _memory_group.manage(&_gx); - _memory_group.manage(&_gy); - - TensorInfo tensor_info_score(shape, Format::F32); - _score.allocator()->init(tensor_info_score); - _nonmax.allocator()->init(tensor_info_score); - - _corners_list.resize(shape.x() * shape.y()); - - // Set/init Sobel kernel accordingly with gradient_size - switch(gradient_size) - { - case 3: - { - auto k = arm_compute::support::cpp14::make_unique<NESobel3x3>(); - k->configure(input, &_gx, &_gy, border_mode, constant_border_value); - _sobel = std::move(k); - break; - } - case 5: - { - auto k = arm_compute::support::cpp14::make_unique<NESobel5x5>(); - k->configure(input, &_gx, &_gy, border_mode, constant_border_value); - _sobel = std::move(k); - break; - } - case 7: - { - auto k = arm_compute::support::cpp14::make_unique<NESobel7x7>(); - k->configure(input, &_gx, &_gy, border_mode, constant_border_value); - _sobel = std::move(k); - break; - } - default: - ARM_COMPUTE_ERROR("Gradient size not implemented"); - } - - // Normalization factor - const float norm_factor = 1.0f / (255.0f * pow(4.0f, gradient_size / 2) * block_size); - - // Manage intermediate buffers - _memory_group.manage(&_score); - - // Set/init Harris Score kernel accordingly with block_size - switch(block_size) - { - case 3: - { - auto k = arm_compute::support::cpp14::make_unique<NEHarrisScoreKernel<3>>(); - k->configure(&_gx, &_gy, &_score, norm_factor, threshold, sensitivity, border_mode == BorderMode::UNDEFINED); - _harris_score = std::move(k); - } - break; - case 5: - { - auto k = arm_compute::support::cpp14::make_unique<NEHarrisScoreKernel<5>>(); - k->configure(&_gx, &_gy, &_score, norm_factor, threshold, sensitivity, border_mode == BorderMode::UNDEFINED); - _harris_score = std::move(k); - } - break; - case 7: - { - auto k = arm_compute::support::cpp14::make_unique<NEHarrisScoreKernel<7>>(); - k->configure(&_gx, &_gy, &_score, norm_factor, threshold, sensitivity, border_mode == BorderMode::UNDEFINED); - _harris_score = std::move(k); - } - default: - break; - } - - // Configure border filling before harris score - _border_gx.configure(&_gx, _harris_score->border_size(), border_mode, constant_border_value); - _border_gy.configure(&_gy, _harris_score->border_size(), border_mode, constant_border_value); - - // Allocate once all the configure methods have been called - _gx.allocator()->allocate(); - _gy.allocator()->allocate(); - - // Manage intermediate buffers - _memory_group.manage(&_nonmax); - - // Init non-maxima suppression function - _non_max_suppr.configure(&_score, &_nonmax, border_mode); - - // Allocate once all the configure methods have been called - _score.allocator()->allocate(); - - // Init corner candidates kernel - _candidates.configure(&_nonmax, _corners_list.data(), &_num_corner_candidates); - - // Allocate once all the configure methods have been called - _nonmax.allocator()->allocate(); - - // Init euclidean distance - _sort_euclidean.configure(_corners_list.data(), corners, &_num_corner_candidates, min_dist); -} - -void NEHarrisCorners::run() -{ - ARM_COMPUTE_ERROR_ON_MSG(_sobel == nullptr, "Unconfigured function"); - - MemoryGroupResourceScope scope_mg(_memory_group); - - // Init to 0 number of corner candidates - _num_corner_candidates = 0; - - // Run Sobel kernel - _sobel->run(); - - // Fill border before harris score kernel - NEScheduler::get().schedule(&_border_gx, Window::DimZ); - NEScheduler::get().schedule(&_border_gy, Window::DimZ); - - // Run harris score kernel - NEScheduler::get().schedule(_harris_score.get(), Window::DimY); - - // Run non-maxima suppression - _non_max_suppr.run(); - - // Run corner candidate kernel - NEScheduler::get().schedule(&_candidates, Window::DimY); - - // Run sort & euclidean distance - NEScheduler::get().schedule(&_sort_euclidean, Window::DimY); -} diff --git a/src/runtime/NEON/functions/NEHistogram.cpp b/src/runtime/NEON/functions/NEHistogram.cpp deleted file mode 100644 index 6a672ed915..0000000000 --- a/src/runtime/NEON/functions/NEHistogram.cpp +++ /dev/null @@ -1,57 +0,0 @@ -/* - * Copyright (c) 2016-2020 ARM Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "arm_compute/runtime/NEON/functions/NEHistogram.h" - -#include "arm_compute/core/Error.h" -#include "arm_compute/core/IDistribution1D.h" -#include "arm_compute/core/ITensor.h" -#include "arm_compute/core/TensorInfo.h" -#include "arm_compute/core/Validate.h" -#include "arm_compute/runtime/NEON/NEScheduler.h" - -using namespace arm_compute; - -NEHistogram::NEHistogram() - : _histogram_kernel(), _local_hist(), _window_lut(window_lut_default_size), _local_hist_size(0) -{ -} - -void NEHistogram::configure(const IImage *input, IDistribution1D *output) -{ - ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(input); - ARM_COMPUTE_ERROR_ON(nullptr == output); - - // Allocate space for threads local histograms - _local_hist_size = output->num_bins() * NEScheduler::get().num_threads(); - _local_hist.resize(_local_hist_size); - - // Configure kernel - _histogram_kernel.configure(input, output, _local_hist.data(), _window_lut.data()); -} - -void NEHistogram::run() -{ - // Calculate histogram of input. - NEScheduler::get().schedule(&_histogram_kernel, Window::DimY); -} diff --git a/src/runtime/NEON/functions/NEInstanceNormalizationLayer.cpp b/src/runtime/NEON/functions/NEInstanceNormalizationLayer.cpp index d7cb7de627..78218cbdee 100644 --- a/src/runtime/NEON/functions/NEInstanceNormalizationLayer.cpp +++ b/src/runtime/NEON/functions/NEInstanceNormalizationLayer.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019 ARM Limited. + * Copyright (c) 2019-2021 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -24,23 +24,40 @@ #include "arm_compute/runtime/NEON/functions/NEInstanceNormalizationLayer.h" #include "arm_compute/core/Helpers.h" +#include "arm_compute/core/KernelDescriptors.h" #include "arm_compute/runtime/NEON/NEScheduler.h" +#include "src/common/utils/Log.h" +#include "src/core/NEON/kernels/NEInstanceNormalizationLayerKernel.h" + namespace arm_compute { +NEInstanceNormalizationLayer::~NEInstanceNormalizationLayer() = default; + NEInstanceNormalizationLayer::NEInstanceNormalizationLayer(std::shared_ptr<IMemoryManager> memory_manager) - : _memory_group(std::move(memory_manager)), _normalization_kernel(), _is_nchw(false), _permute_input(), _permute_output(), _permuted_input(), _permuted_output() + : _memory_group(std::move(memory_manager)), + _normalization_kernel(), + _is_nchw(false), + _permute_input(), + _permute_output(), + _permuted_input(), + _permuted_output() { } void NEInstanceNormalizationLayer::configure(ITensor *input, ITensor *output, float gamma, float beta, float epsilon) { - const DataLayout data_layout = input->info()->data_layout(); + ARM_COMPUTE_LOG_PARAMS(input, output, gamma, beta, epsilon); + + const DataLayout data_layout = input->info()->data_layout(); + const auto kernel_descriptor = InstanceNormalizationLayerKernelInfo{gamma, beta, epsilon, true}; // Configure Kernels _is_nchw = data_layout == DataLayout::NCHW; - if(!_is_nchw) + _normalization_kernel = std::make_unique<NEInstanceNormalizationLayerKernel>(); + + if (!_is_nchw) { _memory_group.manage(&_permuted_input); _memory_group.manage(&_permuted_output); @@ -49,7 +66,7 @@ void NEInstanceNormalizationLayer::configure(ITensor *input, ITensor *output, fl _permute_input.configure(input, &_permuted_input, PermutationVector(1U, 2U, 0U)); _permuted_input.info()->set_data_layout(DataLayout::NCHW); - _normalization_kernel.configure(&_permuted_input, &_permuted_output, gamma, beta, epsilon); + _normalization_kernel->configure(&_permuted_input, &_permuted_output, kernel_descriptor); _permuted_output.info()->set_data_layout(DataLayout::NCHW); _permute_output.configure(&_permuted_output, output != nullptr ? output : input, PermutationVector(2U, 0U, 1U)); @@ -58,13 +75,16 @@ void NEInstanceNormalizationLayer::configure(ITensor *input, ITensor *output, fl } else { - _normalization_kernel.configure(input, output, gamma, beta, epsilon); + _normalization_kernel->configure(input, output, kernel_descriptor); } } -Status NEInstanceNormalizationLayer::validate(const ITensorInfo *input, const ITensorInfo *output, float gamma, float beta, float epsilon) +Status NEInstanceNormalizationLayer::validate( + const ITensorInfo *input, const ITensorInfo *output, float gamma, float beta, float epsilon) { - return NEInstanceNormalizationLayerKernel::validate(&input->clone()->set_data_layout(DataLayout::NCHW), &output->clone()->set_data_layout(DataLayout::NCHW), gamma, beta, epsilon); + return NEInstanceNormalizationLayerKernel::validate( + &input->clone()->set_data_layout(DataLayout::NCHW), &output->clone()->set_data_layout(DataLayout::NCHW), + InstanceNormalizationLayerKernelInfo{gamma, beta, epsilon, true}); } void NEInstanceNormalizationLayer::run() @@ -72,15 +92,15 @@ void NEInstanceNormalizationLayer::run() MemoryGroupResourceScope scope_mg(_memory_group); // Permute input - if(!_is_nchw) + if (!_is_nchw) { _permute_input.run(); } - NEScheduler::get().schedule(&_normalization_kernel, Window::DimZ); + NEScheduler::get().schedule(_normalization_kernel.get(), Window::DimZ); // Permute output - if(!_is_nchw) + if (!_is_nchw) { _permute_output.run(); } diff --git a/src/runtime/NEON/functions/NEIntegralImage.cpp b/src/runtime/NEON/functions/NEIntegralImage.cpp deleted file mode 100644 index 845f3b0936..0000000000 --- a/src/runtime/NEON/functions/NEIntegralImage.cpp +++ /dev/null @@ -1,40 +0,0 @@ -/* - * Copyright (c) 2016-2020 ARM Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "arm_compute/runtime/NEON/functions/NEIntegralImage.h" - -#include "arm_compute/core/NEON/kernels/NEIntegralImageKernel.h" -#include "arm_compute/core/Types.h" -#include "support/MemorySupport.h" - -#include <utility> - -using namespace arm_compute; - -void NEIntegralImage::configure(const ITensor *input, ITensor *output) -{ - auto k = arm_compute::support::cpp14::make_unique<NEIntegralImageKernel>(); - k->configure(input, output); - _kernel = std::move(k); - _border_handler.configure(output, _kernel->border_size(), BorderMode::CONSTANT, PixelValue()); -} diff --git a/src/runtime/NEON/functions/NEL2NormalizeLayer.cpp b/src/runtime/NEON/functions/NEL2NormalizeLayer.cpp index 88ffdbfd08..b7f6203efd 100644 --- a/src/runtime/NEON/functions/NEL2NormalizeLayer.cpp +++ b/src/runtime/NEON/functions/NEL2NormalizeLayer.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2019 ARM Limited. + * Copyright (c) 2017-2021 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -26,12 +26,17 @@ #include "arm_compute/core/Helpers.h" #include "arm_compute/runtime/NEON/NEScheduler.h" +#include "src/common/utils/Log.h" +#include "src/core/NEON/kernels/NEL2NormalizeLayerKernel.h" +#include "src/core/NEON/kernels/NEReductionOperationKernel.h" + namespace arm_compute { namespace { constexpr int max_input_tensor_dim = 3; } // namespace +NEL2NormalizeLayer::~NEL2NormalizeLayer() = default; NEL2NormalizeLayer::NEL2NormalizeLayer(std::shared_ptr<IMemoryManager> memory_manager) : _memory_group(std::move(memory_manager)), _reduce_func(), _normalize_kernel(), _sumsq() @@ -40,13 +45,16 @@ NEL2NormalizeLayer::NEL2NormalizeLayer(std::shared_ptr<IMemoryManager> memory_ma void NEL2NormalizeLayer::configure(ITensor *input, ITensor *output, int axis, float epsilon) { + ARM_COMPUTE_LOG_PARAMS(input, output, axis, epsilon); + // Manage intermediate buffers _memory_group.manage(&_sumsq); // Configure Kernels const uint32_t actual_axis = wrap_around(axis, max_input_tensor_dim); _reduce_func.configure(input, &_sumsq, actual_axis, ReductionOperation::SUM_SQUARE); - _normalize_kernel.configure(input, &_sumsq, output, axis, epsilon); + _normalize_kernel = std::make_unique<NEL2NormalizeLayerKernel>(); + _normalize_kernel->configure(input, &_sumsq, output, axis, epsilon); // Allocate intermediate tensors _sumsq.allocator()->allocate(); @@ -62,7 +70,8 @@ Status NEL2NormalizeLayer::validate(const ITensorInfo *input, const ITensorInfo sum_sq.set_tensor_shape(shape); const uint32_t actual_axis = wrap_around(axis, max_input_tensor_dim); - ARM_COMPUTE_RETURN_ON_ERROR(NEReductionOperation::validate(input, &sum_sq, actual_axis, ReductionOperation::SUM_SQUARE)); + ARM_COMPUTE_RETURN_ON_ERROR( + NEReductionOperation::validate(input, &sum_sq, actual_axis, ReductionOperation::SUM_SQUARE)); // Reduce shape on axis shape.set(actual_axis, 1); @@ -78,6 +87,6 @@ void NEL2NormalizeLayer::run() MemoryGroupResourceScope scope_mg(_memory_group); _reduce_func.run(); - NEScheduler::get().schedule(&_normalize_kernel, Window::DimY); + NEScheduler::get().schedule(_normalize_kernel.get(), Window::DimY); } } // namespace arm_compute diff --git a/src/runtime/NEON/functions/NELSTMLayer.cpp b/src/runtime/NEON/functions/NELSTMLayer.cpp index f9d445fe71..1a08cdeb06 100644 --- a/src/runtime/NEON/functions/NELSTMLayer.cpp +++ b/src/runtime/NEON/functions/NELSTMLayer.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018-2020 ARM Limited. + * Copyright (c) 2018-2022 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -24,47 +24,138 @@ #include "arm_compute/runtime/NEON/functions/NELSTMLayer.h" #include "arm_compute/core/Utils.h" -#include "arm_compute/core/Validate.h" #include "arm_compute/core/utils/misc/InfoHelpers.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" #include "arm_compute/core/utils/quantization/AsymmHelpers.h" +#include "arm_compute/core/Validate.h" #include "arm_compute/runtime/common/LSTMParams.h" +#include "src/common/utils/Log.h" + namespace arm_compute { using namespace arm_compute::misc::shape_calculator; using namespace arm_compute::utils::info_helpers; +NELSTMLayer::~NELSTMLayer() = default; + NELSTMLayer::NELSTMLayer(std::shared_ptr<IMemoryManager> memory_manager) - : _memory_group(std::move(memory_manager)), _fully_connected_input_gate(), _accum_input_gate1(), _subtract_input_gate(), _pixelwise_mul_input_gate(), _activation_input_gate(), - _fully_connected_forget_gate(), _accum_forget_gate1(), _pixelwise_mul_forget_gate(), _activation_forget_gate(), _fully_connected_cell_state(), _gemm_cell_state1(), _transpose_cell_state(), - _accum_cell_state1(), _accum_cell_state2(), _pixelwise_mul_cell_state1(), _activation_cell_state(), _cell_clip(), _pixelwise_mul_cell_state2(), _fully_connected_output(), - _pixelwise_mul_output_state1(), _accum_output1(), _activation_output(), _activation_output_state(), _pixelwise_mul_output_state2(), _fully_connected_output_state(), _projection_clip(), - _copy_cell_state(), _copy_output(), _concat_scratch_buffer(), _concat_inputs_forget_gate(), _concat_weights_forget_gate(), _concat_weights_input_gate(), _concat_weights_output(), - _mean_std_norm_input_gate(), _pixelwise_mul_input_gate_coeff(), _accum_input_gate_bias(), _mean_std_norm_forget_gate(), _pixelwise_mul_forget_gate_coeff(), _accum_forget_gate_bias(), - _mean_std_norm_cell_gate(), _pixelwise_mul_cell_gate_coeff(), _accum_cell_gate_bias(), _mean_std_norm_output_gate(), _pixelwise_mul_output_gate_coeff(), _accum_output_gate_bias(), _input_gate_out1(), - _input_gate_out2(), _input_gate_out3(), _input_gate_out4(), _forget_gate_out1(), _forget_gate_out2(), _forget_gate_out3(), _forget_gate_out4(), _forget_gate_out5(), _forget_gate_out6(), - _cell_state_out1(), _cell_state_out2(), _cell_state_out3(), _cell_state_out4(), _cell_state_out5(), _output1(), _output2(), _output3(), _output4(), _cell_state_activation(), _output_state1(), _ones(), - _input_layer_norm_out1(), _input_layer_norm_out2(), _forget_layer_norm_out1(), _forget_layer_norm_out2(), _cell_layer_norm_out1(), _cell_layer_norm_out2(), _output_layer_norm_out1(), - _output_layer_norm_out2(), _run_peephole_opt(false), _run_cifg_opt(false), _perform_cell_clipping(false), _has_projection_weights(false), _perform_projection_clipping(false), _is_prepared(false), + : _memory_group(std::move(memory_manager)), + _fully_connected_input_gate(), + _accum_input_gate1(), + _subtract_input_gate(), + _pixelwise_mul_input_gate(), + _activation_input_gate(), + _fully_connected_forget_gate(), + _accum_forget_gate1(), + _pixelwise_mul_forget_gate(), + _activation_forget_gate(), + _fully_connected_cell_state(), + _gemm_cell_state1(), + _transpose_cell_state(), + _accum_cell_state1(), + _accum_cell_state2(), + _pixelwise_mul_cell_state1(), + _activation_cell_state(), + _cell_clip(), + _pixelwise_mul_cell_state2(), + _fully_connected_output(), + _pixelwise_mul_output_state1(), + _accum_output1(), + _activation_output(), + _activation_output_state(), + _pixelwise_mul_output_state2(), + _fully_connected_output_state(), + _projection_clip(), + _copy_cell_state(), + _copy_output(), + _concat_scratch_buffer(), + _concat_inputs_forget_gate(), + _concat_weights_forget_gate(), + _concat_weights_input_gate(), + _concat_weights_output(), + _mean_std_norm_input_gate(), + _pixelwise_mul_input_gate_coeff(), + _accum_input_gate_bias(), + _mean_std_norm_forget_gate(), + _pixelwise_mul_forget_gate_coeff(), + _accum_forget_gate_bias(), + _mean_std_norm_cell_gate(), + _pixelwise_mul_cell_gate_coeff(), + _accum_cell_gate_bias(), + _mean_std_norm_output_gate(), + _pixelwise_mul_output_gate_coeff(), + _accum_output_gate_bias(), + _input_gate_out1(), + _input_gate_out2(), + _input_gate_out3(), + _input_gate_out4(), + _forget_gate_out1(), + _forget_gate_out2(), + _forget_gate_out3(), + _forget_gate_out4(), + _forget_gate_out5(), + _forget_gate_out6(), + _cell_state_out1(), + _cell_state_out2(), + _cell_state_out3(), + _cell_state_out4(), + _cell_state_out5(), + _output1(), + _output2(), + _output3(), + _output4(), + _cell_state_activation(), + _output_state1(), + _ones(), + _input_layer_norm_out1(), + _input_layer_norm_out2(), + _forget_layer_norm_out1(), + _forget_layer_norm_out2(), + _cell_layer_norm_out1(), + _cell_layer_norm_out2(), + _output_layer_norm_out1(), + _output_layer_norm_out2(), + _run_peephole_opt(false), + _run_cifg_opt(false), + _perform_cell_clipping(false), + _has_projection_weights(false), + _perform_projection_clipping(false), + _is_prepared(false), _is_layer_norm_lstm(false) { } -void NELSTMLayer::configure(const ITensor *input, - const ITensor *input_to_forget_weights, const ITensor *input_to_cell_weights, const ITensor *input_to_output_weights, - const ITensor *recurrent_to_forget_weights, const ITensor *recurrent_to_cell_weights, const ITensor *recurrent_to_output_weights, - const ITensor *forget_gate_bias, const ITensor *cell_bias, const ITensor *output_gate_bias, - const ITensor *output_state_in, const ITensor *cell_state_in, - ITensor *scratch_buffer, ITensor *output_state_out, ITensor *cell_state_out, ITensor *output, - const LSTMParams<ITensor> &lstm_params, const ActivationLayerInfo &activation_info, float cell_threshold, float projection_threshold) +void NELSTMLayer::configure(const ITensor *input, + const ITensor *input_to_forget_weights, + const ITensor *input_to_cell_weights, + const ITensor *input_to_output_weights, + const ITensor *recurrent_to_forget_weights, + const ITensor *recurrent_to_cell_weights, + const ITensor *recurrent_to_output_weights, + const ITensor *forget_gate_bias, + const ITensor *cell_bias, + const ITensor *output_gate_bias, + const ITensor *output_state_in, + const ITensor *cell_state_in, + ITensor *scratch_buffer, + ITensor *output_state_out, + ITensor *cell_state_out, + ITensor *output, + const LSTMParams<ITensor> &lstm_params, + const ActivationLayerInfo &activation_info, + float cell_threshold, + float projection_threshold) { - ARM_COMPUTE_ERROR_ON_NULLPTR(input, - input_to_forget_weights, input_to_cell_weights, input_to_output_weights, + ARM_COMPUTE_ERROR_ON_NULLPTR(input, input_to_forget_weights, input_to_cell_weights, input_to_output_weights, recurrent_to_forget_weights, recurrent_to_cell_weights, recurrent_to_output_weights, - forget_gate_bias, cell_bias, output_gate_bias, - output_state_in, cell_state_in, + forget_gate_bias, cell_bias, output_gate_bias, output_state_in, cell_state_in, scratch_buffer, output_state_out, cell_state_out, output); + ARM_COMPUTE_LOG_PARAMS(input, input_to_forget_weights, input_to_cell_weights, input_to_output_weights, + recurrent_to_forget_weights, recurrent_to_cell_weights, recurrent_to_output_weights, + forget_gate_bias, cell_bias, output_gate_bias, output_state_in, cell_state_in, + scratch_buffer, output_state_out, cell_state_out, output, lstm_params, activation_info, + cell_threshold, projection_threshold); _is_layer_norm_lstm = lstm_params.use_layer_norm(); @@ -73,13 +164,12 @@ void NELSTMLayer::configure(const ITensor *input, build_lstm_params_tensor_info(lstm_params, &lstm_params_info); // Validate - ARM_COMPUTE_ERROR_THROW_ON(NELSTMLayer::validate(input->info(), input_to_forget_weights->info(), - input_to_cell_weights->info(), input_to_output_weights->info(), - recurrent_to_forget_weights->info(), recurrent_to_cell_weights->info(), recurrent_to_output_weights->info(), - forget_gate_bias->info(), cell_bias->info(), output_gate_bias->info(), - output_state_in->info(), cell_state_in->info(), - scratch_buffer->info(), output_state_out->info(), cell_state_out->info(), output->info(), - lstm_params_info, activation_info, cell_threshold, projection_threshold)); + ARM_COMPUTE_ERROR_THROW_ON(NELSTMLayer::validate( + input->info(), input_to_forget_weights->info(), input_to_cell_weights->info(), input_to_output_weights->info(), + recurrent_to_forget_weights->info(), recurrent_to_cell_weights->info(), recurrent_to_output_weights->info(), + forget_gate_bias->info(), cell_bias->info(), output_gate_bias->info(), output_state_in->info(), + cell_state_in->info(), scratch_buffer->info(), output_state_out->info(), cell_state_out->info(), output->info(), + lstm_params_info, activation_info, cell_threshold, projection_threshold)); const TensorShape cell_state_shape = cell_state_in->info()->tensor_shape(); @@ -106,20 +196,23 @@ void NELSTMLayer::configure(const ITensor *input, _concat_weights_forget_gate.configure(weights_vector, &_forget_gate_out6, Window::DimX); _memory_group.manage(&_forget_gate_out5); - _fully_connected_forget_gate.configure(&_forget_gate_out2, &_forget_gate_out6, (_is_layer_norm_lstm) ? nullptr : forget_gate_bias, &_forget_gate_out5); + _fully_connected_forget_gate.configure(&_forget_gate_out2, &_forget_gate_out6, + (_is_layer_norm_lstm) ? nullptr : forget_gate_bias, &_forget_gate_out5); _memory_group.manage(&_forget_gate_out1); _memory_group.manage(&_forget_gate_out3); _forget_gate_out6.allocator()->allocate(); Tensor *forget_gate_out = &_forget_gate_out5; - if(lstm_params.has_peephole_opt()) + if (lstm_params.has_peephole_opt()) { _forget_gate_out4.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type())); _run_peephole_opt = true; _memory_group.manage(&_forget_gate_out4); - _pixelwise_mul_forget_gate.configure(cell_state_in, lstm_params.cell_to_forget_weights(), &_forget_gate_out4, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO); - _accum_forget_gate1.configure(&_forget_gate_out5, &_forget_gate_out4, &_forget_gate_out3, ConvertPolicy::SATURATE); + _pixelwise_mul_forget_gate.configure(cell_state_in, lstm_params.cell_to_forget_weights(), &_forget_gate_out4, 1, + ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO); + _accum_forget_gate1.configure(&_forget_gate_out5, &_forget_gate_out4, &_forget_gate_out3, + ConvertPolicy::SATURATE); _forget_gate_out4.allocator()->allocate(); _forget_gate_out5.allocator()->allocate(); forget_gate_out = &_forget_gate_out3; @@ -128,21 +221,25 @@ void NELSTMLayer::configure(const ITensor *input, { _forget_gate_out3.allocator()->allocate(); } - if(_is_layer_norm_lstm) + if (_is_layer_norm_lstm) { _forget_layer_norm_out1.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type())); _forget_layer_norm_out2.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type())); _memory_group.manage(&_forget_layer_norm_out1); _memory_group.manage(&_forget_layer_norm_out2); _mean_std_norm_forget_gate.configure(forget_gate_out); - _pixelwise_mul_forget_gate_coeff.configure(forget_gate_out, lstm_params.forget_layer_norm_weights(), &_forget_layer_norm_out1, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO); + _pixelwise_mul_forget_gate_coeff.configure(forget_gate_out, lstm_params.forget_layer_norm_weights(), + &_forget_layer_norm_out1, 1, ConvertPolicy::SATURATE, + RoundingPolicy::TO_ZERO); // forget_gate_out is going to be reassigned, so allocate the tensor that it was assigned to before forget_gate_out->allocator()->allocate(); - _accum_forget_gate_bias.configure(&_forget_layer_norm_out1, forget_gate_bias, &_forget_layer_norm_out2, ConvertPolicy::SATURATE); + _accum_forget_gate_bias.configure(&_forget_layer_norm_out1, forget_gate_bias, &_forget_layer_norm_out2, + ConvertPolicy::SATURATE); _forget_layer_norm_out1.allocator()->allocate(); forget_gate_out = &_forget_layer_norm_out2; } - _activation_forget_gate.configure(forget_gate_out, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)); + _activation_forget_gate.configure(forget_gate_out, nullptr, + ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)); // Configure block that calculates the input gate // input_gate = Activation(input * input_to_input_weights + output_state * recurrent_to_input_weights + PixelWiseMul(cell_state, cell_to_input_weights) + input_gate_bias), without CIFG @@ -151,7 +248,7 @@ void NELSTMLayer::configure(const ITensor *input, // input_gate = Activation((input,output_state) * (input_to_input_weights,recurrent_to_input_weights) + PixelWiseMul(cell_state, cell_to_input_weights) + input_gate_bias), without CIFG _input_gate_out1.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type())); Tensor *input_gate_out = &_input_gate_out1; - if(lstm_params.has_cifg_opt()) + if (lstm_params.has_cifg_opt()) { _memory_group.manage(&_input_gate_out1); _ones.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type())); @@ -173,15 +270,19 @@ void NELSTMLayer::configure(const ITensor *input, _memory_group.manage(&_input_gate_out1); _memory_group.manage(&_input_gate_out4); - _fully_connected_input_gate.configure(&_forget_gate_out2, &_input_gate_out2, (_is_layer_norm_lstm) ? nullptr : lstm_params.input_gate_bias(), &_input_gate_out3); + _fully_connected_input_gate.configure(&_forget_gate_out2, &_input_gate_out2, + (_is_layer_norm_lstm) ? nullptr : lstm_params.input_gate_bias(), + &_input_gate_out3); _input_gate_out2.allocator()->allocate(); input_gate_out = &_input_gate_out3; - if(_run_peephole_opt) + if (_run_peephole_opt) { _memory_group.manage(&_input_gate_out4); - _pixelwise_mul_input_gate.configure(cell_state_in, lstm_params.cell_to_input_weights(), &_input_gate_out4, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO); - _accum_input_gate1.configure(&_input_gate_out3, &_input_gate_out4, &_input_gate_out1, ConvertPolicy::SATURATE); + _pixelwise_mul_input_gate.configure(cell_state_in, lstm_params.cell_to_input_weights(), &_input_gate_out4, + 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO); + _accum_input_gate1.configure(&_input_gate_out3, &_input_gate_out4, &_input_gate_out1, + ConvertPolicy::SATURATE); _input_gate_out3.allocator()->allocate(); _input_gate_out4.allocator()->allocate(); input_gate_out = &_input_gate_out1; @@ -191,21 +292,25 @@ void NELSTMLayer::configure(const ITensor *input, _input_gate_out1.allocator()->allocate(); } - if(_is_layer_norm_lstm) + if (_is_layer_norm_lstm) { _input_layer_norm_out1.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type())); _input_layer_norm_out2.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type())); _memory_group.manage(&_input_layer_norm_out1); _memory_group.manage(&_input_layer_norm_out2); _mean_std_norm_input_gate.configure(input_gate_out); - _pixelwise_mul_input_gate_coeff.configure(input_gate_out, lstm_params.input_layer_norm_weights(), &_input_layer_norm_out1, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO); + _pixelwise_mul_input_gate_coeff.configure(input_gate_out, lstm_params.input_layer_norm_weights(), + &_input_layer_norm_out1, 1, ConvertPolicy::SATURATE, + RoundingPolicy::TO_ZERO); // input_gate_out is going to be reassigned, so allocate the tensor that it was assigned to before input_gate_out->allocator()->allocate(); - _accum_input_gate_bias.configure(&_input_layer_norm_out1, lstm_params.input_gate_bias(), &_input_layer_norm_out2, ConvertPolicy::SATURATE); + _accum_input_gate_bias.configure(&_input_layer_norm_out1, lstm_params.input_gate_bias(), + &_input_layer_norm_out2, ConvertPolicy::SATURATE); _input_layer_norm_out1.allocator()->allocate(); input_gate_out = &_input_layer_norm_out2; } - _activation_input_gate.configure(input_gate_out, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)); + _activation_input_gate.configure(input_gate_out, nullptr, + ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)); } // Configure block that calculates the cell state @@ -218,7 +323,8 @@ void NELSTMLayer::configure(const ITensor *input, _cell_state_out5.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type())); _memory_group.manage(&_cell_state_out1); - _fully_connected_cell_state.configure(input, input_to_cell_weights, (_is_layer_norm_lstm) ? nullptr : cell_bias, &_cell_state_out1); + _fully_connected_cell_state.configure(input, input_to_cell_weights, (_is_layer_norm_lstm) ? nullptr : cell_bias, + &_cell_state_out1); _memory_group.manage(&_cell_state_out2); _transpose_cell_state.configure(recurrent_to_cell_weights, &_cell_state_out2); _memory_group.manage(&_cell_state_out3); @@ -227,33 +333,40 @@ void NELSTMLayer::configure(const ITensor *input, _memory_group.manage(&_cell_state_out4); _accum_cell_state1.configure(&_cell_state_out1, &_cell_state_out3, &_cell_state_out4, ConvertPolicy::SATURATE); Tensor *cell_state_out_ptr = &_cell_state_out4; - if(_is_layer_norm_lstm) + if (_is_layer_norm_lstm) { _cell_layer_norm_out1.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type())); _cell_layer_norm_out2.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type())); _memory_group.manage(&_cell_layer_norm_out1); _memory_group.manage(&_cell_layer_norm_out2); _mean_std_norm_cell_gate.configure(cell_state_out_ptr); - _pixelwise_mul_cell_gate_coeff.configure(cell_state_out_ptr, lstm_params.cell_layer_norm_weights(), &_cell_layer_norm_out1, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO); + _pixelwise_mul_cell_gate_coeff.configure(cell_state_out_ptr, lstm_params.cell_layer_norm_weights(), + &_cell_layer_norm_out1, 1, ConvertPolicy::SATURATE, + RoundingPolicy::TO_ZERO); // cell_state_out_ptr is going to be reassigned, so allocate the tensor that it was assigned to before cell_state_out_ptr->allocator()->allocate(); - _accum_cell_gate_bias.configure(&_cell_layer_norm_out1, cell_bias, &_cell_layer_norm_out2, ConvertPolicy::SATURATE); + _accum_cell_gate_bias.configure(&_cell_layer_norm_out1, cell_bias, &_cell_layer_norm_out2, + ConvertPolicy::SATURATE); _cell_layer_norm_out1.allocator()->allocate(); cell_state_out_ptr = &_cell_layer_norm_out2; } _activation_cell_state.configure(cell_state_out_ptr, nullptr, activation_info); _memory_group.manage(&_cell_state_out5); - _pixelwise_mul_cell_state1.configure(cell_state_out_ptr, input_gate_out, &_cell_state_out5, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO); + _pixelwise_mul_cell_state1.configure(cell_state_out_ptr, input_gate_out, &_cell_state_out5, 1, + ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO); cell_state_out_ptr->allocator()->allocate(); - _pixelwise_mul_cell_state2.configure(forget_gate_out, cell_state_in, &_cell_state_out3, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO); + _pixelwise_mul_cell_state2.configure(forget_gate_out, cell_state_in, &_cell_state_out3, 1, ConvertPolicy::SATURATE, + RoundingPolicy::TO_ZERO); _accum_cell_state2.configure(&_cell_state_out5, &_cell_state_out3, &_cell_state_out1, ConvertPolicy::SATURATE); _cell_state_out3.allocator()->allocate(); _cell_state_out5.allocator()->allocate(); // Perform clipping - if(cell_threshold != 0.f) + if (cell_threshold != 0.f) { _perform_cell_clipping = true; - _cell_clip.configure(&_cell_state_out1, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, -cell_threshold, cell_threshold)); + _cell_clip.configure(&_cell_state_out1, nullptr, + ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, + cell_threshold, -cell_threshold)); } // Configure block that calculates the output @@ -271,18 +384,20 @@ void NELSTMLayer::configure(const ITensor *input, _memory_group.manage(&_output1); _memory_group.manage(&_output4); - _fully_connected_output.configure(&_forget_gate_out2, &_output2, (_is_layer_norm_lstm) ? nullptr : output_gate_bias, &_output4); + _fully_connected_output.configure(&_forget_gate_out2, &_output2, (_is_layer_norm_lstm) ? nullptr : output_gate_bias, + &_output4); _output2.allocator()->allocate(); _forget_gate_out2.allocator()->allocate(); Tensor *output_gate_out = &_output4; - if(lstm_params.has_peephole_opt()) + if (lstm_params.has_peephole_opt()) { _output3.allocator()->init(TensorInfo(_cell_state_out1.info()->tensor_shape(), 1, input->info()->data_type())); _memory_group.manage(&_output3); - _pixelwise_mul_output_state1.configure(&_cell_state_out1, lstm_params.cell_to_output_weights(), &_output3, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO); + _pixelwise_mul_output_state1.configure(&_cell_state_out1, lstm_params.cell_to_output_weights(), &_output3, 1, + ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO); _accum_output1.configure(&_output4, &_output3, &_output1, ConvertPolicy::SATURATE); _output4.allocator()->allocate(); output_gate_out = &_output1; @@ -294,21 +409,25 @@ void NELSTMLayer::configure(const ITensor *input, { _output1.allocator()->allocate(); } - if(_is_layer_norm_lstm) + if (_is_layer_norm_lstm) { _output_layer_norm_out1.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type())); _output_layer_norm_out2.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type())); _memory_group.manage(&_output_layer_norm_out1); _memory_group.manage(&_output_layer_norm_out2); _mean_std_norm_output_gate.configure(output_gate_out); - _pixelwise_mul_output_gate_coeff.configure(output_gate_out, lstm_params.output_layer_norm_weights(), &_output_layer_norm_out1, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO); + _pixelwise_mul_output_gate_coeff.configure(output_gate_out, lstm_params.output_layer_norm_weights(), + &_output_layer_norm_out1, 1, ConvertPolicy::SATURATE, + RoundingPolicy::TO_ZERO); // output_gate_out is going to be reassigned, so allocate the tensor that it was assigned to before output_gate_out->allocator()->allocate(); - _accum_output_gate_bias.configure(&_output_layer_norm_out1, output_gate_bias, &_output_layer_norm_out2, ConvertPolicy::SATURATE); + _accum_output_gate_bias.configure(&_output_layer_norm_out1, output_gate_bias, &_output_layer_norm_out2, + ConvertPolicy::SATURATE); _output_layer_norm_out1.allocator()->allocate(); output_gate_out = &_output_layer_norm_out2; } - _activation_output.configure(output_gate_out, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)); + _activation_output.configure(output_gate_out, nullptr, + ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)); // Configure block that calculates the output state /** lstm_res = PixelwiseMul(output, Activation(cell_state)) @@ -325,20 +444,24 @@ void NELSTMLayer::configure(const ITensor *input, _memory_group.manage(&_cell_state_activation); _activation_output_state.configure(&_cell_state_out1, &_cell_state_activation, activation_info); - _pixelwise_mul_output_state2.configure(&_cell_state_activation, output_gate_out, output_state_out_tmp, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO); + _pixelwise_mul_output_state2.configure(&_cell_state_activation, output_gate_out, output_state_out_tmp, 1, + ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO); _cell_state_activation.allocator()->allocate(); output_gate_out->allocator()->allocate(); - if(lstm_params.has_projection()) + if (lstm_params.has_projection()) { _has_projection_weights = true; - _fully_connected_output_state.configure(output_state_out_tmp, lstm_params.projection_weights(), lstm_params.projection_bias(), output_state_out); + _fully_connected_output_state.configure(output_state_out_tmp, lstm_params.projection_weights(), + lstm_params.projection_bias(), output_state_out); _output_state1.allocator()->allocate(); // Perform clipping - if(projection_threshold != 0.f) + if (projection_threshold != 0.f) { _perform_projection_clipping = true; - _projection_clip.configure(output_state_out, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, -projection_threshold, projection_threshold)); + _projection_clip.configure(output_state_out, nullptr, + ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, + -projection_threshold, projection_threshold)); } } @@ -347,8 +470,8 @@ void NELSTMLayer::configure(const ITensor *input, _copy_output.configure(output_state_out, output); // Vector for holding the tensors to store in scratch buffer - std::vector<ITensor *> scratch_inputs; - if(!lstm_params.has_cifg_opt()) + std::vector<const ITensor *> scratch_inputs; + if (!lstm_params.has_cifg_opt()) { scratch_inputs.emplace_back(input_gate_out); } @@ -362,29 +485,38 @@ void NELSTMLayer::configure(const ITensor *input, output_gate_out->allocator()->allocate(); } -Status NELSTMLayer::validate(const ITensorInfo *input, - const ITensorInfo *input_to_forget_weights, const ITensorInfo *input_to_cell_weights, const ITensorInfo *input_to_output_weights, - const ITensorInfo *recurrent_to_forget_weights, const ITensorInfo *recurrent_to_cell_weights, const ITensorInfo *recurrent_to_output_weights, - const ITensorInfo *forget_gate_bias, const ITensorInfo *cell_bias, const ITensorInfo *output_gate_bias, - const ITensorInfo *output_state_in, const ITensorInfo *cell_state_in, - const ITensorInfo *scratch_buffer, const ITensorInfo *output_state_out, const ITensorInfo *cell_state_out, const ITensorInfo *output, - const LSTMParams<ITensorInfo> &lstm_params, const ActivationLayerInfo &activation_info, float cell_threshold, float projection_threshold) +Status NELSTMLayer::validate(const ITensorInfo *input, + const ITensorInfo *input_to_forget_weights, + const ITensorInfo *input_to_cell_weights, + const ITensorInfo *input_to_output_weights, + const ITensorInfo *recurrent_to_forget_weights, + const ITensorInfo *recurrent_to_cell_weights, + const ITensorInfo *recurrent_to_output_weights, + const ITensorInfo *forget_gate_bias, + const ITensorInfo *cell_bias, + const ITensorInfo *output_gate_bias, + const ITensorInfo *output_state_in, + const ITensorInfo *cell_state_in, + const ITensorInfo *scratch_buffer, + const ITensorInfo *output_state_out, + const ITensorInfo *cell_state_out, + const ITensorInfo *output, + const LSTMParams<ITensorInfo> &lstm_params, + const ActivationLayerInfo &activation_info, + float cell_threshold, + float projection_threshold) { - ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, - input_to_forget_weights, input_to_cell_weights, input_to_output_weights, - recurrent_to_forget_weights, recurrent_to_cell_weights, recurrent_to_output_weights, - forget_gate_bias, cell_bias, output_gate_bias, - output_state_in, cell_state_in, - scratch_buffer, output_state_out, cell_state_out, output); + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR( + input, input_to_forget_weights, input_to_cell_weights, input_to_output_weights, recurrent_to_forget_weights, + recurrent_to_cell_weights, recurrent_to_output_weights, forget_gate_bias, cell_bias, output_gate_bias, + output_state_in, cell_state_in, scratch_buffer, output_state_out, cell_state_out, output); // Check data types ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, - input_to_forget_weights, input_to_cell_weights, input_to_output_weights, - recurrent_to_forget_weights, recurrent_to_cell_weights, recurrent_to_output_weights, - forget_gate_bias, cell_bias, output_gate_bias, - output_state_in, cell_state_in, - scratch_buffer, output_state_out, cell_state_out, output); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES( + input, input_to_forget_weights, input_to_cell_weights, input_to_output_weights, recurrent_to_forget_weights, + recurrent_to_cell_weights, recurrent_to_output_weights, forget_gate_bias, cell_bias, output_gate_bias, + output_state_in, cell_state_in, scratch_buffer, output_state_out, cell_state_out, output); // Check dimensions ARM_COMPUTE_RETURN_ERROR_ON(input->num_dimensions() > 2); @@ -403,16 +535,16 @@ Status NELSTMLayer::validate(const ITensorInfo *input, ARM_COMPUTE_RETURN_ERROR_ON(output_state_out->num_dimensions() > 2); ARM_COMPUTE_RETURN_ERROR_ON(cell_state_out->num_dimensions() > 2); ARM_COMPUTE_RETURN_ERROR_ON(output->num_dimensions() > 2); - ARM_COMPUTE_RETURN_ERROR_ON(cell_bias->dimension(0) * 4 != scratch_buffer->dimension(0) - && cell_bias->dimension(0) * 3 != scratch_buffer->dimension(0)); + ARM_COMPUTE_RETURN_ERROR_ON(cell_bias->dimension(0) * 4 != scratch_buffer->dimension(0) && + cell_bias->dimension(0) * 3 != scratch_buffer->dimension(0)); const unsigned int num_batches = input->dimension(1); const unsigned int num_cells = input_to_output_weights->dimension(1); - if(lstm_params.use_layer_norm()) + if (lstm_params.use_layer_norm()) { // If CIFG is used, input layer normalization weights tensor is omitted - if(lstm_params.has_cifg_opt()) + if (lstm_params.has_cifg_opt()) { ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.input_layer_norm_weights() != nullptr); } @@ -424,8 +556,12 @@ Status NELSTMLayer::validate(const ITensorInfo *input, ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, lstm_params.input_layer_norm_weights()); } - ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(lstm_params.forget_layer_norm_weights(), lstm_params.cell_layer_norm_weights(), lstm_params.output_layer_norm_weights()); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, lstm_params.forget_layer_norm_weights(), lstm_params.cell_layer_norm_weights(), lstm_params.output_layer_norm_weights()); + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(lstm_params.forget_layer_norm_weights(), + lstm_params.cell_layer_norm_weights(), + lstm_params.output_layer_norm_weights()); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, lstm_params.forget_layer_norm_weights(), + lstm_params.cell_layer_norm_weights(), + lstm_params.output_layer_norm_weights()); ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.forget_layer_norm_weights()->num_dimensions() > 1); ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.cell_layer_norm_weights()->num_dimensions() > 1); ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.output_layer_norm_weights()->num_dimensions() > 1); @@ -435,7 +571,7 @@ Status NELSTMLayer::validate(const ITensorInfo *input, } // Check peephole optimization - if(lstm_params.has_peephole_opt()) + if (lstm_params.has_peephole_opt()) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(lstm_params.cell_to_output_weights(), lstm_params.cell_to_forget_weights()); ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.cell_to_forget_weights()->num_dimensions() > 1); @@ -455,33 +591,39 @@ Status NELSTMLayer::validate(const ITensorInfo *input, std::vector<const ITensorInfo *> inputs_vector; inputs_vector.emplace_back(input); inputs_vector.emplace_back(output_state_in); - const TensorShape concat_shape = arm_compute::misc::shape_calculator::calculate_concatenate_shape(inputs_vector, 0); + const TensorShape concat_shape = arm_compute::misc::shape_calculator::calculate_concatenate_shape(inputs_vector, 0); TensorInfo forget_gate_concat = TensorInfo(concat_shape, 1, input->data_type()); ARM_COMPUTE_RETURN_ON_ERROR(NEConcatenateLayer::validate(inputs_vector, &forget_gate_concat, Window::DimX)); // Validate forget gate - ARM_COMPUTE_RETURN_ON_ERROR(NEFullyConnectedLayer::validate(input, input_to_forget_weights, (lstm_params.use_layer_norm()) ? nullptr : forget_gate_bias, &forget_gate)); + ARM_COMPUTE_RETURN_ON_ERROR(NEFullyConnectedLayer::validate( + input, input_to_forget_weights, (lstm_params.use_layer_norm()) ? nullptr : forget_gate_bias, &forget_gate)); - if(lstm_params.has_peephole_opt()) + if (lstm_params.has_peephole_opt()) { - ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplicationKernel::validate(cell_state_in, lstm_params.cell_to_forget_weights(), &forget_gate, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO)); - ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(&forget_gate, &forget_gate, &forget_gate, ConvertPolicy::SATURATE)); + ARM_COMPUTE_RETURN_ON_ERROR( + NEPixelWiseMultiplication::validate(cell_state_in, lstm_params.cell_to_forget_weights(), &forget_gate, 1, + ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO)); + ARM_COMPUTE_RETURN_ON_ERROR( + NEArithmeticAddition::validate(&forget_gate, &forget_gate, &forget_gate, ConvertPolicy::SATURATE)); } - if(lstm_params.use_layer_norm()) + if (lstm_params.use_layer_norm()) { ARM_COMPUTE_RETURN_ON_ERROR(NEMeanStdDevNormalizationLayer::validate(&forget_gate)); - ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplicationKernel::validate(&forget_gate, lstm_params.forget_layer_norm_weights(), &forget_gate, 1, ConvertPolicy::SATURATE, - RoundingPolicy::TO_ZERO)); - ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(&forget_gate, forget_gate_bias, &forget_gate, ConvertPolicy::SATURATE)); + ARM_COMPUTE_RETURN_ON_ERROR( + NEPixelWiseMultiplication::validate(&forget_gate, lstm_params.forget_layer_norm_weights(), &forget_gate, 1, + ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO)); + ARM_COMPUTE_RETURN_ON_ERROR( + NEArithmeticAddition::validate(&forget_gate, forget_gate_bias, &forget_gate, ConvertPolicy::SATURATE)); } - ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayerKernel::validate(&forget_gate, &forget_gate, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC))); + ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayer::validate( + &forget_gate, &forget_gate, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC))); // Validate input gate - if(!lstm_params.has_cifg_opt()) + if (!lstm_params.has_cifg_opt()) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(lstm_params.input_to_input_weights(), - lstm_params.recurrent_to_input_weights(), - lstm_params.input_gate_bias()); + lstm_params.recurrent_to_input_weights(), lstm_params.input_gate_bias()); ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.input_to_input_weights()->num_dimensions() > 2); ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.recurrent_to_input_weights()->num_dimensions() > 2); ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.input_gate_bias()->num_dimensions() > 1); @@ -489,98 +631,130 @@ Status NELSTMLayer::validate(const ITensorInfo *input, std::vector<const ITensorInfo *> lstm_weights; lstm_weights.emplace_back(lstm_params.input_to_input_weights()); lstm_weights.emplace_back(lstm_params.recurrent_to_input_weights()); - TensorShape lstm_weights_concat_shape = arm_compute::misc::shape_calculator::calculate_concatenate_shape(lstm_weights, 0); - TensorInfo lstm_gate_concat = TensorInfo(lstm_weights_concat_shape, 1, input->data_type()); + TensorShape lstm_weights_concat_shape = + arm_compute::misc::shape_calculator::calculate_concatenate_shape(lstm_weights, 0); + TensorInfo lstm_gate_concat = TensorInfo(lstm_weights_concat_shape, 1, input->data_type()); ARM_COMPUTE_RETURN_ON_ERROR(NEConcatenateLayer::validate(lstm_weights, &lstm_gate_concat, Window::DimX)); - ARM_COMPUTE_RETURN_ON_ERROR(NEFullyConnectedLayer::validate(input, lstm_params.input_to_input_weights(), (lstm_params.use_layer_norm()) ? nullptr : lstm_params.input_gate_bias(), &input_gate)); + ARM_COMPUTE_RETURN_ON_ERROR(NEFullyConnectedLayer::validate( + input, lstm_params.input_to_input_weights(), + (lstm_params.use_layer_norm()) ? nullptr : lstm_params.input_gate_bias(), &input_gate)); - if(lstm_params.has_peephole_opt()) + if (lstm_params.has_peephole_opt()) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(lstm_params.cell_to_input_weights()); ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.cell_to_input_weights()->num_dimensions() > 1); - ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplicationKernel::validate(cell_state_in, lstm_params.cell_to_input_weights(), &input_gate, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO)); - ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(&input_gate, &input_gate, &input_gate, ConvertPolicy::SATURATE)); + ARM_COMPUTE_RETURN_ON_ERROR( + NEPixelWiseMultiplication::validate(cell_state_in, lstm_params.cell_to_input_weights(), &input_gate, 1, + ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO)); + ARM_COMPUTE_RETURN_ON_ERROR( + NEArithmeticAddition::validate(&input_gate, &input_gate, &input_gate, ConvertPolicy::SATURATE)); } - if(lstm_params.use_layer_norm()) + if (lstm_params.use_layer_norm()) { ARM_COMPUTE_RETURN_ON_ERROR(NEMeanStdDevNormalizationLayer::validate(&input_gate)); - ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplicationKernel::validate(&input_gate, lstm_params.input_layer_norm_weights(), &input_gate, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO)); - ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(&input_gate, lstm_params.input_gate_bias(), &input_gate, ConvertPolicy::SATURATE)); + ARM_COMPUTE_RETURN_ON_ERROR( + NEPixelWiseMultiplication::validate(&input_gate, lstm_params.input_layer_norm_weights(), &input_gate, 1, + ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO)); + ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(&input_gate, lstm_params.input_gate_bias(), + &input_gate, ConvertPolicy::SATURATE)); } - ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayerKernel::validate(&input_gate, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC))); + ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayer::validate( + &input_gate, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC))); } else { - ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticSubtractionKernel::validate(&forget_gate, &forget_gate, &forget_gate, ConvertPolicy::SATURATE)); + ARM_COMPUTE_RETURN_ON_ERROR( + NEArithmeticSubtraction::validate(&forget_gate, &forget_gate, &forget_gate, ConvertPolicy::SATURATE)); } // Validate cell state - ARM_COMPUTE_RETURN_ON_ERROR(NEFullyConnectedLayer::validate(input, input_to_cell_weights, (lstm_params.use_layer_norm()) ? nullptr : cell_bias, &cell_state_tmp)); - ARM_COMPUTE_RETURN_ON_ERROR(NEGEMM::validate(output_state_in, &units_out_transposed_info, nullptr, &cell_state_tmp, 1.f, 0.f, GEMMInfo())); - ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(&cell_state_tmp, &cell_state_tmp, &cell_state_tmp, ConvertPolicy::SATURATE)); - if(lstm_params.use_layer_norm()) + ARM_COMPUTE_RETURN_ON_ERROR(NEFullyConnectedLayer::validate( + input, input_to_cell_weights, (lstm_params.use_layer_norm()) ? nullptr : cell_bias, &cell_state_tmp)); + ARM_COMPUTE_RETURN_ON_ERROR( + NEGEMM::validate(output_state_in, &units_out_transposed_info, nullptr, &cell_state_tmp, 1.f, 0.f, GEMMInfo())); + ARM_COMPUTE_RETURN_ON_ERROR( + NEArithmeticAddition::validate(&cell_state_tmp, &cell_state_tmp, &cell_state_tmp, ConvertPolicy::SATURATE)); + if (lstm_params.use_layer_norm()) { ARM_COMPUTE_RETURN_ON_ERROR(NEMeanStdDevNormalizationLayer::validate(&cell_state_tmp)); - ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplicationKernel::validate(&cell_state_tmp, lstm_params.cell_layer_norm_weights(), &cell_state_tmp, 1, ConvertPolicy::SATURATE, - RoundingPolicy::TO_ZERO)); - ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(&cell_state_tmp, cell_bias, &cell_state_tmp, ConvertPolicy::SATURATE)); - } - ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayerKernel::validate(&cell_state_tmp, nullptr, activation_info)); - ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplicationKernel::validate(&cell_state_tmp, &input_gate, &cell_state_tmp, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO)); - ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplicationKernel::validate(&cell_state_tmp, &forget_gate, &cell_state_tmp, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO)); - ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(&cell_state_tmp, &cell_state_tmp, &cell_state_tmp, ConvertPolicy::SATURATE)); - if(cell_threshold != 0.f) - { - ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayerKernel::validate(&cell_state_tmp, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, -cell_threshold, - cell_threshold))); + ARM_COMPUTE_RETURN_ON_ERROR( + NEPixelWiseMultiplication::validate(&cell_state_tmp, lstm_params.cell_layer_norm_weights(), &cell_state_tmp, + 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO)); + ARM_COMPUTE_RETURN_ON_ERROR( + NEArithmeticAddition::validate(&cell_state_tmp, cell_bias, &cell_state_tmp, ConvertPolicy::SATURATE)); + } + ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayer::validate(&cell_state_tmp, nullptr, activation_info)); + ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplication::validate(&cell_state_tmp, &input_gate, &cell_state_tmp, 1, + ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO)); + ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplication::validate(&cell_state_tmp, &forget_gate, &cell_state_tmp, 1, + ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO)); + ARM_COMPUTE_RETURN_ON_ERROR( + NEArithmeticAddition::validate(&cell_state_tmp, &cell_state_tmp, &cell_state_tmp, ConvertPolicy::SATURATE)); + if (cell_threshold != 0.f) + { + ARM_COMPUTE_RETURN_ON_ERROR( + NEActivationLayer::validate(&cell_state_tmp, nullptr, + ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, + cell_threshold, -cell_threshold))); } // Validate output gate tmp std::vector<const ITensorInfo *> in_out_weights; in_out_weights.emplace_back(input_to_output_weights); in_out_weights.emplace_back(recurrent_to_output_weights); - TensorShape in_out_weights_concat_shape = arm_compute::misc::shape_calculator::calculate_concatenate_shape(in_out_weights, 0); - TensorInfo in_out_gate_concat = TensorInfo(in_out_weights_concat_shape, 1, input->data_type()); + TensorShape in_out_weights_concat_shape = + arm_compute::misc::shape_calculator::calculate_concatenate_shape(in_out_weights, 0); + TensorInfo in_out_gate_concat = TensorInfo(in_out_weights_concat_shape, 1, input->data_type()); ARM_COMPUTE_RETURN_ON_ERROR(NEConcatenateLayer::validate(in_out_weights, &in_out_gate_concat, Window::DimX)); - ARM_COMPUTE_RETURN_ON_ERROR(NEFullyConnectedLayer::validate(input, input_to_output_weights, (lstm_params.use_layer_norm()) ? nullptr : output_gate_bias, &output_gate_tmp)); + ARM_COMPUTE_RETURN_ON_ERROR(NEFullyConnectedLayer::validate( + input, input_to_output_weights, (lstm_params.use_layer_norm()) ? nullptr : output_gate_bias, &output_gate_tmp)); - if(lstm_params.has_peephole_opt()) + if (lstm_params.has_peephole_opt()) { - ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplicationKernel::validate(&cell_state_tmp, lstm_params.cell_to_output_weights(), &output_gate_tmp, 1, ConvertPolicy::SATURATE, - RoundingPolicy::TO_ZERO)); - ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(&output_gate_tmp, &output_gate_tmp, &output_gate_tmp, ConvertPolicy::SATURATE)); + ARM_COMPUTE_RETURN_ON_ERROR( + NEPixelWiseMultiplication::validate(&cell_state_tmp, lstm_params.cell_to_output_weights(), &output_gate_tmp, + 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO)); + ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(&output_gate_tmp, &output_gate_tmp, &output_gate_tmp, + ConvertPolicy::SATURATE)); } - if(lstm_params.use_layer_norm()) + if (lstm_params.use_layer_norm()) { ARM_COMPUTE_RETURN_ON_ERROR(NEMeanStdDevNormalizationLayer::validate(&output_gate_tmp)); - ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplicationKernel::validate(&output_gate_tmp, lstm_params.output_layer_norm_weights(), &output_gate_tmp, 1, ConvertPolicy::SATURATE, - RoundingPolicy::TO_ZERO)); - ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(&output_gate_tmp, output_gate_bias, &output_gate_tmp, ConvertPolicy::SATURATE)); + ARM_COMPUTE_RETURN_ON_ERROR( + NEPixelWiseMultiplication::validate(&output_gate_tmp, lstm_params.output_layer_norm_weights(), + &output_gate_tmp, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO)); + ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(&output_gate_tmp, output_gate_bias, &output_gate_tmp, + ConvertPolicy::SATURATE)); } - ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayerKernel::validate(&output_gate_tmp, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC))); + ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayer::validate( + &output_gate_tmp, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC))); // Validate output state - ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayerKernel::validate(&cell_state_tmp, &cell_state_tmp, activation_info)); - ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplicationKernel::validate(&cell_state_tmp, &output_gate_tmp, &output_gate_tmp, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO)); - if(lstm_params.has_projection()) - { - ARM_COMPUTE_RETURN_ON_ERROR(NEFullyConnectedLayer::validate(&output_gate_tmp, lstm_params.projection_weights(), lstm_params.projection_bias(), output_state_out)); - if(projection_threshold != 0.f) + ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayer::validate(&cell_state_tmp, &cell_state_tmp, activation_info)); + ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplication::validate( + &cell_state_tmp, &output_gate_tmp, &output_gate_tmp, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO)); + if (lstm_params.has_projection()) + { + ARM_COMPUTE_RETURN_ON_ERROR(NEFullyConnectedLayer::validate(&output_gate_tmp, lstm_params.projection_weights(), + lstm_params.projection_bias(), output_state_out)); + if (projection_threshold != 0.f) { - ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayerKernel::validate(output_state_out, output_state_out, - ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, -projection_threshold, projection_threshold))); + ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayer::validate( + output_state_out, output_state_out, + ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, -projection_threshold, + projection_threshold))); } } // Validate copy kernel - ARM_COMPUTE_RETURN_ON_ERROR(NECopyKernel::validate(&cell_state_tmp, cell_state_out)); - ARM_COMPUTE_RETURN_ON_ERROR(NECopyKernel::validate(output_state_out, output)); + ARM_COMPUTE_RETURN_ON_ERROR(NECopy::validate(&cell_state_tmp, cell_state_out)); + ARM_COMPUTE_RETURN_ON_ERROR(NECopy::validate(output_state_out, output)); // Validate scratch concatenation - std::vector<ITensorInfo *> inputs_vector_info_raw; - if(!lstm_params.has_cifg_opt()) + std::vector<const ITensorInfo *> inputs_vector_info_raw; + if (!lstm_params.has_cifg_opt()) { inputs_vector_info_raw.push_back(&input_gate); } @@ -601,108 +775,111 @@ void NELSTMLayer::run() _concat_inputs_forget_gate.run(); _fully_connected_forget_gate.run(); - if(_run_peephole_opt) + if (_run_peephole_opt) { - NEScheduler::get().schedule(&_pixelwise_mul_forget_gate, Window::DimY); + _pixelwise_mul_forget_gate.run(); _accum_forget_gate1.run(); } - if(_is_layer_norm_lstm) + if (_is_layer_norm_lstm) { _mean_std_norm_forget_gate.run(); - NEScheduler::get().schedule(&_pixelwise_mul_forget_gate_coeff, Window::DimY); - NEScheduler::get().schedule(&_accum_forget_gate_bias, Window::DimY); + _pixelwise_mul_forget_gate_coeff.run(); + _accum_forget_gate_bias.run(); } - NEScheduler::get().schedule(&_activation_forget_gate, Window::DimY); + _activation_forget_gate.run(); - if(_run_cifg_opt) + if (_run_cifg_opt) { - if(_ones.info()->data_type() == DataType::F16) + if (_ones.info()->data_type() == DataType::F16) { - std::fill_n(reinterpret_cast<half *>(_ones.buffer()), _ones.info()->total_size() / _ones.info()->element_size(), 1); + std::fill_n(reinterpret_cast<half *>(_ones.buffer()), + _ones.info()->total_size() / _ones.info()->element_size(), 1); } else { - std::fill_n(reinterpret_cast<float *>(_ones.buffer()), _ones.info()->total_size() / _ones.info()->element_size(), 1); + std::fill_n(reinterpret_cast<float *>(_ones.buffer()), + _ones.info()->total_size() / _ones.info()->element_size(), 1); } - NEScheduler::get().schedule(&_subtract_input_gate, Window::DimY); + _subtract_input_gate.run(); } else { _fully_connected_input_gate.run(); - if(_run_peephole_opt) + if (_run_peephole_opt) { - NEScheduler::get().schedule(&_pixelwise_mul_input_gate, Window::DimY); + _pixelwise_mul_input_gate.run(); _accum_input_gate1.run(); } - if(_is_layer_norm_lstm) + if (_is_layer_norm_lstm) { _mean_std_norm_input_gate.run(); - NEScheduler::get().schedule(&_pixelwise_mul_input_gate_coeff, Window::DimY); - NEScheduler::get().schedule(&_accum_input_gate_bias, Window::DimY); + _pixelwise_mul_input_gate_coeff.run(); + _accum_input_gate_bias.run(); } - NEScheduler::get().schedule(&_activation_input_gate, Window::DimY); + _activation_input_gate.run(); } _fully_connected_cell_state.run(); - NEScheduler::get().schedule(&_transpose_cell_state, Window::DimY); + _transpose_cell_state.run(); _gemm_cell_state1.run(); - NEScheduler::get().schedule(&_accum_cell_state1, Window::DimY); - if(_is_layer_norm_lstm) + _accum_cell_state1.run(); + if (_is_layer_norm_lstm) { _mean_std_norm_cell_gate.run(); - NEScheduler::get().schedule(&_pixelwise_mul_cell_gate_coeff, Window::DimY); - NEScheduler::get().schedule(&_accum_cell_gate_bias, Window::DimY); + _pixelwise_mul_cell_gate_coeff.run(); + _accum_cell_gate_bias.run(); } - NEScheduler::get().schedule(&_activation_cell_state, Window::DimY); - NEScheduler::get().schedule(&_pixelwise_mul_cell_state1, Window::DimY); - NEScheduler::get().schedule(&_pixelwise_mul_cell_state2, Window::DimY); - NEScheduler::get().schedule(&_accum_cell_state2, Window::DimY); - if(_perform_cell_clipping) + _activation_cell_state.run(); + _pixelwise_mul_cell_state1.run(); + _pixelwise_mul_cell_state2.run(); + _accum_cell_state2.run(); + + if (_perform_cell_clipping) { - NEScheduler::get().schedule(&_cell_clip, Window::DimY); + _cell_clip.run(); } _fully_connected_output.run(); - if(_run_peephole_opt) + if (_run_peephole_opt) { - NEScheduler::get().schedule(&_pixelwise_mul_output_state1, Window::DimY); + _pixelwise_mul_output_state1.run(); _accum_output1.run(); } - if(_is_layer_norm_lstm) + if (_is_layer_norm_lstm) { _mean_std_norm_output_gate.run(); - NEScheduler::get().schedule(&_pixelwise_mul_output_gate_coeff, Window::DimY); - NEScheduler::get().schedule(&_accum_output_gate_bias, Window::DimY); + _pixelwise_mul_output_gate_coeff.run(); + _accum_output_gate_bias.run(); } - NEScheduler::get().schedule(&_activation_output, Window::DimY); + _activation_output.run(); - NEScheduler::get().schedule(&_activation_output_state, Window::DimY); - NEScheduler::get().schedule(&_pixelwise_mul_output_state2, Window::DimY); + _activation_output_state.run(); + _pixelwise_mul_output_state2.run(); - if(_has_projection_weights) + if (_has_projection_weights) { _fully_connected_output_state.run(); - if(_perform_projection_clipping) + if (_perform_projection_clipping) { - NEScheduler::get().schedule(&_projection_clip, Window::DimY); + _projection_clip.run(); } } - NEScheduler::get().schedule(&_copy_cell_state, Window::DimY); - NEScheduler::get().schedule(&_copy_output, Window::DimY); + _copy_cell_state.run(); + _copy_output.run(); _concat_scratch_buffer.run(); } void NELSTMLayer::prepare() { - if(!_is_prepared) + if (!_is_prepared) { _concat_weights_forget_gate.run(); - if(!_run_cifg_opt) + if (!_run_cifg_opt) { _concat_weights_input_gate.run(); } diff --git a/src/runtime/NEON/functions/NELSTMLayerQuantized.cpp b/src/runtime/NEON/functions/NELSTMLayerQuantized.cpp index cdfc035400..41f9c3d700 100644 --- a/src/runtime/NEON/functions/NELSTMLayerQuantized.cpp +++ b/src/runtime/NEON/functions/NELSTMLayerQuantized.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019 ARM Limited. + * Copyright (c) 2019-2021 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -24,8 +24,11 @@ #include "arm_compute/runtime/NEON/functions/NELSTMLayerQuantized.h" #include "arm_compute/core/Utils.h" -#include "arm_compute/core/Validate.h" #include "arm_compute/core/utils/quantization/AsymmHelpers.h" +#include "arm_compute/core/Validate.h" + +#include "src/common/utils/Log.h" +#include "src/core/helpers/AutoConfiguration.h" #include <cmath> #include <memory> @@ -41,34 +44,107 @@ const QuantizationInfo qsymm_3(8.f / 32768.f, 0); // qsymm16 with 3 integer bit const QuantizationInfo qsymm_4(16.f / 32768.f, 0); // qsymm16 with 4 integer bit const QuantizationInfo qsymm_0(1.f / 32768.f, 0); // qsymm16 with 0 integer bit } // namespace +NELSTMLayerQuantized::~NELSTMLayerQuantized() = default; NELSTMLayerQuantized::NELSTMLayerQuantized(std::shared_ptr<IMemoryManager> memory_manager) - : _memory_group(std::move(memory_manager)), _gemmlowp(), _output_stage(), _transpose_weights(), _concat_input_weights(), _concat_recurrent_weights(), _concat_weights(), _concat_inputs(), - _concat_bias(), _sigmoid_forget_gate(), _sigmoid_input_gate(), _sigmoid_output_gate(), _tanh_modulation_gate(), _tanh_output_state(), _add1(), _add2(), _mul1(), _mul2(), _mul3(), - _slice_input_tensor(), _slice_forget_tensor(), _slice_cell_tensor(), _slice_output_tensor(), _dequantize(), _quantize(), _input_to_input_weights(nullptr), _input_to_forget_weights(nullptr), - _input_to_cell_weights(nullptr), _input_to_output_weights(nullptr), _recurrent_to_input_weights(nullptr), _recurrent_to_forget_weights(nullptr), _recurrent_to_cell_weights(nullptr), - _recurrent_to_output_weights(nullptr), _input_gate_bias(nullptr), _forget_gate_bias(nullptr), _cell_bias(nullptr), _output_gate_bias(nullptr), _recurrent_weights(), _input_weights(), _weights(), - _input(), _weights_transposed(), _output_highp(), _output_lowp(), _bias(), _forget_gate_input(), _input_gate_input(), _output_gate_input(), _input_modulation_gate_input(), _forget_gate_output(), - _input_gate_output(), _output_gate_output(), _input_modulation_gate_output(), _cell_state1(), _cell_state2(), _output_state_tmp(), _output_state_out_symm(), _output_state_out_f32(), + : _memory_group(std::move(memory_manager)), + _gemmlowp(), + _output_stage(), + _transpose_weights(), + _concat_input_weights(), + _concat_recurrent_weights(), + _concat_weights(), + _concat_inputs(), + _concat_bias(), + _sigmoid_forget_gate(), + _sigmoid_input_gate(), + _sigmoid_output_gate(), + _tanh_modulation_gate(), + _tanh_output_state(), + _add1(), + _add2(), + _mul1(), + _mul2(), + _mul3(), + _slice_input_tensor(), + _slice_forget_tensor(), + _slice_cell_tensor(), + _slice_output_tensor(), + _dequantize(), + _quantize(), + _input_to_input_weights(nullptr), + _input_to_forget_weights(nullptr), + _input_to_cell_weights(nullptr), + _input_to_output_weights(nullptr), + _recurrent_to_input_weights(nullptr), + _recurrent_to_forget_weights(nullptr), + _recurrent_to_cell_weights(nullptr), + _recurrent_to_output_weights(nullptr), + _input_gate_bias(nullptr), + _forget_gate_bias(nullptr), + _cell_bias(nullptr), + _output_gate_bias(nullptr), + _recurrent_weights(), + _input_weights(), + _weights(), + _input(), + _weights_transposed(), + _output_highp(), + _output_lowp(), + _bias(), + _forget_gate_input(), + _input_gate_input(), + _output_gate_input(), + _input_modulation_gate_input(), + _forget_gate_output(), + _input_gate_output(), + _output_gate_output(), + _input_modulation_gate_output(), + _cell_state1(), + _cell_state2(), + _output_state_tmp(), + _output_state_out_symm(), + _output_state_out_f32(), _is_prepared(false) { } void NELSTMLayerQuantized::configure(const ITensor *input, - const ITensor *input_to_input_weights, const ITensor *input_to_forget_weights, const ITensor *input_to_cell_weights, const ITensor *input_to_output_weights, - const ITensor *recurrent_to_input_weights, const ITensor *recurrent_to_forget_weights, const ITensor *recurrent_to_cell_weights, const ITensor *recurrent_to_output_weights, - const ITensor *input_gate_bias, const ITensor *forget_gate_bias, const ITensor *cell_bias, const ITensor *output_gate_bias, - ITensor *cell_state_in, const ITensor *output_state_in, - ITensor *cell_state_out, ITensor *output_state_out) + const ITensor *input_to_input_weights, + const ITensor *input_to_forget_weights, + const ITensor *input_to_cell_weights, + const ITensor *input_to_output_weights, + const ITensor *recurrent_to_input_weights, + const ITensor *recurrent_to_forget_weights, + const ITensor *recurrent_to_cell_weights, + const ITensor *recurrent_to_output_weights, + const ITensor *input_gate_bias, + const ITensor *forget_gate_bias, + const ITensor *cell_bias, + const ITensor *output_gate_bias, + ITensor *cell_state_in, + const ITensor *output_state_in, + ITensor *cell_state_out, + ITensor *output_state_out) { - ARM_COMPUTE_ERROR_ON_NULLPTR(input, input_to_input_weights, input_to_forget_weights, input_to_cell_weights, input_to_output_weights, - recurrent_to_input_weights, recurrent_to_forget_weights, recurrent_to_cell_weights, recurrent_to_output_weights, - input_gate_bias, forget_gate_bias, cell_bias, output_gate_bias, cell_state_in, output_state_in, cell_state_out, output_state_out); - - ARM_COMPUTE_ERROR_THROW_ON(NELSTMLayerQuantized::validate(input->info(), input_to_input_weights->info(), input_to_forget_weights->info(), input_to_cell_weights->info(), - input_to_output_weights->info(), - recurrent_to_input_weights->info(), recurrent_to_forget_weights->info(), recurrent_to_cell_weights->info(), recurrent_to_output_weights->info(), - input_gate_bias->info(), forget_gate_bias->info(), cell_bias->info(), output_gate_bias->info(), cell_state_in->info(), output_state_in->info(), cell_state_out->info(), output_state_out->info())); + ARM_COMPUTE_ERROR_ON_NULLPTR(input, input_to_input_weights, input_to_forget_weights, input_to_cell_weights, + input_to_output_weights, recurrent_to_input_weights, recurrent_to_forget_weights, + recurrent_to_cell_weights, recurrent_to_output_weights, input_gate_bias, + forget_gate_bias, cell_bias, output_gate_bias, cell_state_in, output_state_in, + cell_state_out, output_state_out); + + ARM_COMPUTE_ERROR_THROW_ON(NELSTMLayerQuantized::validate( + input->info(), input_to_input_weights->info(), input_to_forget_weights->info(), input_to_cell_weights->info(), + input_to_output_weights->info(), recurrent_to_input_weights->info(), recurrent_to_forget_weights->info(), + recurrent_to_cell_weights->info(), recurrent_to_output_weights->info(), input_gate_bias->info(), + forget_gate_bias->info(), cell_bias->info(), output_gate_bias->info(), cell_state_in->info(), + output_state_in->info(), cell_state_out->info(), output_state_out->info())); + + ARM_COMPUTE_LOG_PARAMS(input, input_to_input_weights, input_to_forget_weights, input_to_cell_weights, + input_to_output_weights, recurrent_to_input_weights, recurrent_to_forget_weights, + recurrent_to_cell_weights, recurrent_to_output_weights, input_gate_bias, forget_gate_bias, + cell_bias, output_gate_bias, cell_state_in, output_state_in, cell_state_out, + output_state_out); const int input_size = input->info()->dimension(0); const int batch_size = input->info()->dimension(1); @@ -76,8 +152,10 @@ void NELSTMLayerQuantized::configure(const ITensor *input, const QuantizationInfo qweights = input_to_input_weights->info()->quantization_info(); // Weights quantization - auto_init_if_empty(*cell_state_out->info(), TensorInfo(TensorShape(batch_size, output_size), 1, DataType::QSYMM16, qsymm_4)); - auto_init_if_empty(*output_state_out->info(), TensorInfo(TensorShape(batch_size, output_size), 1, DataType::QASYMM8, qasymm)); + auto_init_if_empty(*cell_state_out->info(), + TensorInfo(TensorShape(batch_size, output_size), 1, DataType::QSYMM16, qsymm_4)); + auto_init_if_empty(*output_state_out->info(), + TensorInfo(TensorShape(batch_size, output_size), 1, DataType::QASYMM8, qasymm)); _input_to_input_weights = input_to_input_weights; _input_to_forget_weights = input_to_forget_weights; @@ -93,34 +171,41 @@ void NELSTMLayerQuantized::configure(const ITensor *input, _output_gate_bias = output_gate_bias; // Weights concatenation - std::vector<const ITensor *> inputs_weights_vector{ input_to_input_weights, input_to_forget_weights, input_to_cell_weights, input_to_output_weights }; - std::vector<const ITensor *> recurrent_weights_vector{ recurrent_to_input_weights, recurrent_to_forget_weights, recurrent_to_cell_weights, recurrent_to_output_weights }; + std::vector<const ITensor *> inputs_weights_vector{input_to_input_weights, input_to_forget_weights, + input_to_cell_weights, input_to_output_weights}; + std::vector<const ITensor *> recurrent_weights_vector{recurrent_to_input_weights, recurrent_to_forget_weights, + recurrent_to_cell_weights, recurrent_to_output_weights}; - _input_weights.allocator()->init(TensorInfo(TensorShape(input_size, 4 * output_size), 1, DataType::QASYMM8, qweights)); + _input_weights.allocator()->init( + TensorInfo(TensorShape(input_size, 4 * output_size), 1, DataType::QASYMM8, qweights)); _concat_input_weights.configure(inputs_weights_vector, &_input_weights, Window::DimY); - _recurrent_weights.allocator()->init(TensorInfo(TensorShape(output_size, 4 * output_size), 1, DataType::QASYMM8, qweights)); + _recurrent_weights.allocator()->init( + TensorInfo(TensorShape(output_size, 4 * output_size), 1, DataType::QASYMM8, qweights)); _concat_recurrent_weights.configure(recurrent_weights_vector, &_recurrent_weights, Window::DimY); - std::vector<const ITensor *> weights_vector{ &_recurrent_weights, &_input_weights }; - _weights.allocator()->init(TensorInfo(TensorShape(output_size + input_size, 4 * output_size), 1, DataType::QASYMM8, qweights)); + std::vector<const ITensor *> weights_vector{&_recurrent_weights, &_input_weights}; + _weights.allocator()->init( + TensorInfo(TensorShape(output_size + input_size, 4 * output_size), 1, DataType::QASYMM8, qweights)); _concat_weights.configure(weights_vector, &_weights, Window::DimX); _transpose_weights.configure(&_weights, &_weights_transposed); // Input concatenation - std::vector<const ITensor *> input_vector{ input, output_state_in }; + std::vector<const ITensor *> input_vector{input, output_state_in}; _memory_group.manage(&_input); - _input.allocator()->init(TensorInfo(TensorShape(output_size + input_size, batch_size), 1, DataType::QASYMM8, qasymm)); + _input.allocator()->init( + TensorInfo(TensorShape(output_size + input_size, batch_size), 1, DataType::QASYMM8, qasymm)); _concat_inputs.configure(input_vector, &_input, Window::DimX); // Bias concatenation - std::vector<const ITensor *> bias_vector{ input_gate_bias, forget_gate_bias, cell_bias, output_gate_bias }; + std::vector<const ITensor *> bias_vector{input_gate_bias, forget_gate_bias, cell_bias, output_gate_bias}; _bias.allocator()->init(TensorInfo(TensorShape(4 * output_size), 1, DataType::S32)); _concat_bias.configure(bias_vector, &_bias, Window::DimX); // Invert the offset for gemmlowp _input.info()->set_quantization_info(QuantizationInfo(qasymm.uniform().scale, -qasymm.uniform().offset)); - _weights_transposed.info()->set_quantization_info(QuantizationInfo(qweights.uniform().scale, -qweights.uniform().offset)); + _weights_transposed.info()->set_quantization_info( + QuantizationInfo(qweights.uniform().scale, -qweights.uniform().offset)); // Run gemmlowp _memory_group.manage(&_output_highp); @@ -130,7 +215,8 @@ void NELSTMLayerQuantized::configure(const ITensor *input, // Set the offset back _input.info()->set_quantization_info(QuantizationInfo(qasymm.uniform().scale, qasymm.uniform().offset)); - _weights_transposed.info()->set_quantization_info(QuantizationInfo(qweights.uniform().scale, qweights.uniform().offset)); + _weights_transposed.info()->set_quantization_info( + QuantizationInfo(qweights.uniform().scale, qweights.uniform().offset)); // multiplier = (input_scale * weights_scale) / output_scale (2 ^ (-12)) _output_lowp.allocator()->init(TensorInfo(_output_highp.info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_3)); @@ -141,69 +227,91 @@ void NELSTMLayerQuantized::configure(const ITensor *input, quantization::calculate_quantized_multiplier(multiplier, &output_multiplier, &output_shift); _memory_group.manage(&_output_lowp); - _output_stage.configure(&_output_highp, &_bias, &_output_lowp, output_multiplier, output_shift); + + GEMMLowpOutputStageInfo info; + info.type = GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT; + info.gemmlowp_multiplier = output_multiplier; + info.gemmlowp_shift = output_shift; + info.output_data_type = DataType::QSYMM16; + _output_stage.configure(&_output_highp, &_bias, &_output_lowp, info); _output_highp.allocator()->allocate(); _bias.allocator()->allocate(); // Get the gate tensors - if(batch_size > 1) + if (batch_size > 1) { _memory_group.manage(&_input_gate_input); - _slice_input_tensor.configure(&_output_lowp, &_input_gate_input, { 0, 0 }, { output_size, batch_size }); + _slice_input_tensor.configure(&_output_lowp, &_input_gate_input, {0, 0}, {output_size, batch_size}); _memory_group.manage(&_forget_gate_input); - _slice_forget_tensor.configure(&_output_lowp, &_forget_gate_input, { output_size, 0 }, { 2 * output_size, batch_size }); + _slice_forget_tensor.configure(&_output_lowp, &_forget_gate_input, {output_size, 0}, + {2 * output_size, batch_size}); _memory_group.manage(&_input_modulation_gate_input); - _slice_cell_tensor.configure(&_output_lowp, &_input_modulation_gate_input, { 2 * output_size, 0 }, { 3 * output_size, batch_size }); + _slice_cell_tensor.configure(&_output_lowp, &_input_modulation_gate_input, {2 * output_size, 0}, + {3 * output_size, batch_size}); _memory_group.manage(&_output_gate_input); - _slice_output_tensor.configure(&_output_lowp, &_output_gate_input, { 3 * output_size, 0 }, { 4 * output_size, batch_size }); + _slice_output_tensor.configure(&_output_lowp, &_output_gate_input, {3 * output_size, 0}, + {4 * output_size, batch_size}); _output_lowp.allocator()->allocate(); } else { _memory_group.manage(&_input_gate_input); - _slice_input_tensor.configure(&_output_lowp, &_input_gate_input, { 0 }, { output_size }); + _slice_input_tensor.configure(&_output_lowp, &_input_gate_input, {0}, {output_size}); _memory_group.manage(&_forget_gate_input); - _slice_forget_tensor.configure(&_output_lowp, &_forget_gate_input, { output_size }, { 2 * output_size }); + _slice_forget_tensor.configure(&_output_lowp, &_forget_gate_input, {output_size}, {2 * output_size}); _memory_group.manage(&_input_modulation_gate_input); - _slice_cell_tensor.configure(&_output_lowp, &_input_modulation_gate_input, { 2 * output_size }, { 3 * output_size }); + _slice_cell_tensor.configure(&_output_lowp, &_input_modulation_gate_input, {2 * output_size}, + {3 * output_size}); _memory_group.manage(&_output_gate_input); - _slice_output_tensor.configure(&_output_lowp, &_output_gate_input, { 3 * output_size }, { 4 * output_size }); + _slice_output_tensor.configure(&_output_lowp, &_output_gate_input, {3 * output_size}, {4 * output_size}); _output_lowp.allocator()->allocate(); } // Forget gate _memory_group.manage(&_forget_gate_output); - _forget_gate_output.allocator()->init(TensorInfo(_forget_gate_input.info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_0)); - _sigmoid_forget_gate.configure(&_forget_gate_input, &_forget_gate_output, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)); + _forget_gate_output.allocator()->init( + TensorInfo(_forget_gate_input.info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_0)); + _sigmoid_forget_gate.configure(&_forget_gate_input, &_forget_gate_output, + ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)); _forget_gate_input.allocator()->allocate(); // Input gate _memory_group.manage(&_input_gate_output); - _input_gate_output.allocator()->init(TensorInfo(_input_gate_input.info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_0)); - _sigmoid_input_gate.configure(&_input_gate_input, &_input_gate_output, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)); + _input_gate_output.allocator()->init( + TensorInfo(_input_gate_input.info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_0)); + _sigmoid_input_gate.configure(&_input_gate_input, &_input_gate_output, + ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)); _input_gate_input.allocator()->allocate(); // Input modulation gate equation _memory_group.manage(&_input_modulation_gate_output); - _input_modulation_gate_output.allocator()->init(TensorInfo(_input_modulation_gate_input.info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_0)); - _tanh_modulation_gate.configure(&_input_modulation_gate_input, &_input_modulation_gate_output, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.0f, 1.0f)); + _input_modulation_gate_output.allocator()->init( + TensorInfo(_input_modulation_gate_input.info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_0)); + _tanh_modulation_gate.configure(&_input_modulation_gate_input, &_input_modulation_gate_output, + ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.0f, 1.0f)); _input_modulation_gate_input.allocator()->allocate(); // Output gate _memory_group.manage(&_output_gate_output); - _output_gate_output.allocator()->init(TensorInfo(_output_gate_input.info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_0)); - _sigmoid_output_gate.configure(&_output_gate_input, &_output_gate_output, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)); + _output_gate_output.allocator()->init( + TensorInfo(_output_gate_input.info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_0)); + _sigmoid_output_gate.configure(&_output_gate_input, &_output_gate_output, + ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)); _output_gate_input.allocator()->allocate(); // Long term memory _memory_group.manage(&_cell_state1); - _cell_state1.allocator()->init(TensorInfo(_forget_gate_output.info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_4)); - _mul1.configure(&_forget_gate_output, cell_state_in, &_cell_state1, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO); + _cell_state1.allocator()->init( + TensorInfo(_forget_gate_output.info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_4)); + _mul1.configure(&_forget_gate_output, cell_state_in, &_cell_state1, 1, ConvertPolicy::SATURATE, + RoundingPolicy::TO_ZERO); _forget_gate_output.allocator()->allocate(); _memory_group.manage(&_cell_state2); - _cell_state2.allocator()->init(TensorInfo(_input_gate_output.info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_4)); - _mul2.configure(&_input_gate_output, &_input_modulation_gate_output, &_cell_state2, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO); + _cell_state2.allocator()->init( + TensorInfo(_input_gate_output.info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_4)); + _mul2.configure(&_input_gate_output, &_input_modulation_gate_output, &_cell_state2, 1, ConvertPolicy::SATURATE, + RoundingPolicy::TO_ZERO); _input_modulation_gate_output.allocator()->allocate(); _input_gate_output.allocator()->allocate(); @@ -213,18 +321,23 @@ void NELSTMLayerQuantized::configure(const ITensor *input, // Short term memory _memory_group.manage(&_output_state_tmp); - _output_state_tmp.allocator()->init(TensorInfo(cell_state_out->info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_0)); - _tanh_output_state.configure(cell_state_out, &_output_state_tmp, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.0f, 1.0f)); + _output_state_tmp.allocator()->init( + TensorInfo(cell_state_out->info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_0)); + _tanh_output_state.configure(cell_state_out, &_output_state_tmp, + ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.0f, 1.0f)); _memory_group.manage(&_output_state_out_symm); - _output_state_out_symm.allocator()->init(TensorInfo(_output_gate_output.info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_0)); - _mul3.configure(&_output_state_tmp, &_output_gate_output, &_output_state_out_symm, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO); + _output_state_out_symm.allocator()->init( + TensorInfo(_output_gate_output.info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_0)); + _mul3.configure(&_output_state_tmp, &_output_gate_output, &_output_state_out_symm, 1, ConvertPolicy::SATURATE, + RoundingPolicy::TO_ZERO); _output_gate_output.allocator()->allocate(); _output_state_tmp.allocator()->allocate(); // Requantize the output state from QSYMM16 to QASYMM8 _memory_group.manage(&_output_state_out_f32); - _output_state_out_f32.allocator()->init(TensorInfo(_output_state_out_symm.info()->tensor_shape(), 1, DataType::F32)); + _output_state_out_f32.allocator()->init( + TensorInfo(_output_state_out_symm.info()->tensor_shape(), 1, DataType::F32)); _dequantize.configure(&_output_state_out_symm, &_output_state_out_f32); _output_state_out_symm.allocator()->allocate(); @@ -233,15 +346,28 @@ void NELSTMLayerQuantized::configure(const ITensor *input, } Status NELSTMLayerQuantized::validate(const ITensorInfo *input, - const ITensorInfo *input_to_input_weights, const ITensorInfo *input_to_forget_weights, const ITensorInfo *input_to_cell_weights, const ITensorInfo *input_to_output_weights, - const ITensorInfo *recurrent_to_input_weights, const ITensorInfo *recurrent_to_forget_weights, const ITensorInfo *recurrent_to_cell_weights, const ITensorInfo *recurrent_to_output_weights, - const ITensorInfo *input_gate_bias, const ITensorInfo *forget_gate_bias, const ITensorInfo *cell_bias, const ITensorInfo *output_gate_bias, - const ITensorInfo *cell_state_in, const ITensorInfo *output_state_in, - const ITensorInfo *cell_state_out, const ITensorInfo *output_state_out) + const ITensorInfo *input_to_input_weights, + const ITensorInfo *input_to_forget_weights, + const ITensorInfo *input_to_cell_weights, + const ITensorInfo *input_to_output_weights, + const ITensorInfo *recurrent_to_input_weights, + const ITensorInfo *recurrent_to_forget_weights, + const ITensorInfo *recurrent_to_cell_weights, + const ITensorInfo *recurrent_to_output_weights, + const ITensorInfo *input_gate_bias, + const ITensorInfo *forget_gate_bias, + const ITensorInfo *cell_bias, + const ITensorInfo *output_gate_bias, + const ITensorInfo *cell_state_in, + const ITensorInfo *output_state_in, + const ITensorInfo *cell_state_out, + const ITensorInfo *output_state_out) { - ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, input_to_input_weights, input_to_forget_weights, input_to_cell_weights, input_to_output_weights, recurrent_to_input_weights, - recurrent_to_forget_weights, recurrent_to_cell_weights, recurrent_to_output_weights, input_gate_bias, forget_gate_bias, cell_bias, output_gate_bias, cell_state_in, - output_state_in, cell_state_out, output_state_out); + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR( + input, input_to_input_weights, input_to_forget_weights, input_to_cell_weights, input_to_output_weights, + recurrent_to_input_weights, recurrent_to_forget_weights, recurrent_to_cell_weights, recurrent_to_output_weights, + input_gate_bias, forget_gate_bias, cell_bias, output_gate_bias, cell_state_in, output_state_in, cell_state_out, + output_state_out); const int input_size = input->dimension(0); const int batch_size = input->dimension(1); @@ -253,29 +379,51 @@ Status NELSTMLayerQuantized::validate(const ITensorInfo *input, ARM_COMPUTE_RETURN_ERROR_ON(input_gate_bias->num_dimensions() > 1); ARM_COMPUTE_RETURN_ERROR_ON(output_state_in->num_dimensions() > 2); - TensorInfo input_weights_info(input_to_input_weights->clone()->set_tensor_shape(TensorShape(input_size, output_size)).set_data_type(DataType::QASYMM8)); - TensorInfo recurrent_weights_info(input_to_input_weights->clone()->set_tensor_shape(TensorShape(output_size, output_size)).set_data_type(DataType::QASYMM8)); - TensorInfo bias_info(input_gate_bias->clone()->set_tensor_shape(TensorShape(output_size)).set_data_type(DataType::S32)); - TensorInfo output_state_info(cell_state_in->clone()->set_tensor_shape(TensorShape(output_size, batch_size)).set_data_type(DataType::QASYMM8).set_quantization_info(qasymm)); - TensorInfo cell_state_info(cell_state_in->clone()->set_tensor_shape(TensorShape(output_size, batch_size)).set_data_type(DataType::QSYMM16).set_quantization_info(qsymm_4)); + TensorInfo input_weights_info(input_to_input_weights->clone() + ->set_tensor_shape(TensorShape(input_size, output_size)) + .set_data_type(DataType::QASYMM8)); + TensorInfo recurrent_weights_info(input_to_input_weights->clone() + ->set_tensor_shape(TensorShape(output_size, output_size)) + .set_data_type(DataType::QASYMM8)); + TensorInfo bias_info( + input_gate_bias->clone()->set_tensor_shape(TensorShape(output_size)).set_data_type(DataType::S32)); + TensorInfo output_state_info(cell_state_in->clone() + ->set_tensor_shape(TensorShape(output_size, batch_size)) + .set_data_type(DataType::QASYMM8) + .set_quantization_info(qasymm)); + TensorInfo cell_state_info(cell_state_in->clone() + ->set_tensor_shape(TensorShape(output_size, batch_size)) + .set_data_type(DataType::QSYMM16) + .set_quantization_info(qsymm_4)); // Shape checks - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&input_weights_info, input_to_input_weights, input_to_forget_weights, input_to_cell_weights, input_to_output_weights); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&recurrent_weights_info, recurrent_to_input_weights, recurrent_to_forget_weights, recurrent_to_cell_weights, recurrent_to_output_weights); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&bias_info, input_gate_bias, forget_gate_bias, cell_bias, output_gate_bias); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&input_weights_info, input_to_input_weights, input_to_forget_weights, + input_to_cell_weights, input_to_output_weights); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&recurrent_weights_info, recurrent_to_input_weights, + recurrent_to_forget_weights, recurrent_to_cell_weights, + recurrent_to_output_weights); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&bias_info, input_gate_bias, forget_gate_bias, cell_bias, + output_gate_bias); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&cell_state_info, cell_state_in); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&output_state_info, output_state_in); // Data type checks - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&input_weights_info, input, input_to_input_weights, input_to_forget_weights, input_to_cell_weights, input_to_output_weights); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(recurrent_to_input_weights, recurrent_to_forget_weights, recurrent_to_cell_weights, recurrent_to_output_weights); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&bias_info, input_gate_bias, forget_gate_bias, cell_bias, output_gate_bias); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&input_weights_info, input, input_to_input_weights, + input_to_forget_weights, input_to_cell_weights, + input_to_output_weights); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(recurrent_to_input_weights, recurrent_to_forget_weights, + recurrent_to_cell_weights, recurrent_to_output_weights); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&bias_info, input_gate_bias, forget_gate_bias, cell_bias, + output_gate_bias); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&cell_state_info, cell_state_in); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&output_state_info, output_state_in); // Quantization checks - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(&input_weights_info, input_to_input_weights, input_to_forget_weights, input_to_cell_weights, input_to_output_weights); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(recurrent_to_input_weights, recurrent_to_forget_weights, recurrent_to_cell_weights, recurrent_to_output_weights); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(&input_weights_info, input_to_input_weights, + input_to_forget_weights, input_to_cell_weights, + input_to_output_weights); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(recurrent_to_input_weights, recurrent_to_forget_weights, + recurrent_to_cell_weights, recurrent_to_output_weights); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(&cell_state_info, cell_state_in); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(&output_state_info, output_state_in); @@ -297,7 +445,8 @@ Status NELSTMLayerQuantized::validate(const ITensorInfo *input, recurrent_weights_vector.emplace_back(recurrent_to_cell_weights); recurrent_weights_vector.emplace_back(recurrent_to_output_weights); const TensorInfo recurrent_weights(TensorShape(output_size, 4 * output_size), 1, DataType::QASYMM8, qweights); - ARM_COMPUTE_RETURN_ON_ERROR(NEConcatenateLayer::validate(recurrent_weights_vector, &recurrent_weights, Window::DimY)); + ARM_COMPUTE_RETURN_ON_ERROR( + NEConcatenateLayer::validate(recurrent_weights_vector, &recurrent_weights, Window::DimY)); // _concat_weights std::vector<const ITensorInfo *> weights_vector; @@ -307,7 +456,7 @@ Status NELSTMLayerQuantized::validate(const ITensorInfo *input, ARM_COMPUTE_RETURN_ON_ERROR(NEConcatenateLayer::validate(weights_vector, &weights, Window::DimX)); // _transpose_weights const TensorShape weights_transposed_shape(weights.tensor_shape()[1], weights.tensor_shape()[0]); - TensorInfo weights_transposed = weights.clone()->set_is_resizable(true).set_tensor_shape(weights_transposed_shape); + TensorInfo weights_transposed = weights.clone()->set_is_resizable(true).set_tensor_shape(weights_transposed_shape); ARM_COMPUTE_RETURN_ON_ERROR(NETranspose::validate(&weights, &weights_transposed)); // _concat_inputs @@ -333,7 +482,8 @@ Status NELSTMLayerQuantized::validate(const ITensorInfo *input, // _gemmlowp const TensorInfo output_highp(TensorShape(4 * output_size, batch_size), 1, DataType::S32); - ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpMatrixMultiplyCore::validate(&input_concatenated, &weights_transposed, nullptr, &output_highp)); + ARM_COMPUTE_RETURN_ON_ERROR( + NEGEMMLowpMatrixMultiplyCore::validate(&input_concatenated, &weights_transposed, nullptr, &output_highp)); // Set the offset back input_concatenated.set_quantization_info(QuantizationInfo(qasymm.uniform().scale, qasymm.uniform().offset)); @@ -344,78 +494,107 @@ Status NELSTMLayerQuantized::validate(const ITensorInfo *input, const float multiplier = 4096.f * qasymm.uniform().scale * qweights.uniform().scale; int32_t output_multiplier = 0; int32_t output_shift = 0; - ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier(multiplier, &output_multiplier, &output_shift)); + ARM_COMPUTE_RETURN_ON_ERROR( + quantization::calculate_quantized_multiplier(multiplier, &output_multiplier, &output_shift)); // _output_stage - ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPoint::validate(&output_highp, &bias_concatenated, &output_lowp)); + GEMMLowpOutputStageInfo info; + info.type = GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT; + info.gemmlowp_multiplier = output_multiplier; + info.gemmlowp_shift = output_shift; + info.output_data_type = DataType::QSYMM16; + ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpOutputStage::validate(&output_highp, &bias_concatenated, &output_lowp, info)); TensorInfo input_gate_input; TensorInfo forget_gate_input; TensorInfo input_modulation_gate_input; TensorInfo output_gate_input; - if(batch_size > 1) + if (batch_size > 1) { // _slice_input_tensor input_gate_input = TensorInfo(TensorShape(output_size, batch_size), 1, DataType::QSYMM16, qsymm_3); - ARM_COMPUTE_RETURN_ON_ERROR(NESlice::validate(&output_lowp, &input_gate_input, { 0, 0 }, { output_size, batch_size })); + ARM_COMPUTE_RETURN_ON_ERROR( + NESlice::validate(&output_lowp, &input_gate_input, {0, 0}, {output_size, batch_size})); // _slice_forget_tensor forget_gate_input = TensorInfo(TensorShape(output_size, batch_size), 1, DataType::QSYMM16, qsymm_3); - ARM_COMPUTE_RETURN_ON_ERROR(NESlice::validate(&output_lowp, &forget_gate_input, { output_size, 0 }, { 2 * output_size, batch_size })); + ARM_COMPUTE_RETURN_ON_ERROR( + NESlice::validate(&output_lowp, &forget_gate_input, {output_size, 0}, {2 * output_size, batch_size})); // _slice_cell_tensor input_modulation_gate_input = TensorInfo(TensorShape(output_size, batch_size), 1, DataType::QSYMM16, qsymm_3); - ARM_COMPUTE_RETURN_ON_ERROR(NESlice::validate(&output_lowp, &input_modulation_gate_input, { 2 * output_size, 0 }, { 3 * output_size, batch_size })); + ARM_COMPUTE_RETURN_ON_ERROR(NESlice::validate(&output_lowp, &input_modulation_gate_input, {2 * output_size, 0}, + {3 * output_size, batch_size})); // _slice_output_tensor output_gate_input = TensorInfo(TensorShape(output_size, batch_size), 1, DataType::QSYMM16, qsymm_3); - ARM_COMPUTE_RETURN_ON_ERROR(NESlice::validate(&output_lowp, &output_gate_input, { 3 * output_size, 0 }, { 4 * output_size, batch_size })); + ARM_COMPUTE_RETURN_ON_ERROR( + NESlice::validate(&output_lowp, &output_gate_input, {3 * output_size, 0}, {4 * output_size, batch_size})); } else { // _slice_input_tensor input_gate_input = TensorInfo(TensorShape(output_size), 1, DataType::QSYMM16, qsymm_3); - ARM_COMPUTE_RETURN_ON_ERROR(NESlice::validate(&output_lowp, &input_gate_input, { 0 }, { output_size })); + ARM_COMPUTE_RETURN_ON_ERROR(NESlice::validate(&output_lowp, &input_gate_input, {0}, {output_size})); // _slice_forget_tensor forget_gate_input = TensorInfo(TensorShape(output_size), 1, DataType::QSYMM16, qsymm_3); - ARM_COMPUTE_RETURN_ON_ERROR(NESlice::validate(&output_lowp, &forget_gate_input, { output_size }, { 2 * output_size })); + ARM_COMPUTE_RETURN_ON_ERROR( + NESlice::validate(&output_lowp, &forget_gate_input, {output_size}, {2 * output_size})); // _slice_cell_tensor input_modulation_gate_input = TensorInfo(TensorShape(output_size), 1, DataType::QSYMM16, qsymm_3); - ARM_COMPUTE_RETURN_ON_ERROR(NESlice::validate(&output_lowp, &input_modulation_gate_input, { 2 * output_size }, { 3 * output_size })); + ARM_COMPUTE_RETURN_ON_ERROR( + NESlice::validate(&output_lowp, &input_modulation_gate_input, {2 * output_size}, {3 * output_size})); // _slice_output_tensor output_gate_input = TensorInfo(TensorShape(output_size), 1, DataType::QSYMM16, qsymm_3); - ARM_COMPUTE_RETURN_ON_ERROR(NESlice::validate(&output_lowp, &output_gate_input, { 3 * output_size }, { 4 * output_size })); + ARM_COMPUTE_RETURN_ON_ERROR( + NESlice::validate(&output_lowp, &output_gate_input, {3 * output_size}, {4 * output_size})); } // _sigmoid_forget_gate const TensorInfo forget_gate_output(forget_gate_input.tensor_shape(), 1, DataType::QSYMM16, qsymm_0); - ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayer::validate(&forget_gate_input, &forget_gate_output, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC))); + ARM_COMPUTE_RETURN_ON_ERROR( + NEActivationLayer::validate(&forget_gate_input, &forget_gate_output, + ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC))); // _sigmoid_input_gate const TensorInfo input_gate_output(input_gate_input.tensor_shape(), 1, DataType::QSYMM16, qsymm_0); - ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayer::validate(&input_gate_input, &input_gate_output, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC))); + ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayer::validate( + &input_gate_input, &input_gate_output, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC))); // _tanh_modulation_gate - const TensorInfo input_modulation_gate_output(input_modulation_gate_input.tensor_shape(), 1, DataType::QSYMM16, qsymm_0); - ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayer::validate(&input_modulation_gate_input, &input_modulation_gate_output, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.0f, 1.0f))); + const TensorInfo input_modulation_gate_output(input_modulation_gate_input.tensor_shape(), 1, DataType::QSYMM16, + qsymm_0); + ARM_COMPUTE_RETURN_ON_ERROR( + NEActivationLayer::validate(&input_modulation_gate_input, &input_modulation_gate_output, + ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.0f, 1.0f))); // _sigmoid_output_gate const TensorInfo output_gate_output(output_gate_input.tensor_shape(), 1, DataType::QSYMM16, qsymm_0); - ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayer::validate(&output_gate_input, &output_gate_output, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC))); + ARM_COMPUTE_RETURN_ON_ERROR( + NEActivationLayer::validate(&output_gate_input, &output_gate_output, + ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC))); // _mul_forget_gate_cell_state const TensorInfo cell_state_tmp1(forget_gate_output.tensor_shape(), 1, DataType::QSYMM16, qsymm_4); - ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplication::validate(&forget_gate_output, cell_state_in, &cell_state_tmp1, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO)); + ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplication::validate( + &forget_gate_output, cell_state_in, &cell_state_tmp1, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO)); // _mul_input_gate_input_mod_gate const TensorInfo cell_state_tmp2(input_gate_output.tensor_shape(), 1, DataType::QSYMM16, qsymm_4); - ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplication::validate(&input_gate_output, &input_modulation_gate_output, &cell_state_tmp2, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO)); + ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplication::validate(&input_gate_output, &input_modulation_gate_output, + &cell_state_tmp2, 1, ConvertPolicy::SATURATE, + RoundingPolicy::TO_ZERO)); // _add_cell_state_tmps - ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(&cell_state_tmp1, &cell_state_tmp2, cell_state_out, ConvertPolicy::SATURATE)); + ARM_COMPUTE_RETURN_ON_ERROR( + NEArithmeticAddition::validate(&cell_state_tmp1, &cell_state_tmp2, cell_state_out, ConvertPolicy::SATURATE)); // _tanh_modulation_gate const TensorInfo output_state_tmp(cell_state_out->tensor_shape(), 1, DataType::QSYMM16, qsymm_0); - ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayer::validate(cell_state_out, &output_state_tmp, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.0f, 1.0f))); + ARM_COMPUTE_RETURN_ON_ERROR( + NEActivationLayer::validate(cell_state_out, &output_state_tmp, + ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.0f, 1.0f))); // _mul_output_state_tmp_output_gate const TensorInfo output_state_out_symm(output_gate_output.tensor_shape(), 1, DataType::QSYMM16, qsymm_0); - ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplication::validate(&output_state_tmp, &output_gate_output, &output_state_out_symm, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO)); + ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplication::validate(&output_state_tmp, &output_gate_output, + &output_state_out_symm, 1, ConvertPolicy::SATURATE, + RoundingPolicy::TO_ZERO)); // _dequantize const TensorInfo output_state_out_f32(output_state_out_symm.tensor_shape(), 1, DataType::F32); @@ -424,14 +603,14 @@ Status NELSTMLayerQuantized::validate(const ITensorInfo *input, // _quantize ARM_COMPUTE_RETURN_ON_ERROR(NEQuantizationLayer::validate(&output_state_out_f32, output_state_out)); - if(cell_state_out->total_size() != 0) + if (cell_state_out->total_size() != 0) { ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&cell_state_info, cell_state_out); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&cell_state_info, cell_state_out); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(&cell_state_info, cell_state_out); } - if(output_state_out->total_size() != 0) + if (output_state_out->total_size() != 0) { ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&output_state_info, output_state_out); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&output_state_info, output_state_out); @@ -490,7 +669,7 @@ void NELSTMLayerQuantized::run() void NELSTMLayerQuantized::prepare() { - if(!_is_prepared) + if (!_is_prepared) { _input_weights.allocator()->allocate(); _concat_input_weights.run(); diff --git a/src/runtime/NEON/functions/NELaplacianPyramid.cpp b/src/runtime/NEON/functions/NELaplacianPyramid.cpp deleted file mode 100644 index 6b37029f0e..0000000000 --- a/src/runtime/NEON/functions/NELaplacianPyramid.cpp +++ /dev/null @@ -1,107 +0,0 @@ -/* - * Copyright (c) 2016-2020 ARM Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "arm_compute/runtime/NEON/functions/NELaplacianPyramid.h" - -#include "arm_compute/core/Error.h" -#include "arm_compute/core/IPyramid.h" -#include "arm_compute/core/TensorInfo.h" -#include "arm_compute/core/Validate.h" -#include "arm_compute/runtime/NEON/functions/NEArithmeticSubtraction.h" -#include "arm_compute/runtime/NEON/functions/NEDepthConvertLayer.h" -#include "arm_compute/runtime/NEON/functions/NEGaussian5x5.h" -#include "arm_compute/runtime/NEON/functions/NEGaussianPyramid.h" -#include "arm_compute/runtime/Tensor.h" - -using namespace arm_compute; - -NELaplacianPyramid::NELaplacianPyramid() // NOLINT - : _num_levels(0), - _gaussian_pyr_function(), - _convf(), - _subf(), - _gauss_pyr(), - _conv_pyr(), - _depth_function() -{ -} - -void NELaplacianPyramid::run() -{ - ARM_COMPUTE_ERROR_ON_MSG(0 == _num_levels, "Unconfigured function"); - - // Compute Gaussian Pyramid - _gaussian_pyr_function.run(); - - for(unsigned int i = 0; i < _num_levels; ++i) - { - // Apply Gaussian filter to gaussian pyramid image - _convf[i].run(); - } - - for(unsigned int i = 0; i < _num_levels; ++i) - { - // Compute laplacian image - _subf[i].run(); - } - - _depth_function.run(); -} - -void NELaplacianPyramid::configure(const ITensor *input, IPyramid *pyramid, ITensor *output, BorderMode border_mode, uint8_t constant_border_value) -{ - ARM_COMPUTE_ERROR_ON(nullptr == pyramid); - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8); - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::S16); - ARM_COMPUTE_ERROR_ON(0 == pyramid->info()->num_levels()); - ARM_COMPUTE_ERROR_ON(input->info()->dimension(0) != pyramid->info()->width()); - ARM_COMPUTE_ERROR_ON(input->info()->dimension(1) != pyramid->info()->height()); - ARM_COMPUTE_ERROR_ON(output->info()->dimension(0) != pyramid->get_pyramid_level(pyramid->info()->num_levels() - 1)->info()->dimension(0)); - ARM_COMPUTE_ERROR_ON(output->info()->dimension(1) != pyramid->get_pyramid_level(pyramid->info()->num_levels() - 1)->info()->dimension(1)); - - _num_levels = pyramid->info()->num_levels(); - - // Create and initialize the gaussian pyramid and the convoluted pyramid - PyramidInfo pyramid_info; - pyramid_info.init(_num_levels, 0.5f, pyramid->info()->tensor_shape(), arm_compute::Format::U8); - - _gauss_pyr.init(pyramid_info); - _conv_pyr.init(pyramid_info); - - // Create Gaussian Pyramid function - _gaussian_pyr_function.configure(input, &_gauss_pyr, border_mode, constant_border_value); - - _convf.resize(_num_levels); - _subf.resize(_num_levels); - - for(unsigned int i = 0; i < _num_levels; ++i) - { - _convf[i].configure(_gauss_pyr.get_pyramid_level(i), _conv_pyr.get_pyramid_level(i), border_mode, constant_border_value); - _subf[i].configure(_gauss_pyr.get_pyramid_level(i), _conv_pyr.get_pyramid_level(i), pyramid->get_pyramid_level(i), ConvertPolicy::WRAP); - } - - _depth_function.configure(_conv_pyr.get_pyramid_level(_num_levels - 1), output, ConvertPolicy::WRAP, 0); - - _gauss_pyr.allocate(); - _conv_pyr.allocate(); -} diff --git a/src/runtime/NEON/functions/NELaplacianReconstruct.cpp b/src/runtime/NEON/functions/NELaplacianReconstruct.cpp deleted file mode 100644 index 9f7588edb0..0000000000 --- a/src/runtime/NEON/functions/NELaplacianReconstruct.cpp +++ /dev/null @@ -1,102 +0,0 @@ -/* - * Copyright (c) 2016-2020 ARM Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "arm_compute/runtime/NEON/functions/NELaplacianReconstruct.h" - -#include "arm_compute/core/Error.h" -#include "arm_compute/core/IPyramid.h" -#include "arm_compute/core/ITensor.h" -#include "arm_compute/core/TensorInfo.h" -#include "arm_compute/core/Validate.h" - -#include <cstddef> - -using namespace arm_compute; - -NELaplacianReconstruct::NELaplacianReconstruct() // NOLINT - : _tmp_pyr(), - _addf(), - _scalef(), - _depthf() -{ -} - -void NELaplacianReconstruct::configure(const IPyramid *pyramid, ITensor *input, ITensor *output, BorderMode border_mode, uint8_t constant_border_value) -{ - ARM_COMPUTE_ERROR_ON(nullptr == pyramid); - ARM_COMPUTE_ERROR_ON(input == output); - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::S16); - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8); - ARM_COMPUTE_ERROR_ON(input->info()->num_dimensions() != pyramid->get_pyramid_level(0)->info()->num_dimensions()); - ARM_COMPUTE_ERROR_ON(output->info()->num_dimensions() != pyramid->get_pyramid_level(0)->info()->num_dimensions()); - ARM_COMPUTE_ERROR_ON(output->info()->dimension(0) != pyramid->get_pyramid_level(0)->info()->dimension(0)); - ARM_COMPUTE_ERROR_ON(output->info()->dimension(1) != pyramid->get_pyramid_level(0)->info()->dimension(1)); - ARM_COMPUTE_ERROR_ON(input->info()->dimension(0) != pyramid->get_pyramid_level(pyramid->info()->num_levels() - 1)->info()->dimension(0)); - ARM_COMPUTE_ERROR_ON(input->info()->dimension(1) != pyramid->get_pyramid_level(pyramid->info()->num_levels() - 1)->info()->dimension(1)); - - const size_t num_levels = pyramid->info()->num_levels(); - - // Create and initialize the tmp pyramid: I(n-2) = upsample( input + Laplace(n-1) ) - PyramidInfo pyramid_info; - pyramid_info.init(num_levels, 0.5f, output->info()->tensor_shape(), arm_compute::Format::S16); - - _tmp_pyr.init(pyramid_info); - - // Allocate add and scale functions. Level 0 does not need to be scaled. - _addf.resize(num_levels); - _scalef.resize(num_levels - 1); - - const size_t last_level = num_levels - 1; - - _addf[last_level].configure(input, pyramid->get_pyramid_level(last_level), _tmp_pyr.get_pyramid_level(last_level), ConvertPolicy::SATURATE); - - // Scale levels n-1 to 1, and add levels n-2 to 0 - for(size_t l = 0; l < last_level; ++l) - { - _scalef[l].configure(_tmp_pyr.get_pyramid_level(l + 1), _tmp_pyr.get_pyramid_level(l), arm_compute::InterpolationPolicy::NEAREST_NEIGHBOR, border_mode, constant_border_value); - _addf[l].configure(_tmp_pyr.get_pyramid_level(l), pyramid->get_pyramid_level(l), _tmp_pyr.get_pyramid_level(l), ConvertPolicy::SATURATE); - } - - // Convert level 0 from S16 to U8 - _depthf.configure(_tmp_pyr.get_pyramid_level(0), output, ConvertPolicy::SATURATE, 0); - - _tmp_pyr.allocate(); -} - -void NELaplacianReconstruct::run() -{ - ARM_COMPUTE_ERROR_ON_MSG(_addf.empty(), "Unconfigured function"); - - const size_t last_level = _tmp_pyr.info()->num_levels() - 1; - - _addf[last_level].run(); - - // Run l = [last_level - 1, 0] - for(size_t l = last_level; l-- > 0;) - { - _scalef[l].run(); - _addf[l].run(); - } - - _depthf.run(); -} diff --git a/src/runtime/NEON/functions/NELocallyConnectedLayer.cpp b/src/runtime/NEON/functions/NELocallyConnectedLayer.cpp deleted file mode 100644 index d08202dd24..0000000000 --- a/src/runtime/NEON/functions/NELocallyConnectedLayer.cpp +++ /dev/null @@ -1,196 +0,0 @@ -/* - * Copyright (c) 2017-2019 ARM Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "arm_compute/runtime/NEON/functions/NELocallyConnectedLayer.h" - -#include "arm_compute/core/PixelValue.h" -#include "arm_compute/core/Utils.h" -#include "arm_compute/core/Validate.h" -#include "arm_compute/runtime/NEON/NEScheduler.h" - -#include <cmath> -#include <tuple> - -using namespace arm_compute; - -namespace -{ -void calculate_shapes(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info, - TensorShape &shape_wr, TensorShape &shape_im2col, TensorShape &shape_gemm) -{ - ARM_COMPUTE_UNUSED(output); - - const unsigned int kernel_width = weights->dimension(0); - const unsigned int kernel_height = weights->dimension(1); - - bool has_bias = (biases != nullptr); - - // Get convolved dimensions - unsigned int conv_w = 0; - unsigned int conv_h = 0; - std::tie(conv_w, conv_h) = scaled_dimensions(input->dimension(0), input->dimension(1), kernel_width, kernel_height, - conv_info); - - const size_t mat_weights_cols = weights->dimension(3); - const size_t mat_weights_rows = weights->dimension(0) * weights->dimension(1) * weights->dimension(2) + ((has_bias) ? 1 : 0); - const size_t mat_weights_num = weights->dimension(4); - - shape_wr = TensorShape(mat_weights_cols, mat_weights_rows, mat_weights_num); - - const size_t mat_input_cols = mat_weights_rows; - const size_t mat_input_rows = conv_w * conv_h; - - shape_im2col = input->tensor_shape(); - shape_im2col.set(0, mat_input_cols); - shape_im2col.set(1, mat_input_rows); - shape_im2col.set(2, 1); - - shape_gemm = shape_im2col; - shape_gemm.set(0, mat_weights_cols); - shape_gemm.set(1, mat_input_rows); -} -} // namespace - -NELocallyConnectedLayer::NELocallyConnectedLayer(std::shared_ptr<IMemoryManager> memory_manager) - : _memory_group(std::move(memory_manager)), _input_im2col_kernel(), _weights_reshape_kernel(), _mm_kernel(), _output_col2im_kernel(), _input_im2col_reshaped(), _weights_reshaped(), _gemm_output(), - _is_prepared(false), _original_weights(nullptr) -{ -} - -Status NELocallyConnectedLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info) -{ - ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output); - ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(2) != input->dimension(2)); - ARM_COMPUTE_RETURN_ERROR_ON(!conv_info.padding_is_symmetric()); - - bool has_bias = (biases != nullptr); - - if(has_bias) - { - ARM_COMPUTE_RETURN_ERROR_ON(biases->dimension(0) != weights->dimension(3)); - ARM_COMPUTE_RETURN_ERROR_ON(biases->num_dimensions() > 2); - } - - const unsigned int kernel_width = weights->dimension(0); - const unsigned int kernel_height = weights->dimension(1); - - // Get convolved dimensions - unsigned int conv_w = 0; - unsigned int conv_h = 0; - std::tie(conv_w, conv_h) = scaled_dimensions(input->dimension(0), input->dimension(1), kernel_width, kernel_height, - conv_info); - - ARM_COMPUTE_RETURN_ERROR_ON_MSG((output->dimension(0) != conv_w) || (output->dimension(1) != conv_h), "Output shape does not match the expected one"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(weights->dimension(4) != (conv_w * conv_h), "Weights shape does not match the expected one"); - - // Calculate intermediate buffer shapes - TensorShape shape_wr; - TensorShape shape_im2col; - TensorShape shape_gemm; - calculate_shapes(input, weights, biases, output, conv_info, shape_wr, shape_im2col, shape_gemm); - - TensorInfo weights_reshaped_info(shape_wr, 1, weights->data_type()); - TensorInfo input_im2col_reshaped_info(shape_im2col, 1, input->data_type()); - TensorInfo gemm_output_info(shape_gemm, 1, input->data_type()); - - ARM_COMPUTE_RETURN_ON_ERROR(NEIm2ColKernel::validate(input, &input_im2col_reshaped_info, Size2D(kernel_width, kernel_height), conv_info, has_bias)); - ARM_COMPUTE_RETURN_ON_ERROR(NEWeightsReshapeKernel::validate(weights, biases, &weights_reshaped_info)); - ARM_COMPUTE_RETURN_ON_ERROR(NELocallyConnectedMatrixMultiplyKernel::validate(&input_im2col_reshaped_info, &weights_reshaped_info, &gemm_output_info)); - ARM_COMPUTE_RETURN_ON_ERROR(NECol2ImKernel::validate(&gemm_output_info, output, Size2D(conv_w, conv_h))); - - return Status{}; -} - -void NELocallyConnectedLayer::configure(const ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const PadStrideInfo &conv_info) -{ - ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output); - ARM_COMPUTE_ERROR_THROW_ON(NELocallyConnectedLayer::validate(input->info(), weights->info(), biases == nullptr ? nullptr : biases->info(), output->info(), conv_info)); - - bool _has_bias = (biases != nullptr); - _is_prepared = false; - _original_weights = weights; - - const unsigned int kernel_width = weights->info()->dimension(0); - const unsigned int kernel_height = weights->info()->dimension(1); - - // Get convolved dimensions - unsigned int conv_w = 0; - unsigned int conv_h = 0; - std::tie(conv_w, conv_h) = scaled_dimensions(input->info()->dimension(0), input->info()->dimension(1), kernel_width, kernel_height, - conv_info); - - // Calculate intermediate buffer shapes - TensorShape shape_wr; - TensorShape shape_im2col; - TensorShape shape_gemm; - calculate_shapes(input->info(), weights->info(), biases == nullptr ? nullptr : biases->info(), output->info(), conv_info, shape_wr, shape_im2col, shape_gemm); - - _weights_reshaped.allocator()->init(TensorInfo(shape_wr, 1, weights->info()->data_type())); - _input_im2col_reshaped.allocator()->init(TensorInfo(shape_im2col, 1, input->info()->data_type())); - _gemm_output.allocator()->init(TensorInfo(shape_gemm, 1, input->info()->data_type())); - - // Manage intermediate buffers - _memory_group.manage(&_input_im2col_reshaped); - _memory_group.manage(&_gemm_output); - - // Configure kernels - _input_im2col_kernel.configure(input, &_input_im2col_reshaped, Size2D(kernel_width, kernel_height), conv_info, _has_bias); - _weights_reshape_kernel.configure(weights, biases, &_weights_reshaped); - _mm_kernel.configure(&_input_im2col_reshaped, &_weights_reshaped, &_gemm_output); - _output_col2im_kernel.configure(&_gemm_output, output, Size2D(conv_w, conv_h)); - - // Allocate intermediate tensors - _input_im2col_reshaped.allocator()->allocate(); - _gemm_output.allocator()->allocate(); -} - -void NELocallyConnectedLayer::run() -{ - prepare(); - - MemoryGroupResourceScope scope_mg(_memory_group); - - // Run input reshaping - NEScheduler::get().schedule(&_input_im2col_kernel, Window::DimY); - - // Runs GEMM on reshaped matrices - NEScheduler::get().schedule(&_mm_kernel, Window::DimX); - - // Reshape output matrix - NEScheduler::get().schedule(&_output_col2im_kernel, Window::DimY); -} - -void NELocallyConnectedLayer::prepare() -{ - if(!_is_prepared) - { - ARM_COMPUTE_ERROR_ON(!_original_weights->is_used()); - - // Run weights reshaping and mark original weights tensor as unused - _weights_reshaped.allocator()->allocate(); - NEScheduler::get().schedule(&_weights_reshape_kernel, 3); - _original_weights->mark_as_unused(); - - _is_prepared = true; - } -} diff --git a/src/runtime/NEON/functions/NELogical.cpp b/src/runtime/NEON/functions/NELogical.cpp new file mode 100644 index 0000000000..0013a521d1 --- /dev/null +++ b/src/runtime/NEON/functions/NELogical.cpp @@ -0,0 +1,134 @@ +/* + * Copyright (c) 2020-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/runtime/NEON/functions/NELogical.h" + +#include "arm_compute/runtime/NEON/NEScheduler.h" +#include "arm_compute/runtime/Tensor.h" + +#include "src/common/utils/Log.h" +#include "src/core/NEON/kernels/NELogicalKernel.h" + +namespace arm_compute +{ +struct LogicalArgs +{ + std::unique_ptr<kernels::NELogicalKernel> kernel{nullptr}; + ITensorPack pack{}; +}; + +struct NELogicalAnd::Impl : public LogicalArgs +{ +}; +NELogicalAnd::NELogicalAnd() : _impl(std::make_unique<Impl>()) +{ +} +NELogicalAnd::~NELogicalAnd() = default; + +void NELogicalAnd::configure(const ITensor *input1, const ITensor *input2, ITensor *output) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(input1, input2, output); + ARM_COMPUTE_LOG_PARAMS(input1, input2, output); + + _impl->kernel = std::make_unique<kernels::NELogicalKernel>(); + _impl->kernel->configure(input1->info(), input2->info(), output->info(), LogicalOperation::And); + + _impl->pack = ITensorPack(); + _impl->pack.add_tensor(TensorType::ACL_SRC_0, input1); + _impl->pack.add_tensor(TensorType::ACL_SRC_1, input2); + _impl->pack.add_tensor(TensorType::ACL_DST, output); +} + +Status NELogicalAnd::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output) +{ + return kernels::NELogicalKernel::validate(input1, input2, output, LogicalOperation::And); +} + +void NELogicalAnd::run() +{ + NEScheduler::get().schedule_op(_impl->kernel.get(), Window::DimY, _impl->kernel->window(), _impl->pack); +} + +struct NELogicalOr::Impl : public LogicalArgs +{ +}; +NELogicalOr::NELogicalOr() : _impl(std::make_unique<Impl>()) +{ +} +NELogicalOr::~NELogicalOr() = default; + +void NELogicalOr::configure(const ITensor *input1, const ITensor *input2, ITensor *output) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(input1, input2, output); + ARM_COMPUTE_LOG_PARAMS(input1, input2, output); + + _impl->kernel = std::make_unique<kernels::NELogicalKernel>(); + _impl->kernel->configure(input1->info(), input2->info(), output->info(), LogicalOperation::Or); + + _impl->pack = ITensorPack(); + _impl->pack.add_tensor(TensorType::ACL_SRC_0, input1); + _impl->pack.add_tensor(TensorType::ACL_SRC_1, input2); + _impl->pack.add_tensor(TensorType::ACL_DST, output); +} + +Status NELogicalOr::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output) +{ + return kernels::NELogicalKernel::validate(input1, input2, output, LogicalOperation::Or); +} + +void NELogicalOr::run() +{ + NEScheduler::get().schedule_op(_impl->kernel.get(), Window::DimY, _impl->kernel->window(), _impl->pack); +} + +struct NELogicalNot::Impl : public LogicalArgs +{ +}; +NELogicalNot::NELogicalNot() : _impl(std::make_unique<Impl>()) +{ +} +NELogicalNot::~NELogicalNot() = default; + +void NELogicalNot::configure(const ITensor *input, ITensor *output) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); + ARM_COMPUTE_LOG_PARAMS(input, output); + + _impl->kernel = std::make_unique<kernels::NELogicalKernel>(); + _impl->kernel->configure(input->info(), nullptr, output->info(), LogicalOperation::Not); + + _impl->pack = ITensorPack(); + _impl->pack.add_tensor(TensorType::ACL_SRC_0, input); + _impl->pack.add_tensor(TensorType::ACL_DST, output); +} + +Status NELogicalNot::validate(const ITensorInfo *input, const ITensorInfo *output) +{ + return kernels::NELogicalKernel::validate(input, nullptr, output, LogicalOperation::Not); +} + +void NELogicalNot::run() +{ + NEScheduler::get().schedule_op(_impl->kernel.get(), Window::DimY, _impl->kernel->window(), _impl->pack); +} +} // namespace arm_compute diff --git a/src/runtime/NEON/functions/NEMagnitude.cpp b/src/runtime/NEON/functions/NEMagnitude.cpp deleted file mode 100644 index ff2cd49495..0000000000 --- a/src/runtime/NEON/functions/NEMagnitude.cpp +++ /dev/null @@ -1,48 +0,0 @@ -/* - * Copyright (c) 2016-2020 ARM Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "arm_compute/runtime/NEON/functions/NEMagnitude.h" - -#include "arm_compute/core/NEON/kernels/NEMagnitudePhaseKernel.h" -#include "arm_compute/core/Types.h" -#include "support/MemorySupport.h" - -#include <utility> - -using namespace arm_compute; - -void NEMagnitude::configure(const ITensor *input1, const ITensor *input2, ITensor *output, MagnitudeType mag_type) -{ - if(mag_type == MagnitudeType::L1NORM) - { - auto k = arm_compute::support::cpp14::make_unique<NEMagnitudePhaseKernel<MagnitudeType::L1NORM, PhaseType::SIGNED>>(); - k->configure(input1, input2, output, nullptr); - _kernel = std::move(k); - } - else - { - auto k = arm_compute::support::cpp14::make_unique<NEMagnitudePhaseKernel<MagnitudeType::L2NORM, PhaseType::SIGNED>>(); - k->configure(input1, input2, output, nullptr); - _kernel = std::move(k); - } -} diff --git a/src/runtime/NEON/functions/NEMatMul.cpp b/src/runtime/NEON/functions/NEMatMul.cpp new file mode 100644 index 0000000000..31898bafc4 --- /dev/null +++ b/src/runtime/NEON/functions/NEMatMul.cpp @@ -0,0 +1,85 @@ +/* + * Copyright (c) 2023 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/runtime/NEON/functions/NEMatMul.h" + +#include "arm_compute/core/Validate.h" +#include "arm_compute/runtime/MemoryGroup.h" +#include "arm_compute/runtime/Tensor.h" + +#include "src/core/helpers/MemoryHelpers.h" +#include "src/cpu/operators/CpuMatMul.h" + +namespace arm_compute +{ +struct NEMatMul::Impl +{ + const ITensor *lhs{nullptr}; + const ITensor *rhs{nullptr}; + ITensor *output{nullptr}; + std::unique_ptr<cpu::CpuMatMul> op{nullptr}; + MemoryGroup memory_group{}; + WorkspaceData<Tensor> workspace_tensors{}; + ITensorPack run_pack{}; +}; + +NEMatMul::NEMatMul() : _impl(std::make_unique<Impl>()) +{ +} + +NEMatMul::~NEMatMul() = default; + +void NEMatMul::configure(ITensor *lhs, + ITensor *rhs, + ITensor *output, + const MatMulInfo &info, + const CpuMatMulSettings &settings, + const ActivationLayerInfo &act_info) +{ + _impl->lhs = lhs; + _impl->rhs = rhs; + _impl->output = output; + + ARM_COMPUTE_ERROR_ON_NULLPTR(_impl->lhs, _impl->rhs, _impl->output); + _impl->op = std::make_unique<cpu::CpuMatMul>(); + _impl->op->configure(lhs->info(), rhs->info(), output->info(), info, settings, act_info); + _impl->run_pack = {{ACL_SRC_0, lhs}, {ACL_SRC_1, rhs}, {ACL_DST, output}}; + _impl->workspace_tensors = manage_workspace<Tensor>(_impl->op->workspace(), _impl->memory_group, _impl->run_pack); +} + +Status NEMatMul::validate(const ITensorInfo *lhs, + const ITensorInfo *rhs, + const ITensorInfo *output, + const MatMulInfo &info, + const CpuMatMulSettings &settings, + const ActivationLayerInfo &act_info) +{ + return cpu::CpuMatMul::validate(lhs, rhs, output, info, settings, act_info); +} + +void NEMatMul::run() +{ + MemoryGroupResourceScope scope_mg(_impl->memory_group); + _impl->op->run(_impl->run_pack); +} +} // namespace arm_compute diff --git a/src/runtime/NEON/functions/NEMaxUnpoolingLayer.cpp b/src/runtime/NEON/functions/NEMaxUnpoolingLayer.cpp new file mode 100644 index 0000000000..c3861afd2c --- /dev/null +++ b/src/runtime/NEON/functions/NEMaxUnpoolingLayer.cpp @@ -0,0 +1,90 @@ +/* + * Copyright (c) 2020-2022 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/runtime/NEON/functions/NEMaxUnpoolingLayer.h" + +#include "arm_compute/core/ITensor.h" +#include "arm_compute/core/Validate.h" +#include "arm_compute/runtime/NEON/functions/NEFill.h" +#include "arm_compute/runtime/NEON/NEScheduler.h" + +#include "src/common/utils/Log.h" +#include "src/cpu/kernels/CpuMaxUnpoolingLayerKernel.h" +#include "src/cpu/operators/CpuMaxUnpooling.h" + +namespace arm_compute +{ +struct NEMaxUnpoolingLayer::Impl +{ + const ITensor *src{nullptr}; + const ITensor *indices{nullptr}; + ITensor *dst{nullptr}; + std::unique_ptr<cpu::CpuMaxUnpooling> op{nullptr}; +}; + +NEMaxUnpoolingLayer::~NEMaxUnpoolingLayer() = default; + +NEMaxUnpoolingLayer::NEMaxUnpoolingLayer() : _fill_func(), _impl() +{ +} + +void NEMaxUnpoolingLayer::configure(ITensor *input, + ITensor *indices, + ITensor *output, + const PoolingLayerInfo &pool_info) +{ + ARM_COMPUTE_LOG_PARAMS(input, indices, output, pool_info); + + const PixelValue zero_value(0.f); + _fill_func = std::make_unique<NEFill>(); + _impl = std::make_unique<Impl>(); + _impl->src = input; + _impl->indices = indices; + _impl->dst = output; + + _impl->op = std::make_unique<cpu::CpuMaxUnpooling>(); + _fill_func->configure(output, zero_value); + _impl->op->configure(input->info(), indices->info(), output->info(), pool_info); +} + +Status NEMaxUnpoolingLayer::validate(const ITensorInfo *input, + const ITensorInfo *indices, + const ITensorInfo *output, + const PoolingLayerInfo &pool_info) +{ + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output, indices); + ARM_COMPUTE_RETURN_ON_ERROR(cpu::CpuMaxUnpooling::validate(input, indices, output, pool_info)); + return Status{}; +} + +void NEMaxUnpoolingLayer::run() +{ + ITensorPack pack; + pack.add_tensor(TensorType::ACL_SRC_0, _impl->src); + pack.add_tensor(TensorType::ACL_SRC_1, _impl->indices); + pack.add_tensor(TensorType::ACL_DST, _impl->dst); + + _fill_func->run(); + _impl->op->run(pack); +} +} /* namespace arm_compute */ diff --git a/src/runtime/NEON/functions/NEMeanStdDev.cpp b/src/runtime/NEON/functions/NEMeanStdDev.cpp deleted file mode 100644 index 2304bc80d7..0000000000 --- a/src/runtime/NEON/functions/NEMeanStdDev.cpp +++ /dev/null @@ -1,48 +0,0 @@ -/* - * Copyright (c) 2016, 2017 ARM Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "arm_compute/runtime/NEON/functions/NEMeanStdDev.h" - -#include "arm_compute/runtime/NEON/NEScheduler.h" - -using namespace arm_compute; - -NEMeanStdDev::NEMeanStdDev() - : _mean_stddev_kernel(), _fill_border_kernel(), _global_sum(0), _global_sum_squared(0) -{ -} - -void NEMeanStdDev::configure(IImage *input, float *mean, float *stddev) -{ - _mean_stddev_kernel.configure(input, mean, &_global_sum, stddev, &_global_sum_squared); - _fill_border_kernel.configure(input, _mean_stddev_kernel.border_size(), BorderMode::CONSTANT, PixelValue(static_cast<uint8_t>(0))); -} - -void NEMeanStdDev::run() -{ - _global_sum = 0; - _global_sum_squared = 0; - - NEScheduler::get().schedule(&_fill_border_kernel, Window::DimZ); - NEScheduler::get().schedule(&_mean_stddev_kernel, Window::DimY); -} diff --git a/src/runtime/NEON/functions/NEMeanStdDevNormalizationLayer.cpp b/src/runtime/NEON/functions/NEMeanStdDevNormalizationLayer.cpp index fdf2980961..dec0dde56d 100644 --- a/src/runtime/NEON/functions/NEMeanStdDevNormalizationLayer.cpp +++ b/src/runtime/NEON/functions/NEMeanStdDevNormalizationLayer.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2020 ARM Limited. + * Copyright (c) 2019-2021 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -23,14 +23,18 @@ */ #include "arm_compute/runtime/NEON/functions/NEMeanStdDevNormalizationLayer.h" -#include "arm_compute/core/NEON/kernels/NEMeanStdDevNormalizationKernel.h" -#include "support/MemorySupport.h" +#include "src/common/utils/Log.h" +#include "src/core/NEON/kernels/NEMeanStdDevNormalizationKernel.h" namespace arm_compute { +NEMeanStdDevNormalizationLayer::~NEMeanStdDevNormalizationLayer() = default; + void NEMeanStdDevNormalizationLayer::configure(ITensor *input, ITensor *output, float epsilon) { - auto k = arm_compute::support::cpp14::make_unique<NEMeanStdDevNormalizationKernel>(); + ARM_COMPUTE_LOG_PARAMS(input, output, epsilon); + + auto k = std::make_unique<NEMeanStdDevNormalizationKernel>(); k->configure(input, output, epsilon); _kernel = std::move(k); } diff --git a/src/runtime/NEON/functions/NEMedian3x3.cpp b/src/runtime/NEON/functions/NEMedian3x3.cpp deleted file mode 100644 index e24023cf3a..0000000000 --- a/src/runtime/NEON/functions/NEMedian3x3.cpp +++ /dev/null @@ -1,40 +0,0 @@ -/* - * Copyright (c) 2016-2020 ARM Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "arm_compute/runtime/NEON/functions/NEMedian3x3.h" - -#include "arm_compute/core/NEON/kernels/NEMedian3x3Kernel.h" -#include "arm_compute/core/PixelValue.h" -#include "support/MemorySupport.h" - -#include <utility> - -using namespace arm_compute; - -void NEMedian3x3::configure(ITensor *input, ITensor *output, BorderMode border_mode, uint8_t constant_border_value) -{ - auto k = arm_compute::support::cpp14::make_unique<NEMedian3x3Kernel>(); - k->configure(input, output, border_mode == BorderMode::UNDEFINED); - _kernel = std::move(k); - _border_handler.configure(input, _kernel->border_size(), border_mode, PixelValue(constant_border_value)); -} diff --git a/src/runtime/NEON/functions/NEMinMaxLocation.cpp b/src/runtime/NEON/functions/NEMinMaxLocation.cpp deleted file mode 100644 index 54e89abe24..0000000000 --- a/src/runtime/NEON/functions/NEMinMaxLocation.cpp +++ /dev/null @@ -1,50 +0,0 @@ -/* - * Copyright (c) 2016, 2017 ARM Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "arm_compute/runtime/NEON/functions/NEMinMaxLocation.h" - -#include "arm_compute/runtime/NEON/NEScheduler.h" - -using namespace arm_compute; - -NEMinMaxLocation::NEMinMaxLocation() - : _min_max(), _min_max_loc() -{ -} - -void NEMinMaxLocation::configure(const IImage *input, void *min, void *max, ICoordinates2DArray *min_loc, ICoordinates2DArray *max_loc, uint32_t *min_count, uint32_t *max_count) -{ - _min_max.configure(input, min, max); - _min_max_loc.configure(input, min, max, min_loc, max_loc, min_count, max_count); -} - -void NEMinMaxLocation::run() -{ - _min_max.reset(); - - /* Run min max kernel */ - NEScheduler::get().schedule(&_min_max, Window::DimY); - - /* Run min max location */ - NEScheduler::get().schedule(&_min_max_loc, Window::DimY); -} diff --git a/src/runtime/NEON/functions/NENonLinearFilter.cpp b/src/runtime/NEON/functions/NENonLinearFilter.cpp deleted file mode 100644 index 6875d2e5d5..0000000000 --- a/src/runtime/NEON/functions/NENonLinearFilter.cpp +++ /dev/null @@ -1,42 +0,0 @@ -/* - * Copyright (c) 2016-2020 ARM Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "arm_compute/runtime/NEON/functions/NENonLinearFilter.h" - -#include "arm_compute/core/NEON/kernels/NENonLinearFilterKernel.h" -#include "arm_compute/core/PixelValue.h" -#include "support/MemorySupport.h" - -#include <utility> - -using namespace arm_compute; - -void NENonLinearFilter::configure(ITensor *input, ITensor *output, NonLinearFilterFunction function, unsigned int mask_size, MatrixPattern pattern, const uint8_t *mask, - BorderMode border_mode, - uint8_t constant_border_value) -{ - auto k = arm_compute::support::cpp14::make_unique<NENonLinearFilterKernel>(); - k->configure(input, output, function, mask_size, pattern, mask, border_mode == BorderMode::UNDEFINED); - _kernel = std::move(k); - _border_handler.configure(input, _kernel->border_size(), border_mode, PixelValue(constant_border_value)); -} diff --git a/src/runtime/NEON/functions/NENonMaximaSuppression3x3.cpp b/src/runtime/NEON/functions/NENonMaximaSuppression3x3.cpp deleted file mode 100644 index 5c1c3d29ad..0000000000 --- a/src/runtime/NEON/functions/NENonMaximaSuppression3x3.cpp +++ /dev/null @@ -1,47 +0,0 @@ -/* - * Copyright (c) 2016-2020 ARM Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "arm_compute/runtime/NEON/functions/NENonMaximaSuppression3x3.h" - -#include "arm_compute/core/NEON/kernels/NENonMaximaSuppression3x3Kernel.h" -#include "support/MemorySupport.h" - -#include <utility> - -using namespace arm_compute; - -void NENonMaximaSuppression3x3::configure(ITensor *input, ITensor *output, BorderMode border_mode) -{ - auto k = arm_compute::support::cpp14::make_unique<NENonMaximaSuppression3x3Kernel>(); - k->configure(input, output, border_mode == BorderMode::UNDEFINED); - _kernel = std::move(k); - - if(border_mode != BorderMode::UNDEFINED) - { - _border_handler.configure(input, BorderSize(1), BorderMode::CONSTANT, static_cast<float>(0.f)); - } - else - { - _border_handler.configure(input, BorderSize(1), BorderMode::UNDEFINED, static_cast<float>(0.f)); - } -} diff --git a/src/runtime/NEON/functions/NENormalizationLayer.cpp b/src/runtime/NEON/functions/NENormalizationLayer.cpp index d52e92828e..d6d2e9dc46 100644 --- a/src/runtime/NEON/functions/NENormalizationLayer.cpp +++ b/src/runtime/NEON/functions/NENormalizationLayer.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2019 ARM Limited. + * Copyright (c) 2017-2021 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -30,16 +30,22 @@ #include "arm_compute/core/Validate.h" #include "arm_compute/runtime/NEON/NEScheduler.h" -using namespace arm_compute; +#include "src/common/utils/Log.h" +#include "src/core/NEON/kernels/NENormalizationLayerKernel.h" + +namespace arm_compute +{ +NENormalizationLayer::~NENormalizationLayer() = default; NENormalizationLayer::NENormalizationLayer(std::shared_ptr<IMemoryManager> memory_manager) - : _memory_group(std::move(memory_manager)), _norm_kernel(), _multiply_kernel(), _border_handler(), _input_squared() + : _memory_group(std::move(memory_manager)), _norm_kernel(), _multiply_f(), _input_squared() { } void NENormalizationLayer::configure(const ITensor *input, ITensor *output, const NormalizationLayerInfo &norm_info) { ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); + ARM_COMPUTE_LOG_PARAMS(input, output, norm_info); TensorInfo tensor_info(input->info()->tensor_shape(), 1, input->info()->data_type()); _input_squared.allocator()->init(tensor_info); @@ -48,21 +54,24 @@ void NENormalizationLayer::configure(const ITensor *input, ITensor *output, cons _memory_group.manage(&_input_squared); // Configure kernels - _norm_kernel.configure(input, &_input_squared, output, norm_info); - _multiply_kernel.configure(input, input, &_input_squared, 1.0f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO); - _border_handler.configure(&_input_squared, _norm_kernel.border_size(), BorderMode::CONSTANT, PixelValue(0.0f)); + _norm_kernel = std::make_unique<NENormalizationLayerKernel>(); + _norm_kernel->configure(input, &_input_squared, output, norm_info); + _multiply_f.configure(input, input, &_input_squared, 1.0f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO); // Allocate the tensor once the configure methods have been called _input_squared.allocator()->allocate(); } -Status NENormalizationLayer::validate(const ITensorInfo *input, const ITensorInfo *output, const NormalizationLayerInfo &norm_info) +Status NENormalizationLayer::validate(const ITensorInfo *input, + const ITensorInfo *output, + const NormalizationLayerInfo &norm_info) { // Perform validation step ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output); ARM_COMPUTE_RETURN_ON_ERROR(NENormalizationLayerKernel::validate(input, input, output, norm_info)); - ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplicationKernel::validate(input, input, output, 1.0f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO)); + ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplication::validate(input, input, output, 1.0f, ConvertPolicy::SATURATE, + RoundingPolicy::TO_ZERO)); return Status{}; } @@ -70,8 +79,7 @@ Status NENormalizationLayer::validate(const ITensorInfo *input, const ITensorInf void NENormalizationLayer::run() { MemoryGroupResourceScope scope_mg(_memory_group); - - NEScheduler::get().schedule(&_multiply_kernel, Window::DimY); - NEScheduler::get().schedule(&_border_handler, Window::DimY); - NEScheduler::get().schedule(&_norm_kernel, Window::DimY); + _multiply_f.run(); + NEScheduler::get().schedule(_norm_kernel.get(), Window::DimY); } +} // namespace arm_compute diff --git a/src/runtime/NEON/functions/NEOpticalFlow.cpp b/src/runtime/NEON/functions/NEOpticalFlow.cpp deleted file mode 100644 index cb10ca8508..0000000000 --- a/src/runtime/NEON/functions/NEOpticalFlow.cpp +++ /dev/null @@ -1,138 +0,0 @@ -/* - * Copyright (c) 2016-2020 ARM Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "arm_compute/runtime/NEON/functions/NEOpticalFlow.h" - -#include "arm_compute/core/Error.h" -#include "arm_compute/core/ITensor.h" -#include "arm_compute/core/NEON/kernels/NELKTrackerKernel.h" -#include "arm_compute/core/TensorInfo.h" -#include "arm_compute/core/Window.h" -#include "arm_compute/runtime/NEON/NEScheduler.h" -#include "arm_compute/runtime/NEON/functions/NEScharr3x3.h" -#include "arm_compute/runtime/Pyramid.h" -#include "arm_compute/runtime/Tensor.h" -#include "arm_compute/runtime/TensorAllocator.h" - -using namespace arm_compute; - -NEOpticalFlow::NEOpticalFlow(std::shared_ptr<IMemoryManager> memory_manager) // NOLINT - : _memory_group(std::move(memory_manager)), - _func_scharr(), - _kernel_tracker(), - _scharr_gx(), - _scharr_gy(), - _new_points(nullptr), - _new_points_estimates(nullptr), - _old_points(nullptr), - _new_points_internal(), - _old_points_internal(), - _num_levels(0) -{ -} - -void NEOpticalFlow::configure(const Pyramid *old_pyramid, const Pyramid *new_pyramid, const IKeyPointArray *old_points, const IKeyPointArray *new_points_estimates, - IKeyPointArray *new_points, Termination termination, float epsilon, unsigned int num_iterations, size_t window_dimension, - bool use_initial_estimate, BorderMode border_mode, uint8_t constant_border_value) -{ - ARM_COMPUTE_ERROR_ON(nullptr == old_pyramid); - ARM_COMPUTE_ERROR_ON(nullptr == new_pyramid); - ARM_COMPUTE_ERROR_ON(nullptr == old_points); - ARM_COMPUTE_ERROR_ON(nullptr == new_points_estimates); - ARM_COMPUTE_ERROR_ON(nullptr == new_points); - ARM_COMPUTE_ERROR_ON(old_pyramid->info()->num_levels() != new_pyramid->info()->num_levels()); - ARM_COMPUTE_ERROR_ON(0 == old_pyramid->info()->num_levels()); - ARM_COMPUTE_ERROR_ON(old_pyramid->info()->width() != new_pyramid->info()->width()); - ARM_COMPUTE_ERROR_ON(old_pyramid->info()->height() != new_pyramid->info()->height()); - ARM_COMPUTE_ERROR_ON(use_initial_estimate && old_points->num_values() != new_points_estimates->num_values()); - - _num_levels = old_pyramid->info()->num_levels(); - _old_points = old_points; - _new_points = new_points; - _new_points_estimates = new_points_estimates; - - const float pyr_scale = old_pyramid->info()->scale(); - - _func_scharr.clear(); - _kernel_tracker.clear(); - _scharr_gx.clear(); - _scharr_gy.clear(); - - _func_scharr.resize(_num_levels); - _kernel_tracker.resize(_num_levels); - _scharr_gx.resize(_num_levels); - _scharr_gy.resize(_num_levels); - - _old_points_internal = LKInternalKeypointArray(old_points->num_values()); - _new_points_internal = LKInternalKeypointArray(old_points->num_values()); - _new_points->resize(old_points->num_values()); - - for(unsigned int i = 0; i < _num_levels; ++i) - { - // Get images from the ith level of old and right pyramid - IImage *old_ith_input = old_pyramid->get_pyramid_level(i); - IImage *new_ith_input = new_pyramid->get_pyramid_level(i); - - // Get width and height of images - const unsigned int width_ith = old_ith_input->info()->dimension(0); - const unsigned int height_ith = new_ith_input->info()->dimension(1); - - TensorInfo tensor_info(TensorShape(width_ith, height_ith), Format::S16); - - _scharr_gx[i].allocator()->init(tensor_info); - _scharr_gy[i].allocator()->init(tensor_info); - - // Manage intermediate buffers - _memory_group.manage(&_scharr_gx[i]); - _memory_group.manage(&_scharr_gy[i]); - - // Init Scharr kernel - _func_scharr[i].configure(old_ith_input, &_scharr_gx[i], &_scharr_gy[i], border_mode, constant_border_value); - - // Init Lucas-Kanade kernel - _kernel_tracker[i].configure(old_ith_input, new_ith_input, &_scharr_gx[i], &_scharr_gy[i], - old_points, new_points_estimates, new_points, - &_old_points_internal, &_new_points_internal, - termination, use_initial_estimate, epsilon, num_iterations, window_dimension, - i, _num_levels, pyr_scale); - - _scharr_gx[i].allocator()->allocate(); - _scharr_gy[i].allocator()->allocate(); - } -} - -void NEOpticalFlow::run() -{ - ARM_COMPUTE_ERROR_ON_MSG(_num_levels == 0, "Unconfigured function"); - - MemoryGroupResourceScope scope_mg(_memory_group); - - for(unsigned int level = _num_levels; level > 0; --level) - { - // Run Scharr kernel - _func_scharr[level - 1].run(); - - // Run Lucas-Kanade kernel - NEScheduler::get().schedule(&_kernel_tracker[level - 1], Window::DimX); - } -} diff --git a/src/runtime/NEON/functions/NEPReluLayer.cpp b/src/runtime/NEON/functions/NEPReluLayer.cpp index 02dfc6f137..963e68bac7 100644 --- a/src/runtime/NEON/functions/NEPReluLayer.cpp +++ b/src/runtime/NEON/functions/NEPReluLayer.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2020 ARM Limited. + * Copyright (c) 2019-2021 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -24,20 +24,48 @@ #include "arm_compute/runtime/NEON/functions/NEPReluLayer.h" #include "arm_compute/core/ITensor.h" -#include "arm_compute/core/NEON/kernels/NEElementwiseOperationKernel.h" -#include "support/MemorySupport.h" + +#include "src/cpu/operators/CpuPRelu.h" namespace arm_compute { +using OperatorType = cpu::CpuPRelu; + +struct NEPReluLayer::Impl +{ + const ITensor *src_0{nullptr}; + const ITensor *src_1{nullptr}; + ITensor *dst{nullptr}; + std::unique_ptr<OperatorType> op{nullptr}; +}; + +NEPReluLayer::NEPReluLayer() : _impl(std::make_unique<Impl>()) +{ +} +NEPReluLayer::NEPReluLayer(NEPReluLayer &&) = default; +NEPReluLayer &NEPReluLayer::operator=(NEPReluLayer &&) = default; +NEPReluLayer::~NEPReluLayer() = default; + void NEPReluLayer::configure(const ITensor *input, const ITensor *alpha, ITensor *output) { - auto k = arm_compute::support::cpp14::make_unique<NEArithmeticOperationKernel>(); - k->configure(ArithmeticOperation::PRELU, input, alpha, output); - _kernel = std::move(k); + _impl->src_0 = input; + _impl->src_1 = alpha; + _impl->dst = output; + _impl->op = std::make_unique<OperatorType>(); + _impl->op->configure(input->info(), alpha->info(), output->info()); +} + +void NEPReluLayer::run() +{ + ITensorPack pack; + pack.add_tensor(TensorType::ACL_SRC_0, _impl->src_0); + pack.add_tensor(TensorType::ACL_SRC_1, _impl->src_1); + pack.add_tensor(TensorType::ACL_DST, _impl->dst); + _impl->op->run(pack); } Status NEPReluLayer::validate(const ITensorInfo *input, const ITensorInfo *alpha, const ITensorInfo *output) { - return NEArithmeticOperationKernel::validate(ArithmeticOperation::PRELU, input, alpha, output); + return OperatorType::validate(input, alpha, output); } } // namespace arm_compute diff --git a/src/runtime/NEON/functions/NEPadLayer.cpp b/src/runtime/NEON/functions/NEPadLayer.cpp index 537eba7fdf..253566df0f 100644 --- a/src/runtime/NEON/functions/NEPadLayer.cpp +++ b/src/runtime/NEON/functions/NEPadLayer.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018-2020 ARM Limited. + * Copyright (c) 2018-2021 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -23,10 +23,13 @@ */ #include "arm_compute/runtime/NEON/functions/NEPadLayer.h" -#include "arm_compute/runtime/NEON/NEScheduler.h" - #include "arm_compute/core/Types.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" +#include "arm_compute/runtime/NEON/NEScheduler.h" + +#include "src/common/utils/Log.h" +#include "src/core/helpers/AutoConfiguration.h" +#include "src/core/NEON/kernels/NEPadLayerKernel.h" namespace arm_compute { @@ -35,9 +38,9 @@ namespace uint32_t last_padding_dimension(const PaddingList &padding) { int last_padding_dim = padding.size() - 1; - for(; last_padding_dim >= 0; --last_padding_dim) + for (; last_padding_dim >= 0; --last_padding_dim) { - if(padding[last_padding_dim].first > 0 || padding[last_padding_dim].second > 0) + if (padding[last_padding_dim].first > 0 || padding[last_padding_dim].second > 0) { break; } @@ -46,14 +49,28 @@ uint32_t last_padding_dimension(const PaddingList &padding) } } // namespace +NEPadLayer::~NEPadLayer() = default; + NEPadLayer::NEPadLayer() - : _copy_kernel(), _pad_kernel(), _mode(), _padding(), _num_dimensions(0), _slice_functions(), _concat_functions(), _slice_results(), _concat_results() + : _copy_function(), + _pad_kernel(), + _mode(), + _padding(), + _num_dimensions(0), + _slice_functions(), + _concat_functions(), + _slice_results(), + _concat_results() { } -void NEPadLayer::configure_constant_mode(ITensor *input, ITensor *output, const PaddingList &padding, const PixelValue constant_value) +void NEPadLayer::configure_constant_mode(ITensor *input, + ITensor *output, + const PaddingList &padding, + const PixelValue constant_value) { - _pad_kernel.configure(input, output, padding, constant_value, PaddingMode::CONSTANT); + _pad_kernel = std::make_unique<NEPadLayerKernel>(); + _pad_kernel->configure(input, output, padding, constant_value, PaddingMode::CONSTANT); } void NEPadLayer::configure_reflect_symmetric_mode(ITensor *input, ITensor *output) @@ -79,20 +96,20 @@ void NEPadLayer::configure_reflect_symmetric_mode(ITensor *input, ITensor *outpu Coordinates ends_after{}; Coordinates strides{}; ITensor *prev = input; - for(uint32_t i = 0; i < _num_dimensions; ++i) + for (uint32_t i = 0; i < _num_dimensions; ++i) { // Values in strides from the previous dimensions need to be set to 1 to avoid reversing again. - if(i > 0) + if (i > 0) { strides.set(i - 1, 1); } - if(_padding[i].first > 0 || _padding[i].second > 0) + if (_padding[i].first > 0 || _padding[i].second > 0) { // Set the starts, ends, and strides values for the current dimension. // Due to the bit masks passed to strided slice, the values below the current dimension in // starts and ends will be ignored so do not need to be modified. - if(_mode == PaddingMode::REFLECT) + if (_mode == PaddingMode::REFLECT) { starts_before.set(i, _padding[i].first); ends_before.set(i, 0); @@ -117,12 +134,13 @@ void NEPadLayer::configure_reflect_symmetric_mode(ITensor *input, ITensor *outpu const int32_t end_mask_after = ends_after[i] < 0 ? ~0 : ~(1u << i); // Reflect the input values for the padding before and after the input. - std::vector<ITensor *> concat_vector; - if(_padding[i].first > 0) + std::vector<const ITensor *> concat_vector; + if (_padding[i].first > 0) { - if(i < prev->info()->num_dimensions()) + if (i < prev->info()->num_dimensions()) { - _slice_functions[2 * i].configure(prev, &_slice_results[2 * i], starts_before, ends_before, strides, begin_mask_before, end_mask_before); + _slice_functions[2 * i].configure(prev, &_slice_results[2 * i], starts_before, ends_before, strides, + begin_mask_before, end_mask_before); concat_vector.emplace_back(&_slice_results[2 * i]); } else @@ -132,11 +150,12 @@ void NEPadLayer::configure_reflect_symmetric_mode(ITensor *input, ITensor *outpu } } concat_vector.push_back(prev); - if(_padding[i].second > 0) + if (_padding[i].second > 0) { - if(i < prev->info()->num_dimensions()) + if (i < prev->info()->num_dimensions()) { - _slice_functions[2 * i + 1].configure(prev, &_slice_results[2 * i + 1], starts_after, ends_after, strides, begin_mask_after, end_mask_after); + _slice_functions[2 * i + 1].configure(prev, &_slice_results[2 * i + 1], starts_after, ends_after, + strides, begin_mask_after, end_mask_after); concat_vector.emplace_back(&_slice_results[2 * i + 1]); } else @@ -147,8 +166,13 @@ void NEPadLayer::configure_reflect_symmetric_mode(ITensor *input, ITensor *outpu } // Concatenate the padding before and after with the input. ITensor *out = (i == _num_dimensions - 1) ? output : &_concat_results[i]; + out->info()->set_quantization_info(output->info()->quantization_info()); + for (auto &v : concat_vector) + { + v->info()->set_quantization_info(input->info()->quantization_info()); + } _concat_functions[i].configure(concat_vector, out, i); - if(i != _num_dimensions - 1) + if (i != _num_dimensions - 1) { _concat_results[i].allocator()->allocate(); } @@ -159,22 +183,28 @@ void NEPadLayer::configure_reflect_symmetric_mode(ITensor *input, ITensor *outpu } } -void NEPadLayer::configure(ITensor *input, ITensor *output, const PaddingList &padding, const PixelValue constant_value, const PaddingMode mode) +void NEPadLayer::configure(ITensor *input, + ITensor *output, + const PaddingList &padding, + const PixelValue constant_value, + const PaddingMode mode) { ARM_COMPUTE_ERROR_THROW_ON(validate(input->info(), output->info(), padding, constant_value, mode)); + ARM_COMPUTE_LOG_PARAMS(input, output, padding, constant_value, mode); _padding = padding; _mode = mode; - const TensorShape padded_shape = misc::shape_calculator::compute_padded_shape(input->info()->tensor_shape(), _padding); + const TensorShape padded_shape = + misc::shape_calculator::compute_padded_shape(input->info()->tensor_shape(), _padding); auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(padded_shape)); // Find the last dimension requiring padding so that it is known when to write to output and whether any padding is applied. _num_dimensions = last_padding_dimension(padding) + 1; - if(_num_dimensions > 0) + if (_num_dimensions > 0) { - switch(_mode) + switch (_mode) { case PaddingMode::CONSTANT: { @@ -194,23 +224,27 @@ void NEPadLayer::configure(ITensor *input, ITensor *output, const PaddingList &p else { // Copy the input to the whole output if no padding is applied - _copy_kernel.configure(input, output); + _copy_function.configure(input, output); } } -Status NEPadLayer::validate(const ITensorInfo *input, const ITensorInfo *output, const PaddingList &padding, const PixelValue constant_value, const PaddingMode mode) +Status NEPadLayer::validate(const ITensorInfo *input, + const ITensorInfo *output, + const PaddingList &padding, + const PixelValue constant_value, + const PaddingMode mode) { ARM_COMPUTE_UNUSED(constant_value); const TensorShape padded_shape = misc::shape_calculator::compute_padded_shape(input->tensor_shape(), padding); - if(output->total_size() > 0) + if (output->total_size() > 0) { ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), padded_shape); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); } - switch(mode) + switch (mode) { case PaddingMode::CONSTANT: { @@ -219,9 +253,9 @@ Status NEPadLayer::validate(const ITensorInfo *input, const ITensorInfo *output, case PaddingMode::REFLECT: case PaddingMode::SYMMETRIC: { - for(uint32_t i = 0; i < padding.size(); ++i) + for (uint32_t i = 0; i < padding.size(); ++i) { - if(mode == PaddingMode::REFLECT) + if (mode == PaddingMode::REFLECT) { ARM_COMPUTE_RETURN_ERROR_ON(padding[i].first >= input->dimension(i)); ARM_COMPUTE_RETURN_ERROR_ON(padding[i].second >= input->dimension(i)); @@ -244,27 +278,27 @@ Status NEPadLayer::validate(const ITensorInfo *input, const ITensorInfo *output, void NEPadLayer::run() { - if(_num_dimensions > 0) + if (_num_dimensions > 0) { - switch(_mode) + switch (_mode) { case PaddingMode::CONSTANT: { - NEScheduler::get().schedule(&_pad_kernel, Window::DimZ); + NEScheduler::get().schedule(_pad_kernel.get(), Window::DimZ); break; } case PaddingMode::REFLECT: case PaddingMode::SYMMETRIC: { - for(uint32_t i = 0; i < _num_dimensions; ++i) + for (uint32_t i = 0; i < _num_dimensions; ++i) { - if(_padding[i].first > 0 || _padding[i].second > 0) + if (_padding[i].first > 0 || _padding[i].second > 0) { - if(_padding[i].first > 0 && _slice_results[2 * i].info()->total_size() > 0) + if (_padding[i].first > 0 && _slice_results[2 * i].info()->total_size() > 0) { _slice_functions[2 * i].run(); } - if(_padding[i].second > 0 && _slice_results[2 * i + 1].info()->total_size() > 0) + if (_padding[i].second > 0 && _slice_results[2 * i + 1].info()->total_size() > 0) { _slice_functions[2 * i + 1].run(); } @@ -279,7 +313,7 @@ void NEPadLayer::run() } else { - NEScheduler::get().schedule(&_copy_kernel, Window::DimY); + _copy_function.run(); } } } // namespace arm_compute diff --git a/src/runtime/NEON/functions/NEPermute.cpp b/src/runtime/NEON/functions/NEPermute.cpp index cfd27da117..80cd04ce6c 100644 --- a/src/runtime/NEON/functions/NEPermute.cpp +++ b/src/runtime/NEON/functions/NEPermute.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018-2020 ARM Limited. + * Copyright (c) 2017-2021 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -23,20 +23,48 @@ */ #include "arm_compute/runtime/NEON/functions/NEPermute.h" -#include "arm_compute/core/NEON/kernels/NEPermuteKernel.h" -#include "support/MemorySupport.h" +#include "arm_compute/core/Validate.h" + +#include "src/cpu/operators/CpuPermute.h" namespace arm_compute { +struct NEPermute::Impl +{ + const ITensor *src{nullptr}; + ITensor *dst{nullptr}; + std::unique_ptr<cpu::CpuPermute> op{nullptr}; +}; + +NEPermute::NEPermute() : _impl(std::make_unique<Impl>()) +{ +} + +NEPermute::~NEPermute() = default; + void NEPermute::configure(const ITensor *input, ITensor *output, const PermutationVector &perm) { - auto k = arm_compute::support::cpp14::make_unique<NEPermuteKernel>(); - k->configure(input, output, perm); - _kernel = std::move(k); + ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); + + _impl->src = input; + _impl->dst = output; + _impl->op = std::make_unique<cpu::CpuPermute>(); + _impl->op->configure(input->info(), output->info(), perm); } Status NEPermute::validate(const ITensorInfo *input, const ITensorInfo *output, const PermutationVector &perm) { - return NEPermuteKernel::validate(input, output, perm); + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output); + ARM_COMPUTE_RETURN_ON_ERROR(cpu::CpuPermute::validate(input, output, perm)); + + return Status{}; +} + +void NEPermute::run() +{ + ITensorPack pack; + pack.add_tensor(TensorType::ACL_SRC, _impl->src); + pack.add_tensor(TensorType::ACL_DST, _impl->dst); + _impl->op->run(pack); } } // namespace arm_compute diff --git a/src/runtime/NEON/functions/NEPhase.cpp b/src/runtime/NEON/functions/NEPhase.cpp deleted file mode 100644 index bb96f6db5c..0000000000 --- a/src/runtime/NEON/functions/NEPhase.cpp +++ /dev/null @@ -1,47 +0,0 @@ -/* - * Copyright (c) 2016-2020 ARM Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "arm_compute/runtime/NEON/functions/NEPhase.h" - -#include "arm_compute/core/NEON/kernels/NEMagnitudePhaseKernel.h" -#include "support/MemorySupport.h" - -#include <utility> - -using namespace arm_compute; - -void NEPhase::configure(const ITensor *input1, const ITensor *input2, ITensor *output, PhaseType phase_type) -{ - if(phase_type == PhaseType::UNSIGNED) - { - auto k = arm_compute::support::cpp14::make_unique<NEMagnitudePhaseKernel<MagnitudeType::L2NORM, PhaseType::UNSIGNED>>(); - k->configure(input1, input2, nullptr, output); - _kernel = std::move(k); - } - else - { - auto k = arm_compute::support::cpp14::make_unique<NEMagnitudePhaseKernel<MagnitudeType::L2NORM, PhaseType::SIGNED>>(); - k->configure(input1, input2, nullptr, output); - _kernel = std::move(k); - } -} diff --git a/src/runtime/NEON/functions/NEPixelWiseMultiplication.cpp b/src/runtime/NEON/functions/NEPixelWiseMultiplication.cpp index eaf233b9ed..97155a9e74 100644 --- a/src/runtime/NEON/functions/NEPixelWiseMultiplication.cpp +++ b/src/runtime/NEON/functions/NEPixelWiseMultiplication.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2016-2020 ARM Limited. + * Copyright (c) 2016-2021 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -24,60 +24,101 @@ #include "arm_compute/runtime/NEON/functions/NEPixelWiseMultiplication.h" #include "arm_compute/core/ITensor.h" -#include "arm_compute/core/NEON/kernels/NEPixelWiseMultiplicationKernel.h" -#include "support/MemorySupport.h" + +#include "src/cpu/operators/CpuMul.h" #include <utility> namespace arm_compute { -void NEPixelWiseMultiplication::configure(ITensor *input1, ITensor *input2, ITensor *output, float scale, ConvertPolicy overflow_policy, RoundingPolicy rounding_policy, - const ActivationLayerInfo &act_info) +struct NEPixelWiseMultiplication::Impl { - ARM_COMPUTE_UNUSED(act_info); - auto k = arm_compute::support::cpp14::make_unique<NEPixelWiseMultiplicationKernel>(); - k->configure(input1, input2, output, scale, overflow_policy, rounding_policy); - _kernel = std::move(k); - - if(output->info()->dimension(0) > 1) - { - ITensor *broadcasted_info = (input1->info()->dimension(0) == 1) ? input1 : input2; + const ITensor *src_0{nullptr}; + const ITensor *src_1{nullptr}; + ITensor *dst{nullptr}; + std::unique_ptr<cpu::CpuMul> op{nullptr}; +}; - if(broadcasted_info->info()->dimension(0) == 1) - { - _border_handler.configure(broadcasted_info, _kernel->border_size(), BorderMode::REPLICATE); - } - } +NEPixelWiseMultiplication::NEPixelWiseMultiplication() : _impl(std::make_unique<Impl>()) +{ } -Status NEPixelWiseMultiplication::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, float scale, ConvertPolicy overflow_policy, RoundingPolicy rounding_policy, +NEPixelWiseMultiplication::~NEPixelWiseMultiplication() = default; + +Status NEPixelWiseMultiplication::validate(const ITensorInfo *input1, + const ITensorInfo *input2, + const ITensorInfo *output, + float scale, + ConvertPolicy overflow_policy, + RoundingPolicy rounding_policy, const ActivationLayerInfo &act_info) { - ARM_COMPUTE_RETURN_ERROR_ON(act_info.enabled()); - return NEPixelWiseMultiplicationKernel::validate(input1, input2, output, scale, overflow_policy, rounding_policy); + return cpu::CpuMul::validate(input1, input2, output, scale, overflow_policy, rounding_policy, act_info); } -void NEComplexPixelWiseMultiplication::configure(ITensor *input1, ITensor *input2, ITensor *output, const ActivationLayerInfo &act_info) +void NEPixelWiseMultiplication::configure(const ITensor *input1, + const ITensor *input2, + ITensor *output, + float scale, + ConvertPolicy overflow_policy, + RoundingPolicy rounding_policy, + const ActivationLayerInfo &act_info) { - ARM_COMPUTE_UNUSED(act_info); - auto k = arm_compute::support::cpp14::make_unique<NEComplexPixelWiseMultiplicationKernel>(); - k->configure(input1, input2, output); - _kernel = std::move(k); + _impl->src_0 = input1; + _impl->src_1 = input2; + _impl->dst = output; + _impl->op = std::make_unique<cpu::CpuMul>(); + _impl->op->configure(input1->info(), input2->info(), output->info(), scale, overflow_policy, rounding_policy, + act_info); +} - if(output->info()->dimension(0) > 1) - { - ITensor *broadcasted_info = (input1->info()->dimension(0) == 1) ? input1 : input2; +void NEPixelWiseMultiplication::run() +{ + ITensorPack pack; + pack.add_tensor(TensorType::ACL_SRC_0, _impl->src_0); + pack.add_tensor(TensorType::ACL_SRC_1, _impl->src_1); + pack.add_tensor(TensorType::ACL_DST, _impl->dst); + _impl->op->run(pack); +} - if(broadcasted_info->info()->dimension(0) == 1) - { - _border_handler.configure(broadcasted_info, _kernel->border_size(), BorderMode::REPLICATE); - } - } +struct NEComplexPixelWiseMultiplication::Impl +{ + ITensor *src_0{nullptr}; + ITensor *src_1{nullptr}; + ITensor *dst{nullptr}; + std::unique_ptr<cpu::CpuComplexMul> op{nullptr}; +}; + +NEComplexPixelWiseMultiplication::NEComplexPixelWiseMultiplication() : _impl(std::make_unique<Impl>()) +{ +} +NEComplexPixelWiseMultiplication::~NEComplexPixelWiseMultiplication() = default; + +Status NEComplexPixelWiseMultiplication::validate(const ITensorInfo *input1, + const ITensorInfo *input2, + const ITensorInfo *output, + const ActivationLayerInfo &act_info) +{ + return cpu::CpuComplexMul::validate(input1, input2, output, act_info); } -Status NEComplexPixelWiseMultiplication::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const ActivationLayerInfo &act_info) +void NEComplexPixelWiseMultiplication::configure(ITensor *input1, + ITensor *input2, + ITensor *output, + const ActivationLayerInfo &act_info) { - ARM_COMPUTE_RETURN_ERROR_ON(act_info.enabled()); - return NEComplexPixelWiseMultiplicationKernel::validate(input1, input2, output); + _impl->src_0 = input1; + _impl->src_1 = input2; + _impl->dst = output; + _impl->op = std::make_unique<cpu::CpuComplexMul>(); + _impl->op->configure(input1->info(), input2->info(), output->info(), act_info); } +void NEComplexPixelWiseMultiplication::run() +{ + ITensorPack pack; + pack.add_tensor(TensorType::ACL_SRC_0, _impl->src_0); + pack.add_tensor(TensorType::ACL_SRC_1, _impl->src_1); + pack.add_tensor(TensorType::ACL_DST, _impl->dst); + _impl->op->run(pack); +} } // namespace arm_compute diff --git a/src/runtime/NEON/functions/NEPooling3dLayer.cpp b/src/runtime/NEON/functions/NEPooling3dLayer.cpp new file mode 100644 index 0000000000..e017e8c21d --- /dev/null +++ b/src/runtime/NEON/functions/NEPooling3dLayer.cpp @@ -0,0 +1,76 @@ +/* + * Copyright (c) 2022 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/runtime/NEON/functions/NEPooling3dLayer.h" + +#include "arm_compute/core/TensorInfo.h" +#include "arm_compute/core/Validate.h" +#include "arm_compute/runtime/Tensor.h" + +#include "src/core/helpers/MemoryHelpers.h" +#include "src/cpu/operators/CpuPool3d.h" + +namespace arm_compute +{ +struct NEPooling3dLayer::Impl +{ + const ITensor *src{nullptr}; + ITensor *dst{nullptr}; + std::unique_ptr<cpu::CpuPool3d> op{nullptr}; + MemoryGroup memory_group{}; + ITensorPack run_pack{}; + WorkspaceData<Tensor> workspace_tensors{}; +}; + +NEPooling3dLayer::~NEPooling3dLayer() = default; + +NEPooling3dLayer::NEPooling3dLayer(std::shared_ptr<IMemoryManager> memory_manager) : _impl(std::make_unique<Impl>()) +{ + _impl->memory_group = MemoryGroup(std::move(memory_manager)); +} + +void NEPooling3dLayer::configure(const ITensor *input, ITensor *output, const Pooling3dLayerInfo &pool_info) +{ + _impl->src = input; + _impl->dst = output; + _impl->op = std::make_unique<cpu::CpuPool3d>(); + _impl->op->configure(input->info(), output->info(), pool_info); + + _impl->run_pack = {{TensorType::ACL_SRC, _impl->src}, {TensorType::ACL_DST_0, _impl->dst}}; + _impl->workspace_tensors = manage_workspace<Tensor>(_impl->op->workspace(), _impl->memory_group, _impl->run_pack); +} + +Status +NEPooling3dLayer::validate(const ITensorInfo *input, const ITensorInfo *output, const Pooling3dLayerInfo &pool_info) +{ + return cpu::CpuPool3d::validate(input, output, pool_info); +} + +void NEPooling3dLayer::run() +{ + MemoryGroupResourceScope scope_mg(_impl->memory_group); + ARM_COMPUTE_ERROR_ON_NULLPTR(_impl->src, _impl->dst); + _impl->op->run(_impl->run_pack); +} + +} // namespace arm_compute diff --git a/src/runtime/NEON/functions/NEPoolingLayer.cpp b/src/runtime/NEON/functions/NEPoolingLayer.cpp index 12921cf40e..eb9125be3c 100644 --- a/src/runtime/NEON/functions/NEPoolingLayer.cpp +++ b/src/runtime/NEON/functions/NEPoolingLayer.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2020 ARM Limited. + * Copyright (c) 2017-2021 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -23,69 +23,59 @@ */ #include "arm_compute/runtime/NEON/functions/NEPoolingLayer.h" -#include "arm_compute/core/ITensor.h" -#include "arm_compute/runtime/NEON/NEScheduler.h" +#include "arm_compute/core/TensorInfo.h" +#include "arm_compute/core/Validate.h" +#include "arm_compute/runtime/Tensor.h" -using namespace arm_compute; +#include "src/core/helpers/MemoryHelpers.h" +#include "src/cpu/operators/CpuPool2d.h" -NEPoolingLayer::NEPoolingLayer() - : _pooling_layer_kernel(), _border_handler(), _is_global_pooling_layer(false), _data_layout(DataLayout::NCHW) +namespace arm_compute { +struct NEPoolingLayer::Impl +{ + ITensor *src{nullptr}; + ITensor *dst{nullptr}; + ITensor *indices{nullptr}; + std::unique_ptr<cpu::CpuPool2d> op{nullptr}; + MemoryGroup memory_group{}; + ITensorPack run_pack{}; + WorkspaceData<Tensor> workspace_tensors{}; +}; + +NEPoolingLayer::~NEPoolingLayer() = default; + +NEPoolingLayer::NEPoolingLayer(std::shared_ptr<IMemoryManager> memory_manager) : _impl(std::make_unique<Impl>()) +{ + _impl->memory_group = MemoryGroup(std::move(memory_manager)); } void NEPoolingLayer::configure(ITensor *input, ITensor *output, const PoolingLayerInfo &pool_info, ITensor *indices) { - // Check if we have Global Pooling Layer - _is_global_pooling_layer = (input->info()->dimension(0) == pool_info.pool_size.width) && (input->info()->dimension(1) == pool_info.pool_size.height); + _impl->src = input; + _impl->dst = output; + _impl->indices = indices; + _impl->op = std::make_unique<cpu::CpuPool2d>(); + _impl->op->configure(input->info(), output->info(), pool_info, (indices) ? indices->info() : nullptr); - // Get data layout - _data_layout = pool_info.data_layout == DataLayout::UNKNOWN ? input->info()->data_layout() : pool_info.data_layout; - - // Configure pooling kernel - _pooling_layer_kernel.configure(input, output, pool_info, indices); - - switch(_data_layout) - { - case DataLayout::NCHW: - { - // Configure border depending on operation required (quantize border in case of asymmetric data_type) - BorderMode border_mode = (pool_info.pool_type == PoolingType::MAX) ? BorderMode::REPLICATE : BorderMode::CONSTANT; - PixelValue zero_value(0.f); - if(is_data_type_quantized_asymmetric(input->info()->data_type()) && !pool_info.exclude_padding) - { - zero_value = PixelValue(0, input->info()->data_type(), input->info()->quantization_info()); - } - _border_handler.configure(input, _pooling_layer_kernel.border_size(), border_mode, zero_value); - break; - } - case DataLayout::NHWC: - break; - default: - ARM_COMPUTE_ERROR("Data layout not supported"); - } + _impl->run_pack = {{TensorType::ACL_SRC, _impl->src}, + {TensorType::ACL_DST_0, _impl->dst}, + {TensorType::ACL_DST_1, _impl->indices}}; + _impl->workspace_tensors = manage_workspace<Tensor>(_impl->op->workspace(), _impl->memory_group, _impl->run_pack); } -Status NEPoolingLayer::validate(const ITensorInfo *input, const ITensorInfo *output, const PoolingLayerInfo &pool_info, const ITensorInfo *indices) +Status NEPoolingLayer::validate(const ITensorInfo *input, + const ITensorInfo *output, + const PoolingLayerInfo &pool_info, + const ITensorInfo *indices) { - return NEPoolingLayerKernel::validate(input, output, pool_info, indices); + return cpu::CpuPool2d::validate(input, output, pool_info, indices); } void NEPoolingLayer::run() { - switch(_data_layout) - { - case DataLayout::NCHW: - // Fill border - NEScheduler::get().schedule(&_border_handler, Window::DimY); - - // Run pooling layer - NEScheduler::get().schedule(&_pooling_layer_kernel, _is_global_pooling_layer ? Window::DimZ : Window::DimY); - break; - case DataLayout::NHWC: - // Run pooling layer - NEScheduler::get().schedule(&_pooling_layer_kernel, Window::DimX); - break; - default: - ARM_COMPUTE_ERROR("Data layout not supported"); - } + MemoryGroupResourceScope scope_mg(_impl->memory_group); + ARM_COMPUTE_ERROR_ON_NULLPTR(_impl->src, _impl->dst); + _impl->op->run(_impl->run_pack); } +} // namespace arm_compute diff --git a/src/runtime/NEON/functions/NEPriorBoxLayer.cpp b/src/runtime/NEON/functions/NEPriorBoxLayer.cpp index 6e7d4ab667..dbb6bf9df1 100644 --- a/src/runtime/NEON/functions/NEPriorBoxLayer.cpp +++ b/src/runtime/NEON/functions/NEPriorBoxLayer.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018 ARM Limited. + * Copyright (c) 2018-2021 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -27,20 +27,31 @@ #include "arm_compute/core/Error.h" #include "arm_compute/core/TensorInfo.h" #include "arm_compute/core/Types.h" -#include "arm_compute/core/Validate.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" +#include "arm_compute/core/Validate.h" #include "arm_compute/runtime/NEON/NEScheduler.h" +#include "src/common/utils/Log.h" +#include "src/core/NEON/kernels/NEPriorBoxLayerKernel.h" + namespace arm_compute { -void NEPriorBoxLayer::configure(const ITensor *input1, const ITensor *input2, ITensor *output, const PriorBoxLayerInfo &info) +void NEPriorBoxLayer::configure(const ITensor *input1, + const ITensor *input2, + ITensor *output, + const PriorBoxLayerInfo &info) { - auto k = arm_compute::support::cpp14::make_unique<NEPriorBoxLayerKernel>(); + ARM_COMPUTE_LOG_PARAMS(input1, input2, output, info); + + auto k = std::make_unique<NEPriorBoxLayerKernel>(); k->configure(input1, input2, output, info); _kernel = std::move(k); } -Status NEPriorBoxLayer::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const PriorBoxLayerInfo &info) +Status NEPriorBoxLayer::validate(const ITensorInfo *input1, + const ITensorInfo *input2, + const ITensorInfo *output, + const PriorBoxLayerInfo &info) { return NEPriorBoxLayerKernel::validate(input1, input2, output, info); } diff --git a/src/runtime/NEON/functions/NEQLSTMLayer.cpp b/src/runtime/NEON/functions/NEQLSTMLayer.cpp index 083e3fddb4..dd78d10d16 100644 --- a/src/runtime/NEON/functions/NEQLSTMLayer.cpp +++ b/src/runtime/NEON/functions/NEQLSTMLayer.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020 ARM Limited. + * Copyright (c) 2020-2022 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -23,29 +23,64 @@ */ #include "arm_compute/runtime/NEON/functions/NEQLSTMLayer.h" +#include "arm_compute/core/ITensorPack.h" #include "arm_compute/core/KernelDescriptors.h" #include "arm_compute/core/QuantizationInfo.h" #include "arm_compute/core/Utils.h" -#include "arm_compute/core/Validate.h" #include "arm_compute/core/utils/misc/InfoHelpers.h" #include "arm_compute/core/utils/quantization/AsymmHelpers.h" +#include "arm_compute/core/Validate.h" #include "arm_compute/runtime/NEON/NEScheduler.h" +#include "src/common/utils/Log.h" +#include "src/core/helpers/WindowHelpers.h" +#include "src/core/NEON/kernels/NEQLSTMLayerNormalizationKernel.h" +#include "src/cpu/kernels/CpuGemmLowpMatrixReductionKernel.h" + namespace arm_compute { using namespace arm_compute::utils::info_helpers; namespace { -Status validate_mm(GEMMLowpOutputStageInfo &gemmlowp_info, const ITensorInfo *mm_input, const ITensorInfo *mm_weights, const ITensorInfo *bias, - float gemmlowp_scale, const TensorInfo *mm_res_info, const TensorInfo *outstage_tensor_info) +Status validate_mm(GEMMLowpOutputStageInfo &gemmlowp_info, + const ITensorInfo *mm_input, + const ITensorInfo *mm_weights, + const ITensorInfo *bias, + float gemmlowp_scale, + const TensorInfo *mm_res_info, + const TensorInfo *outstage_tensor_info) { ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpMatrixMultiplyCore::validate(mm_input, mm_weights, nullptr, mm_res_info)); - ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier(gemmlowp_scale, &gemmlowp_info.gemmlowp_multiplier, &gemmlowp_info.gemmlowp_shift)); - ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpOutputStage::validate(mm_res_info, bias, outstage_tensor_info, gemmlowp_info)); + ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier( + gemmlowp_scale, &gemmlowp_info.gemmlowp_multiplier, &gemmlowp_info.gemmlowp_shift)); + ARM_COMPUTE_RETURN_ON_ERROR( + NEGEMMLowpOutputStage::validate(mm_res_info, bias, outstage_tensor_info, gemmlowp_info)); return Status{}; } } // namespace +Status NEQLSTMLayer::validate_layer_norm(const ITensorInfo &in, const ITensorInfo &weight, const ITensorInfo &bias) +{ + // Output quantization scale will be different, but ignored here + // since it will be configured at configure() stage. + const TensorInfo out{in}; + return NEQLSTMLayerNormalizationKernel::validate(&in, &out, &weight, &bias); +} + +void NEQLSTMLayer::configure_layer_norm(NEQLSTMLayer::LayerNormGate g, const ITensor *in) +{ + ARM_COMPUTE_ERROR_ON(!_has_layer_norm); + + Tensor &out = get_layer_norm_output(g); + _memory_group.manage(&out); + out.allocator()->init(*(in->info())); + + get_layer_norm(g) = std::make_unique<NEQLSTMLayerNormalizationKernel>(); + get_layer_norm(g)->configure(in, &out, get_layer_norm_weight(g), get_layer_norm_bias(g)); +} + +NEQLSTMLayer::TensorCopyKernel::~TensorCopyKernel() = default; + Status NEQLSTMLayer::TensorCopyKernel::validate(const ITensorInfo &src, const ITensorInfo &dst) { ARM_COMPUTE_RETURN_ERROR_ON(src.tensor_shape().num_dimensions() > max_dimension_supported); @@ -58,6 +93,8 @@ Status NEQLSTMLayer::TensorCopyKernel::validate(const ITensorInfo &src, const IT void NEQLSTMLayer::TensorCopyKernel::configure(ITensor &src, ITensor &dst) { ARM_COMPUTE_ERROR_THROW_ON(NEQLSTMLayer::TensorCopyKernel::validate(*src.info(), *dst.info())); + ARM_COMPUTE_LOG_PARAMS(src, dst); + _src = &src; _dst = &dst; _row_size = std::min(_src->info()->tensor_shape().x(), _dst->info()->tensor_shape().x()); @@ -66,25 +103,108 @@ void NEQLSTMLayer::TensorCopyKernel::configure(ITensor &src, ITensor &dst) void NEQLSTMLayer::TensorCopyKernel::run() { - Iterator input_iter{ _src, _window }; - Iterator output_iter{ _dst, _window }; + Iterator input_iter{_src, _window}; + Iterator output_iter{_dst, _window}; - execute_window_loop(_window, [&](const Coordinates &) - { - memcpy(output_iter.ptr(), input_iter.ptr(), _row_size); - }, - input_iter, output_iter); + execute_window_loop( + _window, [&](const Coordinates &) { memcpy(output_iter.ptr(), input_iter.ptr(), _row_size); }, input_iter, + output_iter); } +NEQLSTMLayer::~NEQLSTMLayer() = default; + NEQLSTMLayer::NEQLSTMLayer(std::shared_ptr<IMemoryManager> memory_manager) + : _memory_group(), + _dequantize_input_to_forget_weights(), + _quantize_input_to_forget_weights(), + _transpose_input_to_forget_weights(), + _transpose_input_to_cell_weights(), + _transpose_input_to_output_weights(), + _transpose_input_to_input_weights(), + _transpose_recurrent_to_forget_weights(), + _transpose_recurrent_to_cell_weights(), + _transpose_recurrent_to_output_weights(), + _transpose_recurrent_to_input_weights(), + _transpose_projection_weights(), + _input_to_input_reduction(), + _recurrent_to_input_reduction(), + _input_to_forget_reduction(), + _recurrent_to_forget_reduction(), + _input_to_cell_reduction(), + _recurrent_to_cell_reduction(), + _input_to_output_reduction(), + _recurrent_to_output_reduction(), + _projection_reduction(), + _projection_bias_add(), + _mm_input_to_forget(), + _mm_recurrent_to_forget(), + _pixelwise_mul_cell_to_forget(), + _input_to_forget_outstage(), + _recurrent_to_forget_outstage(), + _cell_to_forget_outstage(), + _accumulate_input_recurrent_forget(), + _accumulate_cell_forget(), + _forget_gate_sigmoid(), + _mm_input_to_cell(), + _input_to_cell_outstage(), + _mm_recurrent_to_cell(), + _recurrent_to_cell_outstage(), + _accumulate_input_recurrent_modulation(), + _cell_gate_tanh(), + _input_gate_sub(), + _mm_input_to_input(), + _input_to_input_outstage(), + _mm_recurrent_to_input(), + _recurrent_to_input_outstage(), + _accumulate_input_recurrent_input(), + _pixelwise_mul_cell_to_input(), + _cell_to_input_outstage(), + _accumulate_cell_input(), + _input_gate_sigmoid(), + _pixelwise_mul_forget_cell(), + _pixelwise_mul_input_cell(), + _add_forget_cell(), + _cell_clip(), + _mm_input_to_output(), + _input_to_output_outstage(), + _mm_recurrent_to_output(), + _recurrent_to_output_outstage(), + _accumulate_input_recurrent_output(), + _pixelwise_mul_cell_to_output(), + _cell_to_output_outstage(), + _accumulate_cell_to_output(), + _output_gate_sigmoid(), + _hidden_tanh(), + _pixelwise_mul_hidden(), + _hidden_outstage(), + _mm_projection(), + _projection_outstage(), + _accumulate_projection(), + _projection_clip(), + _projection_bias_copy(), + _projection_output_to_accumulate_copy(), + _projection_accumulate_to_output_copy(), + _hidden_to_output_copy(), + _layer_norms(), + _copy_output(), + _layer_norm_weights(), + _layer_norm_bias(), + _layer_norm_output() { _memory_group = MemoryGroup(std::move(memory_manager)); } -void NEQLSTMLayer::configure_mm(NEGEMMLowpMatrixMultiplyCore &mm, NEGEMMLowpOutputStage &outstage, GEMMLowpOutputStageInfo &gemmlowp_info, - const ITensor *mm_input, const ITensor *mm_weights, const ITensor *bias, - Tensor *mm_res, Tensor *outstage_res, float gemmlowp_scale, - const TensorInfo &mm_res_info, const TensorInfo &outstage_tensor_info) +void NEQLSTMLayer::configure_mm(NEGEMMLowpMatrixMultiplyCore &mm, + NEGEMMLowpOutputStage &outstage, + GEMMLowpOutputStageInfo &gemmlowp_info, + const ITensor *mm_input, + const ITensor *mm_weights, + const ITensor *bias, + Tensor *mm_res, + Tensor *outstage_res, + float gemmlowp_scale, + const TensorInfo &mm_res_info, + const TensorInfo &outstage_tensor_info) { _memory_group.manage(mm_res); _memory_group.manage(outstage_res); @@ -96,33 +216,88 @@ void NEQLSTMLayer::configure_mm(NEGEMMLowpMatrixMultiplyCore &mm, NEGEMMLowpOutp mm.configure(mm_input, mm_weights, nullptr, mm_res); // Configure output stage - quantization::calculate_quantized_multiplier(gemmlowp_scale, &gemmlowp_info.gemmlowp_multiplier, &gemmlowp_info.gemmlowp_shift); + quantization::calculate_quantized_multiplier(gemmlowp_scale, &gemmlowp_info.gemmlowp_multiplier, + &gemmlowp_info.gemmlowp_shift); outstage.configure(mm_res, bias, outstage_res, gemmlowp_info); mm_res->allocator()->allocate(); } -void NEQLSTMLayer::configure(const ITensor *input, - const ITensor *input_to_forget_weights, const ITensor *input_to_cell_weights, const ITensor *input_to_output_weights, - const ITensor *recurrent_to_forget_weights, const ITensor *recurrent_to_cell_weights, const ITensor *recurrent_to_output_weights, - const ITensor *forget_gate_bias, const ITensor *cell_bias, const ITensor *output_gate_bias, - const ITensor *cell_state_in, const ITensor *output_state_in, - ITensor *cell_state_out, ITensor *output_state_out, ITensor *output, +void NEQLSTMLayer::configure(const ITensor *input, + const ITensor *input_to_forget_weights, + const ITensor *input_to_cell_weights, + const ITensor *input_to_output_weights, + const ITensor *recurrent_to_forget_weights, + const ITensor *recurrent_to_cell_weights, + const ITensor *recurrent_to_output_weights, + const ITensor *forget_gate_bias, + const ITensor *cell_bias, + const ITensor *output_gate_bias, + const ITensor *cell_state_in, + ITensor *output_state_in, + ITensor *cell_state_out, + ITensor *output_state_out, + ITensor *output, const LSTMParams<ITensor> &lstm_params) { ARM_COMPUTE_ERROR_ON_NULLPTR(input, input_to_forget_weights, input_to_cell_weights, input_to_output_weights, recurrent_to_forget_weights, recurrent_to_cell_weights, recurrent_to_output_weights, - forget_gate_bias, cell_bias, output_gate_bias, cell_state_in, output_state_in, cell_state_out, output_state_out); + forget_gate_bias, cell_bias, output_gate_bias, cell_state_in, output_state_in, + cell_state_out, output_state_out); + + ARM_COMPUTE_LOG_PARAMS(input, input_to_forget_weights, input_to_cell_weights, input_to_output_weights, + recurrent_to_forget_weights, recurrent_to_cell_weights, recurrent_to_output_weights, + forget_gate_bias, cell_bias, output_gate_bias, cell_state_in, output_state_in, + cell_state_out, output_state_out); // Set lstm parameters LSTMParams<ITensorInfo> lstm_params_info{}; build_lstm_params_tensor_info(lstm_params, &lstm_params_info); - // Validate - ARM_COMPUTE_ERROR_THROW_ON(NEQLSTMLayer::validate(input->info(), input_to_forget_weights->info(), input_to_cell_weights->info(), input_to_output_weights->info(), - recurrent_to_forget_weights->info(), recurrent_to_cell_weights->info(), recurrent_to_output_weights->info(), - forget_gate_bias->info(), cell_bias->info(), output_gate_bias->info(), - cell_state_in->info(), output_state_in->info(), cell_state_out->info(), output_state_out->info(), output->info(), - lstm_params_info)); + _input_to_forget_weights_transposed.info()->set_quantization_info( + input_to_forget_weights->info()->quantization_info()); + _input_to_cell_weights_transposed.info()->set_quantization_info(input_to_cell_weights->info()->quantization_info()); + _input_to_output_weights_transposed.info()->set_quantization_info( + input_to_output_weights->info()->quantization_info()); + _recurrent_to_forget_weights_transposed.info()->set_quantization_info( + recurrent_to_forget_weights->info()->quantization_info()); + _recurrent_to_cell_weights_transposed.info()->set_quantization_info( + recurrent_to_cell_weights->info()->quantization_info()); + _recurrent_to_output_weights_transposed.info()->set_quantization_info( + recurrent_to_output_weights->info()->quantization_info()); + + if (input_to_forget_weights->info()->data_type() == DataType::QASYMM8_SIGNED) + { + _convert_input_to_forget_weights_to_qsymm8 = true; + // Setup dequantize output tensor to go from QASYMM8_SIGNED -> F32 + + _input_to_forget_weights_f32.allocator()->init( + TensorInfo(input_to_forget_weights->info()->tensor_shape(), 1, DataType::F32) + .set_data_layout(input_to_forget_weights->info()->data_layout())); + // Setup the quantize output tensor to go from F32 -> QSYMM8 + _input_to_forget_weights_symm8.allocator()->init( + (TensorInfo(input_to_forget_weights->info()->tensor_shape(), 1, DataType::QSYMM8) + .set_data_layout(input_to_forget_weights->info()->data_layout()) + .set_quantization_info(input_to_forget_weights->info()->quantization_info()))); + + _dequantize_input_to_forget_weights.configure(input_to_forget_weights, &_input_to_forget_weights_f32); + _quantize_input_to_forget_weights.configure(&_input_to_forget_weights_f32, &_input_to_forget_weights_symm8); + + ARM_COMPUTE_ERROR_THROW_ON(NEQLSTMLayer::validate( + input->info(), _input_to_forget_weights_symm8.info(), input_to_cell_weights->info(), + input_to_output_weights->info(), recurrent_to_forget_weights->info(), recurrent_to_cell_weights->info(), + recurrent_to_output_weights->info(), forget_gate_bias->info(), cell_bias->info(), output_gate_bias->info(), + cell_state_in->info(), output_state_in->info(), cell_state_out->info(), output_state_out->info(), + output->info(), lstm_params_info)); + } + else + { + ARM_COMPUTE_ERROR_THROW_ON(NEQLSTMLayer::validate( + input->info(), input_to_forget_weights->info(), input_to_cell_weights->info(), + input_to_output_weights->info(), recurrent_to_forget_weights->info(), recurrent_to_cell_weights->info(), + recurrent_to_output_weights->info(), forget_gate_bias->info(), cell_bias->info(), output_gate_bias->info(), + cell_state_in->info(), output_state_in->info(), cell_state_out->info(), output_state_out->info(), + output->info(), lstm_params_info)); + } const int batch_size = input->info()->dimension(1); const int num_units = input_to_output_weights->info()->dimension(1); @@ -133,7 +308,9 @@ void NEQLSTMLayer::configure(const ITensor *input, const UniformQuantizationInfo qoutput_state_in = output_state_in->info()->quantization_info().uniform(); _projection_bias = lstm_params.projection_bias(); - _input_to_forget_weights = input_to_forget_weights; + _input_to_forget_weights = (input_to_forget_weights->info()->data_type() == DataType::QASYMM8_SIGNED) + ? &_input_to_forget_weights_symm8 + : input_to_forget_weights; _input_to_cell_weights = input_to_cell_weights; _input_to_output_weights = input_to_output_weights; _recurrent_to_forget_weights = recurrent_to_forget_weights; @@ -143,7 +320,7 @@ void NEQLSTMLayer::configure(const ITensor *input, // Layer normalization _has_layer_norm = lstm_params.use_layer_norm(); - if(_has_layer_norm) + if (_has_layer_norm) { set_layer_norm_weight(lstm_params.forget_layer_norm_weights(), LayerNormGate::Forget); set_layer_norm_weight(lstm_params.cell_layer_norm_weights(), LayerNormGate::Cell); @@ -165,45 +342,79 @@ void NEQLSTMLayer::configure(const ITensor *input, // Calculate quantized parameters for clipping. int16_t quantized_cell_clip = 0; - if(lstm_params.cell_clip() > 0.0f) + if (lstm_params.cell_clip() > 0.0f) { quantized_cell_clip = quantize_qsymm16(lstm_params.cell_clip(), qcell_state_in); } _has_cell_clipping = quantized_cell_clip > 0; // Precompute effective bias for optimizing the matmul computations. - if(!_has_cifg) + if (!_has_cifg) { _input_to_input_weights = lstm_params.input_to_input_weights(); _recurrent_to_input_weights = lstm_params.recurrent_to_input_weights(); - _input_to_input_reduction.configure(_input_to_input_weights, &_input_to_input_eff_bias, GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true)); - _recurrent_to_input_reduction.configure(_recurrent_to_input_weights, &_recurrent_to_input_eff_bias, GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true)); + _input_to_input_reduction = std::make_unique<cpu::kernels::CpuGemmLowpMatrixAReductionKernel>(); + _recurrent_to_input_reduction = std::make_unique<cpu::kernels::CpuGemmLowpMatrixAReductionKernel>(); + _input_to_input_reduction->configure(_input_to_input_weights->info(), _input_to_input_eff_bias.info(), + GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true)); + _recurrent_to_input_reduction->configure( + _recurrent_to_input_weights->info(), _recurrent_to_input_eff_bias.info(), + GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true)); } - _input_to_forget_reduction.configure(input_to_forget_weights, &_input_to_forget_eff_bias, GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true)); - _recurrent_to_forget_reduction.configure(recurrent_to_forget_weights, &_recurrent_to_forget_eff_bias, GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true)); - _input_to_cell_reduction.configure(input_to_cell_weights, &_input_to_cell_eff_bias, GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true)); - _recurrent_to_cell_reduction.configure(recurrent_to_cell_weights, &_recurrent_to_cell_eff_bias, GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true)); - _input_to_output_reduction.configure(input_to_output_weights, &_input_to_output_eff_bias, GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true)); - _recurrent_to_output_reduction.configure(recurrent_to_output_weights, &_recurrent_to_output_eff_bias, GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true)); - if(_has_projection) + + _input_to_forget_reduction = std::make_unique<cpu::kernels::CpuGemmLowpMatrixAReductionKernel>(); + _recurrent_to_forget_reduction = std::make_unique<cpu::kernels::CpuGemmLowpMatrixAReductionKernel>(); + _input_to_cell_reduction = std::make_unique<cpu::kernels::CpuGemmLowpMatrixAReductionKernel>(); + _recurrent_to_cell_reduction = std::make_unique<cpu::kernels::CpuGemmLowpMatrixAReductionKernel>(); + _input_to_output_reduction = std::make_unique<cpu::kernels::CpuGemmLowpMatrixAReductionKernel>(); + _recurrent_to_output_reduction = std::make_unique<cpu::kernels::CpuGemmLowpMatrixAReductionKernel>(); + + _input_to_forget_reduction->configure(input_to_forget_weights->info(), _input_to_forget_eff_bias.info(), + GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true)); + _recurrent_to_forget_reduction->configure( + recurrent_to_forget_weights->info(), _recurrent_to_forget_eff_bias.info(), + GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true)); + _input_to_cell_reduction->configure(input_to_cell_weights->info(), _input_to_cell_eff_bias.info(), + GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true)); + _recurrent_to_cell_reduction->configure( + recurrent_to_cell_weights->info(), _recurrent_to_cell_eff_bias.info(), + GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true)); + _input_to_output_reduction->configure(input_to_output_weights->info(), _input_to_output_eff_bias.info(), + GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true)); + _recurrent_to_output_reduction->configure( + recurrent_to_output_weights->info(), _recurrent_to_output_eff_bias.info(), + GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true)); + if (_has_projection) { - _projection_reduction.configure(_projection_weights, &_projection_eff_bias, GEMMLowpReductionKernelInfo(output_size, false, lstm_params.hidden_state_zero(), true)); + _projection_reduction = std::make_unique<cpu::kernels::CpuGemmLowpMatrixAReductionKernel>(); + _projection_reduction->configure( + _projection_weights->info(), _projection_eff_bias.info(), + GEMMLowpReductionKernelInfo(output_size, false, lstm_params.hidden_state_zero(), true)); + if (_projection_bias != nullptr) + { + _projection_bias_add.configure(_projection_bias, &_projection_eff_bias, &_projection_eff_bias, + ConvertPolicy::SATURATE); + } } // Pre-transpose weights to be used in GEMM. _transpose_input_to_forget_weights.configure(input_to_forget_weights, &_input_to_forget_weights_transposed); _transpose_input_to_cell_weights.configure(input_to_cell_weights, &_input_to_cell_weights_transposed); _transpose_input_to_output_weights.configure(input_to_output_weights, &_input_to_output_weights_transposed); - _transpose_recurrent_to_forget_weights.configure(recurrent_to_forget_weights, &_recurrent_to_forget_weights_transposed); + _transpose_recurrent_to_forget_weights.configure(recurrent_to_forget_weights, + &_recurrent_to_forget_weights_transposed); _transpose_recurrent_to_cell_weights.configure(recurrent_to_cell_weights, &_recurrent_to_cell_weights_transposed); - _transpose_recurrent_to_output_weights.configure(recurrent_to_output_weights, &_recurrent_to_output_weights_transposed); - if(!_has_cifg) + _transpose_recurrent_to_output_weights.configure(recurrent_to_output_weights, + &_recurrent_to_output_weights_transposed); + if (!_has_cifg) { - _transpose_input_to_input_weights.configure(lstm_params.input_to_input_weights(), &_input_to_input_weights_transposed); - _transpose_recurrent_to_input_weights.configure(lstm_params.recurrent_to_input_weights(), &_recurrent_to_input_weights_transposed); + _transpose_input_to_input_weights.configure(lstm_params.input_to_input_weights(), + &_input_to_input_weights_transposed); + _transpose_recurrent_to_input_weights.configure(lstm_params.recurrent_to_input_weights(), + &_recurrent_to_input_weights_transposed); } - if(_has_projection) + if (_has_projection) { _transpose_projection_weights.configure(_projection_weights, &_projection_weights_transposed); } @@ -216,40 +427,52 @@ void NEQLSTMLayer::configure(const ITensor *input, const TensorInfo mm_out_info(TensorShape(num_units, batch_size), 1, DataType::S32); // Forget gate. - const TensorInfo forget_gate_outstage_info(mm_out_info.tensor_shape(), 1, DataType::QSYMM16, QuantizationInfo(lstm_params.forget_intermediate_scale(), 0)); - const float input_to_forget_scale = input_to_forget_weights->info()->quantization_info().uniform().scale * qinput.scale / lstm_params.forget_intermediate_scale(); - configure_mm(_mm_input_to_forget, _input_to_forget_outstage, gemmlowp_info, - input, &_input_to_forget_weights_transposed, &_input_to_forget_eff_bias, - &_mm_input_to_forget_res, &_input_to_forget_outstage_res, input_to_forget_scale, - mm_out_info, forget_gate_outstage_info); - - const float recurrent_to_forget_scale = recurrent_to_forget_weights->info()->quantization_info().uniform().scale * qoutput_state_in.scale / lstm_params.forget_intermediate_scale(); - configure_mm(_mm_recurrent_to_forget, _recurrent_to_forget_outstage, gemmlowp_info, - output_state_in, &_recurrent_to_forget_weights_transposed, &_recurrent_to_forget_eff_bias, - &_mm_recurrent_to_forget_res, &_recurrent_to_forget_outstage_res, recurrent_to_forget_scale, - mm_out_info, forget_gate_outstage_info); - - _accumulate_input_recurrent_forget.configure(&_input_to_forget_outstage_res, &_recurrent_to_forget_outstage_res, &_recurrent_to_forget_outstage_res, ConvertPolicy::SATURATE); + const TensorInfo forget_gate_outstage_info(mm_out_info.tensor_shape(), 1, DataType::QSYMM16, + QuantizationInfo(lstm_params.forget_intermediate_scale(), 0)); + const float input_to_forget_scale = input_to_forget_weights->info()->quantization_info().uniform().scale * + qinput.scale / lstm_params.forget_intermediate_scale(); + configure_mm(_mm_input_to_forget, _input_to_forget_outstage, gemmlowp_info, input, + &_input_to_forget_weights_transposed, &_input_to_forget_eff_bias, &_mm_input_to_forget_res, + &_input_to_forget_outstage_res, input_to_forget_scale, mm_out_info, forget_gate_outstage_info); + + const float recurrent_to_forget_scale = recurrent_to_forget_weights->info()->quantization_info().uniform().scale * + qoutput_state_in.scale / lstm_params.forget_intermediate_scale(); + configure_mm(_mm_recurrent_to_forget, _recurrent_to_forget_outstage, gemmlowp_info, output_state_in, + &_recurrent_to_forget_weights_transposed, &_recurrent_to_forget_eff_bias, &_mm_recurrent_to_forget_res, + &_recurrent_to_forget_outstage_res, recurrent_to_forget_scale, mm_out_info, forget_gate_outstage_info); + + _accumulate_input_recurrent_forget.configure(&_input_to_forget_outstage_res, &_recurrent_to_forget_outstage_res, + &_recurrent_to_forget_outstage_res, ConvertPolicy::SATURATE); _input_to_forget_outstage_res.allocator()->allocate(); - if(_has_peephole) + if (_has_peephole) { _mul_cell_to_forget_res.allocator()->init(TensorInfo(cell_state_in->info()->tensor_shape(), 1, DataType::S32)); _memory_group.manage(&_mul_cell_to_forget_res); - _pixelwise_mul_cell_to_forget.configure(cell_state_in, lstm_params.cell_to_forget_weights(), &_mul_cell_to_forget_res, 1.f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO); - _cell_to_forget_outstage_res.allocator()->init(TensorInfo(_mul_cell_to_forget_res.info()->tensor_shape(), 1, DataType::QSYMM16, QuantizationInfo(lstm_params.forget_intermediate_scale(), 0))); + _pixelwise_mul_cell_to_forget.configure(cell_state_in, lstm_params.cell_to_forget_weights(), + &_mul_cell_to_forget_res, 1.f, ConvertPolicy::SATURATE, + RoundingPolicy::TO_ZERO); + _cell_to_forget_outstage_res.allocator()->init( + TensorInfo(_mul_cell_to_forget_res.info()->tensor_shape(), 1, DataType::QSYMM16, + QuantizationInfo(lstm_params.forget_intermediate_scale(), 0))); _memory_group.manage(&_cell_to_forget_outstage_res); - const float cell_to_forget_scale = std::pow(2, cell_shift) * lstm_params.cell_to_forget_weights()->info()->quantization_info().uniform().scale / lstm_params.forget_intermediate_scale(); - quantization::calculate_quantized_multiplier(cell_to_forget_scale, &gemmlowp_info.gemmlowp_multiplier, &gemmlowp_info.gemmlowp_shift); - _cell_to_forget_outstage.configure(&_mul_cell_to_forget_res, nullptr, &_cell_to_forget_outstage_res, gemmlowp_info); + const float cell_to_forget_scale = + std::pow(2, cell_shift) * + lstm_params.cell_to_forget_weights()->info()->quantization_info().uniform().scale / + lstm_params.forget_intermediate_scale(); + quantization::calculate_quantized_multiplier(cell_to_forget_scale, &gemmlowp_info.gemmlowp_multiplier, + &gemmlowp_info.gemmlowp_shift); + _cell_to_forget_outstage.configure(&_mul_cell_to_forget_res, nullptr, &_cell_to_forget_outstage_res, + gemmlowp_info); _mul_cell_to_forget_res.allocator()->allocate(); - _accumulate_cell_forget.configure(&_recurrent_to_forget_outstage_res, &_cell_to_forget_outstage_res, &_recurrent_to_forget_outstage_res, ConvertPolicy::SATURATE); + _accumulate_cell_forget.configure(&_recurrent_to_forget_outstage_res, &_cell_to_forget_outstage_res, + &_recurrent_to_forget_outstage_res, ConvertPolicy::SATURATE); _cell_to_forget_outstage_res.allocator()->allocate(); } Tensor *forget_activation_input = &_recurrent_to_forget_outstage_res; - if(_has_layer_norm) + if (_has_layer_norm) { configure_layer_norm(LayerNormGate::Forget, forget_activation_input); forget_activation_input->allocator()->allocate(); @@ -258,33 +481,36 @@ void NEQLSTMLayer::configure(const ITensor *input, // Output quantization info of Sigmoid and Tanh activations const QuantizationInfo sigmoid_tanh_outqinfo(1.f / 32768.f, 0); - const TensorInfo forget_gate_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, sigmoid_tanh_outqinfo); + const TensorInfo forget_gate_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, sigmoid_tanh_outqinfo); _memory_group.manage(&_forget_gate); _forget_gate.allocator()->init(forget_gate_info); - _forget_gate_sigmoid.configure(forget_activation_input, &_forget_gate, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)); + _forget_gate_sigmoid.configure(forget_activation_input, &_forget_gate, + ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)); forget_activation_input->allocator()->allocate(); // Modulation gate. - const TensorInfo cell_outstage_info(mm_out_info.tensor_shape(), 1, DataType::QSYMM16, QuantizationInfo(lstm_params.cell_intermediate_scale(), 0)); - const float input_to_cell_scale = input_to_cell_weights->info()->quantization_info().uniform().scale * qinput.scale / lstm_params.cell_intermediate_scale(); - configure_mm(_mm_input_to_cell, _input_to_cell_outstage, gemmlowp_info, - input, &_input_to_cell_weights_transposed, &_input_to_cell_eff_bias, - &_mm_input_to_cell_res, &_input_to_cell_outstage_res, input_to_cell_scale, + const TensorInfo cell_outstage_info(mm_out_info.tensor_shape(), 1, DataType::QSYMM16, + QuantizationInfo(lstm_params.cell_intermediate_scale(), 0)); + const float input_to_cell_scale = input_to_cell_weights->info()->quantization_info().uniform().scale * + qinput.scale / lstm_params.cell_intermediate_scale(); + configure_mm(_mm_input_to_cell, _input_to_cell_outstage, gemmlowp_info, input, &_input_to_cell_weights_transposed, + &_input_to_cell_eff_bias, &_mm_input_to_cell_res, &_input_to_cell_outstage_res, input_to_cell_scale, mm_out_info, cell_outstage_info); - const float recurrent_to_cell_scale = recurrent_to_cell_weights->info()->quantization_info().uniform().scale * qoutput_state_in.scale / lstm_params.cell_intermediate_scale(); - configure_mm(_mm_recurrent_to_cell, _recurrent_to_cell_outstage, gemmlowp_info, - output_state_in, &_recurrent_to_cell_weights_transposed, &_recurrent_to_cell_eff_bias, - &_mm_recurrent_to_cell_res, &_recurrent_to_cell_outstage_res, recurrent_to_cell_scale, - mm_out_info, cell_outstage_info); + const float recurrent_to_cell_scale = recurrent_to_cell_weights->info()->quantization_info().uniform().scale * + qoutput_state_in.scale / lstm_params.cell_intermediate_scale(); + configure_mm(_mm_recurrent_to_cell, _recurrent_to_cell_outstage, gemmlowp_info, output_state_in, + &_recurrent_to_cell_weights_transposed, &_recurrent_to_cell_eff_bias, &_mm_recurrent_to_cell_res, + &_recurrent_to_cell_outstage_res, recurrent_to_cell_scale, mm_out_info, cell_outstage_info); - _accumulate_input_recurrent_modulation.configure(&_input_to_cell_outstage_res, &_recurrent_to_cell_outstage_res, &_recurrent_to_cell_outstage_res, ConvertPolicy::SATURATE); + _accumulate_input_recurrent_modulation.configure(&_input_to_cell_outstage_res, &_recurrent_to_cell_outstage_res, + &_recurrent_to_cell_outstage_res, ConvertPolicy::SATURATE); _input_to_cell_outstage_res.allocator()->allocate(); Tensor *cell_activation_input = &_recurrent_to_cell_outstage_res; - if(_has_layer_norm) + if (_has_layer_norm) { configure_layer_norm(LayerNormGate::Cell, cell_activation_input); cell_activation_input->allocator()->allocate(); @@ -295,14 +521,15 @@ void NEQLSTMLayer::configure(const ITensor *input, _memory_group.manage(&_cell_gate); _cell_gate.allocator()->init(cell_gate_info); - _cell_gate_tanh.configure(cell_activation_input, &_cell_gate, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.f, 1.f)); + _cell_gate_tanh.configure(cell_activation_input, &_cell_gate, + ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.f, 1.f)); cell_activation_input->allocator()->allocate(); // Input gate. const TensorInfo input_gate_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, sigmoid_tanh_outqinfo); _input_gate.allocator()->init(input_gate_info); _memory_group.manage(&_input_gate); - if(_has_cifg) + if (_has_cifg) { _ones.allocator()->init(*_forget_gate.info()); _input_gate_sub.configure(&_ones, &_forget_gate, &_input_gate, ConvertPolicy::SATURATE); @@ -310,104 +537,137 @@ void NEQLSTMLayer::configure(const ITensor *input, } else { - const TensorInfo input_outstage_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, QuantizationInfo(lstm_params.input_intermediate_scale(), 0)); - const float input_to_input_scale = _input_to_input_weights->info()->quantization_info().uniform().scale * qinput.scale / lstm_params.input_intermediate_scale(); - configure_mm(_mm_input_to_input, _input_to_input_outstage, gemmlowp_info, - input, &_input_to_input_weights_transposed, &_input_to_input_eff_bias, - &_mm_input_to_input_res, &_input_to_input_outstage_res, input_to_input_scale, - mm_out_info, input_outstage_info); - - const float recurrent_to_input_scale = _recurrent_to_input_weights->info()->quantization_info().uniform().scale * qoutput_state_in.scale / lstm_params.input_intermediate_scale(); - configure_mm(_mm_recurrent_to_input, _recurrent_to_input_outstage, gemmlowp_info, - output_state_in, &_recurrent_to_input_weights_transposed, &_recurrent_to_input_eff_bias, + const TensorInfo input_outstage_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, + QuantizationInfo(lstm_params.input_intermediate_scale(), 0)); + const float input_to_input_scale = _input_to_input_weights->info()->quantization_info().uniform().scale * + qinput.scale / lstm_params.input_intermediate_scale(); + configure_mm(_mm_input_to_input, _input_to_input_outstage, gemmlowp_info, input, + &_input_to_input_weights_transposed, &_input_to_input_eff_bias, &_mm_input_to_input_res, + &_input_to_input_outstage_res, input_to_input_scale, mm_out_info, input_outstage_info); + + const float recurrent_to_input_scale = + _recurrent_to_input_weights->info()->quantization_info().uniform().scale * qoutput_state_in.scale / + lstm_params.input_intermediate_scale(); + configure_mm(_mm_recurrent_to_input, _recurrent_to_input_outstage, gemmlowp_info, output_state_in, + &_recurrent_to_input_weights_transposed, &_recurrent_to_input_eff_bias, &_mm_recurrent_to_input_res, &_recurrent_to_input_outstage_res, recurrent_to_input_scale, mm_out_info, input_outstage_info); - _accumulate_input_recurrent_input.configure(&_input_to_input_outstage_res, &_recurrent_to_input_outstage_res, &_recurrent_to_input_outstage_res, ConvertPolicy::SATURATE); + _accumulate_input_recurrent_input.configure(&_input_to_input_outstage_res, &_recurrent_to_input_outstage_res, + &_recurrent_to_input_outstage_res, ConvertPolicy::SATURATE); _input_to_input_outstage_res.allocator()->allocate(); - if(_has_peephole) + if (_has_peephole) { - _mul_cell_to_input_res.allocator()->init(TensorInfo(cell_state_in->info()->tensor_shape(), 1, DataType::S32)); + _mul_cell_to_input_res.allocator()->init( + TensorInfo(cell_state_in->info()->tensor_shape(), 1, DataType::S32)); _memory_group.manage(&_mul_cell_to_input_res); - _pixelwise_mul_cell_to_input.configure(cell_state_in, lstm_params.cell_to_input_weights(), &_mul_cell_to_input_res, 1.f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO); - const float cell_to_input_scale = std::pow(2, cell_shift) * lstm_params.cell_to_input_weights()->info()->quantization_info().uniform().scale / lstm_params.input_intermediate_scale(); - quantization::calculate_quantized_multiplier(cell_to_input_scale, &gemmlowp_info.gemmlowp_multiplier, &gemmlowp_info.gemmlowp_shift); - _cell_to_input_outstage_res.allocator()->init(TensorInfo(_mul_cell_to_input_res.info()->tensor_shape(), 1, DataType::QSYMM16, QuantizationInfo(lstm_params.input_intermediate_scale(), 0))); + _pixelwise_mul_cell_to_input.configure(cell_state_in, lstm_params.cell_to_input_weights(), + &_mul_cell_to_input_res, 1.f, ConvertPolicy::SATURATE, + RoundingPolicy::TO_ZERO); + const float cell_to_input_scale = + std::pow(2, cell_shift) * + lstm_params.cell_to_input_weights()->info()->quantization_info().uniform().scale / + lstm_params.input_intermediate_scale(); + quantization::calculate_quantized_multiplier(cell_to_input_scale, &gemmlowp_info.gemmlowp_multiplier, + &gemmlowp_info.gemmlowp_shift); + _cell_to_input_outstage_res.allocator()->init( + TensorInfo(_mul_cell_to_input_res.info()->tensor_shape(), 1, DataType::QSYMM16, + QuantizationInfo(lstm_params.input_intermediate_scale(), 0))); _memory_group.manage(&_cell_to_input_outstage_res); - _cell_to_input_outstage.configure(&_mul_cell_to_input_res, nullptr, &_cell_to_input_outstage_res, gemmlowp_info); + _cell_to_input_outstage.configure(&_mul_cell_to_input_res, nullptr, &_cell_to_input_outstage_res, + gemmlowp_info); _mul_cell_to_input_res.allocator()->allocate(); - _accumulate_cell_input.configure(&_recurrent_to_input_outstage_res, &_cell_to_input_outstage_res, &_recurrent_to_input_outstage_res, ConvertPolicy::SATURATE); + _accumulate_cell_input.configure(&_recurrent_to_input_outstage_res, &_cell_to_input_outstage_res, + &_recurrent_to_input_outstage_res, ConvertPolicy::SATURATE); _cell_to_input_outstage_res.allocator()->allocate(); } Tensor *input_activation_input = &_recurrent_to_input_outstage_res; - if(_has_layer_norm) + if (_has_layer_norm) { configure_layer_norm(LayerNormGate::Input, input_activation_input); input_activation_input->allocator()->allocate(); input_activation_input = &get_layer_norm_output(LayerNormGate::Input); } - _input_gate_sigmoid.configure(input_activation_input, &_input_gate, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)); + _input_gate_sigmoid.configure(input_activation_input, &_input_gate, + ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)); input_activation_input->allocator()->allocate(); } // Cell. - // TODO(COMPMID-3395): Perform multiplication in the quantized domain in NEPixelWiseMultiplicationKernel - _pixelwise_mul_forget_cell.configure(&_forget_gate, cell_state_in, &_forget_gate, 1.f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO); + // TODO(COMPMID-3395): Perform multiplication in the quantized domain in NEPixelWiseMultiplication + _pixelwise_mul_forget_cell.configure(&_forget_gate, cell_state_in, &_forget_gate, 1.f, ConvertPolicy::SATURATE, + RoundingPolicy::TO_ZERO); const float cell_gate_scale = _cell_gate.info()->quantization_info().uniform().scale; const float mul_input_cell_scale = cell_gate_scale * std::pow(2, 15 + cell_shift); - const TensorInfo mul_input_cell_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, QuantizationInfo(mul_input_cell_scale, 0)); + const TensorInfo mul_input_cell_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, + QuantizationInfo(mul_input_cell_scale, 0)); _memory_group.manage(&_mul_input_cell_res); _mul_input_cell_res.allocator()->init(mul_input_cell_info); - _pixelwise_mul_input_cell.configure(&_input_gate, &_cell_gate, &_mul_input_cell_res, 1.f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO); + _pixelwise_mul_input_cell.configure(&_input_gate, &_cell_gate, &_mul_input_cell_res, 1.f, ConvertPolicy::SATURATE, + RoundingPolicy::TO_ZERO); _cell_gate.allocator()->allocate(); _add_forget_cell.configure(&_forget_gate, &_mul_input_cell_res, cell_state_out, ConvertPolicy::SATURATE); _mul_input_cell_res.allocator()->allocate(); _forget_gate.allocator()->allocate(); - if(_has_cell_clipping) + if (_has_cell_clipping) { - _cell_clip.configure(cell_state_out, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, -quantized_cell_clip, quantized_cell_clip)); + _cell_clip.configure(cell_state_out, nullptr, + ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, + -quantized_cell_clip, quantized_cell_clip)); } // Output gate. - const TensorInfo output_outstage_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, QuantizationInfo(lstm_params.output_intermediate_scale(), 0)); - const float input_to_output_scale = input_to_output_weights->info()->quantization_info().uniform().scale * qinput.scale / lstm_params.output_intermediate_scale(); - configure_mm(_mm_input_to_output, _input_to_output_outstage, gemmlowp_info, - input, &_input_to_output_weights_transposed, &_input_to_output_eff_bias, - &_mm_input_to_output_res, &_input_to_output_outstage_res, input_to_output_scale, - mm_out_info, output_outstage_info); - - const float recurrent_to_output_scale = recurrent_to_output_weights->info()->quantization_info().uniform().scale * qoutput_state_in.scale / lstm_params.output_intermediate_scale(); - configure_mm(_mm_recurrent_to_output, _recurrent_to_output_outstage, gemmlowp_info, - output_state_in, &_recurrent_to_output_weights_transposed, &_recurrent_to_output_eff_bias, - &_mm_recurrent_to_output_res, &_recurrent_to_output_outstage_res, recurrent_to_output_scale, - mm_out_info, output_outstage_info); - - _accumulate_input_recurrent_output.configure(&_recurrent_to_output_outstage_res, &_input_to_output_outstage_res, &_recurrent_to_output_outstage_res, ConvertPolicy::SATURATE); + const TensorInfo output_outstage_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, + QuantizationInfo(lstm_params.output_intermediate_scale(), 0)); + const float input_to_output_scale = input_to_output_weights->info()->quantization_info().uniform().scale * + qinput.scale / lstm_params.output_intermediate_scale(); + configure_mm(_mm_input_to_output, _input_to_output_outstage, gemmlowp_info, input, + &_input_to_output_weights_transposed, &_input_to_output_eff_bias, &_mm_input_to_output_res, + &_input_to_output_outstage_res, input_to_output_scale, mm_out_info, output_outstage_info); + + const float recurrent_to_output_scale = recurrent_to_output_weights->info()->quantization_info().uniform().scale * + qoutput_state_in.scale / lstm_params.output_intermediate_scale(); + configure_mm(_mm_recurrent_to_output, _recurrent_to_output_outstage, gemmlowp_info, output_state_in, + &_recurrent_to_output_weights_transposed, &_recurrent_to_output_eff_bias, &_mm_recurrent_to_output_res, + &_recurrent_to_output_outstage_res, recurrent_to_output_scale, mm_out_info, output_outstage_info); + + _accumulate_input_recurrent_output.configure(&_recurrent_to_output_outstage_res, &_input_to_output_outstage_res, + &_recurrent_to_output_outstage_res, ConvertPolicy::SATURATE); _input_to_output_outstage_res.allocator()->allocate(); - if(_has_peephole) + if (_has_peephole) { - // TODO(COMPMID-3395): Perform multiplication in the quantized domain in NEPixelWiseMultiplicationKernel + // TODO(COMPMID-3395): Perform multiplication in the quantized domain in NEPixelWiseMultiplication // Here we are not using the output stage because all operations are done in float _mul_cell_to_output_res.allocator()->init(TensorInfo(cell_state_out->info()->tensor_shape(), 1, DataType::S32)); _memory_group.manage(&_mul_cell_to_output_res); - _pixelwise_mul_cell_to_output.configure(cell_state_out, lstm_params.cell_to_output_weights(), &_mul_cell_to_output_res, 1.f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO); - - const float cell_to_output_scale = std::pow(2, cell_shift) * lstm_params.cell_to_output_weights()->info()->quantization_info().uniform().scale / lstm_params.output_intermediate_scale(); - quantization::calculate_quantized_multiplier(cell_to_output_scale, &gemmlowp_info.gemmlowp_multiplier, &gemmlowp_info.gemmlowp_shift); - _cell_to_output_outstage_res.allocator()->init(TensorInfo(_mul_cell_to_output_res.info()->tensor_shape(), 1, DataType::QSYMM16, QuantizationInfo(lstm_params.output_intermediate_scale(), 0))); + _pixelwise_mul_cell_to_output.configure(cell_state_out, lstm_params.cell_to_output_weights(), + &_mul_cell_to_output_res, 1.f, ConvertPolicy::SATURATE, + RoundingPolicy::TO_ZERO); + + const float cell_to_output_scale = + std::pow(2, cell_shift) * + lstm_params.cell_to_output_weights()->info()->quantization_info().uniform().scale / + lstm_params.output_intermediate_scale(); + quantization::calculate_quantized_multiplier(cell_to_output_scale, &gemmlowp_info.gemmlowp_multiplier, + &gemmlowp_info.gemmlowp_shift); + _cell_to_output_outstage_res.allocator()->init( + TensorInfo(_mul_cell_to_output_res.info()->tensor_shape(), 1, DataType::QSYMM16, + QuantizationInfo(lstm_params.output_intermediate_scale(), 0))); _memory_group.manage(&_cell_to_output_outstage_res); - _cell_to_output_outstage.configure(&_mul_cell_to_output_res, nullptr, &_cell_to_output_outstage_res, gemmlowp_info); + _cell_to_output_outstage.configure(&_mul_cell_to_output_res, nullptr, &_cell_to_output_outstage_res, + gemmlowp_info); _mul_cell_to_output_res.allocator()->allocate(); - _accumulate_cell_to_output.configure(&_recurrent_to_output_outstage_res, &_cell_to_output_outstage_res, &_recurrent_to_output_outstage_res, ConvertPolicy::SATURATE); + _accumulate_cell_to_output.configure(&_recurrent_to_output_outstage_res, &_cell_to_output_outstage_res, + &_recurrent_to_output_outstage_res, ConvertPolicy::SATURATE); _cell_to_output_outstage_res.allocator()->allocate(); } Tensor *output_activation_input = &_recurrent_to_output_outstage_res; - if(_has_layer_norm) + if (_has_layer_norm) { configure_layer_norm(LayerNormGate::Output, output_activation_input); output_activation_input->allocator()->allocate(); @@ -417,20 +677,24 @@ void NEQLSTMLayer::configure(const ITensor *input, _memory_group.manage(&_output_gate); _output_gate.allocator()->init(output_gate_info); - _output_gate_sigmoid.configure(output_activation_input, &_output_gate, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)); + _output_gate_sigmoid.configure(output_activation_input, &_output_gate, + ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)); output_activation_input->allocator()->allocate(); // Hidden. - _hidden_tanh.configure(cell_state_out, &_input_gate, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.f, 1.f)); - // TODO(COMPMID-3395): Perform multiplication in the quantized domain in NEPixelWiseMultiplicationKernel + _hidden_tanh.configure(cell_state_out, &_input_gate, + ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.f, 1.f)); + // TODO(COMPMID-3395): Perform multiplication in the quantized domain in NEPixelWiseMultiplication _memory_group.manage(&_hidden_mul_res); const TensorInfo hidden_mul_res(_input_gate.info()->tensor_shape(), 1, DataType::S32); _hidden_mul_res.allocator()->init(hidden_mul_res); - _pixelwise_mul_hidden.configure(&_output_gate, &_input_gate, &_hidden_mul_res, 1.f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO); + _pixelwise_mul_hidden.configure(&_output_gate, &_input_gate, &_hidden_mul_res, 1.f, ConvertPolicy::SATURATE, + RoundingPolicy::TO_ZERO); _output_gate.allocator()->allocate(); _input_gate.allocator()->allocate(); const float hidden_state_scale = std::pow(2, -15) / lstm_params.hidden_state_scale() * std::pow(2, -15); - quantization::calculate_quantized_multiplier(hidden_state_scale, &gemmlowp_info.gemmlowp_multiplier, &gemmlowp_info.gemmlowp_shift, /* ignore_epsilon */ true); + quantization::calculate_quantized_multiplier(hidden_state_scale, &gemmlowp_info.gemmlowp_multiplier, + &gemmlowp_info.gemmlowp_shift, /* ignore_epsilon */ true); gemmlowp_info.gemmlowp_offset = lstm_params.hidden_state_zero(); gemmlowp_info.output_data_type = output_state_in->info()->data_type(); @@ -439,7 +703,7 @@ void NEQLSTMLayer::configure(const ITensor *input, _memory_group.manage(&_hidden_gate); - if(_projection_tensor_copy_required) + if (_projection_tensor_copy_required) { _hidden_gate.allocator()->init(*output_state_out->info()); _hidden_gate.info()->set_tensor_shape(_hidden_mul_res.info()->tensor_shape()); @@ -450,59 +714,62 @@ void NEQLSTMLayer::configure(const ITensor *input, _hidden_mul_res.allocator()->allocate(); // Projection. - if(_has_projection) + if (_has_projection) { const TensorInfo projection_outstage_info(*output_state_out->info()); - const UniformQuantizationInfo qprojection = _projection_weights->info()->quantization_info().uniform(); - const float projection_scale = qprojection.scale * lstm_params.hidden_state_scale() / qoutput_state_in.scale; - gemmlowp_info.gemmlowp_offset = qoutput_state_in.offset; - gemmlowp_info.gemmlowp_min_bound = std::numeric_limits<int8_t>::lowest(); - gemmlowp_info.gemmlowp_max_bound = std::numeric_limits<int8_t>::max(); - gemmlowp_info.output_data_type = DataType::QASYMM8_SIGNED; - - TensorInfo projection_mm_out_info{ mm_out_info }; + const UniformQuantizationInfo qprojection = _projection_weights->info()->quantization_info().uniform(); + const float projection_scale = qprojection.scale * lstm_params.hidden_state_scale() / qoutput_state_in.scale; + gemmlowp_info.gemmlowp_offset = qoutput_state_in.offset; + gemmlowp_info.gemmlowp_min_bound = std::numeric_limits<int8_t>::lowest(); + gemmlowp_info.gemmlowp_max_bound = std::numeric_limits<int8_t>::max(); + gemmlowp_info.output_data_type = DataType::QASYMM8_SIGNED; + + TensorInfo projection_mm_out_info{mm_out_info}; projection_mm_out_info.set_tensor_shape(TensorShape(output_size, batch_size)); - configure_mm(_mm_projection, _projection_outstage, gemmlowp_info, - hidden_gate_result, &_projection_weights_transposed, &_projection_eff_bias, - &_mm_projection_res, &_projection_outstage_res, projection_scale, - projection_mm_out_info, projection_outstage_info); + configure_mm(_mm_projection, _projection_outstage, gemmlowp_info, hidden_gate_result, + &_projection_weights_transposed, &_projection_eff_bias, &_mm_projection_res, + &_projection_outstage_res, projection_scale, projection_mm_out_info, projection_outstage_info); ITensor *accumulate_destination = output_state_out; - if(_projection_tensor_copy_required) + if (_projection_tensor_copy_required) { _hidden_gate.allocator()->allocate(); - _projection_accumulate_res.allocator()->init(*output_state_out->info()); + _projection_accumulate_res.allocator()->init(*output_state_in->info()); _projection_accumulate_res.info()->set_tensor_shape(_projection_outstage_res.info()->tensor_shape()); - _projection_output_to_accumulate_copy.configure(*output_state_out, _projection_accumulate_res); + _projection_output_to_accumulate_copy.configure(*output_state_in, _projection_accumulate_res); accumulate_destination = &_projection_accumulate_res; } - _accumulate_projection.configure(&_projection_outstage_res, accumulate_destination, accumulate_destination, ConvertPolicy::SATURATE); + _accumulate_projection.configure(&_projection_outstage_res, accumulate_destination, accumulate_destination, + ConvertPolicy::SATURATE); _projection_outstage_res.allocator()->allocate(); - if(_projection_tensor_copy_required) + if (_projection_tensor_copy_required) { _projection_accumulate_to_output_copy.configure(_projection_accumulate_res, *output_state_out); _projection_accumulate_res.allocator()->allocate(); } - int8_t quantized_projection_clip{ 0 }; - if(lstm_params.projection_clip() > 0.0f) + int8_t quantized_projection_clip{0}; + if (lstm_params.projection_clip() > 0.0f) { - quantized_projection_clip = utility::clamp<int8_t>(lstm_params.projection_clip() / qprojection.scale, -128, 127); + quantized_projection_clip = + utility::clamp<int8_t>(lstm_params.projection_clip() / qprojection.scale, -128, 127); } - if(quantized_projection_clip > 0) + if (quantized_projection_clip > 0) { - _projection_clip.configure(output_state_out, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, -quantized_projection_clip, quantized_projection_clip)); + _projection_clip.configure(output_state_out, nullptr, + ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, + -quantized_projection_clip, quantized_projection_clip)); _has_projection_clipping = true; } } else { - if(_projection_tensor_copy_required) + if (_projection_tensor_copy_required) { _hidden_to_output_copy.configure(_hidden_gate, *output_state_out); _hidden_gate.allocator()->allocate(); @@ -513,17 +780,27 @@ void NEQLSTMLayer::configure(const ITensor *input, _copy_output.configure(output_state_out, output); } -Status NEQLSTMLayer::validate(const ITensorInfo *input, - const ITensorInfo *input_to_forget_weights, const ITensorInfo *input_to_cell_weights, const ITensorInfo *input_to_output_weights, - const ITensorInfo *recurrent_to_forget_weights, const ITensorInfo *recurrent_to_cell_weights, const ITensorInfo *recurrent_to_output_weights, - const ITensorInfo *forget_gate_bias, const ITensorInfo *cell_bias, const ITensorInfo *output_gate_bias, - const ITensorInfo *cell_state_in, const ITensorInfo *output_state_in, - const ITensorInfo *cell_state_out, const ITensorInfo *output_state_out, const ITensorInfo *output, +Status NEQLSTMLayer::validate(const ITensorInfo *input, + const ITensorInfo *input_to_forget_weights, + const ITensorInfo *input_to_cell_weights, + const ITensorInfo *input_to_output_weights, + const ITensorInfo *recurrent_to_forget_weights, + const ITensorInfo *recurrent_to_cell_weights, + const ITensorInfo *recurrent_to_output_weights, + const ITensorInfo *forget_gate_bias, + const ITensorInfo *cell_bias, + const ITensorInfo *output_gate_bias, + const ITensorInfo *cell_state_in, + const ITensorInfo *output_state_in, + const ITensorInfo *cell_state_out, + const ITensorInfo *output_state_out, + const ITensorInfo *output, const LSTMParams<ITensorInfo> &lstm_params) { - ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, input_to_forget_weights, input_to_cell_weights, input_to_output_weights, recurrent_to_forget_weights, recurrent_to_cell_weights, - recurrent_to_output_weights, forget_gate_bias, cell_bias, output_gate_bias, cell_state_in, output_state_in, - cell_state_out, output_state_out, output); + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, input_to_forget_weights, input_to_cell_weights, input_to_output_weights, + recurrent_to_forget_weights, recurrent_to_cell_weights, + recurrent_to_output_weights, forget_gate_bias, cell_bias, output_gate_bias, + cell_state_in, output_state_in, cell_state_out, output_state_out, output); ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8_SIGNED); ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->num_dimensions() != 2, "Input must have exactly 2 dimensions"); @@ -535,14 +812,28 @@ Status NEQLSTMLayer::validate(const ITensorInfo *input, ARM_COMPUTE_RETURN_ERROR_ON(input_to_output_weights->num_dimensions() != 2); ARM_COMPUTE_RETURN_ERROR_ON(input_to_output_weights->dimension(0) != input_size); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input_to_output_weights, input_to_forget_weights, input_to_cell_weights); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input_to_output_weights, input_to_forget_weights, + input_to_cell_weights); ARM_COMPUTE_RETURN_ERROR_ON(recurrent_to_output_weights->num_dimensions() != 2); ARM_COMPUTE_RETURN_ERROR_ON(recurrent_to_output_weights->dimension(1) != num_units); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(recurrent_to_output_weights, recurrent_to_forget_weights, recurrent_to_cell_weights); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input_to_forget_weights, 1, DataType::QSYMM8); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input_to_forget_weights, input_to_cell_weights, input_to_output_weights, - recurrent_to_forget_weights, recurrent_to_cell_weights, recurrent_to_output_weights); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(recurrent_to_output_weights, recurrent_to_forget_weights, + recurrent_to_cell_weights); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input_to_forget_weights, 1, DataType::QASYMM8_SIGNED, + DataType::QSYMM8); + // If the input_to_forget_weights data type is DataType::QSYMM8 then it can never match the other weights as they are all DataType::QASYMM8_SIGNED + if (input_to_forget_weights->data_type() == DataType::QSYMM8) + { + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input_to_cell_weights, input_to_output_weights, + recurrent_to_forget_weights, recurrent_to_cell_weights, + recurrent_to_output_weights); + } + else + { + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input_to_forget_weights, input_to_cell_weights, + input_to_output_weights, recurrent_to_forget_weights, + recurrent_to_cell_weights, recurrent_to_output_weights); + } ARM_COMPUTE_RETURN_ERROR_ON(forget_gate_bias->num_dimensions() != 1); ARM_COMPUTE_RETURN_ERROR_ON(forget_gate_bias->dimension(0) != num_units); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(forget_gate_bias, cell_bias, output_gate_bias); @@ -560,20 +851,25 @@ Status NEQLSTMLayer::validate(const ITensorInfo *input, ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output_state_in); // Check whether peephole weights are all there or none - if(lstm_params.has_peephole_opt()) + if (lstm_params.has_peephole_opt()) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(lstm_params.cell_to_forget_weights(), lstm_params.cell_to_output_weights()); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lstm_params.cell_to_forget_weights(), 1, DataType::QSYMM16); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lstm_params.cell_to_forget_weights(), 1, + DataType::QSYMM16); ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.cell_to_forget_weights()->num_dimensions() != 1); ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.cell_to_forget_weights()->dimension(0) != num_units); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(lstm_params.cell_to_forget_weights(), lstm_params.cell_to_output_weights()); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(lstm_params.cell_to_forget_weights(), lstm_params.cell_to_output_weights()); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(lstm_params.cell_to_forget_weights(), + lstm_params.cell_to_output_weights()); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(lstm_params.cell_to_forget_weights(), + lstm_params.cell_to_output_weights()); - if(!lstm_params.has_cifg_opt()) + if (!lstm_params.has_cifg_opt()) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(lstm_params.cell_to_input_weights()); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(lstm_params.cell_to_forget_weights(), lstm_params.cell_to_input_weights()); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(lstm_params.cell_to_forget_weights(), lstm_params.cell_to_input_weights()); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(lstm_params.cell_to_forget_weights(), + lstm_params.cell_to_input_weights()); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(lstm_params.cell_to_forget_weights(), + lstm_params.cell_to_input_weights()); } } @@ -587,7 +883,7 @@ Status NEQLSTMLayer::validate(const ITensorInfo *input, // Calculate quantized parameters for clipping. int16_t quantized_cell_clip = 0; - if(lstm_params.cell_clip() > 0.0f) + if (lstm_params.cell_clip() > 0.0f) { quantized_cell_clip = quantize_qsymm16(lstm_params.cell_clip(), qcell_state_in); } @@ -595,44 +891,90 @@ Status NEQLSTMLayer::validate(const ITensorInfo *input, // Precompute effective bias for optimizing the matmul computations. const TensorInfo eff_bias_info(TensorShape(num_units), 1, DataType::S32); const TensorInfo projection_eff_bias_info(TensorShape(output_size), 1, DataType::S32); - if(!lstm_params.has_cifg_opt()) + if (!lstm_params.has_cifg_opt()) { - ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpMatrixAReductionKernel::validate(lstm_params.input_to_input_weights(), &eff_bias_info, GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true))); - ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpMatrixAReductionKernel::validate(lstm_params.recurrent_to_input_weights(), &eff_bias_info, GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, - true))); + ARM_COMPUTE_RETURN_ON_ERROR(cpu::kernels::CpuGemmLowpMatrixAReductionKernel::validate( + lstm_params.input_to_input_weights(), &eff_bias_info, + GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true))); + ARM_COMPUTE_RETURN_ON_ERROR(cpu::kernels::CpuGemmLowpMatrixAReductionKernel::validate( + lstm_params.recurrent_to_input_weights(), &eff_bias_info, + GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true))); } - ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpMatrixAReductionKernel::validate(input_to_forget_weights, &eff_bias_info, GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true))); - ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpMatrixAReductionKernel::validate(recurrent_to_forget_weights, &eff_bias_info, GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true))); - ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpMatrixAReductionKernel::validate(input_to_cell_weights, &eff_bias_info, GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true))); - ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpMatrixAReductionKernel::validate(recurrent_to_cell_weights, &eff_bias_info, GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true))); - ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpMatrixAReductionKernel::validate(input_to_output_weights, &eff_bias_info, GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true))); - ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpMatrixAReductionKernel::validate(recurrent_to_output_weights, &eff_bias_info, GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true))); - if(lstm_params.has_projection()) + ARM_COMPUTE_RETURN_ON_ERROR(cpu::kernels::CpuGemmLowpMatrixAReductionKernel::validate( + input_to_forget_weights, &eff_bias_info, GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true))); + ARM_COMPUTE_RETURN_ON_ERROR(cpu::kernels::CpuGemmLowpMatrixAReductionKernel::validate( + recurrent_to_forget_weights, &eff_bias_info, + GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true))); + ARM_COMPUTE_RETURN_ON_ERROR(cpu::kernels::CpuGemmLowpMatrixAReductionKernel::validate( + input_to_cell_weights, &eff_bias_info, GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true))); + ARM_COMPUTE_RETURN_ON_ERROR(cpu::kernels::CpuGemmLowpMatrixAReductionKernel::validate( + recurrent_to_cell_weights, &eff_bias_info, + GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true))); + ARM_COMPUTE_RETURN_ON_ERROR(cpu::kernels::CpuGemmLowpMatrixAReductionKernel::validate( + input_to_output_weights, &eff_bias_info, GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true))); + ARM_COMPUTE_RETURN_ON_ERROR(cpu::kernels::CpuGemmLowpMatrixAReductionKernel::validate( + recurrent_to_output_weights, &eff_bias_info, + GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true))); + if (lstm_params.has_projection()) { - ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpMatrixAReductionKernel::validate(lstm_params.projection_weights(), &projection_eff_bias_info, GEMMLowpReductionKernelInfo(output_size, false, - lstm_params.hidden_state_zero(), - true))); + ARM_COMPUTE_RETURN_ON_ERROR(cpu::kernels::CpuGemmLowpMatrixAReductionKernel::validate( + lstm_params.projection_weights(), &projection_eff_bias_info, + GEMMLowpReductionKernelInfo(output_size, false, lstm_params.hidden_state_zero(), true))); + if (lstm_params.projection_bias() != nullptr) + { + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lstm_params.projection_bias(), 1, DataType::S32); + ARM_COMPUTE_RETURN_ON_ERROR( + NEArithmeticAddition::validate(lstm_params.projection_bias(), &projection_eff_bias_info, + &projection_eff_bias_info, ConvertPolicy::SATURATE)); + } } - const TensorInfo input_weights_transposed(TensorShape(num_units, input_size), 1, input_to_forget_weights->data_type(), input_to_forget_weights->quantization_info()); - const TensorInfo recurrent_weights_transposed(TensorShape(num_units, output_size), 1, recurrent_to_forget_weights->data_type(), recurrent_to_forget_weights->quantization_info()); + const TensorInfo input_weights_transposed(TensorShape(num_units, input_size), 1, input_to_cell_weights->data_type(), + input_to_cell_weights->quantization_info()); + const TensorInfo input_to_output_weights_transposed(TensorShape(num_units, input_size), 1, + input_to_output_weights->data_type(), + input_to_output_weights->quantization_info()); + const TensorInfo recurrent_to_forget_weights_transposed(TensorShape(num_units, output_size), 1, + recurrent_to_forget_weights->data_type(), + recurrent_to_forget_weights->quantization_info()); + const TensorInfo recurrent_to_cell_weights_transposed(TensorShape(num_units, output_size), 1, + recurrent_to_cell_weights->data_type(), + recurrent_to_cell_weights->quantization_info()); + const TensorInfo recurrent_to_output_weights_transposed(TensorShape(num_units, output_size), 1, + recurrent_to_output_weights->data_type(), + recurrent_to_output_weights->quantization_info()); + const TensorInfo recurrent_weights_transposed(TensorShape(num_units, output_size), 1, + recurrent_to_forget_weights->data_type(), + recurrent_to_forget_weights->quantization_info()); - // Validate weights transpose - ARM_COMPUTE_RETURN_ON_ERROR(NETranspose::validate(input_to_forget_weights, &input_weights_transposed)); ARM_COMPUTE_RETURN_ON_ERROR(NETranspose::validate(input_to_cell_weights, &input_weights_transposed)); - ARM_COMPUTE_RETURN_ON_ERROR(NETranspose::validate(input_to_output_weights, &input_weights_transposed)); - ARM_COMPUTE_RETURN_ON_ERROR(NETranspose::validate(recurrent_to_forget_weights, &recurrent_weights_transposed)); - ARM_COMPUTE_RETURN_ON_ERROR(NETranspose::validate(recurrent_to_cell_weights, &recurrent_weights_transposed)); - ARM_COMPUTE_RETURN_ON_ERROR(NETranspose::validate(recurrent_to_output_weights, &recurrent_weights_transposed)); - if(!lstm_params.has_cifg_opt()) + ARM_COMPUTE_RETURN_ON_ERROR(NETranspose::validate(input_to_output_weights, &input_to_output_weights_transposed)); + ARM_COMPUTE_RETURN_ON_ERROR( + NETranspose::validate(recurrent_to_forget_weights, &recurrent_to_forget_weights_transposed)); + ARM_COMPUTE_RETURN_ON_ERROR( + NETranspose::validate(recurrent_to_cell_weights, &recurrent_to_cell_weights_transposed)); + ARM_COMPUTE_RETURN_ON_ERROR( + NETranspose::validate(recurrent_to_output_weights, &recurrent_to_output_weights_transposed)); + if (!lstm_params.has_cifg_opt()) { - ARM_COMPUTE_RETURN_ON_ERROR(NETranspose::validate(lstm_params.input_to_input_weights(), &input_weights_transposed)); - ARM_COMPUTE_RETURN_ON_ERROR(NETranspose::validate(lstm_params.recurrent_to_input_weights(), &recurrent_weights_transposed)); + const TensorInfo recurrent_to_input_weights_transposed( + TensorShape(num_units, output_size), 1, recurrent_to_forget_weights->data_type(), + lstm_params.recurrent_to_input_weights()->quantization_info()); + const TensorInfo input_to_input_weights_transposed(TensorShape(num_units, input_size), 1, + lstm_params.input_to_input_weights()->data_type(), + lstm_params.input_to_input_weights()->quantization_info()); + ARM_COMPUTE_RETURN_ON_ERROR( + NETranspose::validate(lstm_params.input_to_input_weights(), &input_to_input_weights_transposed)); + ARM_COMPUTE_RETURN_ON_ERROR( + NETranspose::validate(lstm_params.recurrent_to_input_weights(), &recurrent_to_input_weights_transposed)); } - if(lstm_params.has_projection()) + if (lstm_params.has_projection()) { - const TensorInfo projection_weights_transposed(TensorShape(output_size, num_units), 1, lstm_params.projection_weights()->data_type(), lstm_params.projection_weights()->quantization_info()); - ARM_COMPUTE_RETURN_ON_ERROR(NETranspose::validate(lstm_params.projection_weights(), &projection_weights_transposed)); + const TensorInfo projection_weights_transposed(TensorShape(output_size, num_units), 1, + lstm_params.projection_weights()->data_type(), + lstm_params.projection_weights()->quantization_info()); + ARM_COMPUTE_RETURN_ON_ERROR( + NETranspose::validate(lstm_params.projection_weights(), &projection_weights_transposed)); } GEMMLowpOutputStageInfo gemmlowp_info; @@ -645,28 +987,42 @@ Status NEQLSTMLayer::validate(const ITensorInfo *input, // Forget gate. ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.forget_intermediate_scale() == 0); - const TensorInfo forget_outstage_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, QuantizationInfo(lstm_params.forget_intermediate_scale(), 0)); + const TensorInfo forget_outstage_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, + QuantizationInfo(lstm_params.forget_intermediate_scale(), 0)); const TensorInfo mm_out_info(TensorShape(num_units, batch_size), 1, DataType::S32); - const float input_to_forget_scale = input_to_forget_weights->quantization_info().uniform().scale * qinput.scale / lstm_params.forget_intermediate_scale(); - ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, input, &input_weights_transposed, &eff_bias_info, input_to_forget_scale, &mm_out_info, &forget_outstage_info)); + const float input_to_forget_scale = input_to_forget_weights->quantization_info().uniform().scale * qinput.scale / + lstm_params.forget_intermediate_scale(); + ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, input, &input_weights_transposed, &eff_bias_info, + input_to_forget_scale, &mm_out_info, &forget_outstage_info)); - const float recurrent_to_forget_scale = recurrent_to_forget_weights->quantization_info().uniform().scale * qoutput_state_in.scale / lstm_params.forget_intermediate_scale(); - ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, output_state_in, &recurrent_weights_transposed, &eff_bias_info, recurrent_to_forget_scale, &mm_out_info, &forget_outstage_info)); + const float recurrent_to_forget_scale = recurrent_to_forget_weights->quantization_info().uniform().scale * + qoutput_state_in.scale / lstm_params.forget_intermediate_scale(); + ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, output_state_in, &recurrent_weights_transposed, + &eff_bias_info, recurrent_to_forget_scale, &mm_out_info, + &forget_outstage_info)); - ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAdditionKernel::validate(&forget_outstage_info, &forget_outstage_info, &forget_outstage_info, ConvertPolicy::SATURATE)); + ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(&forget_outstage_info, &forget_outstage_info, + &forget_outstage_info, ConvertPolicy::SATURATE)); - if(lstm_params.has_peephole_opt()) + if (lstm_params.has_peephole_opt()) { - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lstm_params.cell_to_forget_weights(), 1, DataType::QSYMM16); - ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplicationKernel::validate(cell_state_in, lstm_params.cell_to_forget_weights(), &mm_out_info, 1.f, ConvertPolicy::SATURATE, - RoundingPolicy::TO_ZERO)); - const float cell_to_forget_scale = std::pow(2, cell_shift) * lstm_params.cell_to_forget_weights()->quantization_info().uniform().scale / lstm_params.forget_intermediate_scale(); - ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier(cell_to_forget_scale, &gemmlowp_info.gemmlowp_multiplier, &gemmlowp_info.gemmlowp_shift)); - ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpOutputStage::validate(&mm_out_info, nullptr, &forget_outstage_info, gemmlowp_info)); - ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAdditionKernel::validate(&forget_outstage_info, &forget_outstage_info, &forget_outstage_info, ConvertPolicy::SATURATE)); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lstm_params.cell_to_forget_weights(), 1, + DataType::QSYMM16); + ARM_COMPUTE_RETURN_ON_ERROR( + NEPixelWiseMultiplication::validate(cell_state_in, lstm_params.cell_to_forget_weights(), &mm_out_info, 1.f, + ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO)); + const float cell_to_forget_scale = std::pow(2, cell_shift) * + lstm_params.cell_to_forget_weights()->quantization_info().uniform().scale / + lstm_params.forget_intermediate_scale(); + ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier( + cell_to_forget_scale, &gemmlowp_info.gemmlowp_multiplier, &gemmlowp_info.gemmlowp_shift)); + ARM_COMPUTE_RETURN_ON_ERROR( + NEGEMMLowpOutputStage::validate(&mm_out_info, nullptr, &forget_outstage_info, gemmlowp_info)); + ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(&forget_outstage_info, &forget_outstage_info, + &forget_outstage_info, ConvertPolicy::SATURATE)); } - if(has_layer_norm) + if (has_layer_norm) { const ITensorInfo *w_info = lstm_params.forget_layer_norm_weights(); const ITensorInfo *b_info = forget_gate_bias; @@ -675,22 +1031,31 @@ Status NEQLSTMLayer::validate(const ITensorInfo *input, // Output quantization info of Sigmoid and Tanh activations const QuantizationInfo sigmoid_tanh_outqinfo(1.f / 32768.f, 0); - const TensorInfo forget_gate_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, sigmoid_tanh_outqinfo); + const TensorInfo forget_gate_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, sigmoid_tanh_outqinfo); - ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayer::validate(&forget_outstage_info, &forget_gate_info, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC))); + ARM_COMPUTE_RETURN_ON_ERROR( + NEActivationLayer::validate(&forget_outstage_info, &forget_gate_info, + ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC))); // Modulation gate. ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.cell_intermediate_scale() == 0); - const TensorInfo cell_outstage_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, QuantizationInfo(lstm_params.cell_intermediate_scale(), 0)); - const float input_to_cell_scale = input_to_cell_weights->quantization_info().uniform().scale * qinput.scale / lstm_params.cell_intermediate_scale(); - ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, input, &input_weights_transposed, &eff_bias_info, input_to_cell_scale, &mm_out_info, &cell_outstage_info)); - - const float recurrent_to_cell_scale = recurrent_to_cell_weights->quantization_info().uniform().scale * qoutput_state_in.scale / lstm_params.cell_intermediate_scale(); - ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, output_state_in, &recurrent_weights_transposed, &eff_bias_info, recurrent_to_cell_scale, &mm_out_info, &cell_outstage_info)); - - ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAdditionKernel::validate(&cell_outstage_info, &cell_outstage_info, &cell_outstage_info, ConvertPolicy::SATURATE)); - - if(has_layer_norm) + const TensorInfo cell_outstage_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, + QuantizationInfo(lstm_params.cell_intermediate_scale(), 0)); + const float input_to_cell_scale = input_to_cell_weights->quantization_info().uniform().scale * qinput.scale / + lstm_params.cell_intermediate_scale(); + ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, input, &input_weights_transposed, &eff_bias_info, + input_to_cell_scale, &mm_out_info, &cell_outstage_info)); + + const float recurrent_to_cell_scale = recurrent_to_cell_weights->quantization_info().uniform().scale * + qoutput_state_in.scale / lstm_params.cell_intermediate_scale(); + ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, output_state_in, &recurrent_weights_transposed, + &eff_bias_info, recurrent_to_cell_scale, &mm_out_info, + &cell_outstage_info)); + + ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(&cell_outstage_info, &cell_outstage_info, + &cell_outstage_info, ConvertPolicy::SATURATE)); + + if (has_layer_norm) { const ITensorInfo *w_info = lstm_params.cell_layer_norm_weights(); const ITensorInfo *b_info = cell_bias; @@ -698,85 +1063,134 @@ Status NEQLSTMLayer::validate(const ITensorInfo *input, } const TensorInfo cell_gate_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, sigmoid_tanh_outqinfo); - ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayer::validate(&cell_outstage_info, &cell_gate_info, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.f, 1.f))); + ARM_COMPUTE_RETURN_ON_ERROR( + NEActivationLayer::validate(&cell_outstage_info, &cell_gate_info, + ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.f, 1.f))); // Input gate. const TensorInfo input_gate_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, sigmoid_tanh_outqinfo); - if(lstm_params.has_cifg_opt()) + if (lstm_params.has_cifg_opt()) { - ARM_COMPUTE_RETURN_ERROR_ON_MSG(lstm_params.input_gate_bias() != nullptr, "Input gate bias must not be present when CIFG is used"); - ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticSubtractionKernel::validate(&input_gate_info, &forget_gate_info, &forget_gate_info, ConvertPolicy::SATURATE)); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(lstm_params.input_gate_bias() != nullptr, + "Input gate bias must not be present when CIFG is used"); + ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticSubtraction::validate(&input_gate_info, &forget_gate_info, + &forget_gate_info, ConvertPolicy::SATURATE)); } else { - ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(lstm_params.input_to_input_weights(), lstm_params.recurrent_to_input_weights(), lstm_params.input_gate_bias()); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input_to_forget_weights, lstm_params.input_to_input_weights(), lstm_params.recurrent_to_input_weights()); + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(lstm_params.input_to_input_weights(), + lstm_params.recurrent_to_input_weights(), lstm_params.input_gate_bias()); + + // If the input_to_forget_weights data type is DataType::QSYMM8 then it can never match the other weights as they are all DataType::QASYMM8_SIGNED + if (input_to_forget_weights->data_type() == DataType::QSYMM8) + { + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(lstm_params.input_to_input_weights(), + lstm_params.recurrent_to_input_weights()); + } + else + { + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input_to_forget_weights, + lstm_params.input_to_input_weights(), + lstm_params.recurrent_to_input_weights()); + } ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input_to_forget_weights, lstm_params.input_to_input_weights()); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(recurrent_to_forget_weights, lstm_params.recurrent_to_input_weights()); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(recurrent_to_forget_weights, + lstm_params.recurrent_to_input_weights()); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(forget_gate_bias, lstm_params.input_gate_bias()); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(forget_gate_bias, lstm_params.input_gate_bias()); ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.input_intermediate_scale() == 0); - const TensorInfo input_outstage_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, QuantizationInfo(lstm_params.input_intermediate_scale(), 0)); - const float input_to_input_scale = lstm_params.input_to_input_weights()->quantization_info().uniform().scale * qinput.scale / lstm_params.input_intermediate_scale(); - ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, input, &input_weights_transposed, &eff_bias_info, input_to_input_scale, &mm_out_info, &input_outstage_info)); - - const float recurrent_to_input_scale = lstm_params.recurrent_to_input_weights()->quantization_info().uniform().scale * qoutput_state_in.scale / lstm_params.input_intermediate_scale(); - ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, output_state_in, &recurrent_weights_transposed, &eff_bias_info, recurrent_to_input_scale, &mm_out_info, &input_outstage_info)); - - ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAdditionKernel::validate(&input_outstage_info, &input_outstage_info, &input_outstage_info, ConvertPolicy::SATURATE)); - - if(lstm_params.has_peephole_opt()) + const TensorInfo input_outstage_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, + QuantizationInfo(lstm_params.input_intermediate_scale(), 0)); + const float input_to_input_scale = lstm_params.input_to_input_weights()->quantization_info().uniform().scale * + qinput.scale / lstm_params.input_intermediate_scale(); + ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, input, &input_weights_transposed, &eff_bias_info, + input_to_input_scale, &mm_out_info, &input_outstage_info)); + + const float recurrent_to_input_scale = + lstm_params.recurrent_to_input_weights()->quantization_info().uniform().scale * qoutput_state_in.scale / + lstm_params.input_intermediate_scale(); + ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, output_state_in, &recurrent_weights_transposed, + &eff_bias_info, recurrent_to_input_scale, &mm_out_info, + &input_outstage_info)); + + ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(&input_outstage_info, &input_outstage_info, + &input_outstage_info, ConvertPolicy::SATURATE)); + + if (lstm_params.has_peephole_opt()) { - ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplicationKernel::validate(cell_state_in, lstm_params.cell_to_input_weights(), &mm_out_info, 1.f, ConvertPolicy::SATURATE, - RoundingPolicy::TO_ZERO)); - const float cell_to_input_scale = std::pow(2, cell_shift) * lstm_params.cell_to_input_weights()->quantization_info().uniform().scale / lstm_params.input_intermediate_scale(); - ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier(cell_to_input_scale, &gemmlowp_info.gemmlowp_multiplier, &gemmlowp_info.gemmlowp_shift)); - ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpOutputStage::validate(&mm_out_info, &eff_bias_info, &input_outstage_info, gemmlowp_info)); - ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAdditionKernel::validate(&input_outstage_info, &input_outstage_info, &input_outstage_info, ConvertPolicy::SATURATE)); + ARM_COMPUTE_RETURN_ON_ERROR( + NEPixelWiseMultiplication::validate(cell_state_in, lstm_params.cell_to_input_weights(), &mm_out_info, + 1.f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO)); + const float cell_to_input_scale = std::pow(2, cell_shift) * + lstm_params.cell_to_input_weights()->quantization_info().uniform().scale / + lstm_params.input_intermediate_scale(); + ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier( + cell_to_input_scale, &gemmlowp_info.gemmlowp_multiplier, &gemmlowp_info.gemmlowp_shift)); + ARM_COMPUTE_RETURN_ON_ERROR( + NEGEMMLowpOutputStage::validate(&mm_out_info, &eff_bias_info, &input_outstage_info, gemmlowp_info)); + ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(&input_outstage_info, &input_outstage_info, + &input_outstage_info, ConvertPolicy::SATURATE)); } - if(has_layer_norm) + if (has_layer_norm) { const ITensorInfo *w_info = lstm_params.input_layer_norm_weights(); const ITensorInfo *b_info = lstm_params.input_gate_bias(); ARM_COMPUTE_RETURN_ON_ERROR(validate_layer_norm(input_outstage_info, *w_info, *b_info)); } - ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayer::validate(&input_outstage_info, &input_gate_info, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.f, 1.f))); + ARM_COMPUTE_RETURN_ON_ERROR( + NEActivationLayer::validate(&input_outstage_info, &input_gate_info, + ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.f, 1.f))); } // Cell. - ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplicationKernel::validate(&forget_gate_info, cell_state_in, &forget_gate_info, 1.f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO)); - ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplicationKernel::validate(&input_gate_info, cell_state_in, &cell_gate_info, 1.f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO)); - ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAdditionKernel::validate(&forget_gate_info, &cell_gate_info, cell_state_out, ConvertPolicy::SATURATE)); - if(quantized_cell_clip > 0) + ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplication::validate( + &forget_gate_info, cell_state_in, &forget_gate_info, 1.f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO)); + ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplication::validate( + &input_gate_info, cell_state_in, &cell_gate_info, 1.f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO)); + ARM_COMPUTE_RETURN_ON_ERROR( + NEArithmeticAddition::validate(&forget_gate_info, &cell_gate_info, cell_state_out, ConvertPolicy::SATURATE)); + if (quantized_cell_clip > 0) { - ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayer::validate(cell_state_out, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, -quantized_cell_clip, - quantized_cell_clip))); + ARM_COMPUTE_RETURN_ON_ERROR( + NEActivationLayer::validate(cell_state_out, nullptr, + ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, + -quantized_cell_clip, quantized_cell_clip))); } // Output gate. ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.output_intermediate_scale() == 0); - const TensorInfo output_outstage_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, QuantizationInfo(lstm_params.output_intermediate_scale(), 0)); - const float input_to_output_scale = input_to_output_weights->quantization_info().uniform().scale * qinput.scale / lstm_params.output_intermediate_scale(); - ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, input, &input_weights_transposed, &eff_bias_info, input_to_output_scale, &mm_out_info, &output_outstage_info)); - - const float recurrent_to_output_scale = recurrent_to_output_weights->quantization_info().uniform().scale * qoutput_state_in.scale / lstm_params.output_intermediate_scale(); - ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, output_state_in, &recurrent_weights_transposed, &eff_bias_info, recurrent_to_output_scale, &mm_out_info, &output_outstage_info)); - - ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAdditionKernel::validate(&output_outstage_info, &output_outstage_info, &output_outstage_info, ConvertPolicy::SATURATE)); - if(lstm_params.has_peephole_opt()) + const TensorInfo output_outstage_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, + QuantizationInfo(lstm_params.output_intermediate_scale(), 0)); + const float input_to_output_scale = input_to_output_weights->quantization_info().uniform().scale * qinput.scale / + lstm_params.output_intermediate_scale(); + ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, input, &input_weights_transposed, &eff_bias_info, + input_to_output_scale, &mm_out_info, &output_outstage_info)); + + const float recurrent_to_output_scale = recurrent_to_output_weights->quantization_info().uniform().scale * + qoutput_state_in.scale / lstm_params.output_intermediate_scale(); + ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, output_state_in, &recurrent_weights_transposed, + &eff_bias_info, recurrent_to_output_scale, &mm_out_info, + &output_outstage_info)); + + ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(&output_outstage_info, &output_outstage_info, + &output_outstage_info, ConvertPolicy::SATURATE)); + if (lstm_params.has_peephole_opt()) { - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lstm_params.cell_to_output_weights(), 1, DataType::QSYMM16); - // TODO(COMPMID-3395): Perform multiplication in the quantized domain in NEPixelWiseMultiplicationKernel + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lstm_params.cell_to_output_weights(), 1, + DataType::QSYMM16); + // TODO(COMPMID-3395): Perform multiplication in the quantized domain in NEPixelWiseMultiplication // Here we are not using the output stage because all operations are done in float // const float cell_to_output_scale = std::pow(2, cell_shift) * lstm_params.cell_to_output_weights()->quantization_info().uniform().scale / lstm_params.output_intermediate_scale(); // ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier(cell_to_output_scale, &gemmlowp_info.gemmlowp_multiplier, &gemmlowp_info.gemmlowp_shift)); - ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplicationKernel::validate(cell_state_out, lstm_params.cell_to_output_weights(), &output_outstage_info, 1.f, ConvertPolicy::SATURATE, - RoundingPolicy::TO_ZERO)); - ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAdditionKernel::validate(&output_outstage_info, &output_outstage_info, &output_outstage_info, ConvertPolicy::SATURATE)); + ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplication::validate( + cell_state_out, lstm_params.cell_to_output_weights(), &output_outstage_info, 1.f, ConvertPolicy::SATURATE, + RoundingPolicy::TO_ZERO)); + ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(&output_outstage_info, &output_outstage_info, + &output_outstage_info, ConvertPolicy::SATURATE)); } - if(has_layer_norm) + if (has_layer_norm) { const ITensorInfo *w_info = lstm_params.output_layer_norm_weights(); const ITensorInfo *b_info = output_gate_bias; @@ -784,91 +1198,109 @@ Status NEQLSTMLayer::validate(const ITensorInfo *input, } const TensorInfo output_gate_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, sigmoid_tanh_outqinfo); - ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayer::validate(&output_outstage_info, &output_gate_info, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC))); + ARM_COMPUTE_RETURN_ON_ERROR( + NEActivationLayer::validate(&output_outstage_info, &output_gate_info, + ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC))); // Hidden. - ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayer::validate(cell_state_out, &input_gate_info, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.f, 1.f))); + ARM_COMPUTE_RETURN_ON_ERROR( + NEActivationLayer::validate(cell_state_out, &input_gate_info, + ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.f, 1.f))); const TensorInfo hidden_mul_res(TensorShape(num_units, batch_size), 1, DataType::S32); const TensorInfo hidden_out_info(TensorShape(num_units, batch_size), 1, DataType::QASYMM8_SIGNED); - ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplicationKernel::validate(&output_gate_info, &input_gate_info, &hidden_mul_res, 1.f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO)); + ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplication::validate( + &output_gate_info, &input_gate_info, &hidden_mul_res, 1.f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO)); ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.hidden_state_scale() == 0); const float hidden_state_scale = std::pow(2, -15) / lstm_params.hidden_state_scale() * std::pow(2, -15); - ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier(hidden_state_scale, &gemmlowp_info.gemmlowp_multiplier, &gemmlowp_info.gemmlowp_shift, /* ignore_epsilon */ true)); - gemmlowp_info.gemmlowp_offset = lstm_params.hidden_state_zero(); - ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpOutputStage::validate(&hidden_mul_res, nullptr, &hidden_out_info, gemmlowp_info)); + ARM_COMPUTE_RETURN_ON_ERROR( + quantization::calculate_quantized_multiplier(hidden_state_scale, &gemmlowp_info.gemmlowp_multiplier, + &gemmlowp_info.gemmlowp_shift, /* ignore_epsilon */ true)); + gemmlowp_info.gemmlowp_offset = lstm_params.hidden_state_zero(); + gemmlowp_info.output_data_type = hidden_out_info.data_type(); + ARM_COMPUTE_RETURN_ON_ERROR( + NEGEMMLowpOutputStage::validate(&hidden_mul_res, nullptr, &hidden_out_info, gemmlowp_info)); const bool projection_tensor_copy_required = num_units != output_size; // Projection. - if(lstm_params.has_projection()) + if (lstm_params.has_projection()) { - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(recurrent_to_forget_weights, lstm_params.projection_weights()); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(forget_gate_bias, lstm_params.projection_bias()); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(recurrent_to_forget_weights, + lstm_params.projection_weights()); ARM_COMPUTE_RETURN_ERROR_ON(qoutput_state_in.scale == 0); - const UniformQuantizationInfo qprojection = lstm_params.projection_weights()->quantization_info().uniform(); - const float projection_scale = qprojection.scale * lstm_params.hidden_state_scale() / qoutput_state_in.scale; - ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier(projection_scale, &gemmlowp_info.gemmlowp_multiplier, &gemmlowp_info.gemmlowp_shift)); + const UniformQuantizationInfo qprojection = lstm_params.projection_weights()->quantization_info().uniform(); + const float projection_scale = qprojection.scale * lstm_params.hidden_state_scale() / qoutput_state_in.scale; + ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier( + projection_scale, &gemmlowp_info.gemmlowp_multiplier, &gemmlowp_info.gemmlowp_shift)); gemmlowp_info.gemmlowp_offset = qoutput_state_in.offset; gemmlowp_info.gemmlowp_min_bound = std::numeric_limits<int8_t>::lowest(); gemmlowp_info.gemmlowp_max_bound = std::numeric_limits<int8_t>::max(); gemmlowp_info.output_data_type = DataType::QASYMM8_SIGNED; const TensorInfo projection_outstage_info(*output_state_out); - const TensorInfo projection_weights_transposed(TensorShape(output_size, num_units), 1, lstm_params.projection_weights()->data_type(), lstm_params.projection_weights()->quantization_info()); + const TensorInfo projection_weights_transposed(TensorShape(output_size, num_units), 1, + lstm_params.projection_weights()->data_type(), + lstm_params.projection_weights()->quantization_info()); - TensorInfo projection_mm_out_info{ mm_out_info }; + TensorInfo projection_mm_out_info{mm_out_info}; projection_mm_out_info.set_tensor_shape(TensorShape(output_size, batch_size)); - ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, &hidden_out_info, &projection_weights_transposed, &projection_eff_bias_info, projection_scale, &projection_mm_out_info, + ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, &hidden_out_info, &projection_weights_transposed, + &projection_eff_bias_info, projection_scale, &projection_mm_out_info, &projection_outstage_info)); - if(projection_tensor_copy_required) + if (projection_tensor_copy_required) { - ARM_COMPUTE_RETURN_ON_ERROR(NEQLSTMLayer::TensorCopyKernel::validate(*output_state_out, projection_outstage_info)); + ARM_COMPUTE_RETURN_ON_ERROR( + NEQLSTMLayer::TensorCopyKernel::validate(*output_state_in, projection_outstage_info)); } - ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAdditionKernel::validate(output_state_out, output_state_out, output_state_out, ConvertPolicy::SATURATE)); + ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(output_state_out, output_state_out, output_state_out, + ConvertPolicy::SATURATE)); - if(projection_tensor_copy_required) + if (projection_tensor_copy_required) { - ARM_COMPUTE_RETURN_ON_ERROR(NEQLSTMLayer::TensorCopyKernel::validate(projection_outstage_info, *output_state_out)); + ARM_COMPUTE_RETURN_ON_ERROR( + NEQLSTMLayer::TensorCopyKernel::validate(projection_outstage_info, *output_state_out)); } - int8_t quantized_projection_clip{ 0 }; - if(lstm_params.projection_clip() > 0.0f) + int8_t quantized_projection_clip{0}; + if (lstm_params.projection_clip() > 0.0f) { quantized_projection_clip = quantize_qasymm8_signed(lstm_params.projection_clip(), qprojection); } - if(quantized_projection_clip > 0) + if (quantized_projection_clip > 0) { - ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayer::validate(output_state_out, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, -quantized_projection_clip, - quantized_projection_clip))); + ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayer::validate( + output_state_out, nullptr, + ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, + -quantized_projection_clip, quantized_projection_clip))); } } else { - if(projection_tensor_copy_required) + if (projection_tensor_copy_required) { ARM_COMPUTE_RETURN_ON_ERROR(NEQLSTMLayer::TensorCopyKernel::validate(hidden_out_info, *output_state_out)); } } - if(cell_state_out->total_size() > 0) + if (cell_state_out->total_size() > 0) { ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(cell_state_in, cell_state_out); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(cell_state_in, cell_state_out); } - if(output_state_out->total_size() > 0) + if (output_state_out->total_size() > 0) { ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output_state_out); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output_state_in, output_state_out); } - ARM_COMPUTE_RETURN_ON_ERROR(NECopyKernel::validate(output_state_out, output)); + ARM_COMPUTE_RETURN_ON_ERROR(NECopy::validate(output_state_out, output)); return Status{}; } @@ -885,18 +1317,18 @@ void NEQLSTMLayer::run() _mm_recurrent_to_forget.run(); _recurrent_to_forget_outstage.run(); - NEScheduler::get().schedule(&_accumulate_input_recurrent_forget, Window::DimY); + _accumulate_input_recurrent_forget.run(); - if(_has_peephole) + if (_has_peephole) { - NEScheduler::get().schedule(&_pixelwise_mul_cell_to_forget, Window::DimY); + _pixelwise_mul_cell_to_forget.run(); _cell_to_forget_outstage.run(); - NEScheduler::get().schedule(&_accumulate_cell_forget, Window::DimY); + _accumulate_cell_forget.run(); } - if(_has_layer_norm) + if (_has_layer_norm) { - NEScheduler::get().schedule(&get_layer_norm(LayerNormGate::Forget), Window::DimY); + NEScheduler::get().schedule(get_layer_norm(LayerNormGate::Forget).get(), Window::DimY); } _forget_gate_sigmoid.run(); @@ -907,19 +1339,19 @@ void NEQLSTMLayer::run() _mm_recurrent_to_cell.run(); _recurrent_to_cell_outstage.run(); - NEScheduler::get().schedule(&_accumulate_input_recurrent_modulation, Window::DimY); + _accumulate_input_recurrent_modulation.run(); - if(_has_layer_norm) + if (_has_layer_norm) { - NEScheduler::get().schedule(&get_layer_norm(LayerNormGate::Cell), Window::DimY); + NEScheduler::get().schedule(get_layer_norm(LayerNormGate::Cell).get(), Window::DimY); } _cell_gate_tanh.run(); // Input gate - if(_has_cifg) + if (_has_cifg) { - NEScheduler::get().schedule(&_input_gate_sub, Window::DimY); + _input_gate_sub.run(); } else { @@ -927,28 +1359,29 @@ void NEQLSTMLayer::run() _input_to_input_outstage.run(); _mm_recurrent_to_input.run(); _recurrent_to_input_outstage.run(); - NEScheduler::get().schedule(&_accumulate_input_recurrent_input, Window::DimY); + _accumulate_input_recurrent_input.run(); - if(_has_peephole) + if (_has_peephole) { - NEScheduler::get().schedule(&_pixelwise_mul_cell_to_input, Window::DimY); + _pixelwise_mul_cell_to_input.run(); _cell_to_input_outstage.run(); - NEScheduler::get().schedule(&_accumulate_cell_input, Window::DimY); + _accumulate_cell_input.run(); } - if(_has_layer_norm) + if (_has_layer_norm) { - NEScheduler::get().schedule(&get_layer_norm(LayerNormGate::Input), Window::DimY); + NEScheduler::get().schedule(get_layer_norm(LayerNormGate::Input).get(), Window::DimY); } _input_gate_sigmoid.run(); } // Cell. - NEScheduler::get().schedule(&_pixelwise_mul_forget_cell, Window::DimY); - NEScheduler::get().schedule(&_pixelwise_mul_input_cell, Window::DimY); - NEScheduler::get().schedule(&_add_forget_cell, Window::DimY); - if(_has_cell_clipping) + _pixelwise_mul_forget_cell.run(); + _pixelwise_mul_input_cell.run(); + _add_forget_cell.run(); + + if (_has_cell_clipping) { _cell_clip.run(); } @@ -958,65 +1391,73 @@ void NEQLSTMLayer::run() _input_to_output_outstage.run(); _mm_recurrent_to_output.run(); _recurrent_to_output_outstage.run(); - NEScheduler::get().schedule(&_accumulate_input_recurrent_output, Window::DimY); - if(_has_peephole) + _accumulate_input_recurrent_output.run(); + if (_has_peephole) { - NEScheduler::get().schedule(&_pixelwise_mul_cell_to_output, Window::DimY); + _pixelwise_mul_cell_to_output.run(); _cell_to_output_outstage.run(); - NEScheduler::get().schedule(&_accumulate_cell_to_output, Window::DimY); + _accumulate_cell_to_output.run(); } - if(_has_layer_norm) + if (_has_layer_norm) { - NEScheduler::get().schedule(&get_layer_norm(LayerNormGate::Output), Window::DimY); + NEScheduler::get().schedule(get_layer_norm(LayerNormGate::Output).get(), Window::DimY); } _output_gate_sigmoid.run(); // Hidden. _hidden_tanh.run(); - NEScheduler::get().schedule(&_pixelwise_mul_hidden, Window::DimY); + _pixelwise_mul_hidden.run(); _hidden_outstage.run(); // Projection. - if(_has_projection) + if (_has_projection) { _mm_projection.run(); _projection_outstage.run(); - if(_projection_tensor_copy_required) + if (_projection_tensor_copy_required) { _projection_output_to_accumulate_copy.run(); } - NEScheduler::get().schedule(&_accumulate_projection, Window::DimY); + _accumulate_projection.run(); - if(_projection_tensor_copy_required) + if (_projection_tensor_copy_required) { _projection_accumulate_to_output_copy.run(); } - if(_has_projection_clipping) + if (_has_projection_clipping) { _projection_clip.run(); } } else { - if(_projection_tensor_copy_required) + if (_projection_tensor_copy_required) { _hidden_to_output_copy.run(); } } // Copy output_state_out to output - NEScheduler::get().schedule(&_copy_output, Window::DimY); + _copy_output.run(); } void NEQLSTMLayer::prepare() { - if(!_is_prepared) + if (!_is_prepared) { + if (_convert_input_to_forget_weights_to_qsymm8) + { + _input_to_forget_weights_f32.allocator()->allocate(); + _input_to_forget_weights_symm8.allocator()->allocate(); + _dequantize_input_to_forget_weights.run(); + _quantize_input_to_forget_weights.run(); + } + // Pre-transpose weights to be used in GEMM. _input_to_forget_weights_transposed.allocator()->allocate(); _input_to_cell_weights_transposed.allocator()->allocate(); @@ -1032,16 +1473,25 @@ void NEQLSTMLayer::prepare() _transpose_recurrent_to_output_weights.run(); // Precompute effective biases - if(_has_cifg) + if (_has_cifg) { - std::fill_n(reinterpret_cast<int16_t *>(_ones.buffer()), _ones.info()->total_size() / _ones.info()->element_size(), 32767); + std::fill_n(reinterpret_cast<int16_t *>(_ones.buffer()), + _ones.info()->total_size() / _ones.info()->element_size(), 32767); } else { _input_to_input_eff_bias.allocator()->allocate(); _recurrent_to_input_eff_bias.allocator()->allocate(); - NEScheduler::get().schedule(&_input_to_input_reduction, Window::DimY); - NEScheduler::get().schedule(&_recurrent_to_input_reduction, Window::DimY); + + ITensorPack packII = {{TensorType::ACL_SRC, _input_to_input_weights}, + {TensorType::ACL_DST, &_input_to_input_eff_bias}}; + NEScheduler::get().schedule_op(_input_to_input_reduction.get(), Window::DimY, + _input_to_input_reduction->window(), packII); + + ITensorPack packRI = {{TensorType::ACL_SRC, _recurrent_to_input_weights}, + {TensorType::ACL_DST, &_recurrent_to_input_eff_bias}}; + NEScheduler::get().schedule_op(_recurrent_to_input_reduction.get(), Window::DimY, + _recurrent_to_input_reduction->window(), packRI); _input_to_input_weights_transposed.allocator()->allocate(); _recurrent_to_input_weights_transposed.allocator()->allocate(); @@ -1056,19 +1506,47 @@ void NEQLSTMLayer::prepare() _recurrent_to_cell_eff_bias.allocator()->allocate(); _input_to_output_eff_bias.allocator()->allocate(); _recurrent_to_output_eff_bias.allocator()->allocate(); - NEScheduler::get().schedule(&_input_to_forget_reduction, Window::DimY); - NEScheduler::get().schedule(&_recurrent_to_forget_reduction, Window::DimY); - NEScheduler::get().schedule(&_input_to_cell_reduction, Window::DimY); - NEScheduler::get().schedule(&_recurrent_to_cell_reduction, Window::DimY); - NEScheduler::get().schedule(&_input_to_output_reduction, Window::DimY); - NEScheduler::get().schedule(&_recurrent_to_output_reduction, Window::DimY); - - if(_has_projection) + + ITensorPack packIF = {{TensorType::ACL_SRC, _input_to_forget_weights}, + {TensorType::ACL_DST, &_input_to_forget_eff_bias}}; + NEScheduler::get().schedule_op(_input_to_forget_reduction.get(), Window::DimY, + _input_to_forget_reduction->window(), packIF); + + ITensorPack packRF = {{TensorType::ACL_SRC, _recurrent_to_forget_weights}, + {TensorType::ACL_DST, &_recurrent_to_forget_eff_bias}}; + NEScheduler::get().schedule_op(_recurrent_to_forget_reduction.get(), Window::DimY, + _recurrent_to_forget_reduction->window(), packRF); + + ITensorPack packIC = {{TensorType::ACL_SRC, _input_to_cell_weights}, + {TensorType::ACL_DST, &_input_to_cell_eff_bias}}; + NEScheduler::get().schedule_op(_input_to_cell_reduction.get(), Window::DimY, _input_to_cell_reduction->window(), + packIC); + + ITensorPack packRC = {{TensorType::ACL_SRC, _recurrent_to_cell_weights}, + {TensorType::ACL_DST, &_recurrent_to_cell_eff_bias}}; + NEScheduler::get().schedule_op(_recurrent_to_cell_reduction.get(), Window::DimY, + _recurrent_to_cell_reduction->window(), packRC); + + ITensorPack packIO = {{TensorType::ACL_SRC, _input_to_output_weights}, + {TensorType::ACL_DST, &_input_to_output_eff_bias}}; + NEScheduler::get().schedule_op(_input_to_output_reduction.get(), Window::DimY, + _input_to_output_reduction->window(), packIO); + + ITensorPack packRO = {{TensorType::ACL_SRC, _recurrent_to_output_weights}, + {TensorType::ACL_DST, &_recurrent_to_output_eff_bias}}; + NEScheduler::get().schedule_op(_recurrent_to_output_reduction.get(), Window::DimY, + _recurrent_to_output_reduction->window(), packRO); + + if (_has_projection) { - if(_projection_bias != nullptr) + _projection_eff_bias.allocator()->allocate(); + ITensorPack pack = {{TensorType::ACL_SRC, _projection_weights}, + {TensorType::ACL_DST, &_projection_eff_bias}}; + NEScheduler::get().schedule_op(_projection_reduction.get(), Window::DimY, _projection_reduction->window(), + pack); + if (_projection_bias != nullptr) { - _projection_eff_bias.allocator()->allocate(); - NEScheduler::get().schedule(&_projection_reduction, Window::DimY); + _projection_bias_add.run(); _projection_bias->mark_as_unused(); } @@ -1076,7 +1554,7 @@ void NEQLSTMLayer::prepare() _transpose_projection_weights.run(); _projection_weights->mark_as_unused(); - if(!_projection_tensor_copy_required) + if (!_projection_tensor_copy_required) { _hidden_gate.mark_as_unused(); _projection_accumulate_res.mark_as_unused(); @@ -1094,5 +1572,4 @@ void NEQLSTMLayer::prepare() _is_prepared = true; } } - } // namespace arm_compute diff --git a/src/runtime/NEON/functions/NEQuantizationLayer.cpp b/src/runtime/NEON/functions/NEQuantizationLayer.cpp index 47cc8b05c1..9b72783c97 100644 --- a/src/runtime/NEON/functions/NEQuantizationLayer.cpp +++ b/src/runtime/NEON/functions/NEQuantizationLayer.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2020 ARM Limited. + * Copyright (c) 2017-2021 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -24,27 +24,43 @@ #include "arm_compute/runtime/NEON/functions/NEQuantizationLayer.h" -#include "arm_compute/core/Types.h" #include "arm_compute/core/Validate.h" -#include "support/MemorySupport.h" +#include "arm_compute/runtime/Tensor.h" + +#include "src/cpu/operators/CpuQuantize.h" namespace arm_compute { -Status NEQuantizationLayer::validate(const ITensorInfo *input, const ITensorInfo *output) +struct NEQuantizationLayer::Impl { - ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output); - ARM_COMPUTE_RETURN_ON_ERROR(NEQuantizationLayerKernel::validate(input, output)); + const ITensor *src{nullptr}; + ITensor *dst{nullptr}; + std::unique_ptr<cpu::CpuQuantize> op{nullptr}; +}; - return Status{}; +NEQuantizationLayer::NEQuantizationLayer() : _impl(std::make_unique<Impl>()) +{ +} +NEQuantizationLayer::~NEQuantizationLayer() = default; + +Status NEQuantizationLayer::validate(const ITensorInfo *input, const ITensorInfo *output) +{ + return cpu::CpuQuantize::validate(input, output); } void NEQuantizationLayer::configure(const ITensor *input, ITensor *output) { - ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); + _impl->src = input; + _impl->dst = output; + _impl->op = std::make_unique<cpu::CpuQuantize>(); + _impl->op->configure(input->info(), output->info()); +} - // Configure quantize kernel - auto k = arm_compute::support::cpp14::make_unique<NEQuantizationLayerKernel>(); - k->configure(input, output); - _kernel = std::move(k); +void NEQuantizationLayer::run() +{ + ITensorPack pack; + pack.add_tensor(TensorType::ACL_SRC, _impl->src); + pack.add_tensor(TensorType::ACL_DST, _impl->dst); + _impl->op->run(pack); } } // namespace arm_compute diff --git a/src/runtime/NEON/functions/NERNNLayer.cpp b/src/runtime/NEON/functions/NERNNLayer.cpp index 154b060c3d..2824693800 100644 --- a/src/runtime/NEON/functions/NERNNLayer.cpp +++ b/src/runtime/NEON/functions/NERNNLayer.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018-2020 ARM Limited. + * Copyright (c) 2018-2021 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -27,22 +27,40 @@ #include "arm_compute/core/Error.h" #include "arm_compute/core/TensorInfo.h" #include "arm_compute/core/Types.h" -#include "arm_compute/core/Validate.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" +#include "arm_compute/core/Validate.h" #include "arm_compute/runtime/NEON/NEScheduler.h" +#include "src/common/utils/Log.h" + namespace arm_compute { +NERNNLayer::~NERNNLayer() = default; + NERNNLayer::NERNNLayer(std::shared_ptr<IMemoryManager> memory_manager) - : _memory_group(std::move(memory_manager)), _gemm_state_f(), _add_kernel(), _activation_kernel(), _fully_connected(memory_manager), _copy_kernel(), _fully_connected_out(), _gemm_output(), - _add_output(), _is_prepared(false) + : _memory_group(std::move(memory_manager)), + _gemm_state_f(), + _add_f(), + _activation(), + _fully_connected(memory_manager), + _copy_f(), + _fully_connected_out(), + _gemm_output(), + _add_output(), + _is_prepared(false) { } -Status NERNNLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *recurrent_weights, const ITensorInfo *bias, const ITensorInfo *hidden_state, - const ITensorInfo *output, const ActivationLayerInfo &info) +Status NERNNLayer::validate(const ITensorInfo *input, + const ITensorInfo *weights, + const ITensorInfo *recurrent_weights, + const ITensorInfo *bias, + const ITensorInfo *hidden_state, + const ITensorInfo *output, + const ActivationLayerInfo &info) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, recurrent_weights, bias, hidden_state, output); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_NOT_IN(input, DataType::F16, DataType::F32); const int idx_width = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::WIDTH); const int idx_height = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::HEIGHT); @@ -56,23 +74,34 @@ Status NERNNLayer::validate(const ITensorInfo *input, const ITensorInfo *weights ARM_COMPUTE_RETURN_ERROR_ON(hidden_state->dimension(idx_height) != input->dimension(idx_height)); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), hidden_state->tensor_shape()); - auto shape_info = TensorInfo(misc::shape_calculator::compute_rnn_shape(recurrent_weights, hidden_state->dimension(idx_height)), 1, input->data_type()); + auto shape_info = + TensorInfo(misc::shape_calculator::compute_rnn_shape(recurrent_weights, hidden_state->dimension(idx_height)), 1, + input->data_type()); ARM_COMPUTE_RETURN_ON_ERROR(NEFullyConnectedLayer::validate(input, weights, bias, &shape_info)); - ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAdditionKernel::validate(&shape_info, &shape_info, &shape_info, ConvertPolicy::SATURATE)); - ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayerKernel::validate(&shape_info, &shape_info, info)); + ARM_COMPUTE_RETURN_ON_ERROR( + NEArithmeticAddition::validate(&shape_info, &shape_info, &shape_info, ConvertPolicy::SATURATE)); + ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayer::validate(&shape_info, &shape_info, info)); return Status{}; } -void NERNNLayer::configure(const ITensor *input, const ITensor *weights, const ITensor *recurrent_weights, const ITensor *bias, ITensor *hidden_state, ITensor *output, +void NERNNLayer::configure(const ITensor *input, + const ITensor *weights, + const ITensor *recurrent_weights, + const ITensor *bias, + ITensor *hidden_state, + ITensor *output, ActivationLayerInfo &info) { ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, recurrent_weights, bias, hidden_state, output); - ARM_COMPUTE_ERROR_THROW_ON(NERNNLayer::validate(input->info(), weights->info(), recurrent_weights->info(), bias->info(), hidden_state->info(), output->info(), info)); + ARM_COMPUTE_ERROR_THROW_ON(NERNNLayer::validate(input->info(), weights->info(), recurrent_weights->info(), + bias->info(), hidden_state->info(), output->info(), info)); + ARM_COMPUTE_LOG_PARAMS(input, weights, recurrent_weights, bias, hidden_state, output, info); const int idx_height = get_data_layout_dimension_index(input->info()->data_layout(), DataLayoutDimension::HEIGHT); - TensorShape shape = misc::shape_calculator::compute_rnn_shape(recurrent_weights->info(), hidden_state->info()->dimension(idx_height)); + TensorShape shape = misc::shape_calculator::compute_rnn_shape(recurrent_weights->info(), + hidden_state->info()->dimension(idx_height)); _is_prepared = false; @@ -90,15 +119,15 @@ void NERNNLayer::configure(const ITensor *input, const ITensor *weights, const I _add_output.allocator()->init(TensorInfo(shape, 1, input->info()->data_type())); _memory_group.manage(&_add_output); - _add_kernel.configure(&_fully_connected_out, &_gemm_output, &_add_output, ConvertPolicy::SATURATE); + _add_f.configure(&_fully_connected_out, &_gemm_output, &_add_output, ConvertPolicy::SATURATE); _fully_connected_out.allocator()->allocate(); _gemm_output.allocator()->allocate(); - _activation_kernel.configure(&_add_output, hidden_state, info); + _activation.configure(&_add_output, hidden_state, info); _add_output.allocator()->allocate(); - _copy_kernel.configure(hidden_state, output); + _copy_f.configure(hidden_state, output); } void NERNNLayer::run() @@ -111,16 +140,16 @@ void NERNNLayer::run() _gemm_state_f.run(); - NEScheduler::get().schedule(&_add_kernel, Window::DimY); - NEScheduler::get().schedule(&_activation_kernel, Window::DimY); + _add_f.run(); + _activation.run(); // copy hidden out to output - NEScheduler::get().schedule(&_copy_kernel, Window::DimY); + _copy_f.run(); } void NERNNLayer::prepare() { - if(!_is_prepared) + if (!_is_prepared) { _fully_connected.prepare(); _gemm_state_f.prepare(); diff --git a/src/runtime/NEON/functions/NEROIAlignLayer.cpp b/src/runtime/NEON/functions/NEROIAlignLayer.cpp index 2299bf78a5..68bb5d5ef3 100644 --- a/src/runtime/NEON/functions/NEROIAlignLayer.cpp +++ b/src/runtime/NEON/functions/NEROIAlignLayer.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2020 ARM Limited. + * Copyright (c) 2019-2021 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -23,22 +23,31 @@ */ #include "arm_compute/runtime/NEON/functions/NEROIAlignLayer.h" -#include "arm_compute/core/NEON/kernels/NEROIAlignLayerKernel.h" -#include "support/MemorySupport.h" +#include "src/common/utils/Log.h" +#include "src/core/NEON/kernels/NEFillBorderKernel.h" +#include "src/core/NEON/kernels/NEROIAlignLayerKernel.h" namespace arm_compute { -Status NEROIAlignLayer::validate(const ITensorInfo *input, const ITensorInfo *rois, ITensorInfo *output, const ROIPoolingLayerInfo &pool_info) +Status NEROIAlignLayer::validate(const ITensorInfo *input, + const ITensorInfo *rois, + ITensorInfo *output, + const ROIPoolingLayerInfo &pool_info) { ARM_COMPUTE_RETURN_ON_ERROR(NEROIAlignLayerKernel::validate(input, rois, output, pool_info)); return Status{}; } -void NEROIAlignLayer::configure(const ITensor *input, const ITensor *rois, ITensor *output, const ROIPoolingLayerInfo &pool_info) +void NEROIAlignLayer::configure(const ITensor *input, + const ITensor *rois, + ITensor *output, + const ROIPoolingLayerInfo &pool_info) { + ARM_COMPUTE_LOG_PARAMS(input, rois, output, pool_info); + // Configure ROI pooling kernel - auto k = arm_compute::support::cpp14::make_unique<NEROIAlignLayerKernel>(); + auto k = std::make_unique<NEROIAlignLayerKernel>(); k->configure(input, rois, output, pool_info); _kernel = std::move(k); } diff --git a/src/runtime/NEON/functions/NEROIPoolingLayer.cpp b/src/runtime/NEON/functions/NEROIPoolingLayer.cpp index 3aca4b7b60..babec4aa92 100644 --- a/src/runtime/NEON/functions/NEROIPoolingLayer.cpp +++ b/src/runtime/NEON/functions/NEROIPoolingLayer.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2019 ARM Limited. + * Copyright (c) 2017-2021 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -24,23 +24,40 @@ #include "arm_compute/runtime/NEON/functions/NEROIPoolingLayer.h" #include "arm_compute/core/Helpers.h" -#include "arm_compute/core/NEON/kernels/NEROIPoolingLayerKernel.h" #include "arm_compute/runtime/NEON/NEScheduler.h" +#include "src/common/utils/Log.h" +#include "src/core/NEON/kernels/NEROIPoolingLayerKernel.h" + namespace arm_compute { -NEROIPoolingLayer::NEROIPoolingLayer() - : _roi_kernel() +NEROIPoolingLayer::~NEROIPoolingLayer() = default; + +NEROIPoolingLayer::NEROIPoolingLayer() : _roi_kernel() +{ +} + +Status NEROIPoolingLayer::validate(const ITensorInfo *input, + const ITensorInfo *rois, + const ITensorInfo *output, + const ROIPoolingLayerInfo &pool_info) { + return NEROIPoolingLayerKernel::validate(input, rois, output, pool_info); } -void NEROIPoolingLayer::configure(const ITensor *input, const ITensor *rois, ITensor *output, const ROIPoolingLayerInfo &pool_info) +void NEROIPoolingLayer::configure(const ITensor *input, + const ITensor *rois, + const ITensor *output, + const ROIPoolingLayerInfo &pool_info) { - _roi_kernel.configure(input, rois, output, pool_info); + ARM_COMPUTE_LOG_PARAMS(input, rois, output, pool_info); + + _roi_kernel = std::make_unique<NEROIPoolingLayerKernel>(); + _roi_kernel->configure(input, rois, output, pool_info); } void NEROIPoolingLayer::run() { - NEScheduler::get().schedule(&_roi_kernel, Window::DimX); + NEScheduler::get().schedule(_roi_kernel.get(), Window::DimX); } -} // namespace arm_compute
\ No newline at end of file +} // namespace arm_compute diff --git a/src/runtime/NEON/functions/NERange.cpp b/src/runtime/NEON/functions/NERange.cpp index 977d502286..95492df126 100644 --- a/src/runtime/NEON/functions/NERange.cpp +++ b/src/runtime/NEON/functions/NERange.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018-2019 ARM Limited. + * Copyright (c) 2018-2021 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -25,16 +25,22 @@ #include "arm_compute/runtime/NEON/NEScheduler.h" +#include "src/common/utils/Log.h" +#include "src/core/NEON/kernels/NERangeKernel.h" + namespace arm_compute { -NERange::NERange() - : _kernel() +NERange::~NERange() = default; + +NERange::NERange() : _kernel() { } void NERange::configure(ITensor *output, const float start, const float end, const float step) { - _kernel.configure(output, start, end, step); + ARM_COMPUTE_LOG_PARAMS(output, start, end, step); + _kernel = std::make_unique<NERangeKernel>(); + _kernel->configure(output, start, end, step); } Status NERange::validate(const ITensorInfo *output, const float start, const float end, const float step) @@ -44,6 +50,6 @@ Status NERange::validate(const ITensorInfo *output, const float start, const flo void NERange::run() { - NEScheduler::get().schedule(&_kernel, Window::DimX); + NEScheduler::get().schedule(_kernel.get(), Window::DimX); } -} // namespace arm_compute
\ No newline at end of file +} // namespace arm_compute diff --git a/src/runtime/NEON/functions/NEReduceMean.cpp b/src/runtime/NEON/functions/NEReduceMean.cpp index d53ed31645..a23db87059 100644 --- a/src/runtime/NEON/functions/NEReduceMean.cpp +++ b/src/runtime/NEON/functions/NEReduceMean.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018-2020 ARM Limited. + * Copyright (c) 2018-2024 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -23,29 +23,26 @@ */ #include "arm_compute/runtime/NEON/functions/NEReduceMean.h" -#include "arm_compute/core/CPP/Validate.h" #include "arm_compute/core/Error.h" -#include "arm_compute/core/Helpers.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" -#include "arm_compute/runtime/NEON/NEScheduler.h" + +#include "src/common/utils/Log.h" +#include "src/core/CPP/Validate.h" +#include "src/core/helpers/AutoConfiguration.h" +#include "src/core/NEON/kernels/NEReductionOperationKernel.h" namespace arm_compute { namespace { -} // namespace - -NEReduceMean::NEReduceMean(std::shared_ptr<IMemoryManager> memory_manager) - : _memory_group(std::move(memory_manager)), _reduction_kernels(), _reduced_outs(), _reshape(), _reduction_ops(), _keep_dims() -{ -} - -Status validate_config(const ITensorInfo *input, const Coordinates &reduction_axis, bool keep_dims, const ITensorInfo *output) +Status +validate_config(const ITensorInfo *input, const Coordinates &reduction_axis, bool keep_dims, const ITensorInfo *output) { ARM_COMPUTE_UNUSED(keep_dims); ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output); ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8_SIGNED, DataType::QASYMM8, DataType::F16, DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8_SIGNED, DataType::QASYMM8, + DataType::F16, DataType::F32); ARM_COMPUTE_RETURN_ERROR_ON(reduction_axis.num_dimensions() < 1); ARM_COMPUTE_RETURN_ERROR_ON(reduction_axis.num_dimensions() > input->num_dimensions()); @@ -53,29 +50,36 @@ Status validate_config(const ITensorInfo *input, const Coordinates &reduction_ax const int input_dims = input->num_dimensions(); Coordinates axis_local = reduction_axis; - for(unsigned int i = 0; i < axis_local.num_dimensions(); ++i) + for (unsigned int i = 0; i < axis_local.num_dimensions(); ++i) { //axis: The dimensions to reduce. Must be in the range [-rank(input_tensor), rank(input_tensor)). ARM_COMPUTE_RETURN_ERROR_ON(axis_local[i] < (-static_cast<int>(input->num_dimensions()))); ARM_COMPUTE_RETURN_ERROR_ON(axis_local[i] >= static_cast<int>(input->num_dimensions())); } - if(output->tensor_shape().total_size() != 0) + if (output->tensor_shape().total_size() != 0) { // Only validate if not using auto_init for the output tensor TensorShape out_shape = input->tensor_shape(); // Validate output_shape only if not using auto_init convert_negative_axis(axis_local, input_dims); + +// Suppress warning produced by a compiler bug in GCC +// https://gcc.gnu.org/bugzilla/show_bug.cgi?id=104165 +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Warray-bounds" std::sort(axis_local.begin(), axis_local.begin() + reduction_ops); - for(unsigned int i = 0; i < reduction_ops; ++i) +#pragma GCC diagnostic pop + + for (unsigned int i = 0; i < reduction_ops; ++i) { ARM_COMPUTE_RETURN_ERROR_ON(axis_local[i] > 3); ARM_COMPUTE_RETURN_ERROR_ON(static_cast<unsigned int>(axis_local[i]) > input->num_dimensions() - 1); - if(output->total_size() > 0 && keep_dims) + if (output->total_size() > 0 && keep_dims) { ARM_COMPUTE_RETURN_ERROR_ON(output->dimension(axis_local[i]) != 1); } - if(keep_dims) + if (keep_dims) { out_shape.set(axis_local[i], 1); } @@ -84,27 +88,45 @@ Status validate_config(const ITensorInfo *input, const Coordinates &reduction_ax ARM_COMPUTE_RETURN_ERROR_ON(i > static_cast<unsigned int>(axis_local[i])); const unsigned int remove_index = axis_local[i] - i; ARM_COMPUTE_RETURN_ERROR_ON(remove_index >= out_shape.num_dimensions()); - out_shape.remove_dimension(remove_index); + out_shape.remove_dimension(remove_index, false); } } const TensorInfo out_info = input->clone()->set_tensor_shape(out_shape); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output, &out_info); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(input, output); } return Status{}; } +} // namespace + +NEReduceMean::~NEReduceMean() = default; -Status NEReduceMean::validate(const ITensorInfo *input, const Coordinates &reduction_axis, bool keep_dims, const ITensorInfo *output) +NEReduceMean::NEReduceMean(std::shared_ptr<IMemoryManager> memory_manager) + : _memory_group(std::move(memory_manager)), + _reduction_kernels(), + _reduced_outs(), + _reshape(), + _reduction_ops(), + _keep_dims() +{ +} + +Status NEReduceMean::validate(const ITensorInfo *input, + const Coordinates &reduction_axis, + bool keep_dims, + const ITensorInfo *output) { return validate_config(input, reduction_axis, keep_dims, output); } void NEReduceMean::configure(ITensor *input, const Coordinates &reduction_axis, bool keep_dims, ITensor *output) { + ARM_COMPUTE_LOG_PARAMS(input, reduction_axis, keep_dims, output); + // Perform validate step ARM_COMPUTE_ERROR_THROW_ON(NEReduceMean::validate(input->info(), reduction_axis, keep_dims, output->info())); // Output auto inizialitation if not yet initialized - const TensorShape output_shape = arm_compute::misc::shape_calculator::calculate_reduce_mean_shape(input, reduction_axis, keep_dims); + const TensorShape output_shape = + arm_compute::misc::shape_calculator::calculate_reduce_mean_shape(input->info(), reduction_axis, keep_dims); auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(output_shape)); _reduction_ops = reduction_axis.num_dimensions(); @@ -112,61 +134,72 @@ void NEReduceMean::configure(ITensor *input, const Coordinates &reduction_axis, _reduced_outs.resize(_reduction_ops - (keep_dims ? 1 : 0)); _keep_dims = keep_dims; + ITensor *tmp_input = input; + ITensor *tmp_output = output; + Coordinates axis_local = reduction_axis; - const int input_dims = input->info()->num_dimensions(); + const int input_dims = tmp_input->info()->num_dimensions(); convert_negative_axis(axis_local, input_dims); // Perform reduction for every axis - for(int i = 0; i < _reduction_ops; ++i) + for (int i = 0; i < _reduction_ops; ++i) { - TensorShape out_shape = i == 0 ? input->info()->tensor_shape() : (&_reduced_outs[i - 1])->info()->tensor_shape(); + TensorShape out_shape = + i == 0 ? tmp_input->info()->tensor_shape() : (&_reduced_outs[i - 1])->info()->tensor_shape(); out_shape.set(axis_local[i], 1); - auto in = (i == 0) ? input : (&_reduced_outs[i - 1]); + auto in = (i == 0) ? tmp_input : (&_reduced_outs[i - 1]); - if(i == _reduction_ops - 1 && keep_dims) + if (i == _reduction_ops - 1 && keep_dims) { - _reduction_kernels[i].configure(in, output, axis_local[i], ReductionOperation::MEAN_SUM); + _reduction_kernels[i].configure(in, tmp_output, axis_local[i], ReductionOperation::MEAN_SUM); } else { - _reduced_outs[i].allocator()->init(TensorInfo(out_shape, input->info()->num_channels(), input->info()->data_type(), input->info()->quantization_info())); + _reduced_outs[i].allocator()->init(TensorInfo(out_shape, tmp_output->info()->num_channels(), + tmp_output->info()->data_type(), + tmp_output->info()->quantization_info())); _memory_group.manage(&_reduced_outs[i]); _reduction_kernels[i].configure(in, &_reduced_outs[i], axis_local[i], ReductionOperation::MEAN_SUM); } } // Allocate intermediate tensors - for(int i = 0; i < _reduction_ops - (keep_dims ? 1 : 0); ++i) + for (int i = 0; i < _reduction_ops - (keep_dims ? 1 : 0); ++i) { _reduced_outs[i].allocator()->allocate(); } - // Configure reshape layer if we want to drop the dimensions - if(!keep_dims) + if (!keep_dims) { - TensorShape out_shape = input->info()->tensor_shape(); + TensorShape out_shape = tmp_input->info()->tensor_shape(); // We have to sort the reduction axis vectors in order for remove_dimension // to work properly + +// Suppress warning produced by a compiler bug in GCC +// https://gcc.gnu.org/bugzilla/show_bug.cgi?id=104165 +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Warray-bounds" std::sort(axis_local.begin(), axis_local.begin() + _reduction_ops); - for(int i = 0; i < _reduction_ops; ++i) +#pragma GCC diagnostic pop + + for (int i = 0; i < _reduction_ops; ++i) { - out_shape.remove_dimension(axis_local[i] - i); + out_shape.remove_dimension(axis_local[i] - i, false); } - auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(out_shape)); - _reshape.configure(&_reduced_outs[_reduction_ops - 1], output); + auto_init_if_empty(*tmp_output->info(), tmp_input->info()->clone()->set_tensor_shape(out_shape)); + _reshape.configure(&_reduced_outs[_reduction_ops - 1], tmp_output); } } void NEReduceMean::run() { MemoryGroupResourceScope scope_mg(_memory_group); - for(auto &kernel : _reduction_kernels) + for (auto &kernel : _reduction_kernels) { kernel.run(); } - - if(!_keep_dims) + if (!_keep_dims) { _reshape.run(); } diff --git a/src/runtime/NEON/functions/NEReductionOperation.cpp b/src/runtime/NEON/functions/NEReductionOperation.cpp index 80ebe6731a..8540d750fc 100644 --- a/src/runtime/NEON/functions/NEReductionOperation.cpp +++ b/src/runtime/NEON/functions/NEReductionOperation.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2020 ARM Limited. + * Copyright (c) 2017-2021 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -27,6 +27,10 @@ #include "arm_compute/core/utils/misc/ShapeCalculator.h" #include "arm_compute/runtime/NEON/NEScheduler.h" +#include "src/common/utils/Log.h" +#include "src/core/helpers/AutoConfiguration.h" +#include "src/core/NEON/kernels/NEReductionOperationKernel.h" + namespace arm_compute { namespace @@ -39,7 +43,7 @@ namespace */ size_t reduction_window_split_dimension(unsigned int axis) { - switch(axis) + switch (axis) { case 0: return Window::DimY; @@ -53,14 +57,24 @@ size_t reduction_window_split_dimension(unsigned int axis) } } // namespace +NEReductionOperation::~NEReductionOperation() = default; + NEReductionOperation::NEReductionOperation(std::shared_ptr<IMemoryManager> memory_manager) - : _memory_group(memory_manager), _reduction_kernel(), _fill_border_kernel(), _reshape_kernel(), _output_internal(), _window_split(0), _reduction_axis(), _is_reshape_required(false) + : _memory_group(memory_manager), + _reduction_kernel(), + _reshape(), + _output_internal(), + _window_split(0), + _reduction_axis(), + _is_reshape_required(false) { } -Status NEReductionOperation::validate(const ITensorInfo *input, const ITensorInfo *output, unsigned int axis, ReductionOperation op, bool keep_dims) +Status NEReductionOperation::validate( + const ITensorInfo *input, const ITensorInfo *output, unsigned int axis, ReductionOperation op, bool keep_dims) { - ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis >= TensorShape::num_max_dimensions, "Reduction axis greater than max number of dimensions"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis >= TensorShape::num_max_dimensions, + "Reduction axis greater than max number of dimensions"); ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis > 3, "Unsupported reduction axis"); const auto is_reshape_required = !keep_dims; @@ -69,9 +83,10 @@ Status NEReductionOperation::validate(const ITensorInfo *input, const ITensorInf TensorInfo info_before_reshape; - if(is_reshape_required) + if (is_reshape_required) { - const TensorInfo expected_output_shape = output->clone()->set_tensor_shape(arm_compute::misc::shape_calculator::compute_reduced_shape(input->tensor_shape(), axis, keep_dims)); + const TensorInfo expected_output_shape = output->clone()->set_tensor_shape( + arm_compute::misc::shape_calculator::compute_reduced_shape(input->tensor_shape(), axis, keep_dims)); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&expected_output_shape, output); auto shape_before_reshape = input->tensor_shape(); @@ -79,113 +94,88 @@ Status NEReductionOperation::validate(const ITensorInfo *input, const ITensorInf const auto input_num_channles = input->num_channels(); const auto input_qinfo = input->quantization_info(); - const auto is_arg_min_max = (op == ReductionOperation::ARG_IDX_MAX) || (op == ReductionOperation::ARG_IDX_MIN); - const auto output_data_type = is_arg_min_max ? DataType::S32 : output->data_type(); + const auto is_arg_min_max = (op == ReductionOperation::ARG_IDX_MAX) || (op == ReductionOperation::ARG_IDX_MIN); + const auto output_data_type = is_arg_min_max ? DataType::S32 : output->data_type(); - info_before_reshape.set_data_type(output_data_type).set_tensor_shape(shape_before_reshape).set_num_channels(input_num_channles).set_quantization_info(input_qinfo); + info_before_reshape.set_data_type(output_data_type) + .set_tensor_shape(shape_before_reshape) + .set_num_channels(input_num_channles) + .set_quantization_info(input_qinfo); output_internal = &info_before_reshape; } ARM_COMPUTE_RETURN_ON_ERROR(NEReductionOperationKernel::validate(input, output_internal, axis, op)); - if(is_reshape_required) + if (is_reshape_required) { - ARM_COMPUTE_RETURN_ON_ERROR(NEReshapeLayerKernel::validate(output_internal, output)); + ARM_COMPUTE_RETURN_ON_ERROR(NEReshapeLayer::validate(output_internal, output)); } return Status{}; } -void NEReductionOperation::configure(ITensor *input, ITensor *output, unsigned int axis, ReductionOperation op, bool keep_dims) +void NEReductionOperation::configure( + ITensor *input, ITensor *output, unsigned int axis, ReductionOperation op, bool keep_dims) { ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); + ARM_COMPUTE_LOG_PARAMS(input, output, axis, op, keep_dims); _is_reshape_required = !keep_dims; auto *output_internal = output; const auto is_arg_min_max = (op == ReductionOperation::ARG_IDX_MAX) || (op == ReductionOperation::ARG_IDX_MIN); - if(_is_reshape_required) + if (_is_reshape_required) { - const auto output_internal_shape = arm_compute::misc::shape_calculator::compute_reduced_shape(input->info()->tensor_shape(), axis); - const auto output_external_shape = arm_compute::misc::shape_calculator::compute_reduced_shape(input->info()->tensor_shape(), axis, false); - const auto output_data_type = is_arg_min_max ? DataType::S32 : input->info()->data_type(); - const auto num_channels = input->info()->num_channels(); - const auto qinfo = input->info()->quantization_info(); - - _output_internal.allocator()->init(input->info()->clone()->set_data_type(output_data_type).set_tensor_shape(output_internal_shape).reset_padding().set_is_resizable(true).set_num_channels( - num_channels).set_quantization_info(qinfo)); + const auto output_internal_shape = + arm_compute::misc::shape_calculator::compute_reduced_shape(input->info()->tensor_shape(), axis); + const auto output_external_shape = + arm_compute::misc::shape_calculator::compute_reduced_shape(input->info()->tensor_shape(), axis, false); + const auto output_data_type = is_arg_min_max ? DataType::S32 : input->info()->data_type(); + const auto num_channels = input->info()->num_channels(); + const auto qinfo = input->info()->quantization_info(); + + _output_internal.allocator()->init(input->info() + ->clone() + ->set_data_type(output_data_type) + .set_tensor_shape(output_internal_shape) + .reset_padding() + .set_is_resizable(true) + .set_num_channels(num_channels) + .set_quantization_info(qinfo)); _memory_group.manage(&_output_internal); output_internal = &_output_internal; - auto_init_if_empty(*output->info(), input->info()->clone()->set_data_type(output_data_type).set_tensor_shape(output_external_shape).reset_padding().set_is_resizable(true)); + auto_init_if_empty(*output->info(), input->info() + ->clone() + ->set_data_type(output_data_type) + .set_tensor_shape(output_external_shape) + .reset_padding() + .set_is_resizable(true)); } ARM_COMPUTE_ERROR_THROW_ON(NEReductionOperation::validate(input->info(), output->info(), axis, op, keep_dims)); // Configure reduction kernel - _reduction_kernel.configure(input, output_internal, axis, op); + _reduction_kernel = std::make_unique<NEReductionOperationKernel>(); + _reduction_kernel->configure(input, output_internal, axis, op); _window_split = reduction_window_split_dimension(axis); _reduction_axis = axis; - if(axis == 0) + if (_is_reshape_required) { - // Configure fill border kernel - const BorderSize fill_border_size = _reduction_kernel.border_size(); - PixelValue pixelValue; - switch(op) - { - case ReductionOperation::PROD: - { - pixelValue = PixelValue(1, input->info()->data_type(), input->info()->quantization_info()); - break; - } - case ReductionOperation::MIN: - { - pixelValue = std::get<1>(get_min_max(input->info()->data_type())); - break; - } - case ReductionOperation::MAX: - { - pixelValue = std::get<0>(get_min_max(input->info()->data_type())); - break; - } - case ReductionOperation::ARG_IDX_MAX: - case ReductionOperation::ARG_IDX_MIN: - { - pixelValue = PixelValue(0, input->info()->data_type(), input->info()->quantization_info()); - break; - } - case ReductionOperation::MEAN_SUM: - case ReductionOperation::SUM_SQUARE: - case ReductionOperation::SUM: - { - pixelValue = PixelValue(static_cast<uint32_t>(0)); - break; - } - default: - ARM_COMPUTE_ERROR("Reduction Operation unsupported"); - } - _fill_border_kernel.configure(input, fill_border_size, (is_arg_min_max ? BorderMode::REPLICATE : BorderMode::CONSTANT), pixelValue); - } - - if(_is_reshape_required) - { - _reshape_kernel.configure(output_internal, output); + _reshape.configure(output_internal, output); _output_internal.allocator()->allocate(); } } void NEReductionOperation::run() { - if(_reduction_axis == 0) - { - NEScheduler::get().schedule(&_fill_border_kernel, Window::DimY); - } - NEScheduler::get().schedule(&_reduction_kernel, _window_split); - if(_is_reshape_required) + MemoryGroupResourceScope scope_mg(_memory_group); + NEScheduler::get().schedule(_reduction_kernel.get(), _window_split); + if (_is_reshape_required) { - NEScheduler::get().schedule(&_reshape_kernel, Window::DimY); + _reshape.run(); } } } // namespace arm_compute diff --git a/src/runtime/NEON/functions/NERemap.cpp b/src/runtime/NEON/functions/NERemap.cpp deleted file mode 100644 index 12c9f7b4e8..0000000000 --- a/src/runtime/NEON/functions/NERemap.cpp +++ /dev/null @@ -1,53 +0,0 @@ -/* - * Copyright (c) 2016-2020 ARM Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "arm_compute/runtime/NEON/functions/NERemap.h" - -#include "arm_compute/core/Error.h" -#include "arm_compute/core/ITensor.h" -#include "arm_compute/core/NEON/kernels/NERemapKernel.h" -#include "arm_compute/core/PixelValue.h" -#include "arm_compute/core/TensorInfo.h" -#include "arm_compute/core/Validate.h" -#include "arm_compute/runtime/TensorAllocator.h" -#include "support/MemorySupport.h" - -#include <utility> - -using namespace arm_compute; - -void NERemap::configure(ITensor *input, const ITensor *map_x, const ITensor *map_y, ITensor *output, InterpolationPolicy policy, BorderMode border_mode, uint8_t constant_border_value) -{ - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8); - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8); - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(map_x, 1, DataType::F32); - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(map_y, 1, DataType::F32); - ARM_COMPUTE_ERROR_ON_MSG(policy == InterpolationPolicy::AREA, "Area interpolation is not supported"); - - auto k = arm_compute::support::cpp14::make_unique<NERemapKernel>(); - - k->configure(input, map_x, map_y, output, policy); - - _kernel = std::move(k); - _border_handler.configure(input, _kernel->border_size(), border_mode, PixelValue(constant_border_value)); -} diff --git a/src/runtime/NEON/functions/NEIm2Col.cpp b/src/runtime/NEON/functions/NEReorderLayer.cpp index 3cb0dc1762..89cf575f38 100644 --- a/src/runtime/NEON/functions/NEIm2Col.cpp +++ b/src/runtime/NEON/functions/NEReorderLayer.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2020 ARM Limited. + * Copyright (c) 2023 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -21,33 +21,46 @@ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ -#include "arm_compute/runtime/NEON/functions/NEIm2Col.h" +#if defined(__aarch64__) + +#include "arm_compute/runtime/NEON/functions/NEReorderLayer.h" -#include "arm_compute/core/TensorInfo.h" #include "arm_compute/runtime/NEON/NEScheduler.h" +#include "src/core/NEON/kernels/NEReorderKernel.h" + namespace arm_compute { -NEIm2Col::NEIm2Col() - : _kernel(), _y_dim(1) +NEReorderLayer::~NEReorderLayer() = default; + +NEReorderLayer::NEReorderLayer() : _reorder_kernel(std::make_unique<NEReorderKernel>()) { } -void NEIm2Col::configure(const ITensor *input, ITensor *output, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias, const Size2D &dilation, unsigned int num_groups) +void NEReorderLayer::configure(const ITensor *input, + ITensor *output, + arm_compute::WeightFormat input_wf, + arm_compute::WeightFormat output_wf) { - _y_dim = get_data_layout_dimension_index(input->info()->data_layout(), DataLayoutDimension::HEIGHT); - - _kernel.configure(input, output, kernel_dims, conv_info, has_bias, dilation, num_groups); + auto k = std::make_unique<NEReorderKernel>(); + k->configure(input, output, input_wf, output_wf); + _reorder_kernel = std::move(k); } -Status NEIm2Col::validate(const ITensorInfo *input, const ITensorInfo *output, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias, const Size2D &dilation, - unsigned int num_groups) +void NEReorderLayer::run() { - return NEIm2ColKernel::validate(input, output, kernel_dims, conv_info, has_bias, dilation, num_groups); + // Run Reorder + NEScheduler::get().schedule(_reorder_kernel.get(), Window::DimX); } -void NEIm2Col::run() +Status NEReorderLayer::validate(const ITensorInfo *input, + const ITensorInfo *output, + arm_compute::WeightFormat input_wf, + arm_compute::WeightFormat output_wf) { - NEScheduler::get().schedule(&_kernel, _y_dim); + return NEReorderKernel::validate(input, output, input_wf, output_wf); } + } // namespace arm_compute + +#endif // defined(__aarch64__) diff --git a/src/runtime/NEON/functions/NEReorgLayer.cpp b/src/runtime/NEON/functions/NEReorgLayer.cpp index dc8f5f1f66..14e41d6df4 100644 --- a/src/runtime/NEON/functions/NEReorgLayer.cpp +++ b/src/runtime/NEON/functions/NEReorgLayer.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018-2020 ARM Limited. + * Copyright (c) 2018-2021 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -23,14 +23,16 @@ */ #include "arm_compute/runtime/NEON/functions/NEReorgLayer.h" -#include "arm_compute/core/NEON/kernels/NEReorgLayerKernel.h" -#include "support/MemorySupport.h" +#include "src/common/utils/Log.h" +#include "src/core/NEON/kernels/NEReorgLayerKernel.h" namespace arm_compute { void NEReorgLayer::configure(const ITensor *input, ITensor *output, int32_t stride) { - auto k = arm_compute::support::cpp14::make_unique<NEReorgLayerKernel>(); + ARM_COMPUTE_LOG_PARAMS(input, output, stride); + + auto k = std::make_unique<NEReorgLayerKernel>(); k->configure(input, output, stride); _kernel = std::move(k); } diff --git a/src/runtime/NEON/functions/NEReshapeLayer.cpp b/src/runtime/NEON/functions/NEReshapeLayer.cpp index 0a9f42d510..bed70ff66c 100644 --- a/src/runtime/NEON/functions/NEReshapeLayer.cpp +++ b/src/runtime/NEON/functions/NEReshapeLayer.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2020 ARM Limited. + * Copyright (c) 2017-2021 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -23,26 +23,51 @@ */ #include "arm_compute/runtime/NEON/functions/NEReshapeLayer.h" -#include "arm_compute/core/NEON/kernels/NEReshapeLayerKernel.h" #include "arm_compute/core/Validate.h" -#include "support/MemorySupport.h" + +#include "src/cpu/operators/CpuReshape.h" #include <utility> namespace arm_compute { +struct NEReshapeLayer::Impl +{ + const ITensor *src{nullptr}; + ITensor *dst{nullptr}; + std::unique_ptr<cpu::CpuReshape> op{nullptr}; +}; + +NEReshapeLayer::NEReshapeLayer() : _impl(std::make_unique<Impl>()) +{ +} +NEReshapeLayer::NEReshapeLayer(NEReshapeLayer &&) = default; +NEReshapeLayer &NEReshapeLayer::operator=(NEReshapeLayer &&) = default; +NEReshapeLayer::~NEReshapeLayer() = default; + void NEReshapeLayer::configure(const ITensor *input, ITensor *output) { - auto k = arm_compute::support::cpp14::make_unique<NEReshapeLayerKernel>(); - k->configure(input, output); - _kernel = std::move(k); + ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); + + _impl->src = input; + _impl->dst = output; + _impl->op = std::make_unique<cpu::CpuReshape>(); + _impl->op->configure(input->info(), output->info()); } Status NEReshapeLayer::validate(const ITensorInfo *input, const ITensorInfo *output) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output); - ARM_COMPUTE_RETURN_ON_ERROR(NEReshapeLayerKernel::validate(input, output)); + ARM_COMPUTE_RETURN_ON_ERROR(cpu::CpuReshape::validate(input, output)); return Status{}; } + +void NEReshapeLayer::run() +{ + ITensorPack pack; + pack.add_tensor(TensorType::ACL_SRC, _impl->src); + pack.add_tensor(TensorType::ACL_DST, _impl->dst); + _impl->op->run(pack); +} } // namespace arm_compute diff --git a/src/runtime/NEON/functions/NEReverse.cpp b/src/runtime/NEON/functions/NEReverse.cpp index a950826270..a90f8d2e76 100644 --- a/src/runtime/NEON/functions/NEReverse.cpp +++ b/src/runtime/NEON/functions/NEReverse.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018-2020 ARM Limited. + * Copyright (c) 2018-2021, 2023 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -23,20 +23,25 @@ */ #include "arm_compute/runtime/NEON/functions/NEReverse.h" -#include "arm_compute/core/NEON/kernels/NEReverseKernel.h" -#include "support/MemorySupport.h" +#include "src/common/utils/Log.h" +#include "src/core/NEON/kernels/NEReverseKernel.h" namespace arm_compute { -void NEReverse::configure(const ITensor *input, ITensor *output, const ITensor *axis) +void NEReverse::configure(const ITensor *input, ITensor *output, const ITensor *axis, bool use_inverted_axis) { - auto k = arm_compute::support::cpp14::make_unique<NEReverseKernel>(); - k->configure(input, output, axis); + ARM_COMPUTE_LOG_PARAMS(input, output, axis); + + auto k = std::make_unique<NEReverseKernel>(); + k->configure(input, output, axis, use_inverted_axis); _kernel = std::move(k); } -Status NEReverse::validate(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *axis) +Status NEReverse::validate(const ITensorInfo *input, + const ITensorInfo *output, + const ITensorInfo *axis, + bool use_inverted_axis) { - return NEReverseKernel::validate(input, output, axis); + return NEReverseKernel::validate(input, output, axis, use_inverted_axis); } } // namespace arm_compute diff --git a/src/runtime/NEON/functions/NEScale.cpp b/src/runtime/NEON/functions/NEScale.cpp index acde0cfcc5..0d011064f6 100644 --- a/src/runtime/NEON/functions/NEScale.cpp +++ b/src/runtime/NEON/functions/NEScale.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2016-2020 ARM Limited. + * Copyright (c) 2016-2023 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -23,220 +23,122 @@ */ #include "arm_compute/runtime/NEON/functions/NEScale.h" -#include "arm_compute/core/Coordinates.h" -#include "arm_compute/core/Error.h" -#include "arm_compute/core/Helpers.h" -#include "arm_compute/core/ITensor.h" -#include "arm_compute/core/PixelValue.h" -#include "arm_compute/core/TensorInfo.h" -#include "arm_compute/core/Window.h" -#include "arm_compute/runtime/NEON/NEScheduler.h" -#include "arm_compute/runtime/TensorAllocator.h" +#include "arm_compute/runtime/Tensor.h" -#include <cmath> -#include <cstddef> -#include <utility> +#include "src/common/utils/Log.h" +#include "src/core/utils/ScaleUtils.h" +#include "src/cpu/operators/CpuScale.h" -using namespace arm_compute; - -namespace +namespace arm_compute { -void precompute_dx_dy_offsets(ITensor *dx, ITensor *dy, ITensor *offsets, float wr, float hr, size_t input_element_size, SamplingPolicy sampling_policy) +struct NEScale::Impl { - ARM_COMPUTE_ERROR_ON(nullptr == offsets); - ARM_COMPUTE_UNUSED(sampling_policy); - float sampling_offset = 0.0f; - if(sampling_policy == SamplingPolicy::CENTER) - { - sampling_offset = 0.5f; - } - - Window win; - win.set(Window::DimX, Window::Dimension(0, offsets->info()->dimension(0), 1)); - win.set(Window::DimY, Window::Dimension(0, offsets->info()->dimension(1), 1)); - - if(dx != nullptr && dy != nullptr) - { - // Pre-compute the offset and pixel's distance for BILINEAR interpolation - Iterator offsets_it(offsets, win); - Iterator dx_it(dx, win); - Iterator dy_it(dy, win); - - execute_window_loop(win, [&](const Coordinates & id) - { - const float in_x = (id.x() + sampling_offset) * wr - sampling_offset; - const float in_y = (id.y() + sampling_offset) * hr - sampling_offset; - const int in_xi = std::floor(in_x); - const int in_yi = std::floor(in_y); - - *reinterpret_cast<int32_t *>(offsets_it.ptr()) = in_xi * static_cast<int>(input_element_size); - *reinterpret_cast<float *>(dx_it.ptr()) = in_x - in_xi; - *reinterpret_cast<float *>(dy_it.ptr()) = in_y - in_yi; - }, - offsets_it, dx_it, dy_it); - } - else - { - // Pre-compute the offset for NEAREST interpolation - Iterator offsets_it(offsets, win); - - execute_window_loop(win, [&](const Coordinates & id) - { - const size_t in_xi = std::floor((id.x() + sampling_offset) * wr); - - *reinterpret_cast<int32_t *>(offsets_it.ptr()) = in_xi * input_element_size; - }, - offsets_it); - } -} -} // namespace - -NEScale::NEScale() // NOLINT - : _offsets(), - _dx(), - _dy(), - _scale_kernel(), - _border_handler(), - _use_padding(true), - _align_corners(false) + const ITensor *src{nullptr}; + ITensor *dst{nullptr}; + Tensor dx{nullptr}; /**< Element's distance between the X real coordinate and the smallest X following integer */ + Tensor dy{nullptr}; /**< Element's distance between the Y real coordinate and the smallest Y following integer */ + Tensor offsets{ + nullptr}; /**< Offset to access the element with NEAREST interpolation or the top-left element with BILINEAR interpolation in the input tensor */ + std::unique_ptr<cpu::CpuScale> op{nullptr}; +}; + +NEScale::NEScale() : _impl(std::make_unique<Impl>()) { } +NEScale::~NEScale() = default; void NEScale::configure(ITensor *input, ITensor *output, const ScaleKernelInfo &info) { - ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); - ARM_COMPUTE_ERROR_THROW_ON(NEScale::validate(input->info(), output->info(), info)); + ARM_COMPUTE_LOG_PARAMS(input, output, info); - _use_padding = info.use_padding; - _align_corners = info.interpolation_policy == InterpolationPolicy::BILINEAR - && info.sampling_policy == SamplingPolicy::TOP_LEFT - && info.align_corners; + _impl->src = input; + _impl->dst = output; + _impl->op = std::make_unique<cpu::CpuScale>(); + _impl->op->configure(input->info(), output->info(), info); + // Configure for size of allocation of internal tensors // Get data layout and width/height indices - const DataLayout data_layout = input->info()->data_layout(); - const int idx_width = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH); - const int idx_height = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT); - - // Get the tensor shape - const TensorShape shape(output->info()->dimension(idx_width), output->info()->dimension(idx_height)); + const DataLayout data_layout = + info.data_layout == DataLayout::UNKNOWN ? input->info()->data_layout() : info.data_layout; + const int idx_width = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH); + const int idx_height = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT); // Compute the ratio between source width/height and destination width/height - const auto wr = arm_compute::calculate_resize_ratio(input->info()->dimension(idx_width), output->info()->dimension(idx_width), _align_corners); - const auto hr = arm_compute::calculate_resize_ratio(input->info()->dimension(idx_height), output->info()->dimension(idx_height), _align_corners); - - // Get the element size of the input image - const size_t input_element_size = input->info()->element_size(); + const bool is_align_corners_used = + info.align_corners && arm_compute::scale_utils::is_align_corners_allowed_sampling_policy(info.sampling_policy); + const auto wr = arm_compute::scale_utils::calculate_resize_ratio( + input->info()->dimension(idx_width), output->info()->dimension(idx_width), is_align_corners_used); + const auto hr = arm_compute::scale_utils::calculate_resize_ratio( + input->info()->dimension(idx_height), output->info()->dimension(idx_height), is_align_corners_used); // Area interpolation behaves as Nearest Neighbour in case of up-sampling - const auto policy_to_use = (info.interpolation_policy == InterpolationPolicy::AREA && wr <= 1.f && hr <= 1.f) ? InterpolationPolicy::NEAREST_NEIGHBOR : info.interpolation_policy; - - switch(policy_to_use) - { - case InterpolationPolicy::NEAREST_NEIGHBOR: - { - TensorInfo tensor_info_offsets(shape, Format::S32); - _offsets.allocator()->init(tensor_info_offsets); - - _scale_kernel.configure(input, nullptr, nullptr, &_offsets, output, info); - - // Allocate once the configure methods have been called - _offsets.allocator()->allocate(); - - // Pre-compute offsets for nearest interpolation - precompute_dx_dy_offsets(nullptr, nullptr, &_offsets, wr, hr, input_element_size, info.sampling_policy); - break; - } - case InterpolationPolicy::BILINEAR: - { - TensorInfo tensor_info_offsets(shape, Format::S32); - TensorInfo tensor_info_dxdy(shape, Format::F32); + InterpolationPolicy policy_to_use = + (info.interpolation_policy == InterpolationPolicy::AREA && wr <= 1.f && hr <= 1.f) + ? InterpolationPolicy::NEAREST_NEIGHBOR + : info.interpolation_policy; - _offsets.allocator()->init(tensor_info_offsets); - _dx.allocator()->init(tensor_info_dxdy); - _dy.allocator()->init(tensor_info_dxdy); + // Get the tensor shape + TensorShape shape(output->info()->dimension(idx_width)); + shape.set(1, output->info()->dimension(idx_height), false); - _scale_kernel.configure(input, &_dx, &_dy, &_offsets, output, info); + bool precompute_indices_weights = arm_compute::scale_utils::is_precomputation_required( + data_layout, input->info()->data_type(), policy_to_use, info.border_mode); - // Allocate once the configure methods have been called - _offsets.allocator()->allocate(); - _dx.allocator()->allocate(); - _dy.allocator()->allocate(); + if (precompute_indices_weights) + { + const TensorInfo tensor_info_dxdy(shape, Format::F32); + const TensorInfo tensor_info_offsets(shape, Format::S32); - // Pre-compute dx, dy and offsets for bilinear interpolation - precompute_dx_dy_offsets(&_dx, &_dy, &_offsets, wr, hr, input_element_size, info.sampling_policy); - break; - } - case InterpolationPolicy::AREA: + _impl->dx.allocator()->init(tensor_info_dxdy); + _impl->dy.allocator()->init(tensor_info_dxdy); + _impl->offsets.allocator()->init(tensor_info_offsets); + switch (policy_to_use) { - _scale_kernel.configure(input, nullptr, nullptr, nullptr, output, info); - break; + case InterpolationPolicy::NEAREST_NEIGHBOR: + { + // Allocate once the configure methods have been called + _impl->offsets.allocator()->allocate(); + break; + } + case InterpolationPolicy::BILINEAR: + { + // Allocate once the configure methods have been called + _impl->dx.allocator()->allocate(); + _impl->dy.allocator()->allocate(); + _impl->offsets.allocator()->allocate(); + break; + } + case InterpolationPolicy::AREA: + { + break; + } + default: + ARM_COMPUTE_ERROR("Unsupported interpolation mode"); } - default: - ARM_COMPUTE_ERROR("Unsupported interpolation mode"); } - if(info.use_padding) + else { - _border_handler.configure(input, _scale_kernel.border_size(), info.border_mode, info.constant_border_value); + if (policy_to_use != InterpolationPolicy::NEAREST_NEIGHBOR && policy_to_use != InterpolationPolicy::BILINEAR && + policy_to_use != InterpolationPolicy::AREA) + { + ARM_COMPUTE_ERROR("Unsupported interpolation mode"); + } } } -void NEScale::configure(ITensor *input, ITensor *output, InterpolationPolicy policy, BorderMode border_mode, PixelValue constant_border_value, SamplingPolicy sampling_policy, bool use_padding, - bool align_corners) -{ - configure(input, output, ScaleKernelInfo{ policy, border_mode, constant_border_value, sampling_policy, use_padding, align_corners }); -} - Status NEScale::validate(const ITensorInfo *input, const ITensorInfo *output, const ScaleKernelInfo &info) { - ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output); - ARM_COMPUTE_RETURN_ERROR_ON(info.sampling_policy != SamplingPolicy::CENTER && info.sampling_policy != SamplingPolicy::TOP_LEFT); - - ITensorInfo *offsets = nullptr; - ITensorInfo *dx = nullptr; - ITensorInfo *dy = nullptr; - - // Get data layout and width/height indices - const DataLayout data_layout = input->data_layout(); - const int idx_width = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH); - const int idx_height = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT); - - // Get the tensor shape of auxilary buffers - const TensorShape shape(output->dimension(idx_width), output->dimension(idx_height)); - - TensorInfo tensor_info_offsets(shape, Format::S32); - TensorInfo tensor_info_dx(shape, Format::F32); - TensorInfo tensor_info_dy(shape, Format::F32); - - switch(info.interpolation_policy) - { - case InterpolationPolicy::NEAREST_NEIGHBOR: - offsets = &tensor_info_offsets; - break; - case InterpolationPolicy::BILINEAR: - offsets = &tensor_info_offsets; - dx = &tensor_info_dx; - dy = &tensor_info_dy; - break; - default: - break; - } - - ARM_COMPUTE_RETURN_ON_ERROR(NEScaleKernel::validate(input->clone().get(), dx, dy, offsets, output->clone().get(), info)); - return Status{}; -} - -Status NEScale::validate(const ITensorInfo *input, const ITensorInfo *output, InterpolationPolicy policy, - BorderMode border_mode, PixelValue constant_border_value, SamplingPolicy sampling_policy, bool use_padding, bool align_corners) -{ - ARM_COMPUTE_RETURN_ON_ERROR(NEScale::validate(input, output, ScaleKernelInfo{ policy, border_mode, constant_border_value, sampling_policy, use_padding, align_corners })); - return Status{}; + return cpu::CpuScale::validate(input, output, info); } void NEScale::run() { - if(_use_padding) - { - NEScheduler::get().schedule(&_border_handler, Window::DimZ); - } - NEScheduler::get().schedule(&_scale_kernel, Window::DimY); + ITensorPack pack; + pack.add_tensor(TensorType::ACL_SRC, _impl->src); + pack.add_tensor(TensorType::ACL_DST, _impl->dst); + pack.add_tensor(TensorType::ACL_INT_0, &_impl->dx); + pack.add_tensor(TensorType::ACL_INT_1, &_impl->dy); + pack.add_tensor(TensorType::ACL_INT_2, &_impl->offsets); + _impl->op->run(pack); } +} // namespace arm_compute diff --git a/src/runtime/NEON/functions/NEScharr3x3.cpp b/src/runtime/NEON/functions/NEScharr3x3.cpp deleted file mode 100644 index b7a99ff55a..0000000000 --- a/src/runtime/NEON/functions/NEScharr3x3.cpp +++ /dev/null @@ -1,40 +0,0 @@ -/* - * Copyright (c) 2016-2020 ARM Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "arm_compute/runtime/NEON/functions/NEScharr3x3.h" - -#include "arm_compute/core/NEON/kernels/NEScharr3x3Kernel.h" -#include "arm_compute/core/PixelValue.h" -#include "support/MemorySupport.h" - -#include <utility> - -using namespace arm_compute; - -void NEScharr3x3::configure(ITensor *input, ITensor *output_x, ITensor *output_y, BorderMode border_mode, uint8_t constant_border_value) -{ - auto k = arm_compute::support::cpp14::make_unique<NEScharr3x3Kernel>(); - k->configure(input, output_x, output_y, border_mode == BorderMode::UNDEFINED); - _kernel = std::move(k); - _border_handler.configure(input, _kernel->border_size(), border_mode, PixelValue(constant_border_value)); -} diff --git a/src/runtime/NEON/functions/NESelect.cpp b/src/runtime/NEON/functions/NESelect.cpp index 8587f7f28e..55cad2202b 100644 --- a/src/runtime/NEON/functions/NESelect.cpp +++ b/src/runtime/NEON/functions/NESelect.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018-2020 ARM Limited. + * Copyright (c) 2018-2021 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -23,15 +23,18 @@ */ #include "arm_compute/runtime/NEON/functions/NESelect.h" -#include "arm_compute/core/NEON/kernels/NESelectKernel.h" #include "arm_compute/core/Types.h" -#include "support/MemorySupport.h" + +#include "src/common/utils/Log.h" +#include "src/core/NEON/kernels/NESelectKernel.h" namespace arm_compute { void NESelect::configure(const ITensor *c, const ITensor *x, const ITensor *y, ITensor *output) { - auto k = arm_compute::support::cpp14::make_unique<NESelectKernel>(); + ARM_COMPUTE_LOG_PARAMS(c, x, y, output); + + auto k = std::make_unique<NESelectKernel>(); k->configure(c, x, y, output); _kernel = std::move(k); } diff --git a/src/runtime/NEON/functions/NESimpleAssemblyFunction.cpp b/src/runtime/NEON/functions/NESimpleAssemblyFunction.cpp deleted file mode 100644 index a4b0dfffaa..0000000000 --- a/src/runtime/NEON/functions/NESimpleAssemblyFunction.cpp +++ /dev/null @@ -1,46 +0,0 @@ -/* - * Copyright (c) 2018 ARM Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "arm_compute/runtime/NEON/functions/NESimpleAssemblyFunction.h" - -#include "arm_compute/core/Validate.h" -#include "arm_compute/runtime/NEON/NEScheduler.h" - -using namespace arm_compute; - -NESimpleAssemblyFunction::NESimpleAssemblyFunction() // NOLINT - : _kernel() -{ -} - -void NESimpleAssemblyFunction::run() -{ - NEScheduler::get().schedule(_kernel.get(), Window::DimX); -} - -void NESimpleAssemblyFunction::configure(std::unique_ptr<INEGEMMWrapperKernel> kernel) -{ - ARM_COMPUTE_ERROR_ON_NULLPTR(kernel.get()); - _kernel = std::move(kernel); - ARM_COMPUTE_ERROR_ON_WINDOW_DIMENSIONS_GTE(_kernel->window(), 1); -} diff --git a/src/runtime/NEON/functions/NESlice.cpp b/src/runtime/NEON/functions/NESlice.cpp index 5da8896d6f..12d43adc84 100644 --- a/src/runtime/NEON/functions/NESlice.cpp +++ b/src/runtime/NEON/functions/NESlice.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018-2020 ARM Limited. + * Copyright (c) 2018-2021 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -24,40 +24,87 @@ #include "arm_compute/runtime/NEON/functions/NESlice.h" #include "arm_compute/core/ITensor.h" -#include "arm_compute/core/NEON/kernels/NEStridedSliceKernel.h" #include "arm_compute/core/Types.h" -#include "arm_compute/core/Validate.h" #include "arm_compute/core/utils/helpers/tensor_transform.h" +#include "arm_compute/core/Validate.h" -#include "support/MemorySupport.h" +#include "src/common/utils/Log.h" +#include "src/core/NEON/kernels/NEStridedSliceKernel.h" namespace arm_compute { -void NESlice::configure(const ITensor *input, ITensor *output, const Coordinates &starts, const Coordinates &ends) +namespace experimental +{ +void NESlice::configure(const ITensorInfo *input, + ITensorInfo *output, + const Coordinates &starts, + const Coordinates &ends) { ARM_COMPUTE_ERROR_ON_NULLPTR(input); + ARM_COMPUTE_LOG_PARAMS(input, output, starts, ends); // Get absolute end coordinates const int32_t slice_end_mask = arm_compute::helpers::tensor_transform::construct_slice_end_mask(ends); - auto k = arm_compute::support::cpp14::make_unique<NEStridedSliceKernel>(); + auto k = std::make_unique<NEStridedSliceKernel>(); k->configure(input, output, starts, ends, BiStrides(), 0, slice_end_mask, 0); _kernel = std::move(k); } -Status NESlice::validate(const ITensorInfo *input, const ITensorInfo *output, const Coordinates &starts, const Coordinates &ends) +Status NESlice::validate(const ITensorInfo *input, + const ITensorInfo *output, + const Coordinates &starts, + const Coordinates &ends) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input); // Check start dimensions for being non-negative - ARM_COMPUTE_RETURN_ERROR_ON(std::any_of(starts.cbegin(), starts.cbegin() + starts.num_dimensions(), [](int i) - { - return i < 0; - })); + ARM_COMPUTE_RETURN_ERROR_ON( + std::any_of(starts.cbegin(), starts.cbegin() + starts.num_dimensions(), [](int i) { return i < 0; })); // Get absolute end coordinates const int32_t slice_end_mask = arm_compute::helpers::tensor_transform::construct_slice_end_mask(ends); return NEStridedSliceKernel::validate(input, output, starts, ends, BiStrides(), 0, slice_end_mask, 0); } +} // namespace experimental + +struct NESlice::Impl +{ + const ITensor *src{nullptr}; + ITensor *dst{nullptr}; + std::unique_ptr<experimental::NESlice> op{nullptr}; +}; + +NESlice::NESlice() : _impl(std::make_unique<Impl>()) +{ +} +NESlice::NESlice(NESlice &&) = default; +NESlice &NESlice::operator=(NESlice &&) = default; +NESlice::~NESlice() = default; + +Status NESlice::validate(const ITensorInfo *input, + const ITensorInfo *output, + const Coordinates &starts, + const Coordinates &ends) +{ + return experimental::NESlice::validate(input, output, starts, ends); +} + +void NESlice::configure(const ITensor *input, ITensor *output, const Coordinates &starts, const Coordinates &ends) +{ + _impl->src = input; + _impl->dst = output; + _impl->op = std::make_unique<experimental::NESlice>(); + _impl->op->configure(input->info(), output->info(), starts, ends); +} + +void NESlice::run() +{ + ITensorPack pack; + pack.add_tensor(TensorType::ACL_SRC, _impl->src); + pack.add_tensor(TensorType::ACL_DST, _impl->dst); + _impl->op->run(pack); +} + } // namespace arm_compute diff --git a/src/runtime/NEON/functions/NESobel3x3.cpp b/src/runtime/NEON/functions/NESobel3x3.cpp deleted file mode 100644 index ca80ccd82e..0000000000 --- a/src/runtime/NEON/functions/NESobel3x3.cpp +++ /dev/null @@ -1,40 +0,0 @@ -/* - * Copyright (c) 2016-2020 ARM Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "arm_compute/runtime/NEON/functions/NESobel3x3.h" - -#include "arm_compute/core/NEON/kernels/NESobel3x3Kernel.h" -#include "arm_compute/core/PixelValue.h" -#include "support/MemorySupport.h" - -#include <utility> - -using namespace arm_compute; - -void NESobel3x3::configure(ITensor *input, ITensor *output_x, ITensor *output_y, BorderMode border_mode, uint8_t constant_border_value) -{ - auto k = arm_compute::support::cpp14::make_unique<NESobel3x3Kernel>(); - k->configure(input, output_x, output_y, border_mode == BorderMode::UNDEFINED); - _kernel = std::move(k); - _border_handler.configure(input, _kernel->border_size(), border_mode, PixelValue(constant_border_value)); -} diff --git a/src/runtime/NEON/functions/NESobel5x5.cpp b/src/runtime/NEON/functions/NESobel5x5.cpp deleted file mode 100644 index 2ddfee5028..0000000000 --- a/src/runtime/NEON/functions/NESobel5x5.cpp +++ /dev/null @@ -1,88 +0,0 @@ -/* - * Copyright (c) 2016-2019 ARM Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "arm_compute/runtime/NEON/functions/NESobel5x5.h" - -#include "arm_compute/core/ITensor.h" -#include "arm_compute/core/PixelValue.h" -#include "arm_compute/core/TensorInfo.h" -#include "arm_compute/core/Validate.h" -#include "arm_compute/runtime/NEON/NEScheduler.h" -#include "arm_compute/runtime/TensorAllocator.h" - -using namespace arm_compute; - -NESobel5x5::NESobel5x5(std::shared_ptr<IMemoryManager> memory_manager) - : _memory_group(std::move(memory_manager)), _sobel_hor(), _sobel_vert(), _tmp_x(), _tmp_y(), _border_handler() -{ -} - -void NESobel5x5::configure(ITensor *input, ITensor *output_x, ITensor *output_y, BorderMode border_mode, uint8_t constant_border_value) -{ - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8); - - const bool run_sobel_x = output_x != nullptr; - const bool run_sobel_y = output_y != nullptr; - - TensorInfo tensor_info(input->info()->tensor_shape(), Format::S16); - - if(run_sobel_x && run_sobel_y) - { - _tmp_x.allocator()->init(tensor_info); - _tmp_y.allocator()->init(tensor_info); - _memory_group.manage(&_tmp_x); - _memory_group.manage(&_tmp_y); - _sobel_hor.configure(input, &_tmp_x, &_tmp_y, border_mode == BorderMode::UNDEFINED); - _sobel_vert.configure(&_tmp_x, &_tmp_y, output_x, output_y, border_mode == BorderMode::UNDEFINED); - _tmp_x.allocator()->allocate(); - _tmp_y.allocator()->allocate(); - } - else if(run_sobel_x) - { - _tmp_x.allocator()->init(tensor_info); - _memory_group.manage(&_tmp_x); - _sobel_hor.configure(input, &_tmp_x, nullptr, border_mode == BorderMode::UNDEFINED); - _sobel_vert.configure(&_tmp_x, nullptr, output_x, nullptr, border_mode == BorderMode::UNDEFINED); - _tmp_x.allocator()->allocate(); - } - else if(run_sobel_y) - { - _tmp_y.allocator()->init(tensor_info); - _memory_group.manage(&_tmp_y); - _sobel_hor.configure(input, nullptr, &_tmp_y, border_mode == BorderMode::UNDEFINED); - _sobel_vert.configure(nullptr, &_tmp_y, nullptr, output_y, border_mode == BorderMode::UNDEFINED); - _tmp_y.allocator()->allocate(); - } - - _border_handler.configure(input, _sobel_hor.border_size(), border_mode, PixelValue(constant_border_value)); -} - -void NESobel5x5::run() -{ - NEScheduler::get().schedule(&_border_handler, Window::DimZ); - - MemoryGroupResourceScope scope_mg(_memory_group); - - NEScheduler::get().schedule(&_sobel_hor, Window::DimY); - NEScheduler::get().schedule(&_sobel_vert, Window::DimY); -} diff --git a/src/runtime/NEON/functions/NESobel7x7.cpp b/src/runtime/NEON/functions/NESobel7x7.cpp deleted file mode 100644 index b47a37aedb..0000000000 --- a/src/runtime/NEON/functions/NESobel7x7.cpp +++ /dev/null @@ -1,88 +0,0 @@ -/* - * Copyright (c) 2016-2019 ARM Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "arm_compute/runtime/NEON/functions/NESobel7x7.h" - -#include "arm_compute/core/ITensor.h" -#include "arm_compute/core/PixelValue.h" -#include "arm_compute/core/TensorInfo.h" -#include "arm_compute/core/Validate.h" -#include "arm_compute/runtime/NEON/NEScheduler.h" -#include "arm_compute/runtime/TensorAllocator.h" - -using namespace arm_compute; - -NESobel7x7::NESobel7x7(std::shared_ptr<IMemoryManager> memory_manager) - : _memory_group(std::move(memory_manager)), _sobel_hor(), _sobel_vert(), _tmp_x(), _tmp_y(), _border_handler() -{ -} - -void NESobel7x7::configure(ITensor *input, ITensor *output_x, ITensor *output_y, BorderMode border_mode, uint8_t constant_border_value) -{ - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8); - - const bool run_sobel_x = output_x != nullptr; - const bool run_sobel_y = output_y != nullptr; - - TensorInfo tensor_info(input->info()->tensor_shape(), Format::S32); - - if(run_sobel_x && run_sobel_y) - { - _tmp_x.allocator()->init(tensor_info); - _tmp_y.allocator()->init(tensor_info); - _memory_group.manage(&_tmp_x); - _memory_group.manage(&_tmp_y); - _sobel_hor.configure(input, &_tmp_x, &_tmp_y, border_mode == BorderMode::UNDEFINED); - _sobel_vert.configure(&_tmp_x, &_tmp_y, output_x, output_y, border_mode == BorderMode::UNDEFINED); - _tmp_x.allocator()->allocate(); - _tmp_y.allocator()->allocate(); - } - else if(run_sobel_x) - { - _tmp_x.allocator()->init(tensor_info); - _memory_group.manage(&_tmp_x); - _sobel_hor.configure(input, &_tmp_x, nullptr, border_mode == BorderMode::UNDEFINED); - _sobel_vert.configure(&_tmp_x, nullptr, output_x, nullptr, border_mode == BorderMode::UNDEFINED); - _tmp_x.allocator()->allocate(); - } - else if(run_sobel_y) - { - _tmp_y.allocator()->init(tensor_info); - _memory_group.manage(&_tmp_y); - _sobel_hor.configure(input, nullptr, &_tmp_y, border_mode == BorderMode::UNDEFINED); - _sobel_vert.configure(nullptr, &_tmp_y, nullptr, output_y, border_mode == BorderMode::UNDEFINED); - _tmp_y.allocator()->allocate(); - } - - _border_handler.configure(input, _sobel_hor.border_size(), border_mode, PixelValue(constant_border_value)); -} - -void NESobel7x7::run() -{ - NEScheduler::get().schedule(&_border_handler, Window::DimZ); - - MemoryGroupResourceScope scope_mg(_memory_group); - - NEScheduler::get().schedule(&_sobel_hor, Window::DimY); - NEScheduler::get().schedule(&_sobel_vert, Window::DimY); -} diff --git a/src/runtime/NEON/functions/NESoftmaxLayer.cpp b/src/runtime/NEON/functions/NESoftmaxLayer.cpp index 57d75af779..be588c5b52 100644 --- a/src/runtime/NEON/functions/NESoftmaxLayer.cpp +++ b/src/runtime/NEON/functions/NESoftmaxLayer.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2020 ARM Limited. + * Copyright (c) 2017-2021, 2023 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -23,192 +23,74 @@ */ #include "arm_compute/runtime/NEON/functions/NESoftmaxLayer.h" -#include "arm_compute/core/Helpers.h" -#include "arm_compute/core/NEON/kernels/NESoftmaxLayerKernel.h" -#include "arm_compute/core/utils/misc/ShapeCalculator.h" -#include "arm_compute/runtime/NEON/NEScheduler.h" -#include "utils/TypePrinter.h" +#include "arm_compute/core/Validate.h" +#include "arm_compute/runtime/MemoryGroup.h" +#include "arm_compute/runtime/Tensor.h" -#include <cfloat> +#include "src/core/helpers/MemoryHelpers.h" +#include "src/core/helpers/SoftmaxHelpers.h" +#include "src/cpu/operators/CpuSoftmax.h" namespace arm_compute { template <bool IS_LOG> -NESoftmaxLayerGeneric<IS_LOG>::NESoftmaxLayerGeneric(std::shared_ptr<IMemoryManager> memory_manager) - : _memory_group(std::move(memory_manager)), _max_kernel(), _softmax_kernel(), _flat_or_reshape_kernel_ptr(nullptr), _fill_border_kernel(), _reshape_kernel(), _max(), _tmp(), _input_flattened(), - _output_flattened(), _needs_flattening(false) +struct NESoftmaxLayerGeneric<IS_LOG>::Impl { -} + const ITensor *src{nullptr}; + ITensor *dst{nullptr}; + std::unique_ptr<cpu::CpuSoftmaxGeneric> op{nullptr}; + MemoryGroup memory_group{}; + ITensorPack run_pack{}; + WorkspaceData<Tensor> workspace_tensors{}; +}; template <bool IS_LOG> -void NESoftmaxLayerGeneric<IS_LOG>::configure_reshape_input_kernel(const ITensor *input, const ITensor *output, int32_t axis) +NESoftmaxLayerGeneric<IS_LOG>::NESoftmaxLayerGeneric(std::shared_ptr<IMemoryManager> memory_manager) + : _impl(std::make_unique<Impl>()) { - // Flatten the input - const TensorShape shape_flatten = misc::shape_calculator::compute_softmax_shape(input->info(), axis); - - // Initialize the flat input - _input_flattened.allocator()->init(input->info()->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(shape_flatten)); - - // If we need to flatten the input, we can use NEFlattenKernel or NEReshapeKernel - // If flattening on the third axes, we use NEFlattenKernel. - // In all other cases we have to use NEReshapeKernel - if(axis != 3) - { - auto reshape_kernel_ptr = support::cpp14::make_unique<NEReshapeLayerKernel>(); - reshape_kernel_ptr->configure(input, &_input_flattened); - _flat_or_reshape_kernel_ptr = std::move(reshape_kernel_ptr); - } - else - { - auto flatten_kernel_ptr = support::cpp14::make_unique<NEFlattenLayerKernel>(); - flatten_kernel_ptr->configure(input, &_input_flattened); - _flat_or_reshape_kernel_ptr = std::move(flatten_kernel_ptr); - } - - // We need to init the output tensor here. Indeed, the reshape kernel expects - // both tensors to be already initialized - auto_init_if_empty(*output->info(), *input->info()->clone()); + _impl->memory_group = MemoryGroup(std::move(memory_manager)); } template <bool IS_LOG> +NESoftmaxLayerGeneric<IS_LOG>::NESoftmaxLayerGeneric(NESoftmaxLayerGeneric &&) = default; +template <bool IS_LOG> +NESoftmaxLayerGeneric<IS_LOG> &NESoftmaxLayerGeneric<IS_LOG>::operator=(NESoftmaxLayerGeneric &&) = default; +template <bool IS_LOG> +NESoftmaxLayerGeneric<IS_LOG>::~NESoftmaxLayerGeneric() = default; + +template <bool IS_LOG> void NESoftmaxLayerGeneric<IS_LOG>::configure(ITensor *input, ITensor *output, float beta, int32_t axis) { - // Perform validation step ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); - ARM_COMPUTE_ERROR_THROW_ON(NESoftmaxLayerGeneric::validate(input->info(), output->info(), beta, axis)); - - // Handle negative axis, negative index is used to specify axis from the end (e.g. -1 for the last axis). - axis = wrap_around(axis, static_cast<int32_t>(input->info()->num_dimensions())); - - // We don't need flattening only in the case the input is 2D and axis is 1 - _needs_flattening = axis != 1; - - // If we are dealing with a 4D tensor, we will: - // - Flatten the input, so that we end up with a [width*height*depth] * batches 2D tensor - // - Execute all the pipeline (reduction + normalization) on the flattened tensor - // - Reshape the flattened output into the real output - if(_needs_flattening) - { - // Add to the memory manager _input_flattened - _memory_group.manage(&_input_flattened); - - // Configure _flatten_kernel and _input_flattened - configure_reshape_input_kernel(input, output, axis); - } - - // We want to deal with a 2D input. Either it is the flattened version of the original input (4D case) - // or it is the original input case (2D case) - ITensor *input_2D = (_needs_flattening ? &_input_flattened : input); - - // Create intermediate tensors shapes - const TensorInfo input_info = input_2D->info()->clone()->reset_padding().set_is_resizable(true); - DataType tmp_data_type = is_data_type_quantized_asymmetric(input_2D->info()->data_type()) ? DataType::F32 : input_2D->info()->data_type(); - TensorInfo tensor_info_tmp(input_info.clone()->set_data_type(tmp_data_type)); - // Init intermediate tensors - TensorShape max_sum_shape = input_2D->info()->tensor_shape(); - max_sum_shape.set(0, 1); - _max.allocator()->init(input_info.clone()->set_tensor_shape(max_sum_shape)); - _tmp.allocator()->init(tensor_info_tmp); + _impl->src = input; + _impl->dst = output; + _impl->op = std::make_unique<cpu::CpuSoftmaxGeneric>(); + _impl->op->configure(input->info(), output->info(), beta, axis, IS_LOG); - // Manage intermediate buffers - _memory_group.manage(&_max); - _memory_group.manage(&_tmp); - - // Configure Kernels - _max_kernel.configure(input_2D, &_max); - if(_needs_flattening) - { - // Add to the memory manager _output_flattened - _memory_group.manage(&_output_flattened); - - // The normalization kernel stores the result in a flat output tensor - _softmax_kernel.configure(input_2D, &_max, &_output_flattened, beta, &_tmp); - _input_flattened.allocator()->allocate(); - - // Reshape the flat output into the requested (4D) output - _reshape_kernel.configure(&_output_flattened, output); - - // Allocate the intermediate flat tensors - _output_flattened.allocator()->allocate(); - } - else - { - // Softmax 2D case - _fill_border_kernel.configure(input_2D, _max_kernel.border_size(), BorderMode::REPLICATE); - _softmax_kernel.configure(input_2D, &_max, output, beta, &_tmp); - } - - // Allocate intermediate buffers - _max.allocator()->allocate(); - _tmp.allocator()->allocate(); + _impl->run_pack = {{TensorType::ACL_SRC, _impl->src}, {TensorType::ACL_DST, _impl->dst}}; + _impl->workspace_tensors = manage_workspace<Tensor>(_impl->op->workspace(), _impl->memory_group, _impl->run_pack); } template <bool IS_LOG> -Status NESoftmaxLayerGeneric<IS_LOG>::validate(const ITensorInfo *input, const ITensorInfo *output, float beta, int32_t axis) +Status +NESoftmaxLayerGeneric<IS_LOG>::validate(const ITensorInfo *input, const ITensorInfo *output, float beta, int32_t axis) { - // Perform validation step ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->num_dimensions() > 4, "Only up to 4 dimensions are supported"); - ARM_COMPUTE_UNUSED(beta); - ARM_COMPUTE_RETURN_ERROR_ON(axis < static_cast<int32_t>(-input->num_dimensions()) || static_cast<int32_t>(input->num_dimensions()) <= axis); - - // Handle negative axis, negative index is used to specify axis from the end (e.g. -1 for the last axis). - axis = wrap_around(axis, static_cast<int32_t>(input->num_dimensions())); - - // Create intermediate tensor info - DataType tmp_data_type = input->data_type(); - const TensorInfo tensor_info_tmp(input->clone()->set_data_type(tmp_data_type).set_is_resizable(true)); - - TensorShape max_sum_shape = input->tensor_shape(); - max_sum_shape.set(0, 1); - const TensorInfo tensor_info_max_sum(input->clone()->set_tensor_shape(max_sum_shape).set_data_type(tmp_data_type).set_quantization_info(input->quantization_info()).set_is_resizable(true)); - const TensorInfo dont_care; - - const bool needs_flattening = (axis != 1); - - if(needs_flattening) - { - const TensorShape shape_flatten = misc::shape_calculator::compute_softmax_shape(input, axis); - TensorInfo tensor_info_flat(input->clone()->set_tensor_shape(shape_flatten).set_is_resizable(true)); - - if(axis != 3) - { - ARM_COMPUTE_RETURN_ON_ERROR(NEReshapeLayerKernel::validate(input, &tensor_info_flat)); - } - else - { - ARM_COMPUTE_RETURN_ON_ERROR(NEFlattenLayerKernel::validate(input, &tensor_info_flat)); - } - } - - ARM_COMPUTE_RETURN_ON_ERROR(NELogits1DMaxKernel::validate(input, &tensor_info_max_sum)); - ARM_COMPUTE_RETURN_ON_ERROR(NELogits1DSoftmaxKernel<IS_LOG>::validate(&tensor_info_tmp, &tensor_info_max_sum, output, beta, &dont_care)); - + ARM_COMPUTE_RETURN_ON_ERROR(cpu::CpuSoftmaxGeneric::validate(input, output, beta, axis, IS_LOG)); return Status{}; } template <bool IS_LOG> -void NESoftmaxLayerGeneric<IS_LOG>::run() +void NESoftmaxLayerGeneric<IS_LOG>::run() { - MemoryGroupResourceScope scope_mg(_memory_group); - - if(_needs_flattening) - { - NEScheduler::get().schedule(_flat_or_reshape_kernel_ptr.get(), Window::DimY); - } - - NEScheduler::get().schedule(&_fill_border_kernel, Window::DimY); - NEScheduler::get().schedule(&_max_kernel, Window::DimY); - NEScheduler::get().schedule(&_softmax_kernel, Window::DimY); - - if(_needs_flattening) - { - NEScheduler::get().schedule(&_reshape_kernel, Window::DimY); - } + // Acquire all the temporaries + MemoryGroupResourceScope scope_mg(_impl->memory_group); + ARM_COMPUTE_ERROR_ON_NULLPTR(_impl->src, _impl->dst); + _impl->op->run(_impl->run_pack); } template class NESoftmaxLayerGeneric<false>; template class NESoftmaxLayerGeneric<true>; -} // namespace arm_compute
\ No newline at end of file +} // namespace arm_compute diff --git a/src/runtime/NEON/functions/NESpaceToBatchLayer.cpp b/src/runtime/NEON/functions/NESpaceToBatchLayer.cpp index 205bc910a5..556ebdd800 100644 --- a/src/runtime/NEON/functions/NESpaceToBatchLayer.cpp +++ b/src/runtime/NEON/functions/NESpaceToBatchLayer.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019 ARM Limited. + * Copyright (c) 2019-2021 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -28,50 +28,76 @@ #include "arm_compute/core/TensorInfo.h" #include "arm_compute/core/Types.h" #include "arm_compute/core/Validate.h" +#include "arm_compute/runtime/NEON/functions/NEFill.h" #include "arm_compute/runtime/NEON/NEScheduler.h" +#include "src/common/utils/Log.h" +#include "src/core/NEON/kernels/NESpaceToBatchLayerKernel.h" + namespace arm_compute { -NESpaceToBatchLayer::NESpaceToBatchLayer() - : _space_to_batch_kernel(), _memset_kernel(), _has_padding(false) +NESpaceToBatchLayer::~NESpaceToBatchLayer() = default; + +NESpaceToBatchLayer::NESpaceToBatchLayer() : _space_to_batch_kernel(), _fill_f(), _has_padding(false) { } -void NESpaceToBatchLayer::configure(const ITensor *input, const ITensor *block_shape, const ITensor *paddings, ITensor *output) +void NESpaceToBatchLayer::configure(const ITensor *input, + const ITensor *block_shape, + const ITensor *paddings, + ITensor *output) { ARM_COMPUTE_ERROR_ON_NULLPTR(input, block_shape, paddings, output); + ARM_COMPUTE_LOG_PARAMS(input, block_shape, paddings, output); - if(input->info()->tensor_shape().total_size() != output->info()->tensor_shape().total_size()) + if (input->info()->tensor_shape().total_size() != output->info()->tensor_shape().total_size()) { _has_padding = true; - _memset_kernel.configure(output, PixelValue(0, input->info()->data_type(), input->info()->quantization_info())); + _fill_f = std::make_unique<NEFill>(); + _fill_f->configure(output, PixelValue(0, input->info()->data_type(), input->info()->quantization_info())); } - _space_to_batch_kernel.configure(input, block_shape, paddings, output); + _space_to_batch_kernel = std::make_unique<NESpaceToBatchLayerKernel>(); + _space_to_batch_kernel->configure(input, block_shape, paddings, output); } -void NESpaceToBatchLayer::configure(const ITensor *input, const int block_shape_x, const int block_shape_y, const Size2D &padding_left, const Size2D &padding_right, ITensor *output) +void NESpaceToBatchLayer::configure(const ITensor *input, + const int block_shape_x, + const int block_shape_y, + const Size2D &padding_left, + const Size2D &padding_right, + ITensor *output) { ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); - if(input->info()->tensor_shape().total_size() != output->info()->tensor_shape().total_size()) + if (input->info()->tensor_shape().total_size() != output->info()->tensor_shape().total_size()) { _has_padding = true; - _memset_kernel.configure(output, PixelValue(0, input->info()->data_type(), input->info()->quantization_info())); + _fill_f = std::make_unique<NEFill>(); + _fill_f->configure(output, PixelValue(0, input->info()->data_type(), input->info()->quantization_info())); } - _space_to_batch_kernel.configure(input, block_shape_x, block_shape_y, padding_left, padding_right, output); + _space_to_batch_kernel = std::make_unique<NESpaceToBatchLayerKernel>(); + _space_to_batch_kernel->configure(input, block_shape_x, block_shape_y, padding_left, padding_right, output); } -Status NESpaceToBatchLayer::validate(const ITensorInfo *input, const ITensorInfo *block_shape, const ITensorInfo *paddings, const ITensorInfo *output) +Status NESpaceToBatchLayer::validate(const ITensorInfo *input, + const ITensorInfo *block_shape, + const ITensorInfo *paddings, + const ITensorInfo *output) { ARM_COMPUTE_RETURN_ON_ERROR(NESpaceToBatchLayerKernel::validate(input, block_shape, paddings, output)); return Status{}; } -Status NESpaceToBatchLayer::validate(const ITensorInfo *input, const int block_shape_x, const int block_shape_y, const Size2D &padding_left, const Size2D &padding_right, +Status NESpaceToBatchLayer::validate(const ITensorInfo *input, + const int block_shape_x, + const int block_shape_y, + const Size2D &padding_left, + const Size2D &padding_right, const ITensorInfo *output) { - ARM_COMPUTE_RETURN_ON_ERROR(NESpaceToBatchLayerKernel::validate(input, block_shape_x, block_shape_y, padding_left, padding_right, output)); + ARM_COMPUTE_RETURN_ON_ERROR( + NESpaceToBatchLayerKernel::validate(input, block_shape_x, block_shape_y, padding_left, padding_right, output)); return Status{}; } @@ -79,10 +105,10 @@ Status NESpaceToBatchLayer::validate(const ITensorInfo *input, const int block_s void NESpaceToBatchLayer::run() { // Zero out output only if we have paddings - if(_has_padding) + if (_has_padding) { - NEScheduler::get().schedule(&_memset_kernel, Window::DimY); + _fill_f->run(); } - NEScheduler::get().schedule(&_space_to_batch_kernel, Window::DimY); + NEScheduler::get().schedule(_space_to_batch_kernel.get(), Window::DimY); } } // namespace arm_compute diff --git a/src/runtime/NEON/functions/NESpaceToDepthLayer.cpp b/src/runtime/NEON/functions/NESpaceToDepthLayer.cpp index 18d82918c7..846b619429 100644 --- a/src/runtime/NEON/functions/NESpaceToDepthLayer.cpp +++ b/src/runtime/NEON/functions/NESpaceToDepthLayer.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019 ARM Limited. + * Copyright (c) 2019-2021 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -30,17 +30,24 @@ #include "arm_compute/core/Validate.h" #include "arm_compute/runtime/NEON/NEScheduler.h" +#include "src/common/utils/Log.h" +#include "src/core/NEON/kernels/NESpaceToDepthLayerKernel.h" + namespace arm_compute { -NESpaceToDepthLayer::NESpaceToDepthLayer() - : _space_to_depth_kernel() +NESpaceToDepthLayer::~NESpaceToDepthLayer() = default; + +NESpaceToDepthLayer::NESpaceToDepthLayer() : _space_to_depth_kernel() { } void NESpaceToDepthLayer::configure(const ITensor *input, ITensor *output, int32_t block_shape) { ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); - _space_to_depth_kernel.configure(input, output, block_shape); + ARM_COMPUTE_LOG_PARAMS(input, output, block_shape); + + _space_to_depth_kernel = std::make_unique<NESpaceToDepthLayerKernel>(); + _space_to_depth_kernel->configure(input, output, block_shape); } Status NESpaceToDepthLayer::validate(const ITensorInfo *input, const ITensorInfo *output, int32_t block_shape) @@ -51,6 +58,6 @@ Status NESpaceToDepthLayer::validate(const ITensorInfo *input, const ITensorInfo void NESpaceToDepthLayer::run() { - NEScheduler::get().schedule(&_space_to_depth_kernel, Window::DimY); + NEScheduler::get().schedule(_space_to_depth_kernel.get(), Window::DimY); } } // namespace arm_compute diff --git a/src/runtime/NEON/functions/NESplit.cpp b/src/runtime/NEON/functions/NESplit.cpp index 8131e47e3f..53b09e9ae5 100644 --- a/src/runtime/NEON/functions/NESplit.cpp +++ b/src/runtime/NEON/functions/NESplit.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018-2020 ARM Limited. + * Copyright (c) 2018-2020 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -34,7 +34,7 @@ namespace arm_compute { void NESplit::run() { - for(unsigned i = 0; i < _num_outputs; ++i) + for (unsigned i = 0; i < _num_outputs; ++i) { _slice_functions[i].run(); } diff --git a/src/runtime/NEON/functions/NEStackLayer.cpp b/src/runtime/NEON/functions/NEStackLayer.cpp index 351497c96e..2f88ffca2a 100644 --- a/src/runtime/NEON/functions/NEStackLayer.cpp +++ b/src/runtime/NEON/functions/NEStackLayer.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018-2020 ARM Limited. + * Copyright (c) 2018-2021, 2023 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -31,27 +31,26 @@ #include "arm_compute/core/utils/misc/ShapeCalculator.h" #include "arm_compute/runtime/NEON/NEScheduler.h" +#include "src/common/utils/Log.h" +#include "src/core/NEON/kernels/NEStackLayerKernel.h" + namespace arm_compute { +NEStackLayer::~NEStackLayer() = default; + NEStackLayer::NEStackLayer() // NOLINT - : _input(), - _stack_kernels(), - _num_inputs(0) + : _stack_kernel(std::make_unique<NEStackLayerKernel>()), _is_prepared(false) { } void NEStackLayer::configure(const std::vector<ITensor *> &input, int axis, ITensor *output) { - _num_inputs = input.size(); - _stack_kernels.resize(_num_inputs); + ARM_COMPUTE_LOG_PARAMS(input, axis, output); // Wrap around negative values const unsigned int axis_u = wrap_around(axis, static_cast<int>(input[0]->info()->num_dimensions() + 1)); - for(unsigned int i = 0; i < _num_inputs; i++) - { - _stack_kernels[i].configure(input[i], axis_u, i, _num_inputs, output); - } + _stack_kernel->configure(input, axis_u, output); } Status NEStackLayer::validate(const std::vector<ITensorInfo *> &input, int axis, const ITensorInfo *output) @@ -63,24 +62,20 @@ Status NEStackLayer::validate(const std::vector<ITensorInfo *> &input, int axis, const size_t rank = input[0]->num_dimensions(); const unsigned int axis_u = wrap_around(axis, static_cast<int>(rank + 1)); - const unsigned int num_inputs = input.size(); - - for(unsigned int i = 0; i < num_inputs; i++) - { - // All the tensors must have the same rank - ARM_COMPUTE_RETURN_ERROR_ON(input[i]->num_dimensions() != rank); - // Validate Kernel - ARM_COMPUTE_RETURN_ON_ERROR(NEStackLayerKernel::validate(input[i], axis_u, i, num_inputs, output)); - } + // Validate Kernel + ARM_COMPUTE_RETURN_ON_ERROR(NEStackLayerKernel::validate(input, axis_u, output)); return Status{}; } void NEStackLayer::run() { - for(unsigned i = 0; i < _num_inputs; i++) + if (!_is_prepared) { - NEScheduler::get().schedule(&_stack_kernels[i], Window::DimY); + _stack_kernel->prepare(); + _is_prepared = true; } + + NEScheduler::get().schedule(_stack_kernel.get(), _stack_kernel->get_split_dimension()); } } // namespace arm_compute diff --git a/src/runtime/NEON/functions/NEStridedSlice.cpp b/src/runtime/NEON/functions/NEStridedSlice.cpp index c9be563e17..6a3ac8be05 100644 --- a/src/runtime/NEON/functions/NEStridedSlice.cpp +++ b/src/runtime/NEON/functions/NEStridedSlice.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018-2020 ARM Limited. + * Copyright (c) 2018-2021 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -23,25 +23,92 @@ */ #include "arm_compute/runtime/NEON/functions/NEStridedSlice.h" -#include "arm_compute/core/NEON/kernels/NEStridedSliceKernel.h" +#include "arm_compute/core/ITensor.h" #include "arm_compute/core/Types.h" -#include "support/MemorySupport.h" + +#include "src/common/utils/Log.h" +#include "src/core/NEON/kernels/NEStridedSliceKernel.h" namespace arm_compute { -void NEStridedSlice::configure(const ITensor *input, ITensor *output, - const Coordinates &starts, const Coordinates &ends, const BiStrides &strides, - int32_t begin_mask, int32_t end_mask, int32_t shrink_axis_mask) +namespace experimental +{ +void NEStridedSlice::configure(const ITensorInfo *input, + ITensorInfo *output, + const Coordinates &starts, + const Coordinates &ends, + const BiStrides &strides, + int32_t begin_mask, + int32_t end_mask, + int32_t shrink_axis_mask) { - auto k = arm_compute::support::cpp14::make_unique<NEStridedSliceKernel>(); + ARM_COMPUTE_LOG_PARAMS(input, output, starts, ends, strides, begin_mask, end_mask, shrink_axis_mask); + + auto k = std::make_unique<NEStridedSliceKernel>(); k->configure(input, output, starts, ends, strides, begin_mask, end_mask, shrink_axis_mask); _kernel = std::move(k); } -Status NEStridedSlice::validate(const ITensorInfo *input, const ITensorInfo *output, - const Coordinates &starts, const Coordinates &ends, const BiStrides &strides, - int32_t begin_mask, int32_t end_mask, int32_t shrink_axis_mask) +Status NEStridedSlice::validate(const ITensorInfo *input, + const ITensorInfo *output, + const Coordinates &starts, + const Coordinates &ends, + const BiStrides &strides, + int32_t begin_mask, + int32_t end_mask, + int32_t shrink_axis_mask) { return NEStridedSliceKernel::validate(input, output, starts, ends, strides, begin_mask, end_mask, shrink_axis_mask); } +} // namespace experimental + +struct NEStridedSlice::Impl +{ + const ITensor *src{nullptr}; + ITensor *dst{nullptr}; + std::unique_ptr<experimental::NEStridedSlice> op{nullptr}; +}; + +NEStridedSlice::NEStridedSlice() : _impl(std::make_unique<Impl>()) +{ +} +NEStridedSlice::NEStridedSlice(NEStridedSlice &&) = default; +NEStridedSlice &NEStridedSlice::operator=(NEStridedSlice &&) = default; +NEStridedSlice::~NEStridedSlice() = default; + +void NEStridedSlice::configure(const ITensor *input, + ITensor *output, + const Coordinates &starts, + const Coordinates &ends, + const BiStrides &strides, + int32_t begin_mask, + int32_t end_mask, + int32_t shrink_axis_mask) +{ + _impl->src = input; + _impl->dst = output; + _impl->op = std::make_unique<experimental::NEStridedSlice>(); + _impl->op->configure(input->info(), output->info(), starts, ends, strides, begin_mask, end_mask, shrink_axis_mask); +} + +void NEStridedSlice::run() +{ + ITensorPack pack; + pack.add_tensor(TensorType::ACL_SRC, _impl->src); + pack.add_tensor(TensorType::ACL_DST, _impl->dst); + _impl->op->run(pack); +} + +Status NEStridedSlice::validate(const ITensorInfo *input, + const ITensorInfo *output, + const Coordinates &starts, + const Coordinates &ends, + const BiStrides &strides, + int32_t begin_mask, + int32_t end_mask, + int32_t shrink_axis_mask) +{ + return experimental::NEStridedSlice::validate(input, output, starts, ends, strides, begin_mask, end_mask, + shrink_axis_mask); +} } // namespace arm_compute diff --git a/src/runtime/NEON/functions/NETableLookup.cpp b/src/runtime/NEON/functions/NETableLookup.cpp deleted file mode 100644 index 44cbbc8416..0000000000 --- a/src/runtime/NEON/functions/NETableLookup.cpp +++ /dev/null @@ -1,38 +0,0 @@ -/* - * Copyright (c) 2016-2020 ARM Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "arm_compute/runtime/NEON/functions/NETableLookup.h" - -#include "arm_compute/core/NEON/kernels/NETableLookupKernel.h" -#include "support/MemorySupport.h" - -#include <utility> - -using namespace arm_compute; - -void NETableLookup::configure(const ITensor *input, const ILut *lut, ITensor *output) -{ - auto k = arm_compute::support::cpp14::make_unique<NETableLookupKernel>(); - k->configure(input, lut, output); - _kernel = std::move(k); -} diff --git a/src/runtime/NEON/functions/NEThreshold.cpp b/src/runtime/NEON/functions/NEThreshold.cpp deleted file mode 100644 index f4fd85722c..0000000000 --- a/src/runtime/NEON/functions/NEThreshold.cpp +++ /dev/null @@ -1,38 +0,0 @@ -/* - * Copyright (c) 2016-2020 ARM Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "arm_compute/runtime/NEON/functions/NEThreshold.h" - -#include "arm_compute/core/NEON/kernels/NEThresholdKernel.h" -#include "support/MemorySupport.h" - -#include <utility> - -using namespace arm_compute; - -void NEThreshold::configure(const ITensor *input, ITensor *output, uint8_t threshold, uint8_t false_value, uint8_t true_value, ThresholdType type, uint8_t upper) -{ - auto k = arm_compute::support::cpp14::make_unique<NEThresholdKernel>(); - k->configure(input, output, threshold, false_value, true_value, type, upper); - _kernel = std::move(k); -} diff --git a/src/runtime/NEON/functions/NETile.cpp b/src/runtime/NEON/functions/NETile.cpp index 6bf8eaad20..d10b1c8e95 100644 --- a/src/runtime/NEON/functions/NETile.cpp +++ b/src/runtime/NEON/functions/NETile.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018-2020 ARM Limited. + * Copyright (c) 2018-2021 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -23,14 +23,16 @@ */ #include "arm_compute/runtime/NEON/functions/NETile.h" -#include "arm_compute/core/NEON/kernels/NETileKernel.h" -#include "support/MemorySupport.h" +#include "src/common/utils/Log.h" +#include "src/core/NEON/kernels/NETileKernel.h" namespace arm_compute { void NETile::configure(const ITensor *input, ITensor *output, const Multiples &multiples) { - auto k = arm_compute::support::cpp14::make_unique<NETileKernel>(); + ARM_COMPUTE_LOG_PARAMS(input, output, multiples); + + auto k = std::make_unique<NETileKernel>(); k->configure(input, output, multiples); _kernel = std::move(k); } diff --git a/src/runtime/NEON/functions/NETranspose.cpp b/src/runtime/NEON/functions/NETranspose.cpp index 21d496300a..0144a85e8c 100644 --- a/src/runtime/NEON/functions/NETranspose.cpp +++ b/src/runtime/NEON/functions/NETranspose.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2020 ARM Limited. + * Copyright (c) 2017-2021 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -23,22 +23,50 @@ */ #include "arm_compute/runtime/NEON/functions/NETranspose.h" -#include "arm_compute/core/NEON/kernels/NETransposeKernel.h" -#include "support/MemorySupport.h" +#include "arm_compute/core/Validate.h" -#include <utility> +#include "src/common/utils/Log.h" +#include "src/cpu/operators/CpuTranspose.h" namespace arm_compute { +struct NETranspose::Impl +{ + const ITensor *src{nullptr}; + ITensor *dst{nullptr}; + std::unique_ptr<cpu::CpuTranspose> op{nullptr}; +}; + +NETranspose::NETranspose() : _impl(std::make_unique<Impl>()) +{ +} + +NETranspose::~NETranspose() = default; + void NETranspose::configure(const ITensor *input, ITensor *output) { - auto k = arm_compute::support::cpp14::make_unique<NETransposeKernel>(); - k->configure(input, output); - _kernel = std::move(k); + ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); + ARM_COMPUTE_LOG_PARAMS(input, output); + + _impl->src = input; + _impl->dst = output; + _impl->op = std::make_unique<cpu::CpuTranspose>(); + _impl->op->configure(input->info(), output->info()); } Status NETranspose::validate(const ITensorInfo *input, const ITensorInfo *output) { - return NETransposeKernel::validate(input, output); + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output); + ARM_COMPUTE_RETURN_ON_ERROR(cpu::CpuTranspose::validate(input, output)); + return Status{}; } + +void NETranspose::run() +{ + ITensorPack pack; + pack.add_tensor(TensorType::ACL_SRC, _impl->src); + pack.add_tensor(TensorType::ACL_DST, _impl->dst); + _impl->op->run(pack); +} + } // namespace arm_compute diff --git a/src/runtime/NEON/functions/NEUnstack.cpp b/src/runtime/NEON/functions/NEUnstack.cpp index 21f35f8312..2f7ed2bb1f 100644 --- a/src/runtime/NEON/functions/NEUnstack.cpp +++ b/src/runtime/NEON/functions/NEUnstack.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018-2019 ARM Limited. + * Copyright (c) 2018-2021 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -29,6 +29,8 @@ #include "arm_compute/core/Types.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" +#include "src/common/utils/Log.h" + namespace arm_compute { namespace @@ -38,13 +40,15 @@ inline unsigned int wrap_axis(int axis, const ITensorInfo *const tensor) return wrap_around(axis, static_cast<int>(tensor->num_dimensions())); } -inline void setup_slice_coordinates_and_mask(Coordinates &slice_start, int32_t &slice_end_mask, const unsigned int input_num_dimensions) +inline void setup_slice_coordinates_and_mask(Coordinates &slice_start, + int32_t &slice_end_mask, + const unsigned int input_num_dimensions) { // Setups up coordinates to slice the input tensor: start coordinates to all 0s and the unstacking axis of both Start/End to slice just one 2d tensor at a time. Coordinates slice_end; slice_start.set_num_dimensions(input_num_dimensions); slice_end.set_num_dimensions(input_num_dimensions); - for(size_t k = 0; k < input_num_dimensions; ++k) + for (size_t k = 0; k < input_num_dimensions; ++k) { slice_start.set(k, 0); slice_end.set(k, -1); @@ -54,22 +58,23 @@ inline void setup_slice_coordinates_and_mask(Coordinates &slice_start, int32_t & } // namespace NEUnstack::NEUnstack() // NOLINT - : _num_slices(0), - _strided_slice_vector() + : _num_slices(0), _strided_slice_vector() { } void NEUnstack::configure(const ITensor *input, const std::vector<ITensor *> &output_vector, int axis) { std::vector<ITensorInfo *> outputs_vector_info(output_vector.size()); - std::transform(output_vector.begin(), output_vector.end(), outputs_vector_info.begin(), [](ITensor * t) - { - ARM_COMPUTE_ERROR_ON_NULLPTR(t); - return t->info(); - }); + std::transform(output_vector.begin(), output_vector.end(), outputs_vector_info.begin(), + [](ITensor *t) + { + ARM_COMPUTE_ERROR_ON_NULLPTR(t); + return t->info(); + }); ARM_COMPUTE_ERROR_ON_NULLPTR(input); ARM_COMPUTE_ERROR_THROW_ON(NEUnstack::validate(input->info(), outputs_vector_info, axis)); + ARM_COMPUTE_LOG_PARAMS(input, output_vector, axis); // Wrap around negative values const unsigned int axis_u = wrap_axis(axis, input->info()); @@ -79,11 +84,12 @@ void NEUnstack::configure(const ITensor *input, const std::vector<ITensor *> &ou Coordinates slice_start; int32_t slice_end_mask; setup_slice_coordinates_and_mask(slice_start, slice_end_mask, input->info()->tensor_shape().num_dimensions()); - for(unsigned int slice = 0; slice < _num_slices; ++slice) + for (unsigned int slice = 0; slice < _num_slices; ++slice) { // Adjusts start and end coordinates to take a 2D slice at a time slice_start.set(axis_u, slice); - _strided_slice_vector[slice].configure(input, output_vector[slice], slice_start, Coordinates(), BiStrides(), 0, slice_end_mask, (1 << axis_u)); + _strided_slice_vector[slice].configure(input, output_vector[slice], slice_start, Coordinates(), BiStrides(), 0, + slice_end_mask, (1 << axis_u)); } } @@ -100,18 +106,20 @@ Status NEUnstack::validate(const ITensorInfo *input, const std::vector<ITensorIn Coordinates slice_start; int32_t slice_end_mask; - for(size_t k = 0; k < num_slices; ++k) + for (size_t k = 0; k < num_slices; ++k) { slice_start.set(wrap_axis(axis, input), k); setup_slice_coordinates_and_mask(slice_start, slice_end_mask, input->tensor_shape().num_dimensions()); - ARM_COMPUTE_RETURN_ON_ERROR(NEStridedSlice::validate(input, output_vector[k], slice_start, Coordinates(), BiStrides(), 0, slice_end_mask, (1 << wrap_axis(axis, input)))); + ARM_COMPUTE_RETURN_ON_ERROR(NEStridedSlice::validate(input, output_vector[k], slice_start, Coordinates(), + BiStrides(), 0, slice_end_mask, + (1 << wrap_axis(axis, input)))); } return Status{}; } void NEUnstack::run() { - for(unsigned i = 0; i < _num_slices; ++i) + for (unsigned i = 0; i < _num_slices; ++i) { _strided_slice_vector[i].run(); } diff --git a/src/runtime/NEON/functions/NEWarpAffine.cpp b/src/runtime/NEON/functions/NEWarpAffine.cpp deleted file mode 100644 index 469ca65bd9..0000000000 --- a/src/runtime/NEON/functions/NEWarpAffine.cpp +++ /dev/null @@ -1,62 +0,0 @@ -/* - * Copyright (c) 2016-2020 ARM Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "arm_compute/runtime/NEON/functions/NEWarpAffine.h" - -#include "arm_compute/core/Error.h" -#include "arm_compute/core/NEON/kernels/NEWarpKernel.h" -#include "arm_compute/core/Validate.h" -#include "support/MemorySupport.h" - -#include <utility> - -using namespace arm_compute; - -void NEWarpAffine::configure(ITensor *input, ITensor *output, const std::array<float, 9> &matrix, InterpolationPolicy policy, BorderMode border_mode, uint8_t constant_border_value) -{ - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8); - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8); - - switch(policy) - { - case InterpolationPolicy::NEAREST_NEIGHBOR: - { - auto k = arm_compute::support::cpp14::make_unique<NEWarpAffineKernel<InterpolationPolicy::NEAREST_NEIGHBOR>>(); - k->configure(input, output, matrix, border_mode, constant_border_value); - _kernel = std::move(k); - break; - } - case InterpolationPolicy::BILINEAR: - { - auto k = arm_compute::support::cpp14::make_unique<NEWarpAffineKernel<InterpolationPolicy::BILINEAR>>(); - k->configure(input, output, matrix, border_mode, constant_border_value); - _kernel = std::move(k); - break; - } - case InterpolationPolicy::AREA: - default: - ARM_COMPUTE_ERROR("Interpolation type not supported"); - } - - _border_handler.configure(input, _kernel->border_size(), border_mode, constant_border_value); -} diff --git a/src/runtime/NEON/functions/NEWarpPerspective.cpp b/src/runtime/NEON/functions/NEWarpPerspective.cpp deleted file mode 100644 index ac5edca125..0000000000 --- a/src/runtime/NEON/functions/NEWarpPerspective.cpp +++ /dev/null @@ -1,62 +0,0 @@ -/* - * Copyright (c) 2016-2020 ARM Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "arm_compute/runtime/NEON/functions/NEWarpPerspective.h" - -#include "arm_compute/core/Error.h" -#include "arm_compute/core/NEON/kernels/NEWarpKernel.h" -#include "arm_compute/core/Validate.h" -#include "support/MemorySupport.h" - -#include <utility> - -using namespace arm_compute; - -void NEWarpPerspective::configure(ITensor *input, ITensor *output, const std::array<float, 9> &matrix, InterpolationPolicy policy, BorderMode border_mode, uint8_t constant_border_value) -{ - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8); - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8); - - switch(policy) - { - case InterpolationPolicy::NEAREST_NEIGHBOR: - { - auto k = arm_compute::support::cpp14::make_unique<NEWarpPerspectiveKernel<InterpolationPolicy::NEAREST_NEIGHBOR>>(); - k->configure(input, output, matrix, border_mode, constant_border_value); - _kernel = std::move(k); - break; - } - case InterpolationPolicy::BILINEAR: - { - auto k = arm_compute::support::cpp14::make_unique<NEWarpPerspectiveKernel<InterpolationPolicy::BILINEAR>>(); - k->configure(input, output, matrix, border_mode, constant_border_value); - _kernel = std::move(k); - break; - } - case InterpolationPolicy::AREA: - default: - ARM_COMPUTE_ERROR("Interpolation type not supported"); - } - - _border_handler.configure(input, _kernel->border_size(), border_mode, constant_border_value); -} diff --git a/src/runtime/NEON/functions/NEWinogradConvolutionLayer.cpp b/src/runtime/NEON/functions/NEWinogradConvolutionLayer.cpp index d567a18709..7334be8456 100644 --- a/src/runtime/NEON/functions/NEWinogradConvolutionLayer.cpp +++ b/src/runtime/NEON/functions/NEWinogradConvolutionLayer.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2020 ARM Limited. + * Copyright (c) 2017-2022, 2024 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -23,749 +23,94 @@ */ #include "arm_compute/runtime/NEON/functions/NEWinogradConvolutionLayer.h" -#include "arm_compute/core/CPP/Validate.h" #include "arm_compute/core/Error.h" -#include "arm_compute/core/NEON/kernels/NEWinogradConvolutionLayerKernel.h" +#include "arm_compute/core/ITensorPack.h" #include "arm_compute/core/Utils.h" -#include "arm_compute/core/Validate.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" -#include "arm_compute/runtime/NEON/NEScheduler.h" -#include "arm_compute/runtime/NEON/functions/NEGEMMAssemblyDispatch.h" -#include "support/MemorySupport.h" +#include "arm_compute/core/Validate.h" -#include "arm_compute/core/NEON/kernels/convolution/common/utils.hpp" -#include "arm_compute/core/NEON/kernels/convolution/winograd/winograd.hpp" +#include "src/core/CPP/Validate.h" +#include "src/core/helpers/MemoryHelpers.h" +#include "src/core/NEON/kernels/convolution/common/utils.hpp" +#include "src/cpu/kernels/CpuWinogradConv2dKernel.h" +#include "src/cpu/operators/CpuWinogradConv2d.h" namespace arm_compute { -namespace -{ -inline Status validate_kernel_3x3(const Size2D input_dims, const ITensorInfo *input, const TensorInfo *input0, const TensorInfo *input1, const TensorInfo *batched_mm_output, - const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const WinogradInfo &winograd_info, const ActivationLayerInfo &act_info) -{ - ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32); - - if(input->data_type() == DataType::F32) - { - if(input_dims.width > 4 && input_dims.height > 4) - { - ARM_COMPUTE_RETURN_ON_ERROR((NEWinogradLayerTransformInputKernel<float, 4, 4, 3, 3>::validate(input, input0, winograd_info))); - ARM_COMPUTE_RETURN_ON_ERROR((NEWinogradLayerTransformWeightsKernel<float, 4, 4, 3, 3>::validate(weights, input1, winograd_info))); - ARM_COMPUTE_RETURN_ON_ERROR((NEWinogradLayerTransformOutputKernel<float, 4, 4, 3, 3>::validate(batched_mm_output, biases, output, winograd_info))); - } - else - { - ARM_COMPUTE_RETURN_ON_ERROR((NEWinogradLayerTransformInputKernel<float, 2, 2, 3, 3>::validate(input, input0, winograd_info))); - ARM_COMPUTE_RETURN_ON_ERROR((NEWinogradLayerTransformWeightsKernel<float, 2, 2, 3, 3>::validate(weights, input1, winograd_info))); - ARM_COMPUTE_RETURN_ON_ERROR((NEWinogradLayerTransformOutputKernel<float, 2, 2, 3, 3>::validate(batched_mm_output, biases, output, winograd_info))); - } - } -#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC - else if(input->data_type() == DataType::F32) - { - ARM_COMPUTE_RETURN_ON_ERROR((NEWinogradLayerTransformInputKernel<__fp16, 4, 4, 3, 3>::validate(input, input0, winograd_info))); - ARM_COMPUTE_RETURN_ON_ERROR((NEWinogradLayerTransformWeightsKernel<__fp16, 4, 4, 3, 3>::validate(weights, input1, winograd_info))); - ARM_COMPUTE_RETURN_ON_ERROR((NEWinogradLayerTransformOutputKernel<__fp16, 4, 4, 3, 3>::validate(batched_mm_output, biases, output, winograd_info))); - } -#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */ - - if(act_info.enabled()) - { - NEActivationLayer::validate(output, nullptr, act_info); - } - return Status{}; -} - -inline Status validate_kernel_5x5(const ITensorInfo *input, const TensorInfo *input0, const TensorInfo *input1, const TensorInfo *batched_mm_output, - const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const WinogradInfo &winograd_info, const ActivationLayerInfo &act_info) -{ - ARM_COMPUTE_RETURN_ON_ERROR((NEWinogradLayerTransformInputKernel<float, 2, 2, 5, 5>::validate(input, input0, winograd_info))); - ARM_COMPUTE_RETURN_ON_ERROR((NEWinogradLayerTransformWeightsKernel<float, 2, 2, 5, 5>::validate(weights, input1, winograd_info))); - ARM_COMPUTE_RETURN_ON_ERROR((NEWinogradLayerTransformOutputKernel<float, 2, 2, 5, 5>::validate(batched_mm_output, biases, output, winograd_info))); - if(act_info.enabled()) - { - NEActivationLayer::validate(output, nullptr, act_info); - } - return Status{}; -} - -inline Status validate_kernel_3x1(const ITensorInfo *input, const TensorInfo *input0, const TensorInfo *input1, const TensorInfo *batched_mm_output, - const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const WinogradInfo &winograd_info, const ActivationLayerInfo &act_info) -{ - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32); - ARM_COMPUTE_RETURN_ON_ERROR((NEWinogradLayerTransformInputKernel<float, 1, 6, 1, 3>::validate(input, input0, winograd_info))); - ARM_COMPUTE_RETURN_ON_ERROR((NEWinogradLayerTransformWeightsKernel<float, 1, 6, 1, 3>::validate(weights, input1, winograd_info))); - ARM_COMPUTE_RETURN_ON_ERROR((NEWinogradLayerTransformOutputKernel<float, 1, 6, 1, 3>::validate(batched_mm_output, biases, output, winograd_info))); - if(act_info.enabled()) - { - NEActivationLayer::validate(output, nullptr, act_info); - } - return Status{}; -} - -inline Status validate_kernel_1x3(const ITensorInfo *input, const TensorInfo *input0, const TensorInfo *input1, const TensorInfo *batched_mm_output, - const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const WinogradInfo &winograd_info, const ActivationLayerInfo &act_info) -{ - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32); - ARM_COMPUTE_RETURN_ON_ERROR((NEWinogradLayerTransformInputKernel<float, 6, 1, 3, 1>::validate(input, input0, winograd_info))); - ARM_COMPUTE_RETURN_ON_ERROR((NEWinogradLayerTransformWeightsKernel<float, 6, 1, 3, 1>::validate(weights, input1, winograd_info))); - ARM_COMPUTE_RETURN_ON_ERROR((NEWinogradLayerTransformOutputKernel<float, 6, 1, 3, 1>::validate(batched_mm_output, biases, output, winograd_info))); - - if(act_info.enabled()) - { - NEActivationLayer::validate(output, nullptr, act_info); - } - return Status{}; -} +using namespace arm_compute::experimental; -inline Status validate_kernel_5x1(const ITensorInfo *input, const TensorInfo *input0, const TensorInfo *input1, const TensorInfo *batched_mm_output, - const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const WinogradInfo &winograd_info, const ActivationLayerInfo &act_info) +struct NEWinogradConvolutionLayer::Impl { - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32); - ARM_COMPUTE_RETURN_ON_ERROR((NEWinogradLayerTransformInputKernel<float, 1, 4, 1, 5>::validate(input, input0, winograd_info))); - ARM_COMPUTE_RETURN_ON_ERROR((NEWinogradLayerTransformWeightsKernel<float, 1, 4, 1, 5>::validate(weights, input1, winograd_info))); - ARM_COMPUTE_RETURN_ON_ERROR((NEWinogradLayerTransformOutputKernel<float, 1, 4, 1, 5>::validate(batched_mm_output, biases, output, winograd_info))); - if(act_info.enabled()) - { - NEActivationLayer::validate(output, nullptr, act_info); - } - return Status{}; -} -inline Status validate_kernel_1x5(const ITensorInfo *input, const TensorInfo *input0, const TensorInfo *input1, const TensorInfo *batched_mm_output, - const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const WinogradInfo &winograd_info, const ActivationLayerInfo &act_info) -{ - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32); - ARM_COMPUTE_RETURN_ON_ERROR((NEWinogradLayerTransformInputKernel<float, 4, 1, 5, 1>::validate(input, input0, winograd_info))); - ARM_COMPUTE_RETURN_ON_ERROR((NEWinogradLayerTransformWeightsKernel<float, 4, 1, 5, 1>::validate(weights, input1, winograd_info))); - ARM_COMPUTE_RETURN_ON_ERROR((NEWinogradLayerTransformOutputKernel<float, 4, 1, 5, 1>::validate(batched_mm_output, biases, output, winograd_info))); - if(act_info.enabled()) - { - NEActivationLayer::validate(output, nullptr, act_info); - } - return Status{}; -} - -inline Status validate_kernel_7x1(const ITensorInfo *input, const TensorInfo *input0, const TensorInfo *input1, const TensorInfo *batched_mm_output, - const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const WinogradInfo &winograd_info, const ActivationLayerInfo &act_info) -{ - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32); - ARM_COMPUTE_RETURN_ON_ERROR((NEWinogradLayerTransformInputKernel<float, 1, 2, 1, 7>::validate(input, input0, winograd_info))); - ARM_COMPUTE_RETURN_ON_ERROR((NEWinogradLayerTransformWeightsKernel<float, 1, 2, 1, 7>::validate(weights, input1, winograd_info))); - ARM_COMPUTE_RETURN_ON_ERROR((NEWinogradLayerTransformOutputKernel<float, 1, 2, 1, 7>::validate(batched_mm_output, biases, output, winograd_info))); - if(act_info.enabled()) - { - NEActivationLayer::validate(output, nullptr, act_info); - } - return Status{}; -} - -inline Status validate_kernel_1x7(const ITensorInfo *input, const TensorInfo *input0, const TensorInfo *input1, const TensorInfo *batched_mm_output, - const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const WinogradInfo &winograd_info, const ActivationLayerInfo &act_info) -{ - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32); - ARM_COMPUTE_RETURN_ON_ERROR((NEWinogradLayerTransformInputKernel<float, 2, 1, 7, 1>::validate(input, input0, winograd_info))); - ARM_COMPUTE_RETURN_ON_ERROR((NEWinogradLayerTransformWeightsKernel<float, 2, 1, 7, 1>::validate(weights, input1, winograd_info))); - ARM_COMPUTE_RETURN_ON_ERROR((NEWinogradLayerTransformOutputKernel<float, 2, 1, 7, 1>::validate(batched_mm_output, biases, output, winograd_info))); - - if(act_info.enabled()) - { - NEActivationLayer::validate(output, nullptr, act_info); - } - return Status{}; -} - -inline Tensor4DShape internal_get_input_shape(const arm_compute::ITensor *input) -{ - const DataLayout data_layout = input->info()->data_layout(); - const int in_width = input->info()->dimension(get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH)); - const int in_height = input->info()->dimension(get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT)); - const int in_channels = input->info()->dimension(get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL)); - const int in_batches = input->info()->dimension(3); - - return Tensor4DShape{ in_batches, in_height, in_width, in_channels }; -} - -Status validate_arguments(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info) -{ - ARM_COMPUTE_UNUSED(output); - ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input); - - ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.stride().first != 1 || conv_info.stride().second != 1, "Winograd layer only supports unit strides."); - if(biases != nullptr) - { - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, biases); - ARM_COMPUTE_RETURN_ERROR_ON(biases->num_dimensions() > 1); - } - return INEWinogradLayerTransformWeightsKernel::validate(input, weights); -} - -Size2D winograd_output_tile(const Size2D &input_dims, const Size2D &kernel_dims, DataType data_type) -{ - Size2D output_tile = Size2D{}; - if(kernel_dims == Size2D(3U, 3U)) - { - output_tile = (input_dims.width <= 4 || input_dims.height <= 4) ? Size2D(2U, 2U) : Size2D(4U, 4U); - if(data_type == DataType::F16) - { - output_tile = Size2D(4U, 4U); - } - } - else if(kernel_dims == Size2D(5U, 5U)) - { - output_tile = Size2D(2U, 2U); - } - else if(kernel_dims == Size2D(1U, 3U)) - { - output_tile = Size2D(1U, 6U); - } - else if(kernel_dims == Size2D(3U, 1U)) - { - output_tile = Size2D(6U, 1U); - } - else if(kernel_dims == Size2D(1U, 5U)) - { - output_tile = Size2D(1U, 4U); - } - else if(kernel_dims == Size2D(5U, 1U)) - { - output_tile = Size2D(4U, 1U); - } - else if(kernel_dims == Size2D(7U, 1U)) - { - output_tile = Size2D(2U, 1U); - } - else if(kernel_dims == Size2D(1U, 7U)) - { - output_tile = Size2D(1U, 2U); - } - return output_tile; -} - -bool check_support_fast_math(const Size2D &output_tile, const Size2D &kernel_size, DataType data_type) -{ - // Check if we want to configure a Winograd configuration which requires fast math - using WinogradConfiguration = std::pair<std::pair<int, int>, std::pair<int, int>>; - - const std::vector<WinogradConfiguration> fast_math_winograd_f16 = - { - WinogradConfiguration(std::pair<int, int>(4, 4), std::pair<int, int>(3, 3)) - }; - - const std::vector<WinogradConfiguration> fast_math_winograd_f32 = - { - WinogradConfiguration(std::pair<int, int>(2, 2), std::pair<int, int>(5, 5)), - WinogradConfiguration(std::pair<int, int>(4, 4), std::pair<int, int>(5, 5)) - }; - - auto p = std::make_pair(std::pair<int, int>(output_tile.width, output_tile.height), - std::pair<int, int>(kernel_size.width, kernel_size.height)); - - switch(data_type) - { - case DataType::F16: - return std::find(fast_math_winograd_f16.begin(), fast_math_winograd_f16.end(), p) != fast_math_winograd_f16.end(); - case DataType::F32: - return std::find(fast_math_winograd_f32.begin(), fast_math_winograd_f32.end(), p) != fast_math_winograd_f32.end(); - default: - return false; - } -} - -inline bool fuse_function_supported(const ActivationLayerInfo &act_info) -{ - return act_info.activation() == ActivationLayerInfo::ActivationFunction::RELU || act_info.activation() == ActivationLayerInfo::ActivationFunction::BOUNDED_RELU; -} - -arm_gemm::Activation arm_gemm_activation_from_acl_activation(const ActivationLayerInfo &act_info) -{ - switch(act_info.activation()) - { - case ActivationLayerInfo::ActivationFunction::RELU: - { - return arm_gemm::Activation(arm_gemm::Activation::Type::ReLU, act_info.a(), act_info.b()); - } - case ActivationLayerInfo::ActivationFunction::BOUNDED_RELU: - { - return arm_gemm::Activation(arm_gemm::Activation::Type::BoundedReLU, act_info.a(), act_info.b()); - } - default: - { - return arm_gemm::Activation(arm_gemm::Activation::Type::None); - } - } -} -} //namespace + MemoryGroup memory_group{}; + std::unique_ptr<cpu::CpuWinogradConv2d> op{nullptr}; + ITensorPack run_pack{}; + ITensorPack prep_pack{}; + WorkspaceData<Tensor> workspace{}; + experimental::MemoryRequirements aux_mem_req{}; + const ITensor *original_weights{nullptr}; + bool is_prepared{false}; + bool is_activationlayer_enabled{false}; + DataLayout data_layout{}; +}; NEWinogradConvolutionLayer::NEWinogradConvolutionLayer(const std::shared_ptr<IMemoryManager> &memory_manager) - : _memory_group(memory_manager), _gemm_function(memory_manager), _transform_input_kernel(nullptr), _transform_output_kernel(nullptr), _transform_weights_kernel(nullptr), _activationlayer_function(), - _permute_input(), _permute_weights(), _permute_output(), _input_transformed(), _output_transformed(), _input_workspace(), _output_workspace(), _kernel_storage(), _input_nhwc(), _output_nhwc(), - _weights_hwio(), _input(), _weights(), _output(), _is_prepared(false), _is_activationlayer_enabled(false) + : _impl(std::make_unique<Impl>()) { + _impl->memory_group = MemoryGroup(memory_manager); } -void NEWinogradConvolutionLayer::configure(const ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info, - bool enable_fast_math) -{ - ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output); - ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), weights->info(), (biases != nullptr) ? biases->info() : nullptr, output->info(), conv_info)); - - // Get indices for the width and height - const DataLayout data_layout = input->info()->data_layout(); - const unsigned int width_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH); - const unsigned int height_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT); - const unsigned int channel_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL); - - const Size2D input_dims = Size2D(input->info()->dimension(width_idx), input->info()->dimension(height_idx)); - const Size2D kernel_size = Size2D(weights->info()->dimension(width_idx), weights->info()->dimension(height_idx)); - const DataType data_type = input->info()->data_type(); - const Size2D output_tile = winograd_output_tile(input_dims, kernel_size, data_type); - - // Check if the Winograd configuration requires fast math - if(!enable_fast_math) - { - ARM_COMPUTE_ERROR_ON_MSG(check_support_fast_math(output_tile, kernel_size, data_type), - "This Winograd configuration requires enable_fast_math=true"); - } - - _weights = weights; - _input = input; - _output = output; - _is_prepared = false; - - int n_gemms = 0; - int N_BLOCK = 0; // Size of block used by GEMM. - - std::unique_ptr<INEWinogradLayerTransformInputKernel> transform_input_kernel; - std::unique_ptr<INEWinogradLayerTransformWeightsKernel> transform_weights_kernel; - std::unique_ptr<INEWinogradLayerTransformOutputKernel> transform_output_kernel; - - if(data_type == DataType::F32) - { - if(kernel_size == Size2D(3, 3)) - { - if(input->info()->dimension(width_idx) > 4 && input->info()->dimension(height_idx) > 4) - { - using config = NEWinogradLayerConfiguration<float, float, 4, 4, 3, 3>; - transform_input_kernel = support::cpp14::make_unique<config::TransformInputKernel>(); - transform_weights_kernel = support::cpp14::make_unique<config::TransformWeightsKernel>(); - transform_output_kernel = support::cpp14::make_unique<config::TransformOutputKernel>(); - n_gemms = config::WinogradBase::N_GEMMS; - N_BLOCK = config::WinogradConv::N_BLOCK; - } - else - { - using config = NEWinogradLayerConfiguration<float, float, 2, 2, 3, 3>; - transform_input_kernel = support::cpp14::make_unique<config::TransformInputKernel>(); - transform_weights_kernel = support::cpp14::make_unique<config::TransformWeightsKernel>(); - transform_output_kernel = support::cpp14::make_unique<config::TransformOutputKernel>(); - n_gemms = config::WinogradBase::N_GEMMS; - N_BLOCK = config::WinogradConv::N_BLOCK; - } - } - else if(kernel_size == Size2D(5, 5)) - { - using config = NEWinogradLayerConfiguration<float, float, 2, 2, 5, 5>; - transform_input_kernel = support::cpp14::make_unique<config::TransformInputKernel>(); - transform_weights_kernel = support::cpp14::make_unique<config::TransformWeightsKernel>(); - transform_output_kernel = support::cpp14::make_unique<config::TransformOutputKernel>(); - n_gemms = config::WinogradBase::N_GEMMS; - N_BLOCK = config::WinogradConv::N_BLOCK; - } - else if(kernel_size == Size2D(1, 3)) - { - using config = NEWinogradLayerConfiguration<float, float, 6, 1, 3, 1>; - transform_input_kernel = support::cpp14::make_unique<config::TransformInputKernel>(); - transform_weights_kernel = support::cpp14::make_unique<config::TransformWeightsKernel>(); - transform_output_kernel = support::cpp14::make_unique<config::TransformOutputKernel>(); - n_gemms = config::WinogradBase::N_GEMMS; - N_BLOCK = config::WinogradConv::N_BLOCK; - } - else if(kernel_size == Size2D(3, 1)) - { - using config = NEWinogradLayerConfiguration<float, float, 1, 6, 1, 3>; - transform_input_kernel = support::cpp14::make_unique<config::TransformInputKernel>(); - transform_weights_kernel = support::cpp14::make_unique<config::TransformWeightsKernel>(); - transform_output_kernel = support::cpp14::make_unique<config::TransformOutputKernel>(); - n_gemms = config::WinogradBase::N_GEMMS; - N_BLOCK = config::WinogradConv::N_BLOCK; - } - else if(kernel_size == Size2D(1, 5)) - { - using config = NEWinogradLayerConfiguration<float, float, 4, 1, 5, 1>; - transform_input_kernel = support::cpp14::make_unique<config::TransformInputKernel>(); - transform_weights_kernel = support::cpp14::make_unique<config::TransformWeightsKernel>(); - transform_output_kernel = support::cpp14::make_unique<config::TransformOutputKernel>(); - n_gemms = config::WinogradBase::N_GEMMS; - N_BLOCK = config::WinogradConv::N_BLOCK; - } - else if(kernel_size == Size2D(5, 1)) - { - using config = NEWinogradLayerConfiguration<float, float, 1, 4, 1, 5>; - transform_input_kernel = support::cpp14::make_unique<config::TransformInputKernel>(); - transform_weights_kernel = support::cpp14::make_unique<config::TransformWeightsKernel>(); - transform_output_kernel = support::cpp14::make_unique<config::TransformOutputKernel>(); - n_gemms = config::WinogradBase::N_GEMMS; - N_BLOCK = config::WinogradConv::N_BLOCK; - } - else if(kernel_size == Size2D(1, 7)) - { - using config = NEWinogradLayerConfiguration<float, float, 2, 1, 7, 1>; - transform_input_kernel = support::cpp14::make_unique<config::TransformInputKernel>(); - transform_weights_kernel = support::cpp14::make_unique<config::TransformWeightsKernel>(); - transform_output_kernel = support::cpp14::make_unique<config::TransformOutputKernel>(); - n_gemms = config::WinogradBase::N_GEMMS; - N_BLOCK = config::WinogradConv::N_BLOCK; - } - else if(kernel_size == Size2D(7, 1)) - { - using config = NEWinogradLayerConfiguration<float, float, 1, 2, 1, 7>; - transform_input_kernel = support::cpp14::make_unique<config::TransformInputKernel>(); - transform_weights_kernel = support::cpp14::make_unique<config::TransformWeightsKernel>(); - transform_output_kernel = support::cpp14::make_unique<config::TransformOutputKernel>(); - n_gemms = config::WinogradBase::N_GEMMS; - N_BLOCK = config::WinogradConv::N_BLOCK; - } - else - { - ARM_COMPUTE_ERROR("Not supported."); - } - } -#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC - else if(data_type == DataType::F16) - { - if(kernel_size == Size2D(3, 3)) - { - using config = NEWinogradLayerConfiguration<__fp16, __fp16, 4, 4, 3, 3>; - transform_input_kernel = support::cpp14::make_unique<config::TransformInputKernel>(); - transform_weights_kernel = support::cpp14::make_unique<config::TransformWeightsKernel>(); - transform_output_kernel = support::cpp14::make_unique<config::TransformOutputKernel>(); - n_gemms = config::WinogradBase::N_GEMMS; - N_BLOCK = config::WinogradConv::N_BLOCK; - } - else - { - ARM_COMPUTE_ERROR("Not supported."); - } - } -#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC - - const PaddingType use_padding_type = (conv_info.pad_top() != 0u || conv_info.pad_left() != 0) ? PADDING_SAME : PADDING_VALID; - const bool use_same_padding = use_padding_type == PADDING_SAME; - - // Get convolved dimensions - const int in_channels = input->info()->dimension(channel_idx); - const int out_channels = output->info()->dimension(channel_idx); - - const Tensor4DShape in_shape(internal_get_input_shape(input)); - const size_t data_type_size = input->info()->element_size(); - // Get the memory required to instantiate a new Winograd operator. - constexpr size_t storage_alignment = 64; - - // Kernel Storage - const size_t kernel_storage_size = transform_weights_kernel->get_weight_storage_size(out_channels, - in_channels) - * data_type_size; - - // Input storage - const size_t input_storage_size = transform_input_kernel->get_input_storage_size(in_shape.n_batches, in_shape.n_channels, in_shape.n_rows, in_shape.n_cols, - use_same_padding) - * data_type_size; - - // Output storage - const size_t output_storage_size = transform_output_kernel->get_output_storage_size(in_shape.n_batches, in_shape.n_rows, in_shape.n_cols, out_channels) * data_type_size; - const int kernel_matrix_stride = transform_weights_kernel->get_matrix_stride(out_channels, in_channels); - const int output_matrix_stride = transform_output_kernel->get_matrix_stride(in_shape.n_batches, in_shape.n_rows, in_shape.n_cols, out_channels); - const auto output_shape = transform_output_kernel->get_output_shape(in_shape.n_rows, in_shape.n_cols, use_padding_type == PADDING_SAME); - const int input_matrix_stride = transform_input_kernel->get_matrix_stride(in_shape.n_batches, in_channels, in_shape.n_rows, in_shape.n_cols, use_padding_type == PADDING_SAME); - - // Configure GEMM - const int tile_rows = iceildiv(output_shape.first, output_tile.height); - const int tile_cols = iceildiv(output_shape.second, output_tile.width); - const int m = in_shape.n_batches * tile_rows * tile_cols; - const int k = in_shape.n_channels; - const int n = out_channels; - const int kernel_matrix_row_stride = roundup(out_channels, N_BLOCK); - const int output_matrix_row_stride = kernel_matrix_row_stride; - - TensorShape a_shape(k, m, 1, n_gemms); - Strides a_strides(data_type_size); - a_strides.set(1, a_strides[0] * k); - //a_strides.set(2, data_type_size * input_matrix_stride / n_gemms); FIXME: This is the real batch size, but RSH's code crashes if it's not 0. - a_strides.set(2, 0); - a_strides.set(3, data_type_size * input_matrix_stride); - - TensorShape b_shape(n, k, n_gemms); - Strides b_strides(data_type_size); - b_strides.set(1, data_type_size * kernel_matrix_row_stride); - b_strides.set(2, data_type_size * kernel_matrix_stride); - - TensorShape d_shape(n, m, 1, n_gemms); - Strides d_strides(data_type_size); - d_strides.set(1, data_type_size * output_matrix_row_stride); - //d_strides.set(2, data_type_size * output_matrix_stride / n_gemms); FIXME: This is the real batch size, but RSH's code crashes if it's not 0. - d_strides.set(2, 0); - d_strides.set(3, data_type_size * output_matrix_stride); - - TensorInfo a_info{}; - TensorInfo b_info{}; - TensorInfo d_info{}; - a_info.init(a_shape, 1, data_type, a_strides, 0, input_storage_size); - b_info.init(b_shape, 1, data_type, b_strides, 0, kernel_storage_size); - d_info.init(d_shape, 1, data_type, d_strides, 0, output_storage_size); - - _input_transformed.allocator()->init(a_info, storage_alignment); - _kernel_storage.allocator()->init(b_info, storage_alignment); - _output_transformed.allocator()->init(d_info, storage_alignment); - - // configure and allocate dst tensor to be used to convert from winograd domain to spatial domain when calling to reshape_output() - TensorInfo info(TensorShape(_output->info()->dimension(2), _output->info()->dimension(0), - _output->info()->dimension(1), _output->info()->dimension(3)), - 1, _output->info()->data_type()); - _output_nhwc.allocator()->init(info); - - const ITensor *input_to_use = _input; - ITensor *output_to_use = _output; - PermutationVector weights_permutation_vector(3U, 0U, 1U, 2U); - const unsigned int max_num_threads = NEScheduler::get().num_threads(); - - // Configure the kernel to transform the input tensor from NCHW -> NHWC - if(data_layout == DataLayout::NCHW) - { - _memory_group.manage(&_input_nhwc); - _permute_input.configure(input, &_input_nhwc, PermutationVector(2U, 0U, 1U)); - input_to_use = &_input_nhwc; - weights_permutation_vector = PermutationVector(3U, 2U, 0U, 1U); - } - - // Configure input transform kernel - _memory_group.manage(&_input_transformed); - _memory_group.manage(&_input_workspace); - transform_input_kernel->configure(input_to_use, in_shape.n_batches, in_shape.n_rows, in_shape.n_cols, in_shape.n_channels, use_padding_type, - &_input_transformed, input_matrix_stride, &_input_workspace); - const size_t input_workspace_size = transform_input_kernel->get_working_space_size(max_num_threads); - TensorInfo input_workspace_info(TensorShape(input_workspace_size), 1, _input->info()->data_type()); - _input_workspace.allocator()->init(input_workspace_info); - _input_workspace.allocator()->allocate(); - if(data_layout == DataLayout::NCHW) - { - _input_nhwc.allocator()->allocate(); - } - - // Re-order a weight tensor from [Output feature map x Input feature map x Height x Width] to [Height x Width x Input feature map x Output feature map] - _permute_weights.configure(weights, &_weights_hwio, weights_permutation_vector); - transform_weights_kernel->configure(&_weights_hwio, &_kernel_storage, kernel_matrix_stride, out_channels, in_channels); - - // Configure GEMM function - _memory_group.manage(&_output_transformed); - _gemm_function.configure(&_input_transformed, &_kernel_storage, nullptr, &_output_transformed, 1.0f, 0.f); - _input_transformed.allocator()->allocate(); - - // Configure output transform function - // The biases tensor has not been allocated at this point in time, the output transform will add the biases to the final result in the run() method - if(data_layout == DataLayout::NCHW) - { - _memory_group.manage(&_output_nhwc); - output_to_use = &_output_nhwc; - } - const arm_gemm::Activation activation = arm_gemm_activation_from_acl_activation(act_info); - - transform_output_kernel->configure(biases, - &_output_transformed, - output_matrix_stride, - output_to_use, - in_shape.n_batches, - output_shape.first, - output_shape.second, - out_channels, - &_output_workspace, - activation); - - const size_t output_workspace_size = transform_output_kernel->get_working_space_size(max_num_threads); - TensorInfo output_workspace_info(TensorShape(output_workspace_size), 1, _output->info()->data_type()); - _output_workspace.allocator()->init(output_workspace_info); - _output_workspace.allocator()->allocate(); - _output_transformed.allocator()->allocate(); +NEWinogradConvolutionLayer::~NEWinogradConvolutionLayer() = default; - // Reorder the convoluted output to ACL's ordering NCHW - if(data_layout == DataLayout::NCHW) - { - _permute_output.configure(&_output_nhwc, _output, PermutationVector(1U, 2U, 0U)); - _output_nhwc.allocator()->allocate(); - } - - _transform_input_kernel = std::move(transform_input_kernel); - _transform_weights_kernel = std::move(transform_weights_kernel); - _transform_output_kernel = std::move(transform_output_kernel); +void NEWinogradConvolutionLayer::configure(const ITensor *input, + const ITensor *weights, + const ITensor *biases, + ITensor *output, + const PadStrideInfo &conv_info, + const ActivationLayerInfo &act_info, + bool enable_fast_math) +{ + _impl->original_weights = weights; + _impl->op = std::make_unique<cpu::CpuWinogradConv2d>(); + _impl->op->configure(input->info(), weights->info(), biases != nullptr ? biases->info() : nullptr, output->info(), + conv_info, act_info, enable_fast_math); - //Configure Activation Layer - _is_activationlayer_enabled = act_info.enabled() && !fuse_function_supported(act_info); - if(_is_activationlayer_enabled) - { - _activationlayer_function.configure(_output, nullptr, act_info); - } + _impl->aux_mem_req = _impl->op->workspace(); + _impl->run_pack = {{ACL_SRC_0, input}, {ACL_SRC_1, weights}, {ACL_SRC_2, biases}, {ACL_DST, output}}; + _impl->prep_pack = {{ACL_SRC_1, weights}, {ACL_SRC_2, biases}}; + _impl->workspace = + manage_workspace<Tensor>(_impl->aux_mem_req, _impl->memory_group, _impl->run_pack, _impl->prep_pack); } void NEWinogradConvolutionLayer::run() { - const DataLayout data_layout = _input->info()->data_layout(); - prepare(); - MemoryGroupResourceScope scope_mg(_memory_group); - - if(data_layout == DataLayout::NCHW) - { - //Bring channels to the front as Winograd code expects the tensor to be in the format NHWC - _permute_input.run(); - } - - // Transform input tensor to the winograd domain - NEScheduler::get().schedule(_transform_input_kernel.get(), Window::DimX); - - //Run 16 GEMMs in multiple threads, each kernel runs one or more GEMMs - _gemm_function.run(); - - // Transform output tensor to the spatial domain - NEScheduler::get().schedule(_transform_output_kernel.get(), Window::DimX); - - if(data_layout == DataLayout::NCHW) - { - // Reorder the convoluted output to ACL's ordering NCHW - _permute_output.run(); - } - - if(_is_activationlayer_enabled) - { - _activationlayer_function.run(); - } + MemoryGroupResourceScope scope_mg(_impl->memory_group); + _impl->op->run(_impl->run_pack); } -Status NEWinogradConvolutionLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info, - const ActivationLayerInfo &act_info, bool enable_fast_math) +Status NEWinogradConvolutionLayer::validate(const ITensorInfo *input, + const ITensorInfo *weights, + const ITensorInfo *biases, + const ITensorInfo *output, + const PadStrideInfo &conv_info, + const ActivationLayerInfo &act_info, + bool enable_fast_math) { - ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output); - ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, weights, biases, output, conv_info)); - - // Get indices for the width and height - const size_t idx_width = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::WIDTH); - const size_t idx_height = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::HEIGHT); - - // Input shape, kernel size and output tile - const Size2D input_dims = Size2D(input->dimension(idx_width), input->dimension(idx_height)); - const Size2D kernel_size = Size2D(weights->dimension(idx_width), weights->dimension(idx_height)); - const DataType data_type = input->data_type(); - const Size2D output_tile = winograd_output_tile(input_dims, kernel_size, data_type); - - // Check if the Winograd configuration requires fast math - if(!enable_fast_math) - { - ARM_COMPUTE_RETURN_ERROR_ON_MSG(check_support_fast_math(output_tile, kernel_size, data_type), - "This Winograd configuration requires enable_fast_math=true"); - } - - const WinogradInfo winograd_info = WinogradInfo(output_tile, - kernel_size, - input_dims, - conv_info, - input->data_layout()); - - // Validate input transform - const TensorShape input0_shape = misc::shape_calculator::compute_winograd_input_transform_shape(*input, winograd_info); - const TensorInfo input0 = input->clone()->set_tensor_shape(input0_shape); - // Validate filter transform - const TensorShape input1_shape = misc::shape_calculator::compute_winograd_filter_transform_shape(*weights, winograd_info); - const TensorInfo input1 = weights->clone()->set_tensor_shape(input1_shape); - // Validate batched matrix multiply - TensorShape batched_mm_output_shape = input0.tensor_shape(); - batched_mm_output_shape[0] = input1.tensor_shape()[0]; - const TensorInfo batched_mm_output = input0.clone()->set_tensor_shape(batched_mm_output_shape); - - if(kernel_size == Size2D(3, 3)) - { - ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.pad_top() != 0u && conv_info.pad_top() != 1, "Only SAME or VALID padding supported"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.pad_bottom() != 0u && conv_info.pad_bottom() != 1, "Only SAME or VALID padding supported"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.pad_left() != 0u && conv_info.pad_left() != 1, "Only SAME or VALID padding supported"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.pad_right() != 0u && conv_info.pad_right() != 1, "Only SAME or VALID padding supported"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.pad_right() != conv_info.pad_left(), "Only SAME or VALID padding supported"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.pad_top() != conv_info.pad_bottom(), "Only SAME or VALID padding supported"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.pad_top() != conv_info.pad_left(), "Only SAME or VALID padding supported"); - return validate_kernel_3x3(input_dims, input, &input0, &input1, &batched_mm_output, weights, biases, output, winograd_info, act_info); - } - else if(kernel_size == Size2D(5, 5)) - { - ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.pad_top() != 0u && conv_info.pad_top() != 2, "Only SAME or VALID padding supported"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.pad_left() != 0u && conv_info.pad_left() != 2, "Only SAME or VALID padding supported"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.pad_bottom() != 0u && conv_info.pad_bottom() != 2, "Only SAME or VALID padding supported"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.pad_right() != 0u && conv_info.pad_right() != 2, "Only SAME or VALID padding supported"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.pad_right() != conv_info.pad_left(), "Only SAME or VALID padding supported"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.pad_top() != conv_info.pad_bottom(), "Only SAME or VALID padding supported"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.pad_top() != conv_info.pad_left(), "Only SAME or VALID padding supported"); - return validate_kernel_5x5(input, &input0, &input1, &batched_mm_output, weights, biases, output, winograd_info, act_info); - } - if(kernel_size == Size2D(3, 1)) - { - ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.pad_left() != 0u && conv_info.pad_left() != 1, "Only SAME or VALID padding supported"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.pad_right() != 0u && conv_info.pad_right() != 1, "Only SAME or VALID padding supported"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.pad_top() != 0u && conv_info.pad_bottom() != 0, "Only SAME or VALID padding supported"); - return validate_kernel_3x1(input, &input0, &input1, &batched_mm_output, weights, biases, output, winograd_info, act_info); - } - else if(kernel_size == Size2D(1, 3)) - { - ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.pad_top() != 0u && conv_info.pad_top() != 1, "Only SAME or VALID padding supported"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.pad_bottom() != 0u && conv_info.pad_bottom() != 1, "Only SAME or VALID padding supported"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.pad_left() != 0u && conv_info.pad_right() != 0, "Only SAME or VALID padding supported"); - return validate_kernel_1x3(input, &input0, &input1, &batched_mm_output, weights, biases, output, winograd_info, act_info); - } - else if(kernel_size == Size2D(5, 1)) - { - ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.pad_left() != 0u && conv_info.pad_left() != 2, "Only SAME or VALID padding supported"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.pad_right() != 0u && conv_info.pad_right() != 2, "Only SAME or VALID padding supported"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.pad_top() != 0u && conv_info.pad_bottom() != 0, "Only SAME or VALID padding supported"); - return validate_kernel_5x1(input, &input0, &input1, &batched_mm_output, weights, biases, output, winograd_info, act_info); - } - else if(kernel_size == Size2D(1, 5)) - { - ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.pad_top() != 0u && conv_info.pad_top() != 2, "Only SAME or VALID padding supported"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.pad_bottom() != 0u && conv_info.pad_bottom() != 2, "Only SAME or VALID padding supported"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.pad_left() != 0u && conv_info.pad_right() != 0, "Only SAME or VALID padding supported"); - return validate_kernel_1x5(input, &input0, &input1, &batched_mm_output, weights, biases, output, winograd_info, act_info); - } - else if(kernel_size == Size2D(7, 1)) - { - ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.pad_left() != 0u && conv_info.pad_left() != 3, "Only SAME or VALID padding supported"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.pad_right() != 0u && conv_info.pad_right() != 3, "Only SAME or VALID padding supported"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.pad_top() != 0u && conv_info.pad_bottom() != 0, "Only SAME or VALID padding supported"); - return validate_kernel_7x1(input, &input0, &input1, &batched_mm_output, weights, biases, output, winograd_info, act_info); - } - else if(kernel_size == Size2D(1, 7)) - { - ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.pad_top() != 0u && conv_info.pad_top() != 3, "Only SAME or VALID padding supported"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.pad_bottom() != 0u && conv_info.pad_bottom() != 3, "Only SAME or VALID padding supported"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.pad_left() != 0u && conv_info.pad_right() != 0, "Only SAME or VALID padding supported"); - return validate_kernel_1x7(input, &input0, &input1, &batched_mm_output, weights, biases, output, winograd_info, act_info); - } - else - { - ARM_COMPUTE_RETURN_ERROR_MSG("Kernel shape not supported"); - } + return cpu::CpuWinogradConv2d::validate(input, weights, biases, output, conv_info, act_info, enable_fast_math); } void NEWinogradConvolutionLayer::prepare() { - if(!_is_prepared) + if (!_impl->is_prepared) { - // Permute weights - _weights_hwio.allocator()->allocate(); - _permute_weights.run(); - _weights->mark_as_unused(); + _impl->op->prepare(_impl->prep_pack); + _impl->original_weights->mark_as_unused(); - // Transform weights - _kernel_storage.allocator()->allocate(); - NEScheduler::get().schedule(_transform_weights_kernel.get(), Window::DimX); + // Release temporary tensors that are only used in prepare stage + release_temporaries<Tensor>(_impl->aux_mem_req, _impl->workspace); - _weights_hwio.allocator()->free(); - _is_prepared = true; + _impl->is_prepared = true; } } } // namespace arm_compute diff --git a/src/runtime/NEON/functions/assembly/NEDepthwiseConvolutionAssemblyDispatch.cpp b/src/runtime/NEON/functions/assembly/NEDepthwiseConvolutionAssemblyDispatch.cpp deleted file mode 100644 index e0094f4eec..0000000000 --- a/src/runtime/NEON/functions/assembly/NEDepthwiseConvolutionAssemblyDispatch.cpp +++ /dev/null @@ -1,568 +0,0 @@ -/* - * Copyright (c) 2019-2020 ARM Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ - -#include "arm_compute/runtime/NEON/functions/assembly/NEDepthwiseConvolutionAssemblyDispatch.h" - -#include "arm_compute/core/CPP/Validate.h" -#include "arm_compute/core/ITensor.h" -#include "arm_compute/core/NEON/kernels/assembly/NEDepthwiseConvolutionAssemblyKernelWrapper.h" -#include "arm_compute/core/NEON/kernels/convolution/depthwise/depthwise_dilated.hpp" -#include "arm_compute/core/NEON/kernels/convolution/depthwise/depthwise_quantized_dilated.hpp" -#include "arm_compute/core/Utils.h" -#include "arm_compute/core/utils/misc/InfoHelpers.h" -#include "arm_compute/core/utils/misc/ShapeCalculator.h" -#include "arm_compute/core/utils/quantization/AsymmHelpers.h" - -#include "arm_compute/runtime/NEON/NEScheduler.h" - -#include <set> - -namespace arm_compute -{ -namespace -{ -std::unique_ptr<depthwise::IDepthwiseConvolution> get_qasymm8_convolver(int kernel_size, int stride_x, - int n_batches, int in_rows, int in_cols, int n_channels, - int dilation_factor, neon_convolution_kernels::ActivationFunction activation, - const qasymm8::QAsymm8Params &wqinfo, const qasymm8::QAsymm8Params &iqinfo, const qasymm8::QAsymm8Params &oqinfo, - const qasymm8::QAsymm8RescaleParams &rescale_params, - int padding_top, int padding_left, int padding_bottom, int padding_right) -{ - switch(kernel_size) - { - case 3: - { - switch(stride_x) - { - case 1: - return arm_compute::support::cpp14::make_unique<depthwise::QAsymm8DilatedDepthwiseConvolution<2, 2, 3, 3, 1, 1>>( - n_batches, in_rows, in_cols, n_channels, dilation_factor, activation, wqinfo, iqinfo, oqinfo, rescale_params, padding_top, padding_left, padding_bottom, padding_right); - case 2: - return arm_compute::support::cpp14::make_unique<depthwise::QAsymm8DilatedDepthwiseConvolution<2, 2, 3, 3, 2, 2>>( - n_batches, in_rows, in_cols, n_channels, dilation_factor, activation, wqinfo, iqinfo, oqinfo, rescale_params, padding_top, padding_left, padding_bottom, padding_right); - default: - return nullptr; - } - } - case 5: - { - switch(stride_x) - { - case 1: - return arm_compute::support::cpp14::make_unique<depthwise::QAsymm8DilatedDepthwiseConvolution<2, 2, 5, 5, 1, 1>>( - n_batches, in_rows, in_cols, n_channels, dilation_factor, activation, wqinfo, iqinfo, oqinfo, rescale_params, padding_top, padding_left, padding_bottom, padding_right); - case 2: - return arm_compute::support::cpp14::make_unique<depthwise::QAsymm8DilatedDepthwiseConvolution<2, 2, 5, 5, 2, 2>>( - n_batches, in_rows, in_cols, n_channels, dilation_factor, activation, wqinfo, iqinfo, oqinfo, rescale_params, padding_top, padding_left, padding_bottom, padding_right); - default: - return nullptr; - } - } - default: - return nullptr; - } -} - -std::unique_ptr<depthwise::IDepthwiseConvolution> get_qsymm8_perchannel_convolver(int kernel_size, int stride_x, - int n_batches, int in_rows, int in_cols, int n_channels, - neon_convolution_kernels::ActivationFunction activation, - const qsymm8::QSymm8PerChannelParams &wqinfo, const qasymm8::QAsymm8Params &iqinfo, const qasymm8::QAsymm8Params &oqinfo, - const qsymm8::QSymm8PerChannelRescaleParams &rescale_params, - int padding_top, int padding_left, int padding_bottom, int padding_right) -{ - switch(kernel_size) - { - case 3: - { - switch(stride_x) - { - case 1: - return arm_compute::support::cpp14::make_unique<depthwise::QSymm8HybridPerChannelDepthwiseConvolution<2, 2, 3, 3, 1, 1>>( - n_batches, in_rows, in_cols, n_channels, activation, wqinfo, iqinfo, oqinfo, rescale_params, padding_top, padding_left, padding_bottom, padding_right); - case 2: - return arm_compute::support::cpp14::make_unique<depthwise::QSymm8HybridPerChannelDepthwiseConvolution<2, 2, 3, 3, 2, 2>>( - n_batches, in_rows, in_cols, n_channels, activation, wqinfo, iqinfo, oqinfo, rescale_params, padding_top, padding_left, padding_bottom, padding_right); - default: - return nullptr; - } - } - case 5: - { - switch(stride_x) - { - case 1: - return arm_compute::support::cpp14::make_unique<depthwise::QSymm8HybridPerChannelDepthwiseConvolution<2, 2, 5, 5, 1, 1>>( - n_batches, in_rows, in_cols, n_channels, activation, wqinfo, iqinfo, oqinfo, rescale_params, padding_top, padding_left, padding_bottom, padding_right); - case 2: - return arm_compute::support::cpp14::make_unique<depthwise::QSymm8HybridPerChannelDepthwiseConvolution<2, 2, 5, 5, 2, 2>>( - n_batches, in_rows, in_cols, n_channels, activation, wqinfo, iqinfo, oqinfo, rescale_params, padding_top, padding_left, padding_bottom, padding_right); - default: - return nullptr; - } - } - default: - return nullptr; - } -} - -#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC -std::unique_ptr<depthwise::IDepthwiseConvolution> get_fp16_convolver(int kernel_size, int stride_x, - int n_batches, int in_rows, int in_cols, int n_channels, - int dilation_factor, neon_convolution_kernels::ActivationFunction activation, - int padding_top, int padding_left, int padding_bottom, int padding_right) -{ - switch(kernel_size) - { - case 3: - { - switch(stride_x) - { - case 1: - return arm_compute::support::cpp14::make_unique<depthwise::DilatedDepthwiseConvolution<3, 3, 3, 3, 1, 1, float16_t, float16_t, float16_t>>( - n_batches, in_rows, in_cols, n_channels, dilation_factor, activation, padding_top, padding_left, padding_bottom, padding_right); - case 2: - return arm_compute::support::cpp14::make_unique<depthwise::DilatedDepthwiseConvolution<3, 3, 3, 3, 2, 2, float16_t, float16_t, float16_t>>( - n_batches, in_rows, in_cols, n_channels, dilation_factor, activation, padding_top, padding_left, padding_bottom, padding_right); - default: - return nullptr; - } - } - case 5: - { - switch(stride_x) - { - case 1: - return arm_compute::support::cpp14::make_unique<depthwise::DilatedDepthwiseConvolution<3, 3, 5, 5, 1, 1, float16_t, float16_t, float16_t>>( - n_batches, in_rows, in_cols, n_channels, dilation_factor, activation, padding_top, padding_left, padding_bottom, padding_right); - case 2: - return arm_compute::support::cpp14::make_unique<depthwise::DilatedDepthwiseConvolution<3, 3, 5, 5, 2, 2, float16_t, float16_t, float16_t>>( - n_batches, in_rows, in_cols, n_channels, dilation_factor, activation, padding_top, padding_left, padding_bottom, padding_right); - default: - return nullptr; - } - } - default: - return nullptr; - } -} -#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC - -std::unique_ptr<depthwise::IDepthwiseConvolution> get_fp32_convolver(int kernel_size, int stride_x, - int n_batches, int in_rows, int in_cols, int n_channels, - int dilation_factor, neon_convolution_kernels::ActivationFunction activation, - int padding_top, int padding_left, int padding_bottom, int padding_right) -{ - switch(kernel_size) - { - case 3: - { - switch(stride_x) - { - case 1: - return arm_compute::support::cpp14::make_unique<depthwise::DilatedDepthwiseConvolution<4, 4, 3, 3, 1, 1, float, float, float>>( - n_batches, in_rows, in_cols, n_channels, dilation_factor, activation, padding_top, padding_left, padding_bottom, padding_right); - case 2: - return arm_compute::support::cpp14::make_unique<depthwise::DilatedDepthwiseConvolution<3, 3, 3, 3, 2, 2, float, float, float>>( - n_batches, in_rows, in_cols, n_channels, dilation_factor, activation, padding_top, padding_left, padding_bottom, padding_right); - default: - return nullptr; - } - } - case 5: - { - switch(stride_x) - { - case 1: - return arm_compute::support::cpp14::make_unique<depthwise::DilatedDepthwiseConvolution<4, 4, 5, 5, 1, 1, float, float, float>>( - n_batches, in_rows, in_cols, n_channels, dilation_factor, activation, padding_top, padding_left, padding_bottom, padding_right); - case 2: - return arm_compute::support::cpp14::make_unique<depthwise::DilatedDepthwiseConvolution<3, 3, 5, 5, 2, 2, float, float, float>>( - n_batches, in_rows, in_cols, n_channels, dilation_factor, activation, padding_top, padding_left, padding_bottom, padding_right); - default: - return nullptr; - } - } - default: - return nullptr; - } -} - -std::unique_ptr<depthwise::IDepthwiseConvolution> create_convolver(const ITensor *input, - const ITensor *weights, - ITensor *output, - PadStrideInfo conv_info, - ActivationLayerInfo act_info, - const Size2D &dilation) -{ - ARM_COMPUTE_UNUSED(dilation); - const DataType data_type = input->info()->data_type(); - const TensorShape shape = input->info()->tensor_shape(); - - const int n_batches = shape[3]; - const int in_rows = shape.z(); - const int in_cols = shape.y(); - const int n_channels = shape.x(); - const int dilation_factor = dilation.x(); - const int padding_top = conv_info.pad_top(); - const int padding_left = conv_info.pad_left(); - const int padding_bottom = conv_info.pad_bottom(); - const int padding_right = conv_info.pad_right(); - - const bool is_uniform_quantized = (data_type == DataType::QASYMM8) && (weights->info()->data_type() == DataType::QASYMM8); - const bool is_perchannel_quantized = (data_type == DataType::QASYMM8) && (weights->info()->data_type() == DataType::QSYMM8_PER_CHANNEL); - - const unsigned int stride_x = conv_info.stride().first; - const unsigned int kernel_size = weights->info()->tensor_shape().y(); - - // Map activation function - neon_convolution_kernels::ActivationFunction activation = neon_convolution_kernels::ActivationFunction::None; - if(arm_compute::utils::info_helpers::is_relu(act_info)) - { - activation = neon_convolution_kernels::ActivationFunction::ReLU; - } - else if(arm_compute::utils::info_helpers::is_relu6(act_info)) - { - activation = neon_convolution_kernels::ActivationFunction::ReLU6; - } - - // Create quantized convolver - if(is_uniform_quantized) - { - const UniformQuantizationInfo input_qinfo = input->info()->quantization_info().uniform(); - const UniformQuantizationInfo weights_qinfo = weights->info()->quantization_info().uniform(); - const UniformQuantizationInfo output_qinfo = output->info()->quantization_info().uniform(); - - // Check that quantization info are in the range [0, 255] - ARM_COMPUTE_ERROR_ON(input_qinfo.offset < 0 || input_qinfo.offset > 255); - ARM_COMPUTE_ERROR_ON(weights_qinfo.offset < 0 || weights_qinfo.offset > 255); - ARM_COMPUTE_ERROR_ON(output_qinfo.offset < 0 || output_qinfo.offset > 255); - const qasymm8::QAsymm8Params iqinfo{ static_cast<uint8_t>(input_qinfo.offset), input_qinfo.scale }; - const qasymm8::QAsymm8Params wqinfo{ static_cast<uint8_t>(weights_qinfo.offset), weights_qinfo.scale }; - const qasymm8::QAsymm8Params oqinfo{ static_cast<uint8_t>(output_qinfo.offset), output_qinfo.scale }; - - // Calculate rescale parameters - const float fmultipler = iqinfo.scale * wqinfo.scale / oqinfo.scale; - int32_t qmultiplier = 0; - int32_t qshift = 0; - quantization::calculate_quantized_multiplier_less_than_one(fmultipler, &qmultiplier, &qshift); - qasymm8::QAsymm8RescaleParams rescale_params(qshift, qmultiplier, fmultipler); - - return get_qasymm8_convolver(kernel_size, stride_x, n_batches, in_rows, in_cols, n_channels, dilation_factor, activation, - wqinfo, iqinfo, oqinfo, rescale_params, padding_top, padding_left, padding_bottom, padding_right); - } - else if(is_perchannel_quantized) - { - const UniformQuantizationInfo input_qinfo = input->info()->quantization_info().uniform(); - const QuantizationInfo weights_qinfo = weights->info()->quantization_info(); - const UniformQuantizationInfo output_qinfo = output->info()->quantization_info().uniform(); - - // Check that quantization info are in the range [0, 255] - ARM_COMPUTE_ERROR_ON(input_qinfo.offset < 0 || input_qinfo.offset > 255); - ARM_COMPUTE_ERROR_ON(output_qinfo.offset < 0 || output_qinfo.offset > 255); - const qasymm8::QAsymm8Params iqinfo{ static_cast<uint8_t>(input_qinfo.offset), input_qinfo.scale }; - const qsymm8::QSymm8PerChannelParams wqinfo{ weights_qinfo.scale() }; - const qasymm8::QAsymm8Params oqinfo{ static_cast<uint8_t>(output_qinfo.offset), output_qinfo.scale }; - - // Calculate rescale parameters - std::vector<float> fmultipliers; - std::vector<int32_t> qmultipliers; - std::vector<int32_t> qshifts; - - for(auto const s : wqinfo.scales) - { - const float fmultipler = iqinfo.scale * s / oqinfo.scale; - int32_t qmultiplier = 0; - int32_t qshift = 0; - quantization::calculate_quantized_multiplier_less_than_one(fmultipler, &qmultiplier, &qshift); - fmultipliers.push_back(fmultipler); - qmultipliers.push_back(qmultiplier); - qshifts.push_back(qshift); - } - - qsymm8::QSymm8PerChannelRescaleParams rescale_params(qshifts, qmultipliers, fmultipliers); - - return get_qsymm8_perchannel_convolver(kernel_size, stride_x, n_batches, in_rows, in_cols, n_channels, activation, - wqinfo, iqinfo, oqinfo, rescale_params, padding_top, padding_left, padding_bottom, padding_right); - } - else - { - // Create float convolver - switch(data_type) - { -#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC - case DataType::F16: - { - return get_fp16_convolver(kernel_size, stride_x, n_batches, in_rows, in_cols, n_channels, dilation_factor, activation, padding_top, padding_left, padding_bottom, padding_right); - } -#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC - case DataType::F32: - { - return get_fp32_convolver(kernel_size, stride_x, n_batches, in_rows, in_cols, n_channels, dilation_factor, activation, padding_top, padding_left, padding_bottom, padding_right); - } - default: - return nullptr; - } - } -} -} // namespace - -struct NEDepthwiseConvolutionAssemblyDispatch::LocalImpl -{ - std::unique_ptr<depthwise::IDepthwiseConvolution> _dwc_assembly_kernel{ nullptr }; - NEDepthwiseConvolutionAssemblyKernelWrapper _dwc_acl_kernel{}; -}; - -#ifndef DOXYGEN_SKIP_THIS -NEDepthwiseConvolutionAssemblyDispatch::NEDepthwiseConvolutionAssemblyDispatch(std::shared_ptr<arm_compute::IMemoryManager> memory_manager) - : _memory_group(std::move(memory_manager)), _input(nullptr), _weights(nullptr), _bias(nullptr), _output(nullptr), _packed_weights(), _workspace(), _is_prepared(false), - _pImpl(support::cpp14::make_unique<LocalImpl>()) -{ -} -#endif /* DOXYGEN_SKIP_THIS */ - -NEDepthwiseConvolutionAssemblyDispatch::~NEDepthwiseConvolutionAssemblyDispatch() = default; - -void NEDepthwiseConvolutionAssemblyDispatch::configure(const ITensor *input, - const ITensor *weights, - const ITensor *bias, - ITensor *output, - const PadStrideInfo &conv_info, - unsigned int depth_multiplier, - const ActivationLayerInfo &act_info, - const Size2D &dilation) -{ - ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output); - ARM_COMPUTE_UNUSED(depth_multiplier); - ARM_COMPUTE_ERROR_THROW_ON(NEDepthwiseConvolutionAssemblyDispatch::validate(input->info(), - weights->info(), - bias != nullptr ? bias->info() : nullptr, - output->info(), - conv_info, - depth_multiplier, - act_info, - dilation)); - - // Output auto inizialitation if not yet initialized - const TensorShape output_shape = misc::shape_calculator::compute_depthwise_convolution_shape(*input->info(), *weights->info(), conv_info, depth_multiplier, dilation); - auto_init_if_empty(*output->info(), input->info()->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(output_shape).set_quantization_info(output->info()->quantization_info())); - - _input = input; - _weights = weights; - _bias = bias; - _output = output; - _is_prepared = false; - - // Create convolver - _pImpl->_dwc_assembly_kernel = create_convolver(input, weights, output, conv_info, act_info, dilation); - ARM_COMPUTE_ERROR_ON(_pImpl->_dwc_assembly_kernel == nullptr); - - // Create assembly kernel wrapper - _pImpl->_dwc_acl_kernel.configure(_pImpl->_dwc_assembly_kernel.get()); - - constexpr size_t alignment = 128; - - // Create workspace - const unsigned int num_threads = NEScheduler::get().num_threads(); - const size_t workspace_size = _pImpl->_dwc_assembly_kernel->get_working_space_size(num_threads); - ARM_COMPUTE_ERROR_ON_MSG(workspace_size == 0, "Workspace size cannot be 0 !"); - _workspace.allocator()->init(TensorInfo(TensorShape{ workspace_size }, 1, DataType::S8), alignment); - _memory_group.manage(&_workspace); - _workspace.allocator()->allocate(); - - // Create packing tensor - const size_t pack_tensor_size = _pImpl->_dwc_assembly_kernel->get_packed_params_size(); - ARM_COMPUTE_ERROR_ON_MSG(pack_tensor_size == 0, "Pack tensor size cannot be 0 !"); - _packed_weights.allocator()->init(TensorInfo(TensorShape{ pack_tensor_size }, 1, DataType::S8), alignment); -} - -Status NEDepthwiseConvolutionAssemblyDispatch::validate(const ITensorInfo *input, - const ITensorInfo *weights, - const ITensorInfo *bias, - const ITensorInfo *output, - const PadStrideInfo &conv_info, - unsigned int depth_multiplier, - const ActivationLayerInfo &act_info, - const Size2D &dilation) -{ - ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F16, DataType::F32); - if(weights->data_type() != DataType::QSYMM8_PER_CHANNEL) - { - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights); - } - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, weights); - - // Validate convolver - ARM_COMPUTE_RETURN_ERROR_ON(!is_optimized_supported(input, weights, conv_info, depth_multiplier, dilation)); - - // Validate activation - const bool is_relu = arm_compute::utils::info_helpers::is_relu(act_info); - const bool is_relu6 = arm_compute::utils::info_helpers::is_relu6(act_info); - ARM_COMPUTE_RETURN_ERROR_ON(act_info.enabled() && !(is_relu || is_relu6)); - - // Check bias - if(bias != nullptr) - { - unsigned int channel_idx = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::CHANNEL); - ARM_COMPUTE_RETURN_ERROR_ON(bias->num_dimensions() > 1); - ARM_COMPUTE_RETURN_ERROR_ON(bias->dimension(0) != weights->dimension(channel_idx)); - } - - // Check output - if(output->total_size() != 0) - { - const TensorShape output_shape = misc::shape_calculator::compute_depthwise_convolution_shape(*input, *weights, conv_info, depth_multiplier, dilation); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), output_shape); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); - } - - // The uniform quantization case will only have 1 scale value in the weights quantization info - const UniformQuantizationInfo input_qinfo = input->quantization_info().uniform(); - const QuantizationInfo weights_qinfo = weights->quantization_info(); - const UniformQuantizationInfo output_qinfo = output->quantization_info().uniform(); - for(auto const s : weights_qinfo.scale()) - { - const float fmultipler = input_qinfo.scale * s / output_qinfo.scale; - ARM_COMPUTE_RETURN_ERROR_ON(fmultipler > 1.f); - } - - return Status{}; -} - -bool NEDepthwiseConvolutionAssemblyDispatch::is_optimized_supported(const ITensorInfo *input, - const ITensorInfo *weights, - PadStrideInfo conv_info, - unsigned int depth_multiplier, - const Size2D &dilation) -{ - ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights); - - // Reshape input shape if in NHWC format - const DataLayout data_layout = input->data_layout(); - TensorShape in_shape{ input->tensor_shape() }; - if(data_layout == DataLayout::NHWC) - { - in_shape.set(Window::DimX, input->tensor_shape().y()); - in_shape.set(Window::DimY, input->tensor_shape().z()); - in_shape.set(Window::DimZ, input->tensor_shape().x()); - } - - // Check data type - // TODO (COMPMID-3004): Add assembly optimized routine for QASYMM8_SIGNED NEDepthwiseConvolutionLayer - const DataType input_type = input->data_type(); - const bool is_input_type_valid = is_data_type_float(input_type) || input_type == DataType::QASYMM8; - const DataType weights_type = weights->data_type(); - const bool is_weights_type_valid = is_data_type_float(weights_type) || weights_type == DataType::QASYMM8 || weights_type == DataType::QASYMM8_SIGNED - || weights_type == DataType::QSYMM8_PER_CHANNEL; - - // Check weighs size - std::set<unsigned int> supported_kernel_sizes = { 3, 5 }; - const unsigned int width_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH); - const unsigned int height_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT); - const unsigned int kernel_w = weights->dimension(width_idx); - const unsigned int kernel_h = weights->dimension(height_idx); - bool weights_supported = (kernel_w == kernel_h) && (supported_kernel_sizes.count(kernel_w) != 0); - - // Check for supported strides - const auto &strides = conv_info.stride(); - bool supported_strides = (strides.first == strides.second) && ((strides.first == 1) || (strides.first == 2)); - - // Check for supported padding - const auto pad_top = conv_info.pad_top(); - const auto pad_right = conv_info.pad_right(); - const auto pad_bottom = conv_info.pad_bottom(); - const auto pad_left = conv_info.pad_left(); - PadStrideInfo same_pad = calculate_same_pad(in_shape, TensorShape(kernel_w, kernel_h), conv_info, DataLayout::NCHW, dilation); - bool is_same_padding = (pad_top == same_pad.pad_top()) && (pad_right == same_pad.pad_right()) && (pad_bottom == same_pad.pad_bottom()) && (pad_left == same_pad.pad_left()); - bool is_valid_padding = (pad_top == 0) && (pad_right == 0) && (pad_bottom == 0) && (pad_left == 0); - bool supported_padding = is_same_padding || is_valid_padding; - // TODO(COMPMID-2464): Enable once dilated conv with stride 2 is supported - bool is_dilation_supported = ((dilation == Size2D(1U, 1U)) || ((dilation.x() == dilation.y()) && strides.first == 1)); - - if(weights_type == DataType::QSYMM8_PER_CHANNEL) - { - is_dilation_supported = is_dilation_supported && (dilation == Size2D(1U, 1U)); - } - - return is_input_type_valid && is_weights_type_valid && weights_supported && supported_strides && supported_padding && (depth_multiplier == 1) && is_dilation_supported; -} - -void NEDepthwiseConvolutionAssemblyDispatch::run() -{ - // Prepare assembly kernel - prepare(); - - MemoryGroupResourceScope scope_mg(_memory_group); - - // Setup inputs/outputs - ARM_COMPUTE_ERROR_ON(_workspace.buffer() == nullptr); - _pImpl->_dwc_assembly_kernel->set_working_space(static_cast<void *>(_workspace.buffer())); - - ARM_COMPUTE_ERROR_ON(_input->buffer() == nullptr); - const int input_element_size = _input->info()->element_size(); - const int input_batch_stride = _input->info()->strides_in_bytes()[3] / input_element_size; - const int input_row_stride = _input->info()->strides_in_bytes().z() / input_element_size; - const int input_col_stride = _input->info()->strides_in_bytes().y() / input_element_size; - const void *input_ptr = _input->buffer() + _input->info()->offset_first_element_in_bytes(); - _pImpl->_dwc_assembly_kernel->set_input(input_ptr, input_batch_stride, input_row_stride, input_col_stride); - - ARM_COMPUTE_ERROR_ON(_output->buffer() == nullptr); - const int output_element_size = _output->info()->element_size(); - const int output_batch_stride = _output->info()->strides_in_bytes()[3] / output_element_size; - const int output_row_stride = _output->info()->strides_in_bytes().z() / output_element_size; - const int output_col_stride = _output->info()->strides_in_bytes().y() / output_element_size; - void *output_ptr = _output->buffer() + _output->info()->offset_first_element_in_bytes(); - _pImpl->_dwc_assembly_kernel->set_output(output_ptr, output_batch_stride, output_row_stride, output_col_stride); - - // Schedule assembly kernel - NEScheduler::get().schedule(&_pImpl->_dwc_acl_kernel, Window::DimX); -} - -void NEDepthwiseConvolutionAssemblyDispatch::prepare() -{ - if(!_is_prepared) - { - _packed_weights.allocator()->allocate(); - ARM_COMPUTE_ERROR_ON(_packed_weights.buffer() == nullptr); - - // Pack weights and bias - const int weights_element_size = _weights->info()->element_size(); - const int weights_row_stride = _weights->info()->strides_in_bytes().z() / weights_element_size; - const int weights_col_stride = _weights->info()->strides_in_bytes().y() / weights_element_size; - _pImpl->_dwc_assembly_kernel->pack_params(_packed_weights.buffer(), - _weights->buffer() + _weights->info()->offset_first_element_in_bytes(), - weights_row_stride, - weights_col_stride, - (_bias != nullptr) ? _bias->buffer() : nullptr); - _pImpl->_dwc_assembly_kernel->set_packed_params_buffer(_packed_weights.buffer()); - - _weights->mark_as_unused(); - if(_bias != nullptr) - { - _bias->mark_as_unused(); - } - _is_prepared = true; - } -} -} // namespace arm_compute diff --git a/src/runtime/OMP/OMPScheduler.cpp b/src/runtime/OMP/OMPScheduler.cpp index f67f06fc94..2a5abb5f7a 100644 --- a/src/runtime/OMP/OMPScheduler.cpp +++ b/src/runtime/OMP/OMPScheduler.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2019 ARM Limited. + * Copyright (c) 2017-2024 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -27,16 +27,29 @@ #include "arm_compute/core/Error.h" #include "arm_compute/core/Helpers.h" #include "arm_compute/core/Utils.h" -#include "arm_compute/runtime/CPUUtils.h" #include <omp.h> namespace arm_compute { +#if !defined(_WIN64) && !defined(BARE_METAL) && !defined(__APPLE__) && !defined(__OpenBSD__) && \ + (defined(__arm__) || defined(__aarch64__)) && defined(__ANDROID__) OMPScheduler::OMPScheduler() // NOLINT - : _num_threads(omp_get_max_threads()) + : _num_threads(cpu_info().get_cpu_num_excluding_little()), + _has_lmb(cpu_info().cpu_has_little_mid_big()), + _nonlittle_num_cpus(cpu_info().get_cpu_num_excluding_little()) { } +#else /* !defined(_WIN64) && !defined(BARE_METAL) && !defined(__APPLE__) && !defined(__OpenBSD__) && \ + (defined(__arm__) || defined(__aarch64__)) && defined(__ANDROID__)*/ +OMPScheduler::OMPScheduler() // NOLINT + : _num_threads(omp_get_max_threads()), + _has_lmb(cpu_info().cpu_has_little_mid_big()), + _nonlittle_num_cpus(cpu_info().get_cpu_num_excluding_little()) +{ +} +#endif /* !defined(_WIN64) && !defined(BARE_METAL) && !defined(__APPLE__) && !defined(__OpenBSD__) && \ + (defined(__arm__) || defined(__aarch64__)) && defined(__ANDROID__)*/ unsigned int OMPScheduler::num_threads() const { @@ -46,60 +59,78 @@ unsigned int OMPScheduler::num_threads() const void OMPScheduler::set_num_threads(unsigned int num_threads) { const unsigned int num_cores = omp_get_max_threads(); - _num_threads = (num_threads == 0) ? num_cores : num_threads; +#if !defined(_WIN64) && !defined(BARE_METAL) && !defined(__APPLE__) && !defined(__OpenBSD__) && \ + (defined(__arm__) || defined(__aarch64__)) && defined(__ANDROID__) + const unsigned int adjusted_num_threads = (_has_lmb) ? _nonlittle_num_cpus : num_threads; + _num_threads = (num_threads == 0) ? num_cores : adjusted_num_threads; +#else /* !defined(_WIN64) && !defined(BARE_METAL) && !defined(__APPLE__) && !defined(__OpenBSD__) && \ + (defined(__arm__) || defined(__aarch64__)) && defined(__ANDROID__)*/ + _num_threads = (num_threads == 0) ? num_cores : num_threads; +#endif /* !defined(_WIN64) && !defined(BARE_METAL) && !defined(__APPLE__) && !defined(__OpenBSD__) && \ + (defined(__arm__) || defined(__aarch64__)) && defined(__ANDROID__)*/ } void OMPScheduler::schedule(ICPPKernel *kernel, const Hints &hints) { + ITensorPack tensors; + schedule_common(kernel, hints, kernel->window(), tensors); +} + +void OMPScheduler::schedule_op(ICPPKernel *kernel, const Hints &hints, const Window &window, ITensorPack &tensors) +{ ARM_COMPUTE_ERROR_ON_MSG(!kernel, "The child class didn't set the kernel"); ARM_COMPUTE_ERROR_ON_MSG(hints.strategy() == StrategyHint::DYNAMIC, "Dynamic scheduling is not supported in OMPScheduler"); - const Window &max_window = kernel->window(); + const Window &max_window = window; const unsigned int num_iterations = max_window.num_iterations(hints.split_dimension()); const unsigned int num_threads = std::min(num_iterations, _num_threads); - if(!kernel->is_parallelisable() || num_threads == 1) + if (!kernel->is_parallelisable() || num_threads == 1) { ThreadInfo info; - info.cpu_info = &_cpu_info; - kernel->run(max_window, info); + info.cpu_info = &cpu_info(); + kernel->run_op(tensors, max_window, info); } else { const unsigned int num_windows = num_threads; std::vector<IScheduler::Workload> workloads(num_windows); - for(unsigned int t = 0; t < num_windows; t++) + for (unsigned int t = 0; t < num_windows; t++) { //Capture 't' by copy, all the other variables by reference: - workloads[t] = [t, &hints, &max_window, &num_windows, &kernel](const ThreadInfo & info) + workloads[t] = [t, &hints, &max_window, &num_windows, &kernel, &tensors](const ThreadInfo &info) { Window win = max_window.split_window(hints.split_dimension(), t, num_windows); win.validate(); - kernel->run(win, info); + kernel->run_op(tensors, win, info); }; } run_workloads(workloads); } } - #ifndef DOXYGEN_SKIP_THIS void OMPScheduler::run_workloads(std::vector<arm_compute::IScheduler::Workload> &workloads) { - const unsigned int num_threads = std::min(_num_threads, static_cast<unsigned int>(workloads.size())); - if(num_threads < 1) + const unsigned int amount_of_work = static_cast<unsigned int>(workloads.size()); + const unsigned int num_threads_to_use = std::min(_num_threads, amount_of_work); + + if (num_threads_to_use < 1) { return; } ThreadInfo info; - info.cpu_info = &_cpu_info; - info.num_threads = num_threads; - #pragma omp parallel firstprivate(info) num_threads(num_threads) + info.cpu_info = &cpu_info(); + info.num_threads = num_threads_to_use; +#pragma omp parallel for firstprivate(info) num_threads(num_threads_to_use) default(shared) proc_bind(close) \ + schedule(static, 1) + for (unsigned int wid = 0; wid < amount_of_work; ++wid) { - const int tid = omp_get_thread_num(); + const int tid = omp_get_thread_num(); + info.thread_id = tid; - workloads[tid](info); + workloads[wid](info); } } #endif /* DOXYGEN_SKIP_THIS */ diff --git a/src/runtime/OffsetLifetimeManager.cpp b/src/runtime/OffsetLifetimeManager.cpp index 3133202bf3..d746f618b5 100644 --- a/src/runtime/OffsetLifetimeManager.cpp +++ b/src/runtime/OffsetLifetimeManager.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2020 ARM Limited. + * Copyright (c) 2017-2020 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -27,7 +27,6 @@ #include "arm_compute/runtime/IAllocator.h" #include "arm_compute/runtime/IMemoryGroup.h" #include "arm_compute/runtime/OffsetMemoryPool.h" -#include "support/MemorySupport.h" #include <algorithm> #include <cmath> @@ -44,8 +43,7 @@ size_t align_offset(size_t offset, size_t alignment) return (remainder != 0U) ? offset + (alignment - remainder) : offset; } } // namespace -OffsetLifetimeManager::OffsetLifetimeManager() - : _blob(0) +OffsetLifetimeManager::OffsetLifetimeManager() : _blob(0) { } @@ -57,7 +55,7 @@ const OffsetLifetimeManager::info_type &OffsetLifetimeManager::info() const std::unique_ptr<IMemoryPool> OffsetLifetimeManager::create_pool(IAllocator *allocator) { ARM_COMPUTE_ERROR_ON(allocator == nullptr); - return support::cpp14::make_unique<OffsetMemoryPool>(allocator, _blob); + return std::make_unique<OffsetMemoryPool>(allocator, _blob); } MappingType OffsetLifetimeManager::mapping_type() const @@ -72,21 +70,22 @@ void OffsetLifetimeManager::update_blobs_and_mappings() // Update blob size size_t max_aggregated_size = 0; - std::for_each(std::begin(_free_blobs), std::end(_free_blobs), [&](const Blob & b) - { - max_aggregated_size += b.max_size; - _blob.alignment = std::max(_blob.alignment, b.max_alignment); - }); + std::for_each(std::begin(_free_blobs), std::end(_free_blobs), + [&](const Blob &b) + { + max_aggregated_size += b.max_size; + _blob.alignment = std::max(_blob.alignment, b.max_alignment); + }); max_aggregated_size += _free_blobs.size() * _blob.alignment; _blob.owners = std::max(_blob.owners, _free_blobs.size()); _blob.size = std::max(_blob.size, max_aggregated_size); // Calculate group mappings - auto &group_mappings = _active_group->mappings(); + auto &group_mappings = _active_group->mappings(); size_t offset = 0; - for(auto &free_blob : _free_blobs) + for (auto &free_blob : _free_blobs) { - for(auto &bound_element_id : free_blob.bound_elements) + for (auto &bound_element_id : free_blob.bound_elements) { ARM_COMPUTE_ERROR_ON(_active_elements.find(bound_element_id) == std::end(_active_elements)); Element &bound_element = _active_elements[bound_element_id]; diff --git a/src/runtime/OffsetMemoryPool.cpp b/src/runtime/OffsetMemoryPool.cpp index c8381a11d2..8f3c1a84ba 100644 --- a/src/runtime/OffsetMemoryPool.cpp +++ b/src/runtime/OffsetMemoryPool.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2020 ARM Limited. + * Copyright (c) 2017-2020 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -21,8 +21,6 @@ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ -#include <algorithm> - #include "arm_compute/runtime/OffsetMemoryPool.h" #include "arm_compute/core/Error.h" @@ -30,7 +28,8 @@ #include "arm_compute/runtime/IMemoryPool.h" #include "arm_compute/runtime/MemoryRegion.h" #include "arm_compute/runtime/Types.h" -#include "support/MemorySupport.h" + +#include <algorithm> namespace arm_compute { @@ -51,7 +50,7 @@ void OffsetMemoryPool::acquire(MemoryMappings &handles) ARM_COMPUTE_ERROR_ON(_blob == nullptr); // Set memory to handlers - for(auto &handle : handles) + for (auto &handle : handles) { ARM_COMPUTE_ERROR_ON(handle.first == nullptr); handle.first->set_owned_region(_blob->extract_subregion(handle.second, _blob_info.size - handle.second)); @@ -60,7 +59,7 @@ void OffsetMemoryPool::acquire(MemoryMappings &handles) void OffsetMemoryPool::release(MemoryMappings &handles) { - for(auto &handle : handles) + for (auto &handle : handles) { ARM_COMPUTE_ERROR_ON(handle.first == nullptr); handle.first->set_region(nullptr); @@ -75,6 +74,6 @@ MappingType OffsetMemoryPool::mapping_type() const std::unique_ptr<IMemoryPool> OffsetMemoryPool::duplicate() { ARM_COMPUTE_ERROR_ON(!_allocator); - return support::cpp14::make_unique<OffsetMemoryPool>(_allocator, _blob_info); + return std::make_unique<OffsetMemoryPool>(_allocator, _blob_info); } } // namespace arm_compute diff --git a/src/runtime/GLES_COMPUTE/functions/GCActivationLayer.cpp b/src/runtime/OperatorTensor.cpp index 4f5ee28a76..19415b35cf 100644 --- a/src/runtime/GLES_COMPUTE/functions/GCActivationLayer.cpp +++ b/src/runtime/OperatorTensor.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2020 ARM Limited. + * Copyright (c) 2020-2021 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -21,25 +21,40 @@ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ -#include "arm_compute/runtime/GLES_COMPUTE/functions/GCActivationLayer.h" +#include "arm_compute/runtime/OperatorTensor.h" -#include "arm_compute/core/GLES_COMPUTE/kernels/GCActivationLayerKernel.h" -#include "arm_compute/core/Helpers.h" -#include "support/MemorySupport.h" +#include "arm_compute/runtime/MemoryRegion.h" + +#include "support/Cast.h" namespace arm_compute { -GCActivationLayer::GCActivationLayer(GCRuntimeContext *ctx) - : IGCSimpleFunction(ctx) +namespace experimental +{ +OperatorTensor::OperatorTensor(ITensorInfo *info, IMemory *memory) + : _info(info), _memory(memory), _mem_type(MemoryType::CPU) { } -void GCActivationLayer::configure(IGCTensor *input, IGCTensor *output, ActivationLayerInfo act_info) +ITensorInfo *OperatorTensor::info() const { - auto core_ctx = _ctx ? _ctx->core_runtime_context() : /* Legacy */ nullptr; + return _info; +} - auto k = arm_compute::support::cpp14::make_unique<GCActivationLayerKernel>(core_ctx); - k->configure(input, output, act_info); - _kernel = std::move(k); +ITensorInfo *OperatorTensor::info() +{ + return _info; +} + +uint8_t *OperatorTensor::buffer() const +{ + switch (_mem_type) + { + case MemoryType::CPU: + return (uint8_t *)utils::cast::polymorphic_downcast<MemoryRegion *>(_memory->region())->buffer(); + default: + ARM_COMPUTE_ERROR("Memory type not supported."); + } } +} // namespace experimental } // namespace arm_compute diff --git a/src/runtime/PoolManager.cpp b/src/runtime/PoolManager.cpp index 455f969bd3..7fb9bd8000 100644 --- a/src/runtime/PoolManager.cpp +++ b/src/runtime/PoolManager.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2020 ARM Limited. + * Copyright (c) 2017-2020 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -25,15 +25,13 @@ #include "arm_compute/core/Error.h" #include "arm_compute/runtime/IMemoryPool.h" -#include "support/MemorySupport.h" #include <algorithm> #include <list> using namespace arm_compute; -PoolManager::PoolManager() - : _free_pools(), _occupied_pools(), _sem(), _mtx() +PoolManager::PoolManager() : _free_pools(), _occupied_pools(), _sem(), _mtx() { } @@ -53,10 +51,8 @@ void PoolManager::unlock_pool(IMemoryPool *pool) ARM_COMPUTE_ERROR_ON_MSG(_free_pools.empty() && _occupied_pools.empty(), "Haven't setup any pools!"); arm_compute::lock_guard<arm_compute::Mutex> lock(_mtx); - auto it = std::find_if(std::begin(_occupied_pools), std::end(_occupied_pools), [pool](const std::unique_ptr<IMemoryPool> &pool_it) - { - return pool_it.get() == pool; - }); + auto it = std::find_if(std::begin(_occupied_pools), std::end(_occupied_pools), + [pool](const std::unique_ptr<IMemoryPool> &pool_it) { return pool_it.get() == pool; }); ARM_COMPUTE_ERROR_ON_MSG(it == std::end(_occupied_pools), "Pool to be unlocked couldn't be found!"); _free_pools.splice(std::begin(_free_pools), _occupied_pools, it); _sem->signal(); @@ -71,7 +67,7 @@ void PoolManager::register_pool(std::unique_ptr<IMemoryPool> pool) _free_pools.push_front(std::move(pool)); // Update semaphore - _sem = arm_compute::support::cpp14::make_unique<arm_compute::Semaphore>(_free_pools.size()); + _sem = std::make_unique<arm_compute::Semaphore>(_free_pools.size()); } std::unique_ptr<IMemoryPool> PoolManager::release_pool() @@ -79,14 +75,14 @@ std::unique_ptr<IMemoryPool> PoolManager::release_pool() arm_compute::lock_guard<arm_compute::Mutex> lock(_mtx); ARM_COMPUTE_ERROR_ON_MSG(!_occupied_pools.empty(), "All pools should be free in order to release one!"); - if(!_free_pools.empty()) + if (!_free_pools.empty()) { std::unique_ptr<IMemoryPool> pool = std::move(_free_pools.front()); ARM_COMPUTE_ERROR_ON(_free_pools.front() != nullptr); _free_pools.pop_front(); // Update semaphore - _sem = arm_compute::support::cpp14::make_unique<arm_compute::Semaphore>(_free_pools.size()); + _sem = std::make_unique<arm_compute::Semaphore>(_free_pools.size()); return pool; } diff --git a/src/runtime/Pyramid.cpp b/src/runtime/Pyramid.cpp deleted file mode 100644 index 16a91a8704..0000000000 --- a/src/runtime/Pyramid.cpp +++ /dev/null @@ -1,117 +0,0 @@ -/* - * Copyright (c) 2016-2020 ARM Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "arm_compute/runtime/Pyramid.h" - -#include "arm_compute/core/Error.h" -#include "arm_compute/core/PyramidInfo.h" -#include "arm_compute/core/TensorInfo.h" -#include "arm_compute/core/TensorShape.h" - -#include <cmath> - -using namespace arm_compute; - -void Pyramid::init(const PyramidInfo &info) -{ - internal_init(info, false); -} - -void Pyramid::init_auto_padding(const PyramidInfo &info) -{ - internal_init(info, true); -} - -void Pyramid::internal_init(const PyramidInfo &info, bool auto_padding) -{ - _info = info; - _pyramid.resize(_info.num_levels()); - - size_t w = _info.width(); - size_t h = _info.height(); - size_t ref_w = w; - size_t ref_h = h; - bool is_orb_scale = (SCALE_PYRAMID_ORB == _info.scale()); - TensorShape tensor_shape = _info.tensor_shape(); - - // Note: Look-up table used by the OpenVX sample implementation - const std::array<float, 4> c_orbscale = { 0.5f, - SCALE_PYRAMID_ORB, - SCALE_PYRAMID_ORB * SCALE_PYRAMID_ORB, - SCALE_PYRAMID_ORB *SCALE_PYRAMID_ORB * SCALE_PYRAMID_ORB - }; - - for(size_t i = 0; i < _info.num_levels(); ++i) - { - TensorInfo tensor_info(tensor_shape, _info.format()); - - if(auto_padding) - { - tensor_info.auto_padding(); - } - - _pyramid[i].allocator()->init(tensor_info); - - if(is_orb_scale) - { - float orb_scale = c_orbscale[(i + 1) % 4]; - w = static_cast<int>(std::ceil(static_cast<float>(ref_w) * orb_scale)); - h = static_cast<int>(std::ceil(static_cast<float>(ref_h) * orb_scale)); - - if(0 == ((i + 1) % 4)) - { - ref_w = w; - ref_h = h; - } - } - else - { - w = (w + 1) * _info.scale(); - h = (h + 1) * _info.scale(); - } - - // Update tensor_shape - tensor_shape.set(0, w); - tensor_shape.set(1, h); - } -} - -void Pyramid::allocate() -{ - for(size_t i = 0; i < _info.num_levels(); ++i) - { - _pyramid[i].allocator()->allocate(); - } -} - -const PyramidInfo *Pyramid::info() const -{ - return &_info; -} - -Tensor *Pyramid::get_pyramid_level(size_t index) const -{ - ARM_COMPUTE_ERROR_ON(index >= _info.num_levels()); - - return &_pyramid[index]; -} diff --git a/src/runtime/RuntimeContext.cpp b/src/runtime/RuntimeContext.cpp index 308e2788a9..1de8d2abdb 100644 --- a/src/runtime/RuntimeContext.cpp +++ b/src/runtime/RuntimeContext.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019 ARM Limited. + * Copyright (c) 2019, 2021 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -28,8 +28,7 @@ namespace arm_compute { -RuntimeContext::RuntimeContext() - : _owned_scheduler(SchedulerFactory::create()), _scheduler(_owned_scheduler.get()), _device_props() +RuntimeContext::RuntimeContext() : _owned_scheduler(SchedulerFactory::create()), _scheduler(_owned_scheduler.get()) { } @@ -48,9 +47,4 @@ IAssetManager *RuntimeContext::asset_manager() { return nullptr; } - -const DeviceProperties &RuntimeContext::properties() -{ - return _device_props; -} } // namespace arm_compute diff --git a/src/runtime/Scheduler.cpp b/src/runtime/Scheduler.cpp index 380ad90a27..3f1e96968a 100644 --- a/src/runtime/Scheduler.cpp +++ b/src/runtime/Scheduler.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2020 ARM Limited. + * Copyright (c) 2017-2020, 2023 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -24,7 +24,6 @@ #include "arm_compute/runtime/Scheduler.h" #include "arm_compute/core/Error.h" -#include "support/MemorySupport.h" #if ARM_COMPUTE_CPP_SCHEDULER #include "arm_compute/runtime/CPP/CPPScheduler.h" @@ -55,19 +54,19 @@ namespace std::map<Scheduler::Type, std::unique_ptr<IScheduler>> init() { std::map<Scheduler::Type, std::unique_ptr<IScheduler>> m; - m[Scheduler::Type::ST] = support::cpp14::make_unique<SingleThreadScheduler>(); + m[Scheduler::Type::ST] = std::make_unique<SingleThreadScheduler>(); #if defined(ARM_COMPUTE_CPP_SCHEDULER) - m[Scheduler::Type::CPP] = support::cpp14::make_unique<CPPScheduler>(); + m[Scheduler::Type::CPP] = std::make_unique<CPPScheduler>(); #endif // defined(ARM_COMPUTE_CPP_SCHEDULER) #if defined(ARM_COMPUTE_OPENMP_SCHEDULER) - m[Scheduler::Type::OMP] = support::cpp14::make_unique<OMPScheduler>(); + m[Scheduler::Type::OMP] = std::make_unique<OMPScheduler>(); #endif // defined(ARM_COMPUTE_OPENMP_SCHEDULER) return m; } } // namespace -std::map<Scheduler::Type, std::unique_ptr<IScheduler>> Scheduler::_schedulers = init(); +std::map<Scheduler::Type, std::unique_ptr<IScheduler>> Scheduler::_schedulers{}; void Scheduler::set(Type t) { @@ -77,7 +76,7 @@ void Scheduler::set(Type t) bool Scheduler::is_available(Type t) { - if(t == Type::CUSTOM) + if (t == Type::CUSTOM) { return _custom_scheduler != nullptr; } @@ -94,11 +93,12 @@ Scheduler::Type Scheduler::get_type() IScheduler &Scheduler::get() { - if(_scheduler_type == Type::CUSTOM) + if (_scheduler_type == Type::CUSTOM) { - if(_custom_scheduler == nullptr) + if (_custom_scheduler == nullptr) { - ARM_COMPUTE_ERROR("No custom scheduler has been setup. Call set(std::shared_ptr<IScheduler> &scheduler) before Scheduler::get()"); + ARM_COMPUTE_ERROR("No custom scheduler has been setup. Call set(std::shared_ptr<IScheduler> &scheduler) " + "before Scheduler::get()"); } else { @@ -107,8 +107,13 @@ IScheduler &Scheduler::get() } else { + if (_schedulers.empty()) + { + _schedulers = init(); + } + auto it = _schedulers.find(_scheduler_type); - if(it != _schedulers.end()) + if (it != _schedulers.end()) { return *it->second; } diff --git a/src/runtime/SchedulerFactory.cpp b/src/runtime/SchedulerFactory.cpp index c6c90348b4..4fb08d79f5 100644 --- a/src/runtime/SchedulerFactory.cpp +++ b/src/runtime/SchedulerFactory.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2020 ARM Limited. + * Copyright (c) 2019-2020 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -23,8 +23,6 @@ */ #include "arm_compute/runtime/SchedulerFactory.h" -#include "support/MemorySupport.h" - #include "arm_compute/core/Error.h" #if ARM_COMPUTE_CPP_SCHEDULER #include "arm_compute/runtime/CPP/CPPScheduler.h" @@ -50,16 +48,16 @@ const SchedulerFactory::Type SchedulerFactory::_default_type = SchedulerFactory: std::unique_ptr<IScheduler> SchedulerFactory::create(Type type) { - switch(type) + switch (type) { case Type::ST: { - return support::cpp14::make_unique<SingleThreadScheduler>(); + return std::make_unique<SingleThreadScheduler>(); } case Type::CPP: { #if ARM_COMPUTE_CPP_SCHEDULER - return support::cpp14::make_unique<CPPScheduler>(); + return std::make_unique<CPPScheduler>(); #else /* ARM_COMPUTE_CPP_SCHEDULER */ ARM_COMPUTE_ERROR("Recompile with cppthreads=1 to use C++11 scheduler."); #endif /* ARM_COMPUTE_CPP_SCHEDULER */ @@ -67,7 +65,7 @@ std::unique_ptr<IScheduler> SchedulerFactory::create(Type type) case Type::OMP: { #if ARM_COMPUTE_OPENMP_SCHEDULER - return support::cpp14::make_unique<OMPScheduler>(); + return std::make_unique<OMPScheduler>(); #else /* ARM_COMPUTE_OPENMP_SCHEDULER */ ARM_COMPUTE_ERROR("Recompile with openmp=1 to use openmp scheduler."); #endif /* ARM_COMPUTE_OPENMP_SCHEDULER */ diff --git a/src/runtime/SchedulerUtils.cpp b/src/runtime/SchedulerUtils.cpp new file mode 100644 index 0000000000..74ee539fec --- /dev/null +++ b/src/runtime/SchedulerUtils.cpp @@ -0,0 +1,82 @@ +/* + * Copyright (c) 2020 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "src/runtime/SchedulerUtils.h" + +#include "arm_compute/core/Error.h" + +#include <cmath> + +namespace arm_compute +{ +namespace scheduler_utils +{ +#ifndef BARE_METAL +std::pair<unsigned, unsigned> split_2d(unsigned max_threads, std::size_t m, std::size_t n) +{ + /* + * We want the same ratio of threads in M & N to the ratio of m and n problem size + * + * Therefore: mt/nt == m/n where mt*nt == max_threads + * + * max_threads/nt = mt & (max_threads/nt) * (m/n) = nt + * nt^2 = max_threads * (m/n) + * nt = sqrt( max_threads * (m/n) ) + */ + //ratio of m to n in problem dimensions + double ratio = m / static_cast<double>(n); + + // nt = sqrt(max_threads * (m / n) ) + const unsigned adjusted = std::round(std::sqrt(max_threads * ratio)); + + //find the nearest factor of max_threads + for (unsigned i = 0; i != adjusted; ++i) + { + //try down + const unsigned adj_down = adjusted - i; + if (max_threads % adj_down == 0) + { + return {adj_down, max_threads / adj_down}; + } + + //try up + const unsigned adj_up = adjusted + i; + if (max_threads % adj_up == 0) + { + return {adj_up, max_threads / adj_up}; + } + } + + //we didn't find anything so lets bail out with maxes biased to the largest dimension + if (m > n) + { + return {std::min<unsigned>(m, max_threads), 1}; + } + else + { + return {1, std::min<unsigned>(n, max_threads)}; + } +} +#endif /* #ifndef BARE_METAL */ +} // namespace scheduler_utils +} // namespace arm_compute diff --git a/src/runtime/NEON/functions/NECol2Im.cpp b/src/runtime/SchedulerUtils.h index 262ba8f2af..46644a369e 100644 --- a/src/runtime/NEON/functions/NECol2Im.cpp +++ b/src/runtime/SchedulerUtils.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2020 ARM Limited. + * Copyright (c) 2020 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -21,22 +21,25 @@ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ -#include "arm_compute/runtime/NEON/functions/NECol2Im.h" +#ifndef SRC_COMPUTE_SCHEDULER_UTILS_H +#define SRC_COMPUTE_SCHEDULER_UTILS_H -#include "arm_compute/core/NEON/kernels/NECol2ImKernel.h" -#include "support/MemorySupport.h" +#include <cstddef> +#include <utility> namespace arm_compute { -void NECol2Im::configure(const ITensor *input, ITensor *output, const Size2D &convolved_dims) +namespace scheduler_utils { - auto k = arm_compute::support::cpp14::make_unique<NECol2ImKernel>(); - k->configure(input, output, convolved_dims); - _kernel = std::move(k); -} - -Status NECol2Im::validate(const ITensorInfo *input, const ITensorInfo *output, const Size2D &convolved_dims) -{ - return NECol2ImKernel::validate(input, output, convolved_dims); -} +/** Given two dimensions and a maximum number of threads to utilise, calculate the best + * combination of threads that fit in (multiplied together) max_threads. + * + * This algorithm assumes that work in either of the dimensions is equally difficult + * to compute + * + * @returns [m_nthreads, n_nthreads] A pair of the threads that should be used in each dimension + */ +std::pair<unsigned, unsigned> split_2d(unsigned max_threads, std::size_t m, std::size_t n); +} // namespace scheduler_utils } // namespace arm_compute +#endif /* SRC_COMPUTE_SCHEDULER_UTILS_H */ diff --git a/src/runtime/SubTensor.cpp b/src/runtime/SubTensor.cpp index b010a32eca..f87256abb1 100644 --- a/src/runtime/SubTensor.cpp +++ b/src/runtime/SubTensor.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2018 ARM Limited. + * Copyright (c) 2017-2018 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -27,8 +27,7 @@ using namespace arm_compute; -SubTensor::SubTensor() - : _parent(nullptr), _info() +SubTensor::SubTensor() : _parent(nullptr), _info() { } diff --git a/src/runtime/Tensor.cpp b/src/runtime/Tensor.cpp index 8f7ecd6ffa..f17e323694 100644 --- a/src/runtime/Tensor.cpp +++ b/src/runtime/Tensor.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2016-2019 ARM Limited. + * Copyright (c) 2016-2019 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -25,8 +25,7 @@ namespace arm_compute { -Tensor::Tensor(IRuntimeContext *) - : _allocator(this) +Tensor::Tensor(IRuntimeContext *) : _allocator(this) { } diff --git a/src/runtime/TensorAllocator.cpp b/src/runtime/TensorAllocator.cpp index ffd5cc7236..372852bfea 100644 --- a/src/runtime/TensorAllocator.cpp +++ b/src/runtime/TensorAllocator.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2016-2020 ARM Limited. + * Copyright (c) 2016-2020 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -28,7 +28,6 @@ #include "arm_compute/core/TensorInfo.h" #include "arm_compute/runtime/MemoryGroup.h" #include "arm_compute/runtime/MemoryRegion.h" -#include "support/MemorySupport.h" #include <cstddef> @@ -44,13 +43,13 @@ bool validate_subtensor_shape(const TensorInfo &parent_info, const TensorInfo &c const size_t parent_dims = parent_info.num_dimensions(); const size_t child_dims = child_info.num_dimensions(); - if(child_dims <= parent_dims) + if (child_dims <= parent_dims) { - for(size_t num_dimensions = child_dims; num_dimensions > 0; --num_dimensions) + for (size_t num_dimensions = child_dims; num_dimensions > 0; --num_dimensions) { const size_t child_dim_size = coords[num_dimensions - 1] + child_shape[num_dimensions - 1]; - if((coords[num_dimensions - 1] < 0) || (child_dim_size > parent_shape[num_dimensions - 1])) + if ((coords[num_dimensions - 1] < 0) || (child_dim_size > parent_shape[num_dimensions - 1])) { is_valid = false; break; @@ -66,8 +65,7 @@ bool validate_subtensor_shape(const TensorInfo &parent_info, const TensorInfo &c } } // namespace -TensorAllocator::TensorAllocator(IMemoryManageable *owner) - : _owner(owner), _associated_memory_group(nullptr), _memory() +TensorAllocator::TensorAllocator(IMemoryManageable *owner) : _owner(owner), _associated_memory_group(nullptr), _memory() { } @@ -89,7 +87,7 @@ TensorAllocator::TensorAllocator(TensorAllocator &&o) noexcept TensorAllocator &TensorAllocator::operator=(TensorAllocator &&o) noexcept { - if(&o != this) + if (&o != this) { _owner = o._owner; o._owner = nullptr; @@ -118,8 +116,10 @@ void TensorAllocator::init(const TensorAllocator &allocator, const Coordinates & _memory = Memory(allocator._memory.region()); // Init tensor info with new dimensions - size_t total_size = parent_info.offset_element_in_bytes(coords) + sub_info.total_size() - sub_info.offset_first_element_in_bytes(); - sub_info.init(sub_info.tensor_shape(), sub_info.format(), parent_info.strides_in_bytes(), parent_info.offset_element_in_bytes(coords), total_size); + size_t total_size = + parent_info.offset_element_in_bytes(coords) + sub_info.total_size() - sub_info.offset_first_element_in_bytes(); + sub_info.init(sub_info.tensor_shape(), sub_info.format(), parent_info.strides_in_bytes(), + parent_info.offset_element_in_bytes(coords), total_size); // Set TensorInfo init(sub_info); @@ -134,9 +134,9 @@ void TensorAllocator::allocate() { // Align to 64-byte boundaries by default if alignment is not specified const size_t alignment_to_use = (alignment() != 0) ? alignment() : 64; - if(_associated_memory_group == nullptr) + if (_associated_memory_group == nullptr) { - _memory.set_owned_region(support::cpp14::make_unique<MemoryRegion>(info().total_size(), alignment_to_use)); + _memory.set_owned_region(std::make_unique<MemoryRegion>(info().total_size(), alignment_to_use)); } else { @@ -157,7 +157,7 @@ Status TensorAllocator::import_memory(void *memory) ARM_COMPUTE_RETURN_ERROR_ON(_associated_memory_group != nullptr); ARM_COMPUTE_RETURN_ERROR_ON(alignment() != 0 && !arm_compute::utility::check_aligned(memory, alignment())); - _memory.set_owned_region(support::cpp14::make_unique<MemoryRegion>(memory, info().total_size())); + _memory.set_owned_region(std::make_unique<MemoryRegion>(memory, info().total_size())); info().set_is_resizable(false); return Status{}; diff --git a/src/runtime/TracePoint.cpp b/src/runtime/TracePoint.cpp deleted file mode 100644 index 817d63bdf3..0000000000 --- a/src/runtime/TracePoint.cpp +++ /dev/null @@ -1,49 +0,0 @@ -/* - * Copyright (c) 2020 ARM Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "arm_compute/core/TracePoint.h" -#include <stdio.h> -#include <vector> - -#include "arm_compute/core/NEON/kernels/assembly/arm_gemm.hpp" -#include "arm_compute/runtime/Array.h" -#include "arm_compute/runtime/Pyramid.h" -#include "arm_compute/runtime/common/LSTMParams.h" -#include "utils/TypePrinter.h" - -namespace arm_compute -{ -TRACE_TO_STRING(KeyPointArray) -TRACE_TO_STRING(Pyramid) -TRACE_TO_STRING(LSTMParams<ITensor>) -TRACE_TO_STRING(FullyConnectedLayerInfo) -TRACE_TO_STRING(arm_gemm::Requantize32) - -CONST_PTR_CLASS(KeyPointArray) -CONST_PTR_CLASS(Pyramid) -CONST_PTR_CLASS(LSTMParams<ITensor>) -CONST_PTR_CLASS(DetectionPostProcessLayerInfo) -CONST_PTR_CLASS(FullyConnectedLayerInfo) -CONST_PTR_CLASS(GenerateProposalsInfo) -CONST_PTR_CLASS(arm_gemm::Requantize32) -} // namespace arm_compute diff --git a/src/runtime/Utils.cpp b/src/runtime/Utils.cpp index 2204ec11d7..a7f7b5f3cb 100644 --- a/src/runtime/Utils.cpp +++ b/src/runtime/Utils.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2019 ARM Limited. + * Copyright (c) 2017-2020 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -21,7 +21,7 @@ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ -#include "arm_compute/runtime/Utils.h" +#include "src/runtime/Utils.h" #include "arm_compute/runtime/NEON/NEScheduler.h" @@ -31,6 +31,8 @@ namespace arm_compute { +namespace utils +{ #ifndef DOXYGEN_SKIP_THIS static const std::string information = #include "arm_compute_version.embed" @@ -39,20 +41,17 @@ static const std::string information = const std::string &string_from_scheduler_type(Scheduler::Type t) { - static std::map<Scheduler::Type, const std::string> scheduler_type_map = - { - { Scheduler::Type::ST, "Single Thread" }, - { Scheduler::Type::CPP, "C++11 Threads" }, - { Scheduler::Type::OMP, "OpenMP Threads" }, - { Scheduler::Type::CUSTOM, "Custom" } - }; + static std::map<Scheduler::Type, const std::string> scheduler_type_map = {{Scheduler::Type::ST, "Single Thread"}, + {Scheduler::Type::CPP, "C++11 Threads"}, + {Scheduler::Type::OMP, "OpenMP Threads"}, + {Scheduler::Type::CUSTOM, "Custom"}}; return scheduler_type_map[t]; } void schedule_kernel_on_ctx(IRuntimeContext *ctx, ICPPKernel *kernel, const IScheduler::Hints &hints) { - if(ctx) + if (ctx) { ARM_COMPUTE_ERROR_ON(ctx->scheduler() == nullptr); ctx->scheduler()->schedule(kernel, hints); @@ -66,7 +65,7 @@ void schedule_kernel_on_ctx(IRuntimeContext *ctx, ICPPKernel *kernel, const ISch unsigned int calculate_number_of_stages_only_x_axis(size_t input_x_dimension, unsigned int axis) { // We need only 1 stage for all axis except x-axis - if(axis != 0) + if (axis != 0) { return 1; } @@ -78,4 +77,5 @@ unsigned int calculate_number_of_stages_only_x_axis(size_t input_x_dimension, un const unsigned int num_of_stages = num_of_wg / 128 + 2; return num_of_stages; } +} // namespace utils } // namespace arm_compute diff --git a/src/runtime/GLES_COMPUTE/GCTensor.cpp b/src/runtime/Utils.h index e05eb4c4ae..f8775c9612 100644 --- a/src/runtime/GLES_COMPUTE/GCTensor.cpp +++ b/src/runtime/Utils.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2019 ARM Limited. + * Copyright (c) 2017-2020 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -21,63 +21,40 @@ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ +#ifndef SRC_RUNTIME_UTILS_H +#define SRC_RUNTIME_UTILS_H -#include "arm_compute/runtime/GLES_COMPUTE/GCTensor.h" +#include "arm_compute/runtime/IRuntimeContext.h" +#include "arm_compute/runtime/Scheduler.h" -namespace arm_compute -{ -GCTensor::GCTensor(IRuntimeContext *) - : _allocator(this) -{ -} - -ITensorAllocator *GCTensor::allocator() -{ - return &_allocator; -} - -TensorInfo *GCTensor::info() const -{ - return &_allocator.info(); -} - -TensorInfo *GCTensor::info() -{ - return &_allocator.info(); -} - -uint8_t *GCTensor::buffer() const -{ - return _allocator.data(); -} - -GLuint GCTensor::gc_buffer() const -{ - return _allocator.get_gl_ssbo_name(); -} +#include <string> -void GCTensor::associate_memory_group(arm_compute::IMemoryGroup *memory_group) -{ - _allocator.set_associated_memory_group(memory_group); -} - -void GCTensor::map(bool blocking) +namespace arm_compute { - IGCTensor::map(blocking); -} - -void GCTensor::unmap() +namespace utils { - IGCTensor::unmap(); -} +/** Convert a Scheduler::Type into a string. + * + * @param[in] t @ref Scheduler::Type to be translated to string. + * + * @return The string describing the scheduler type. + */ +const std::string &string_from_scheduler_type(Scheduler::Type t); -uint8_t *GCTensor::do_map(bool blocking) -{ - return _allocator.map(blocking); -} +/** Schedules a kernel using the context if not nullptr else uses the legacy scheduling flow. + * + * @param[in] ctx Context to use. + * @param[in] kernel Kernel to schedule. + * @param[in] hints Hints to use. + */ +void schedule_kernel_on_ctx(IRuntimeContext *ctx, ICPPKernel *kernel, const IScheduler::Hints &hints); -void GCTensor::do_unmap() -{ - _allocator.unmap(); -} +/** Calculate number of stages for parallel implementations + * + * @param[in] input_x_dimension input tensor x dimension + * @param[in] axis axis to be used + */ +unsigned int calculate_number_of_stages_only_x_axis(size_t input_x_dimension, unsigned int axis); +} // namespace utils } // namespace arm_compute +#endif /* SRC_RUNTIME_UTILS_H */ diff --git a/src/runtime/heuristics/direct_conv/ClDirectConvDefaultConfigBifrost.cpp b/src/runtime/heuristics/direct_conv/ClDirectConvDefaultConfigBifrost.cpp new file mode 100644 index 0000000000..aba32871d0 --- /dev/null +++ b/src/runtime/heuristics/direct_conv/ClDirectConvDefaultConfigBifrost.cpp @@ -0,0 +1,205 @@ +/* + * Copyright (c) 2022 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "src/runtime/heuristics/direct_conv/ClDirectConvDefaultConfigBifrost.h" + +#include "arm_compute/core/CL/CLHelpers.h" +#include "arm_compute/core/CL/CLKernelLibrary.h" +#include "arm_compute/core/GPUTarget.h" +#include "arm_compute/core/TensorInfo.h" +#include "arm_compute/core/TensorShape.h" +#include "arm_compute/core/utils/misc/ShapeCalculator.h" + +#include <utility> + +namespace arm_compute +{ +namespace cl_direct_conv +{ +using namespace arm_compute::misc::shape_calculator; + +ClDirectConvDefaultConfigBifrost::ClDirectConvDefaultConfigBifrost(GPUTarget gpu) : IClDirectConvKernelConfig(gpu) +{ +} + +DirectConvComputeKernelInfo ClDirectConvDefaultConfigBifrost::configure(const ITensorInfo *src, + const ITensorInfo *wei, + const PadStrideInfo &conv_info) +{ + using ConfigurationFunctionExecutorPtr = DirectConvComputeKernelInfo (ClDirectConvDefaultConfigBifrost::*)( + const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info); + + ClDirectConvConfigArray<ConfigurationFunctionExecutorPtr> configs_G71( + &ClDirectConvDefaultConfigBifrost::configure_G71_f32, &ClDirectConvDefaultConfigBifrost::configure_G71_f16, + &ClDirectConvDefaultConfigBifrost::configure_G71_u8); + + ClDirectConvConfigArray<ConfigurationFunctionExecutorPtr> configs_default( + &ClDirectConvDefaultConfigBifrost::configure_default_f32, + &ClDirectConvDefaultConfigBifrost::configure_default_f16, &ClDirectConvDefaultConfigBifrost::configure_G71_u8); + + ConfigurationFunctionExecutorPtr func = nullptr; + switch (_target) + { + case GPUTarget::G71: + func = configs_G71.get_function(src->data_type()); + break; + default: + func = configs_default.get_function(src->data_type()); + break; + } + + ARM_COMPUTE_ERROR_ON_MSG(func == nullptr, "Data type not supported for direct convolution"); + return (this->*func)(src, wei, conv_info); +} + +DirectConvComputeKernelInfo ClDirectConvDefaultConfigBifrost::configure_G71_f32(const ITensorInfo *src, + const ITensorInfo *wei, + const PadStrideInfo &conv_info) +{ + DirectConvComputeKernelInfo desc; + + if (src->data_layout() == DataLayout::NHWC) + { + // Get the output shape + TensorShape output_shape = misc::shape_calculator::compute_deep_convolution_shape(*src, *wei, conv_info); + + desc.n0 = 4; + + if (output_shape[0] > 16) + { + desc.m0 = 2; + } + + desc.k0 = 8; + + desc.export_weights_to_cl_image = false; + } + + return desc; +} + +DirectConvComputeKernelInfo ClDirectConvDefaultConfigBifrost::configure_G71_f16(const ITensorInfo *src, + const ITensorInfo *wei, + const PadStrideInfo &conv_info) +{ + DirectConvComputeKernelInfo desc; + + if (src->data_layout() == DataLayout::NHWC) + { + // Get the output shape + TensorShape output_shape = misc::shape_calculator::compute_deep_convolution_shape(*src, *wei, conv_info); + + desc.n0 = 4; + + if (output_shape[0] > 16) + { + desc.m0 = 4; + } + + desc.k0 = 8; + + desc.export_weights_to_cl_image = false; + } + + return desc; +} + +DirectConvComputeKernelInfo ClDirectConvDefaultConfigBifrost::configure_G71_u8(const ITensorInfo *src, + const ITensorInfo *wei, + const PadStrideInfo &conv_info) +{ + DirectConvComputeKernelInfo desc; + + if (src->data_layout() == DataLayout::NHWC) + { + // Get the output shape + TensorShape output_shape = misc::shape_calculator::compute_deep_convolution_shape(*src, *wei, conv_info); + + desc.n0 = 4; + + if (output_shape[0] > 16) + { + desc.m0 = 4; + } + + desc.k0 = 16; + + desc.export_weights_to_cl_image = false; + } + + return desc; +} + +DirectConvComputeKernelInfo ClDirectConvDefaultConfigBifrost::configure_default_f32(const ITensorInfo *src, + const ITensorInfo *wei, + const PadStrideInfo &conv_info) +{ + DirectConvComputeKernelInfo desc; + + if (src->data_layout() == DataLayout::NHWC) + { + // Get the output shape + TensorShape output_shape = misc::shape_calculator::compute_deep_convolution_shape(*src, *wei, conv_info); + + desc.n0 = 4; + + if (output_shape[0] > 16) + { + desc.m0 = 2; + } + + desc.k0 = 8; + + desc.export_weights_to_cl_image = export_to_cl_image(wei); + } + + return desc; +} + +DirectConvComputeKernelInfo ClDirectConvDefaultConfigBifrost::configure_default_f16(const ITensorInfo *src, + const ITensorInfo *wei, + const PadStrideInfo &conv_info) +{ + DirectConvComputeKernelInfo desc; + + if (src->data_layout() == DataLayout::NHWC) + { + // Get the output shape + TensorShape output_shape = misc::shape_calculator::compute_deep_convolution_shape(*src, *wei, conv_info); + + desc.n0 = 4; + + if (output_shape[0] > 16) + { + desc.m0 = 4; + } + + desc.k0 = 8; + + desc.export_weights_to_cl_image = export_to_cl_image(wei); + } + + return desc; +} +} // namespace cl_direct_conv +} // namespace arm_compute diff --git a/src/runtime/heuristics/direct_conv/ClDirectConvDefaultConfigBifrost.h b/src/runtime/heuristics/direct_conv/ClDirectConvDefaultConfigBifrost.h new file mode 100644 index 0000000000..ed6a4c3c68 --- /dev/null +++ b/src/runtime/heuristics/direct_conv/ClDirectConvDefaultConfigBifrost.h @@ -0,0 +1,61 @@ +/* + * Copyright (c) 2022 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef SRC_RUNTIME_HEURISTICS_DIRECT_CONV_CLDIRECTCONVDEFAULTCONFIGBIFROST +#define SRC_RUNTIME_HEURISTICS_DIRECT_CONV_CLDIRECTCONVDEFAULTCONFIGBIFROST + +#include "src/runtime/heuristics/direct_conv/IClDirectConvKernelConfig.h" + +namespace arm_compute +{ +namespace cl_direct_conv +{ +/** Bifrost based OpenCL direct convolution configuration */ +class ClDirectConvDefaultConfigBifrost final : public IClDirectConvKernelConfig +{ +public: + /** Constructor + * + * @param[in] gpu GPU target + */ + ClDirectConvDefaultConfigBifrost(GPUTarget gpu); + + // Inherited overridden method + DirectConvComputeKernelInfo + configure(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info) override; + +private: + DirectConvComputeKernelInfo + configure_G71_f32(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info); + DirectConvComputeKernelInfo + configure_G71_f16(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info); + DirectConvComputeKernelInfo + configure_G71_u8(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info); + DirectConvComputeKernelInfo + configure_default_f32(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info); + DirectConvComputeKernelInfo + configure_default_f16(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info); +}; +} // namespace cl_direct_conv +} // namespace arm_compute +#endif /* SRC_RUNTIME_HEURISTICS_DIRECT_CONV_CLDIRECTCONVDEFAULTCONFIGBIFROST */ diff --git a/src/runtime/heuristics/direct_conv/ClDirectConvDefaultConfigValhall.cpp b/src/runtime/heuristics/direct_conv/ClDirectConvDefaultConfigValhall.cpp new file mode 100644 index 0000000000..4b7666d5aa --- /dev/null +++ b/src/runtime/heuristics/direct_conv/ClDirectConvDefaultConfigValhall.cpp @@ -0,0 +1,413 @@ +/* + * Copyright (c) 2022 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "src/runtime/heuristics/direct_conv/ClDirectConvDefaultConfigValhall.h" + +#include "arm_compute/core/CL/CLHelpers.h" +#include "arm_compute/core/CL/CLKernelLibrary.h" +#include "arm_compute/core/GPUTarget.h" +#include "arm_compute/core/TensorInfo.h" +#include "arm_compute/core/TensorShape.h" +#include "arm_compute/core/utils/misc/ShapeCalculator.h" + +#include <utility> + +namespace arm_compute +{ +namespace cl_direct_conv +{ +using namespace arm_compute::misc::shape_calculator; + +ClDirectConvDefaultConfigValhall::ClDirectConvDefaultConfigValhall(GPUTarget gpu) : IClDirectConvKernelConfig(gpu) +{ +} + +DirectConvComputeKernelInfo ClDirectConvDefaultConfigValhall::configure(const ITensorInfo *src, + const ITensorInfo *wei, + const PadStrideInfo &conv_info) +{ + using ConfigurationFunctionExecutorPtr = DirectConvComputeKernelInfo (ClDirectConvDefaultConfigValhall::*)( + const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info); + + ClDirectConvConfigArray<ConfigurationFunctionExecutorPtr> configs_G78( + &ClDirectConvDefaultConfigValhall::configure_G78_f32, &ClDirectConvDefaultConfigValhall::configure_G78_f16, + &ClDirectConvDefaultConfigValhall::configure_G78_u8); + + ClDirectConvConfigArray<ConfigurationFunctionExecutorPtr> configs_G57( + &ClDirectConvDefaultConfigValhall::configure_G57_f32, &ClDirectConvDefaultConfigValhall::configure_G57_f16, + &ClDirectConvDefaultConfigValhall::configure_G78_u8); + + ConfigurationFunctionExecutorPtr func = nullptr; + switch (_target) + { + case GPUTarget::G57: + func = configs_G57.get_function(src->data_type()); + break; + case GPUTarget::G78: + default: + func = configs_G78.get_function(src->data_type()); + break; + } + + ARM_COMPUTE_ERROR_ON_MSG(func == nullptr, "Data type not supported for direct convolution"); + return (this->*func)(src, wei, conv_info); +} + +DirectConvComputeKernelInfo ClDirectConvDefaultConfigValhall::configure_G78_f32(const ITensorInfo *src, + const ITensorInfo *wei, + const PadStrideInfo &conv_info) +{ + DirectConvComputeKernelInfo desc; + + if (src->data_layout() == DataLayout::NHWC) + { + // Get the output shape + const TensorShape wei_shape = wei->tensor_shape(); + const TensorShape dst_shape = misc::shape_calculator::compute_deep_convolution_shape(*src, *wei, conv_info); + const bool export_weights_to_cl_image = export_to_cl_image(wei); + + const int32_t ofm = dst_shape[0]; + const int32_t m = dst_shape[1] * dst_shape[2]; + const bool is_pointwise = (wei_shape[1] == wei_shape[2]) && wei_shape[1] == 1; + + desc.export_weights_to_cl_image = export_weights_to_cl_image; + + if (dst_shape[0] <= 4) + { + if (is_pointwise) + { + if (ofm == 4) + { + desc.m0 = 1; + desc.n0 = 4; + desc.k0 = 16; + } + else + { + desc.m0 = 1; + desc.n0 = 1; + desc.k0 = 16; + } + } + else + { + desc.m0 = 1; + desc.n0 = 2; + desc.k0 = 16; + } + } + else + { + if (m < 64) + { + desc.m0 = 1; + desc.n0 = 1; + desc.k0 = 16; + } + else + { + desc.m0 = 4; + desc.n0 = 4; + desc.k0 = 4; + } + } + } + + return desc; +} + +DirectConvComputeKernelInfo ClDirectConvDefaultConfigValhall::configure_G78_f16(const ITensorInfo *src, + const ITensorInfo *wei, + const PadStrideInfo &conv_info) +{ + DirectConvComputeKernelInfo desc; + + if (src->data_layout() == DataLayout::NHWC) + { + // Get the output shape + const TensorShape wei_shape = wei->tensor_shape(); + const TensorShape dst_shape = misc::shape_calculator::compute_deep_convolution_shape(*src, *wei, conv_info); + const bool export_weights_to_cl_image = export_to_cl_image(wei); + + const int32_t ofm = dst_shape[0]; + const int32_t m = dst_shape[1] * dst_shape[2]; + const int32_t k = wei_shape[0]; + const bool is_pointwise = (wei_shape[1] == wei_shape[2]) && wei_shape[1] == 1; + + desc.export_weights_to_cl_image = export_weights_to_cl_image; + + if (dst_shape[0] <= 4) + { + // k0 should be as larger as possible. However, we should avoid + // having left-over for loops that make the implementation slower. + if ((k % 16) == 0) + { + desc.k0 = 16; + } + else if ((k % 8) == 0) + { + desc.k0 = 8; + } + else + { + desc.k0 = 4; + } + + if (is_pointwise) + { + if (ofm == 4) + { + desc.m0 = 1; + desc.n0 = 4; + } + else + { + desc.m0 = 1; + desc.n0 = 1; + } + } + else + { + desc.m0 = 1; + desc.n0 = dst_shape[0]; + } + } + else + { + if (m < 64) + { + desc.m0 = 1; + desc.n0 = 1; + if ((k % 16) == 0) + { + desc.k0 = 16; + } + else if ((k % 8) == 0) + { + desc.k0 = 8; + } + else + { + desc.k0 = 4; + } + } + else + { + if (ofm >= 16) + { + if (m / 6 > 24000) + { + desc.m0 = 6; + } + else + { + desc.m0 = 5; + } + desc.n0 = 8; + desc.k0 = 4; + } + else + { + desc.m0 = 2; + desc.n0 = 8; + if ((k % 16) == 0) + { + desc.k0 = 16; + } + else if ((k % 8) == 0) + { + desc.k0 = 8; + } + else + { + desc.k0 = 4; + } + } + } + } + } + + return desc; +} + +DirectConvComputeKernelInfo ClDirectConvDefaultConfigValhall::configure_G78_u8(const ITensorInfo *src, + const ITensorInfo *wei, + const PadStrideInfo &conv_info) +{ + DirectConvComputeKernelInfo desc; + + if (src->data_layout() == DataLayout::NHWC) + { + // Get the output shape + TensorShape output_shape = misc::shape_calculator::compute_deep_convolution_shape(*src, *wei, conv_info); + + desc.n0 = 4; + + if (output_shape[0] > 16) + { + desc.m0 = 4; + } + + desc.k0 = 16; + + desc.export_weights_to_cl_image = false; + } + + return desc; +} + +DirectConvComputeKernelInfo ClDirectConvDefaultConfigValhall::configure_G57_f32(const ITensorInfo *src, + const ITensorInfo *wei, + const PadStrideInfo &conv_info) +{ + DirectConvComputeKernelInfo desc; + + if (src->data_layout() == DataLayout::NHWC) + { + // Get the output shape + const TensorShape wei_shape = wei->tensor_shape(); + const TensorShape dst_shape = misc::shape_calculator::compute_deep_convolution_shape(*src, *wei, conv_info); + const bool export_weights_to_cl_image = export_to_cl_image(wei); + + const int32_t m = dst_shape[1] * dst_shape[2]; + const bool is_pointwise = (wei_shape[1] == wei_shape[2]) && wei_shape[1] == 1; + + desc.export_weights_to_cl_image = export_weights_to_cl_image; + + if (dst_shape[0] <= 4) + { + if (is_pointwise) + { + desc.m0 = 1; + desc.n0 = 1; + desc.k0 = 16; + } + else + { + desc.m0 = 1; + desc.n0 = dst_shape[0]; + desc.k0 = 16; + } + } + else + { + if (m < 64) + { + if (m == 1) + { + desc.m0 = 1; + desc.n0 = 1; + desc.k0 = 16; + } + else + { + desc.m0 = 4; + desc.n0 = 2; + desc.k0 = 8; + } + } + else + { + desc.m0 = 4; + desc.n0 = 4; + desc.k0 = 4; + } + } + } + + return desc; +} + +DirectConvComputeKernelInfo ClDirectConvDefaultConfigValhall::configure_G57_f16(const ITensorInfo *src, + const ITensorInfo *wei, + const PadStrideInfo &conv_info) +{ + DirectConvComputeKernelInfo desc; + + if (src->data_layout() == DataLayout::NHWC) + { + // Get the output shape + const TensorShape wei_shape = wei->tensor_shape(); + const TensorShape dst_shape = misc::shape_calculator::compute_deep_convolution_shape(*src, *wei, conv_info); + const bool export_weights_to_cl_image = export_to_cl_image(wei); + + const int32_t ofm = dst_shape[0]; + const int32_t m = dst_shape[1] * dst_shape[2]; + const bool is_pointwise = (wei_shape[1] == wei_shape[2]) && wei_shape[1] == 1; + + desc.export_weights_to_cl_image = export_weights_to_cl_image; + + if (dst_shape[0] <= 4) + { + if (is_pointwise) + { + desc.m0 = 2; + desc.n0 = 1; + desc.k0 = 16; + } + else + { + desc.m0 = 1; + desc.n0 = dst_shape[0]; + desc.k0 = 16; + } + } + else + { + if (m < 64) + { + if (m == 1) + { + desc.m0 = 1; + desc.n0 = 1; + desc.k0 = 16; + } + else + { + desc.m0 = 4; + desc.n0 = 2; + desc.k0 = 8; + } + } + else + { + if (ofm > 16) + { + desc.m0 = 4; + desc.n0 = 8; + desc.k0 = 8; + } + else + { + desc.m0 = 8; + desc.n0 = 4; + desc.k0 = 4; + } + } + } + } + + return desc; +} +} // namespace cl_direct_conv +} // namespace arm_compute diff --git a/src/runtime/heuristics/direct_conv/ClDirectConvDefaultConfigValhall.h b/src/runtime/heuristics/direct_conv/ClDirectConvDefaultConfigValhall.h new file mode 100644 index 0000000000..efd879a567 --- /dev/null +++ b/src/runtime/heuristics/direct_conv/ClDirectConvDefaultConfigValhall.h @@ -0,0 +1,61 @@ +/* + * Copyright (c) 2022 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef SRC_RUNTIME_HEURISTICS_DIRECT_CONV_CLDIRECTCONVDEFAULTCONFIGVALHALL +#define SRC_RUNTIME_HEURISTICS_DIRECT_CONV_CLDIRECTCONVDEFAULTCONFIGVALHALL + +#include "src/runtime/heuristics/direct_conv/IClDirectConvKernelConfig.h" + +namespace arm_compute +{ +namespace cl_direct_conv +{ +/** Valhall based OpenCL direct convolution configuration */ +class ClDirectConvDefaultConfigValhall final : public IClDirectConvKernelConfig +{ +public: + /** Constructor + * + * @param[in] gpu GPU target + */ + ClDirectConvDefaultConfigValhall(GPUTarget gpu); + + // Inherited overridden method + DirectConvComputeKernelInfo + configure(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info) override; + +private: + DirectConvComputeKernelInfo + configure_G78_f32(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info); + DirectConvComputeKernelInfo + configure_G78_f16(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info); + DirectConvComputeKernelInfo + configure_G78_u8(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info); + DirectConvComputeKernelInfo + configure_G57_f32(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info); + DirectConvComputeKernelInfo + configure_G57_f16(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info); +}; +} // namespace cl_direct_conv +} // namespace arm_compute +#endif /* SRC_RUNTIME_HEURISTICS_DIRECT_CONV_CLDIRECTCONVDEFAULTCONFIGVALHALL */ diff --git a/src/runtime/heuristics/direct_conv/ClDirectConvKernelConfig.h b/src/runtime/heuristics/direct_conv/ClDirectConvKernelConfig.h new file mode 100644 index 0000000000..215b17ef79 --- /dev/null +++ b/src/runtime/heuristics/direct_conv/ClDirectConvKernelConfig.h @@ -0,0 +1,65 @@ +/* + * Copyright (c) 2022-2023 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ACL_SRC_RUNTIME_HEURISTICS_DIRECT_CONV_CLDIRECTCONVKERNELCONFIG_H +#define ACL_SRC_RUNTIME_HEURISTICS_DIRECT_CONV_CLDIRECTCONVKERNELCONFIG_H + +#include "src/runtime/heuristics/direct_conv/ClDirectConvDefaultConfigBifrost.h" +#include "src/runtime/heuristics/direct_conv/ClDirectConvDefaultConfigValhall.h" +#include "src/runtime/heuristics/direct_conv/IClDirectConvKernelConfig.h" + +#include <memory> + +namespace arm_compute +{ +namespace cl_direct_conv +{ +/** ClDirectConvolution factory class */ +class ClDirectConvKernelConfigurationFactory final +{ +public: + /** Static method to call the ClDirectConvolution kernel configuration class accordingly with the GPU target + * + * @param[in] gpu GPU target + * + * @return IClDirectConvKernelConfig + */ + static std::unique_ptr<IClDirectConvKernelConfig> create(GPUTarget gpu) + { + switch (get_arch_from_target(gpu)) + { + case GPUTarget::MIDGARD: + return std::make_unique<ClDirectConvDefaultConfigBifrost>(GPUTarget::G71); + case GPUTarget::BIFROST: + return std::make_unique<ClDirectConvDefaultConfigBifrost>(gpu); + case GPUTarget::VALHALL: + case GPUTarget::FIFTHGEN: + return std::make_unique<ClDirectConvDefaultConfigValhall>(gpu); + default: + ARM_COMPUTE_ERROR("Not supported GPU target"); + } + } +}; +} // namespace cl_direct_conv +} // namespace arm_compute +#endif // ACL_SRC_RUNTIME_HEURISTICS_DIRECT_CONV_CLDIRECTCONVKERNELCONFIG_H diff --git a/src/runtime/heuristics/direct_conv/IClDirectConvKernelConfig.h b/src/runtime/heuristics/direct_conv/IClDirectConvKernelConfig.h new file mode 100644 index 0000000000..e5b270c720 --- /dev/null +++ b/src/runtime/heuristics/direct_conv/IClDirectConvKernelConfig.h @@ -0,0 +1,115 @@ +/* + * Copyright (c) 2022 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef SRC_RUNTIME_HEURISTICS_DIRECT_CONV_ICLDIRECTCONVKERNELCONFIG +#define SRC_RUNTIME_HEURISTICS_DIRECT_CONV_ICLDIRECTCONVKERNELCONFIG + +#include "arm_compute/core/GPUTarget.h" +#include "arm_compute/core/KernelDescriptors.h" +#include "arm_compute/core/Types.h" + +#include "src/core/common/Macros.h" + +namespace arm_compute +{ +namespace cl_direct_conv +{ +/** Basic container for the OpenCL direct convolution configuration functions */ +template <class T> +class ClDirectConvConfigArray +{ +public: + /** Alias for F32 index */ + static constexpr size_t DT_F32 = 0; + /** Alias for F16 index */ + static constexpr size_t DT_F16 = 1; + /** Alias for Int8 index */ + static constexpr size_t DT_INT8 = 2; + + /** Constructor + * + * @param[in] func_f32 Function to call for direct convolution F32 + * @param[in] func_f16 Function to call for direct convolution F16 + * @param[in] func_int8 Function to call for direct convolution Int8 (QASYMM8, QASYMM8_SIGNED, QSYMM8_PER_CHANNEL) + * + */ + ClDirectConvConfigArray(T func_f32, T func_f16, T func_int8) : _configs{func_f32, func_f16, func_int8} + { + } + + /** Method to return the direct convolution configuration function based on data type + * + * @param[in] data_type Input data type + * + * @return the valid function otherwise it returns nullptr if the data type is not valid + */ + T get_function(DataType data_type) + { + switch (data_type) + { + case DataType::F32: + return _configs.at(DT_F32); + case DataType::F16: + return _configs.at(DT_F16); + case DataType::QASYMM8: + case DataType::QASYMM8_SIGNED: + case DataType::QSYMM8_PER_CHANNEL: + return _configs.at(DT_INT8); + default: + return nullptr; + } + } + +private: + std::array<T, 3> _configs; +}; + +/** Basic interface for the Direct convolution kernel configuration */ +class IClDirectConvKernelConfig +{ +public: + /** Constructor + * + * @param[in] arch GPU target + */ + IClDirectConvKernelConfig(GPUTarget arch) : _target(arch) + { + } + ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(IClDirectConvKernelConfig); + /** Virtual destructor */ + virtual ~IClDirectConvKernelConfig() = default; + /** This method returns the @ref DirectConvComputeKernelInfo for the given inputs + * + * @param[in] src Source tensor (activation tensor) + * @param[in] wei Weights tensor + * @param[in] conv_info Convolution info + */ + virtual DirectConvComputeKernelInfo + configure(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info) = 0; + +protected: + GPUTarget _target; +}; +} // namespace cl_direct_conv +} // namespace arm_compute +#endif /* SRC_RUNTIME_HEURISTICS_DIRECT_CONV_ICLDIRECTCONVKERNELCONFIG */ diff --git a/src/runtime/heuristics/dwc_native/ClDWCNativeDefaultConfigBifrost.cpp b/src/runtime/heuristics/dwc_native/ClDWCNativeDefaultConfigBifrost.cpp new file mode 100644 index 0000000000..98ebf3ebbe --- /dev/null +++ b/src/runtime/heuristics/dwc_native/ClDWCNativeDefaultConfigBifrost.cpp @@ -0,0 +1,309 @@ +/* + * Copyright (c) 2022-2023 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "src/runtime/heuristics/dwc_native/ClDWCNativeDefaultConfigBifrost.h" + +#include "arm_compute/core/CL/CLHelpers.h" +#include "arm_compute/core/GPUTarget.h" +#include "arm_compute/core/TensorInfo.h" +#include "arm_compute/core/TensorShape.h" +#include "arm_compute/core/utils/helpers/AdjustVecSize.h" + +#include "src/runtime/heuristics/dwc_native/ClDWCNativeHeuristicsHelpers.h" + +namespace arm_compute +{ +namespace cl_dwc +{ +namespace +{ +DWCComputeKernelInfo configure_f32(const ITensorInfo *src, + const ITensorInfo *wei, + const PadStrideInfo &conv_info, + const Size2D &dilation, + unsigned int depth_multiplier, + bool is_g71) +{ + DWCComputeKernelInfo desc; + + if (src->data_layout() == DataLayout::NHWC) + { + const size_t idx_c = get_data_layout_dimension_index(wei->data_layout(), DataLayoutDimension::CHANNEL); + const size_t idx_w = get_data_layout_dimension_index(wei->data_layout(), DataLayoutDimension::WIDTH); + const TensorShape wei_shape = wei->tensor_shape(); + const size_t kernel_c = wei_shape[idx_c]; + const size_t kernel_w = wei_shape[idx_w]; + + desc.export_input_to_cl_image = false; + + if (is_g71) + { + desc.export_weights_to_cl_image = false; + } + else + { + desc.export_weights_to_cl_image = use_cl_image_for_weights(wei, depth_multiplier); + } + + if (depth_multiplier == 1) + { + desc.n0 = 4; + } + else + { + if ((depth_multiplier % 4) == 0) + { + desc.n0 = 4; + } + else if ((depth_multiplier % 2) == 0) + { + desc.n0 = 2; + } + else + { + desc.n0 = 1; + } + } + + // Note: If we reduce n0, export to cl_image must be false + ARM_COMPUTE_ERROR_ON((adjust_vec_size(desc.n0, kernel_c) != desc.n0) && + (desc.export_weights_to_cl_image == true)); + + desc.n0 = adjust_vec_size(desc.n0, kernel_c); + + // Set m0 only if stride_x == 1 and dilation_x == 1 + if (conv_info.stride().first == 1 && dilation.x() == 1) + { + if ((kernel_w >= 9) || (kernel_w == 1)) + { + desc.m0 = 1; + } + else + { + desc.m0 = 2; + } + } + else + { + desc.m0 = 1; + } + } + + return desc; +} + +DWCComputeKernelInfo configure_f16(const ITensorInfo *src, + const ITensorInfo *wei, + const PadStrideInfo &conv_info, + const Size2D &dilation, + unsigned int depth_multiplier, + bool is_g71) +{ + DWCComputeKernelInfo desc; + + if (src->data_layout() == DataLayout::NHWC) + { + // Src and weights have the same dimension indices + const size_t idx_c = get_data_layout_dimension_index(wei->data_layout(), DataLayoutDimension::CHANNEL); + const size_t idx_w = get_data_layout_dimension_index(wei->data_layout(), DataLayoutDimension::WIDTH); + const TensorShape src_shape = src->tensor_shape(); + const TensorShape wei_shape = wei->tensor_shape(); + const size_t src_w = src_shape[idx_w]; + const size_t kernel_c = wei_shape[idx_c]; + const size_t kernel_w = wei_shape[idx_w]; + + desc.export_input_to_cl_image = false; + + if (is_g71) + { + desc.export_weights_to_cl_image = false; + } + else + { + desc.export_weights_to_cl_image = use_cl_image_for_weights(wei, depth_multiplier); + } + + if (depth_multiplier == 1) + { + if (desc.export_weights_to_cl_image == false) + { + desc.n0 = 8; + } + else + { + desc.n0 = 4; + } + } + else + { + if ((depth_multiplier % 4) == 0) + { + desc.n0 = 4; + } + else if ((depth_multiplier % 2) == 0) + { + desc.n0 = 2; + } + else + { + desc.n0 = 1; + } + } + + // Note: If we reduce n0, export to cl_image must be false + ARM_COMPUTE_ERROR_ON((adjust_vec_size(desc.n0, kernel_c) != desc.n0) && + (desc.export_weights_to_cl_image == true)); + + desc.n0 = adjust_vec_size(desc.n0, kernel_c); + + // Set m0 only if stride_x == 1 and dilation_x == 1 + if (conv_info.stride().first == 1 && dilation.x() == 1) + { + if ((kernel_w >= 9) || (kernel_w == 1)) + { + desc.m0 = 1; + } + else + { + if ((src_w % 5) == 0) + { + desc.m0 = 5; + } + else + { + desc.m0 = 4; + } + } + } + else + { + desc.m0 = 1; + } + } + + return desc; +} +} // namespace + +ClDWCNativeDefaultConfigBifrost::ClDWCNativeDefaultConfigBifrost(GPUTarget gpu) : IClDWCNativeKernelConfig(gpu) +{ +} + +DWCComputeKernelInfo ClDWCNativeDefaultConfigBifrost::configure(const ITensorInfo *src, + const ITensorInfo *wei, + const PadStrideInfo &conv_info, + const Size2D &dilation, + unsigned int depth_multiplier) +{ + using ConfigurationFunctionExecutorPtr = DWCComputeKernelInfo (ClDWCNativeDefaultConfigBifrost::*)( + const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info, const Size2D &dilation, + unsigned int depth_multiplier); + + ClDWCNativeConfigArray<ConfigurationFunctionExecutorPtr> configs_G71( + &ClDWCNativeDefaultConfigBifrost::configure_G71_f32, &ClDWCNativeDefaultConfigBifrost::configure_G71_f16, + &ClDWCNativeDefaultConfigBifrost::configure_G7x_u8); + + ClDWCNativeConfigArray<ConfigurationFunctionExecutorPtr> configs_G7x( + &ClDWCNativeDefaultConfigBifrost::configure_G7x_f32, &ClDWCNativeDefaultConfigBifrost::configure_G7x_f16, + &ClDWCNativeDefaultConfigBifrost::configure_G7x_u8); + + ConfigurationFunctionExecutorPtr func = nullptr; + switch (_target) + { + case GPUTarget::G71: + func = configs_G71.get_function(src->data_type()); + break; + default: + func = configs_G7x.get_function(src->data_type()); + break; + } + + ARM_COMPUTE_ERROR_ON_MSG(func == nullptr, "Data type not supported for depthwise convolution"); + return (this->*func)(src, wei, conv_info, dilation, depth_multiplier); +} + +DWCComputeKernelInfo ClDWCNativeDefaultConfigBifrost::configure_G71_f32(const ITensorInfo *src, + const ITensorInfo *wei, + const PadStrideInfo &conv_info, + const Size2D &dilation, + unsigned int depth_multiplier) +{ + return configure_f32(src, wei, conv_info, dilation, depth_multiplier, true); +} + +DWCComputeKernelInfo ClDWCNativeDefaultConfigBifrost::configure_G71_f16(const ITensorInfo *src, + const ITensorInfo *wei, + const PadStrideInfo &conv_info, + const Size2D &dilation, + unsigned int depth_multiplier) +{ + return configure_f16(src, wei, conv_info, dilation, depth_multiplier, true); +} + +DWCComputeKernelInfo ClDWCNativeDefaultConfigBifrost::configure_G7x_f32(const ITensorInfo *src, + const ITensorInfo *wei, + const PadStrideInfo &conv_info, + const Size2D &dilation, + unsigned int depth_multiplier) +{ + return configure_f32(src, wei, conv_info, dilation, depth_multiplier, false); +} + +DWCComputeKernelInfo ClDWCNativeDefaultConfigBifrost::configure_G7x_f16(const ITensorInfo *src, + const ITensorInfo *wei, + const PadStrideInfo &conv_info, + const Size2D &dilation, + unsigned int depth_multiplier) +{ + return configure_f16(src, wei, conv_info, dilation, depth_multiplier, false); +} + +DWCComputeKernelInfo ClDWCNativeDefaultConfigBifrost::configure_G7x_u8(const ITensorInfo *src, + const ITensorInfo *wei, + const PadStrideInfo &conv_info, + const Size2D &dilation, + unsigned int depth_multiplier) +{ + ARM_COMPUTE_UNUSED(wei); + + DWCComputeKernelInfo desc; + + if (src->data_layout() == DataLayout::NHWC) + { + desc.export_input_to_cl_image = false; + desc.export_weights_to_cl_image = false; + desc.n0 = (depth_multiplier == 1) ? 4 : 1; + if (conv_info.stride().first == 1 && dilation.x() == 1 && depth_multiplier == 1) + { + desc.m0 = 2; + } + else + { + desc.m0 = 1; + } + } + + return desc; +} +} // namespace cl_dwc +} // namespace arm_compute diff --git a/src/runtime/heuristics/dwc_native/ClDWCNativeDefaultConfigBifrost.h b/src/runtime/heuristics/dwc_native/ClDWCNativeDefaultConfigBifrost.h new file mode 100644 index 0000000000..41d86c9c14 --- /dev/null +++ b/src/runtime/heuristics/dwc_native/ClDWCNativeDefaultConfigBifrost.h @@ -0,0 +1,79 @@ +/* + * Copyright (c) 2022 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef SRC_RUNTIME_HEURISTICS_DWC_NATIVE_CLDWCNATIVEDEFAULTCONFIGBIFROST +#define SRC_RUNTIME_HEURISTICS_DWC_NATIVE_CLDWCNATIVEDEFAULTCONFIGBIFROST + +#include "src/runtime/heuristics/dwc_native/IClDWCNativeKernelConfig.h" + +namespace arm_compute +{ +namespace cl_dwc +{ +/** Bifrost based OpenCL depthwise convolution configuration */ +class ClDWCNativeDefaultConfigBifrost final : public IClDWCNativeKernelConfig +{ +public: + /** Constructor + * + * @param[in] gpu GPU target + */ + ClDWCNativeDefaultConfigBifrost(GPUTarget gpu); + + // Inherited overridden method + DWCComputeKernelInfo configure(const ITensorInfo *src, + const ITensorInfo *wei, + const PadStrideInfo &conv_info, + const Size2D &dilation, + unsigned int depth_multiplier) override; + +private: + DWCComputeKernelInfo configure_G71_f32(const ITensorInfo *src, + const ITensorInfo *wei, + const PadStrideInfo &conv_info, + const Size2D &dilation, + unsigned int depth_multiplier); + DWCComputeKernelInfo configure_G71_f16(const ITensorInfo *src, + const ITensorInfo *wei, + const PadStrideInfo &conv_info, + const Size2D &dilation, + unsigned int depth_multiplier); + DWCComputeKernelInfo configure_G7x_f32(const ITensorInfo *src, + const ITensorInfo *wei, + const PadStrideInfo &conv_info, + const Size2D &dilation, + unsigned int depth_multiplier); + DWCComputeKernelInfo configure_G7x_f16(const ITensorInfo *src, + const ITensorInfo *wei, + const PadStrideInfo &conv_info, + const Size2D &dilation, + unsigned int depth_multiplier); + DWCComputeKernelInfo configure_G7x_u8(const ITensorInfo *src, + const ITensorInfo *wei, + const PadStrideInfo &conv_info, + const Size2D &dilation, + unsigned int depth_multiplier); +}; +} // namespace cl_dwc +} // namespace arm_compute +#endif /* SRC_RUNTIME_HEURISTICS_DWC_NATIVE_CLDWCNATIVEDEFAULTCONFIGBIFROST */ diff --git a/src/runtime/heuristics/dwc_native/ClDWCNativeDefaultConfigValhall.cpp b/src/runtime/heuristics/dwc_native/ClDWCNativeDefaultConfigValhall.cpp new file mode 100644 index 0000000000..ef1bb3858c --- /dev/null +++ b/src/runtime/heuristics/dwc_native/ClDWCNativeDefaultConfigValhall.cpp @@ -0,0 +1,326 @@ +/* + * Copyright (c) 2022-2023 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "src/runtime/heuristics/dwc_native/ClDWCNativeDefaultConfigValhall.h" + +#include "arm_compute/core/CL/CLHelpers.h" +#include "arm_compute/core/GPUTarget.h" +#include "arm_compute/core/TensorInfo.h" +#include "arm_compute/core/TensorShape.h" +#include "arm_compute/core/utils/helpers/AdjustVecSize.h" + +#include "src/runtime/heuristics/dwc_native/ClDWCNativeHeuristicsHelpers.h" + +namespace arm_compute +{ +namespace cl_dwc +{ +ClDWCNativeDefaultConfigValhall::ClDWCNativeDefaultConfigValhall(GPUTarget gpu) : IClDWCNativeKernelConfig(gpu) +{ +} + +DWCComputeKernelInfo ClDWCNativeDefaultConfigValhall::configure(const ITensorInfo *src, + const ITensorInfo *wei, + const PadStrideInfo &conv_info, + const Size2D &dilation, + unsigned int depth_multiplier) +{ + using ConfigurationFunctionExecutorPtr = DWCComputeKernelInfo (ClDWCNativeDefaultConfigValhall::*)( + const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info, const Size2D &dilation, + unsigned int depth_multiplier); + + ClDWCNativeConfigArray<ConfigurationFunctionExecutorPtr> configs_G78( + &ClDWCNativeDefaultConfigValhall::configure_G78_f32, &ClDWCNativeDefaultConfigValhall::configure_G78_f16, + &ClDWCNativeDefaultConfigValhall::configure_G78_u8); + + ClDWCNativeConfigArray<ConfigurationFunctionExecutorPtr> configs_G77( + &ClDWCNativeDefaultConfigValhall::configure_G78_f32, &ClDWCNativeDefaultConfigValhall::configure_G77_f16, + &ClDWCNativeDefaultConfigValhall::configure_G78_u8); + + ConfigurationFunctionExecutorPtr func = nullptr; + switch (_target) + { + case GPUTarget::G77: + func = configs_G77.get_function(src->data_type()); + break; + case GPUTarget::G78: + default: + func = configs_G78.get_function(src->data_type()); + break; + } + + ARM_COMPUTE_ERROR_ON_MSG(func == nullptr, "Data type not supported for depthwise convolution"); + return (this->*func)(src, wei, conv_info, dilation, depth_multiplier); +} + +DWCComputeKernelInfo ClDWCNativeDefaultConfigValhall::configure_G78_f32(const ITensorInfo *src, + const ITensorInfo *wei, + const PadStrideInfo &conv_info, + const Size2D &dilation, + unsigned int depth_multiplier) +{ + DWCComputeKernelInfo desc; + + if (src->data_layout() == DataLayout::NHWC) + { + const size_t idx_c = get_data_layout_dimension_index(wei->data_layout(), DataLayoutDimension::CHANNEL); + const size_t idx_w = get_data_layout_dimension_index(wei->data_layout(), DataLayoutDimension::WIDTH); + const TensorShape wei_shape = wei->tensor_shape(); + const size_t kernel_c = wei_shape[idx_c]; + const size_t kernel_w = wei_shape[idx_w]; + + desc.export_input_to_cl_image = false; + desc.export_weights_to_cl_image = use_cl_image_for_weights(wei, depth_multiplier); + + if (depth_multiplier == 1) + { + desc.n0 = 4; + } + else + { + if ((depth_multiplier % 4) == 0) + { + desc.n0 = 4; + } + else if ((depth_multiplier % 2) == 0) + { + desc.n0 = 2; + } + else + { + desc.n0 = 1; + } + } + + // Note: If we reduce n0, export to cl_image must be false + ARM_COMPUTE_ERROR_ON((adjust_vec_size(desc.n0, kernel_c) != desc.n0) && + (desc.export_weights_to_cl_image == true)); + + desc.n0 = adjust_vec_size(desc.n0, kernel_c); + + // Set m0 only if stride_x == 1 and dilation_x == 1 + if (conv_info.stride().first == 1 && dilation.x() == 1) + { + if ((kernel_w >= 9) || (kernel_w == 1)) + { + desc.m0 = 1; + } + else + { + desc.m0 = 2; + } + } + else + { + desc.m0 = 1; + } + } + + return desc; +} + +DWCComputeKernelInfo ClDWCNativeDefaultConfigValhall::configure_G78_f16(const ITensorInfo *src, + const ITensorInfo *wei, + const PadStrideInfo &conv_info, + const Size2D &dilation, + unsigned int depth_multiplier) +{ + DWCComputeKernelInfo desc; + + if (src->data_layout() == DataLayout::NHWC) + { + // Src and weights have the same dimension indices + const size_t idx_c = get_data_layout_dimension_index(wei->data_layout(), DataLayoutDimension::CHANNEL); + const size_t idx_w = get_data_layout_dimension_index(wei->data_layout(), DataLayoutDimension::WIDTH); + const TensorShape src_shape = src->tensor_shape(); + const TensorShape wei_shape = wei->tensor_shape(); + const size_t src_w = src_shape[idx_w]; + const size_t kernel_c = wei_shape[idx_c]; + const size_t kernel_w = wei_shape[idx_w]; + + desc.export_input_to_cl_image = false; + desc.export_weights_to_cl_image = use_cl_image_for_weights(wei, depth_multiplier); + + if (depth_multiplier == 1) + { + if (desc.export_weights_to_cl_image == false) + { + desc.n0 = 8; + } + else + { + desc.n0 = 4; + } + } + else + { + if ((depth_multiplier % 4) == 0) + { + desc.n0 = 4; + } + else if ((depth_multiplier % 2) == 0) + { + desc.n0 = 2; + } + else + { + desc.n0 = 1; + } + } + + // Note: If we reduce n0, export to cl_image must be false + ARM_COMPUTE_ERROR_ON((adjust_vec_size(desc.n0, kernel_c) != desc.n0) && + (desc.export_weights_to_cl_image == true)); + + desc.n0 = adjust_vec_size(desc.n0, kernel_c); + + // Set m0 only if stride_x == 1 and dilation_x == 1 + if (conv_info.stride().first == 1 && dilation.x() == 1) + { + if ((kernel_w >= 9) || (kernel_w == 1)) + { + desc.m0 = 1; + } + else + { + if ((src_w % 5) == 0) + { + desc.m0 = 5; + } + else + { + desc.m0 = 4; + } + } + } + else + { + desc.m0 = 1; + } + } + + return desc; +} + +DWCComputeKernelInfo ClDWCNativeDefaultConfigValhall::configure_G78_u8(const ITensorInfo *src, + const ITensorInfo *wei, + const PadStrideInfo &conv_info, + const Size2D &dilation, + unsigned int depth_multiplier) +{ + ARM_COMPUTE_UNUSED(wei); + + DWCComputeKernelInfo desc; + + if (src->data_layout() == DataLayout::NHWC) + { + desc.export_input_to_cl_image = false; + desc.export_weights_to_cl_image = false; + desc.n0 = (depth_multiplier == 1) ? 4 : 1; + if (conv_info.stride().first == 1 && dilation.x() == 1 && depth_multiplier == 1) + { + desc.m0 = 2; + } + else + { + desc.m0 = 1; + } + } + + return desc; +} + +DWCComputeKernelInfo ClDWCNativeDefaultConfigValhall::configure_G77_f16(const ITensorInfo *src, + const ITensorInfo *wei, + const PadStrideInfo &conv_info, + const Size2D &dilation, + unsigned int depth_multiplier) +{ + DWCComputeKernelInfo desc; + + if (src->data_layout() == DataLayout::NHWC) + { + const size_t idx_c = get_data_layout_dimension_index(wei->data_layout(), DataLayoutDimension::CHANNEL); + const size_t idx_w = get_data_layout_dimension_index(wei->data_layout(), DataLayoutDimension::WIDTH); + const TensorShape wei_shape = wei->tensor_shape(); + const size_t kernel_c = wei_shape[idx_c]; + const size_t kernel_w = wei_shape[idx_w]; + + desc.export_input_to_cl_image = false; + desc.export_weights_to_cl_image = use_cl_image_for_weights(wei, depth_multiplier); + + if (depth_multiplier == 1) + { + if (desc.export_weights_to_cl_image == false) + { + desc.n0 = 8; + } + else + { + desc.n0 = 4; + } + } + else + { + if ((depth_multiplier % 4) == 0) + { + desc.n0 = 4; + } + else if ((depth_multiplier % 2) == 0) + { + desc.n0 = 2; + } + else + { + desc.n0 = 1; + } + } + + // Note: If we reduce n0, export to cl_image must be false + ARM_COMPUTE_ERROR_ON((adjust_vec_size(desc.n0, kernel_c) != desc.n0) && + (desc.export_weights_to_cl_image == true)); + + desc.n0 = adjust_vec_size(desc.n0, kernel_c); + + // Set m0 only if stride_x == 1 and dilation_x == 1 + if (conv_info.stride().first == 1 && dilation.x() == 1) + { + if ((kernel_w >= 9) || (kernel_w == 1)) + { + desc.m0 = 1; + } + else + { + desc.m0 = 2; + } + } + else + { + desc.m0 = 1; + } + } + + return desc; +} +} // namespace cl_dwc +} // namespace arm_compute diff --git a/src/runtime/heuristics/dwc_native/ClDWCNativeDefaultConfigValhall.h b/src/runtime/heuristics/dwc_native/ClDWCNativeDefaultConfigValhall.h new file mode 100644 index 0000000000..fabce77b54 --- /dev/null +++ b/src/runtime/heuristics/dwc_native/ClDWCNativeDefaultConfigValhall.h @@ -0,0 +1,74 @@ +/* + * Copyright (c) 2022 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef SRC_RUNTIME_HEURISTICS_DWC_NATIVE_CLDWCNATIVEDEFAULTCONFIGVALHALL +#define SRC_RUNTIME_HEURISTICS_DWC_NATIVE_CLDWCNATIVEDEFAULTCONFIGVALHALL + +#include "src/runtime/heuristics/dwc_native/IClDWCNativeKernelConfig.h" + +namespace arm_compute +{ +namespace cl_dwc +{ +/** Valhall based OpenCL depthwise convolution configuration */ +class ClDWCNativeDefaultConfigValhall final : public IClDWCNativeKernelConfig +{ +public: + /** Constructor + * + * @param[in] gpu GPU target + */ + ClDWCNativeDefaultConfigValhall(GPUTarget gpu); + + // Inherited overridden method + DWCComputeKernelInfo configure(const ITensorInfo *src, + const ITensorInfo *wei, + const PadStrideInfo &conv_info, + const Size2D &dilation, + unsigned int depth_multiplier) override; + +private: + DWCComputeKernelInfo configure_G78_f32(const ITensorInfo *src, + const ITensorInfo *wei, + const PadStrideInfo &conv_info, + const Size2D &dilation, + unsigned int depth_multiplier); + DWCComputeKernelInfo configure_G78_f16(const ITensorInfo *src, + const ITensorInfo *wei, + const PadStrideInfo &conv_info, + const Size2D &dilation, + unsigned int depth_multiplier); + DWCComputeKernelInfo configure_G78_u8(const ITensorInfo *src, + const ITensorInfo *wei, + const PadStrideInfo &conv_info, + const Size2D &dilation, + unsigned int depth_multiplier); + DWCComputeKernelInfo configure_G77_f16(const ITensorInfo *src, + const ITensorInfo *wei, + const PadStrideInfo &conv_info, + const Size2D &dilation, + unsigned int depth_multiplier); +}; +} // namespace cl_dwc +} // namespace arm_compute +#endif /* SRC_RUNTIME_HEURISTICS_DWC_NATIVE_CLDWCNATIVEDEFAULTCONFIGVALHALL */ diff --git a/src/runtime/CL/tuners/MidgardTuner.cpp b/src/runtime/heuristics/dwc_native/ClDWCNativeHeuristicsHelpers.cpp index cae3123d74..c8b006c546 100644 --- a/src/runtime/CL/tuners/MidgardTuner.cpp +++ b/src/runtime/heuristics/dwc_native/ClDWCNativeHeuristicsHelpers.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018 ARM Limited. + * Copyright (c) 2022 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -21,57 +21,41 @@ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ -#include "arm_compute/runtime/CL/tuners/MidgardTuner.h" - #include "arm_compute/core/CL/CLHelpers.h" -#include "arm_compute/core/CL/CLKernels.h" -#include "arm_compute/core/utils/misc/Cast.h" +#include "arm_compute/core/TensorInfo.h" +#include "arm_compute/core/TensorShape.h" namespace arm_compute { -namespace tuners -{ -namespace +namespace cl_dwc { -void tune_gemm_kernel(CLGEMMMatrixMultiplyKernel &k) +bool use_cl_image_for_weights(const ITensorInfo *weights, unsigned int depth_multiplier) { - cl::NDRange lws_hint = k.lws_hint(); - const GPUTarget gpu_target = k.get_target(); - - switch(gpu_target) + // Check whether we can use the cl image with the weights. + if (!export_to_cl_image(weights)) { - case GPUTarget::MIDGARD: - case GPUTarget::T600: - case GPUTarget::T700: - case GPUTarget::T800: - if(k._output->info()->dimension(1) == 196) - { - lws_hint = cl::NDRange(1, 7); - } - else - { - lws_hint = cl::NDRange(8, 8); - } - break; - default: - lws_hint = cl::NullRange; + return false; } - k.set_lws_hint(lws_hint); -} -} // namespace + const size_t idx_w = get_data_layout_dimension_index(weights->data_layout(), DataLayoutDimension::WIDTH); + const size_t idx_h = get_data_layout_dimension_index(weights->data_layout(), DataLayoutDimension::HEIGHT); + const size_t kernel_w = weights->tensor_shape()[idx_w]; + const size_t kernel_h = weights->tensor_shape()[idx_h]; -void MidgardTuner::tune_kernel_static(ICLKernel &kernel) -{ - if(dynamic_cast<CLGEMMMatrixMultiplyKernel *>(&kernel) != nullptr) + // If we can use the cl image storage with the weights, we prefer to use the cl buffer storage in the following cases for performance reasons: + // 1- When the kernel size is 1x1 + // 2- When the depth multiplier is greater than 1 and not multiple of 4. + if ((kernel_w == 1) && (kernel_h == 1)) { - tune_gemm_kernel(*utils::cast::polymorphic_downcast<CLGEMMMatrixMultiplyKernel *>(&kernel)); + return false; } -} -void MidgardTuner::tune_kernel_dynamic(ICLKernel &kernel) -{ - ARM_COMPUTE_UNUSED(kernel); + if ((depth_multiplier > 1) && (depth_multiplier % 4) != 0) + { + return false; + } + + return true; } -} // namespace tuners +} // namespace cl_dwc } // namespace arm_compute diff --git a/src/runtime/NEON/functions/NEYOLOLayer.cpp b/src/runtime/heuristics/dwc_native/ClDWCNativeHeuristicsHelpers.h index cef6246f51..e3484c04ff 100644 --- a/src/runtime/NEON/functions/NEYOLOLayer.cpp +++ b/src/runtime/heuristics/dwc_native/ClDWCNativeHeuristicsHelpers.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018-2020 ARM Limited. + * Copyright (c) 2022 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -21,22 +21,25 @@ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ -#include "arm_compute/runtime/NEON/functions/NEYOLOLayer.h" - -#include "arm_compute/core/NEON/kernels/NEYOLOLayerKernel.h" -#include "support/MemorySupport.h" +#ifndef SRC_RUNTIME_HEURISTICS_DWC_NATIVE_CLDWCNATIVEHEURISTICSHELPERS +#define SRC_RUNTIME_HEURISTICS_DWC_NATIVE_CLDWCNATIVEHEURISTICSHELPERS namespace arm_compute { -void NEYOLOLayer::configure(ITensor *input, ITensor *output, const ActivationLayerInfo &act_info, int32_t num_classes) -{ - auto k = arm_compute::support::cpp14::make_unique<NEYOLOLayerKernel>(); - k->configure(input, output, act_info, num_classes); - _kernel = std::move(k); -} +// Forward declaration +class ITensorInfo; -Status NEYOLOLayer::validate(const ITensorInfo *input, const ITensorInfo *output, const ActivationLayerInfo &act_info, int32_t num_classes) +namespace cl_dwc { - return NEYOLOLayerKernel::validate(input, output, act_info, num_classes); -} +/** Utility function to know whether we can use the cl image storage for the weights of depthwise convolution to get better performance + * + * @param[in] weights Weights TensorInfo of the depthwise convolution + * @param[in] depth_multiplier Depth multiplier + * + * @return true if the weights of depthwise convolution can be kept in the cl image storage to improve the performance + */ +bool use_cl_image_for_weights(const ITensorInfo *weights, unsigned int depth_multiplier); + +} // namespace cl_dwc } // namespace arm_compute +#endif /* SRC_RUNTIME_HEURISTICS_DWC_NATIVE_CLDWCNATIVEHEURISTICSHELPERS */ diff --git a/src/runtime/heuristics/dwc_native/ClDWCNativeKernelConfig.h b/src/runtime/heuristics/dwc_native/ClDWCNativeKernelConfig.h new file mode 100644 index 0000000000..031cf1859a --- /dev/null +++ b/src/runtime/heuristics/dwc_native/ClDWCNativeKernelConfig.h @@ -0,0 +1,66 @@ +/* + * Copyright (c) 2022-2023 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ACL_SRC_RUNTIME_HEURISTICS_DWC_NATIVE_CLDWCNATIVEKERNELCONFIG_H +#define ACL_SRC_RUNTIME_HEURISTICS_DWC_NATIVE_CLDWCNATIVEKERNELCONFIG_H + +#include "src/runtime/heuristics/dwc_native/ClDWCNativeDefaultConfigBifrost.h" +#include "src/runtime/heuristics/dwc_native/ClDWCNativeDefaultConfigValhall.h" +#include "src/runtime/heuristics/dwc_native/IClDWCNativeKernelConfig.h" + +#include <memory> + +namespace arm_compute +{ +namespace cl_dwc +{ +/** ClDWCNativeKernelConfigurationFactory factory class */ +class ClDWCNativeKernelConfigurationFactory final +{ +public: + /** Static method to call the ClDWCNative kernel configuration class accordingly with the GPU target + * + * @param[in] gpu GPU target + * + * @return IClDWCNativeKernelConfig + */ + static std::unique_ptr<IClDWCNativeKernelConfig> create(GPUTarget gpu) + { + switch (get_arch_from_target(gpu)) + { + case GPUTarget::MIDGARD: + // The heuristic for Midgard is the same as the one used for Arm Mali-G71 + return std::make_unique<ClDWCNativeDefaultConfigBifrost>(GPUTarget::G71); + case GPUTarget::BIFROST: + return std::make_unique<ClDWCNativeDefaultConfigBifrost>(gpu); + case GPUTarget::VALHALL: + case GPUTarget::FIFTHGEN: + return std::make_unique<ClDWCNativeDefaultConfigValhall>(gpu); + default: + ARM_COMPUTE_ERROR("Not supported GPU target"); + } + } +}; +} // namespace cl_dwc +} // namespace arm_compute +#endif // ACL_SRC_RUNTIME_HEURISTICS_DWC_NATIVE_CLDWCNATIVEKERNELCONFIG_H diff --git a/src/runtime/heuristics/dwc_native/IClDWCNativeKernelConfig.h b/src/runtime/heuristics/dwc_native/IClDWCNativeKernelConfig.h new file mode 100644 index 0000000000..614a6622df --- /dev/null +++ b/src/runtime/heuristics/dwc_native/IClDWCNativeKernelConfig.h @@ -0,0 +1,120 @@ +/* + * Copyright (c) 2022 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef SRC_RUNTIME_HEURISTICS_DWC_NATIVE_ICLDWCNATIVEKERNELCONFIG +#define SRC_RUNTIME_HEURISTICS_DWC_NATIVE_ICLDWCNATIVEKERNELCONFIG + +#include "arm_compute/core/GPUTarget.h" +#include "arm_compute/core/KernelDescriptors.h" +#include "arm_compute/core/Types.h" + +#include "src/core/common/Macros.h" + +namespace arm_compute +{ +namespace cl_dwc +{ +/** Basic container for the OpenCL depthwise convolution configuration functions */ +template <class T> +class ClDWCNativeConfigArray +{ +public: + /** Alias for F32 index */ + static constexpr size_t DT_F32 = 0; + /** Alias for F16 index */ + static constexpr size_t DT_F16 = 1; + /** Alias for Int8 index */ + static constexpr size_t DT_INT8 = 2; + + /** Constructor + * + * @param[in] func_f32 Function to call for depthwise convolution F32 + * @param[in] func_f16 Function to call for depthwise convolution F16 + * @param[in] func_int8 Function to call for depthwise convolution Int8 (QASYMM8, QASYMM8_SIGNED, QSYMM8_PER_CHANNEL) + * + */ + ClDWCNativeConfigArray(T func_f32, T func_f16, T func_int8) : _configs{func_f32, func_f16, func_int8} + { + } + + /** Method to return the depthwise convolution configuration function based on data type + * + * @param[in] data_type Input data type + * + * @return the valid function otherwise it returns nullptr if the data type is not valid + */ + T get_function(DataType data_type) + { + switch (data_type) + { + case DataType::F32: + return _configs.at(DT_F32); + case DataType::F16: + return _configs.at(DT_F16); + case DataType::QASYMM8: + case DataType::QASYMM8_SIGNED: + case DataType::QSYMM8_PER_CHANNEL: + return _configs.at(DT_INT8); + default: + return nullptr; + } + } + +private: + std::array<T, 3> _configs; +}; + +/** Basic interface for the depthwise convolution kernel configuration */ +class IClDWCNativeKernelConfig +{ +public: + /** Constructor + * + * @param[in] arch GPU target + */ + IClDWCNativeKernelConfig(GPUTarget arch) : _target(arch) + { + } + ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(IClDWCNativeKernelConfig); + /** Virtual destructor */ + virtual ~IClDWCNativeKernelConfig() = default; + /** This method returns the @ref DWCComputeKernelInfo for the given inputs + * + * @param[in] src Source tensor (activation tensor) + * @param[in] wei Weights tensor + * @param[in] conv_info Convolution info + * @param[in] dilation Kernel dilation + * @param[in] depth_multiplier Output feature maps multiplier + */ + virtual DWCComputeKernelInfo configure(const ITensorInfo *src, + const ITensorInfo *wei, + const PadStrideInfo &conv_info, + const Size2D &dilation, + unsigned int depth_multiplier) = 0; + +protected: + GPUTarget _target; +}; +} // namespace cl_dwc +} // namespace arm_compute +#endif /* SRC_RUNTIME_HEURISTICS_DWC_NATIVE_ICLDWCNATIVEKERNELCONFIG */ diff --git a/src/runtime/heuristics/indirect_conv/ClIndirectConvDefaultConfigValhall.cpp b/src/runtime/heuristics/indirect_conv/ClIndirectConvDefaultConfigValhall.cpp new file mode 100644 index 0000000000..3380d8f1b7 --- /dev/null +++ b/src/runtime/heuristics/indirect_conv/ClIndirectConvDefaultConfigValhall.cpp @@ -0,0 +1,167 @@ +/* + * Copyright (c) 2022 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "src/runtime/heuristics/indirect_conv/ClIndirectConvDefaultConfigValhall.h" + +#include "arm_compute/core/CL/CLHelpers.h" +#include "arm_compute/core/GPUTarget.h" +#include "arm_compute/core/TensorInfo.h" +#include "arm_compute/core/TensorShape.h" +#include "arm_compute/core/utils/misc/ShapeCalculator.h" + +namespace arm_compute +{ +namespace cl_indirect_conv +{ +using namespace arm_compute::misc::shape_calculator; + +ClIndirectConvDefaultConfigValhall::ClIndirectConvDefaultConfigValhall(GPUTarget gpu) : IClIndirectConvKernelConfig(gpu) +{ +} + +DirectConvComputeKernelInfo ClIndirectConvDefaultConfigValhall::configure(const ITensorInfo *src, + const ITensorInfo *wei, + const PadStrideInfo &conv_info) +{ + using ConfigurationFunctionExecutorPtr = DirectConvComputeKernelInfo (ClIndirectConvDefaultConfigValhall::*)( + const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info); + + ClIndirectConvConfigArray<ConfigurationFunctionExecutorPtr> configs_G77( + &ClIndirectConvDefaultConfigValhall::configure_G77_f32, &ClIndirectConvDefaultConfigValhall::configure_G77_f16); + + // Important note: Indirect convolution should not be used when the kernel size is 1x1 (pointwise). The reason is because the indirect buffer makes + // indirect convolution less efficient than direct convolution or gemm. For this reason, the heuristic of indirect convolution has not been tuned + // for the pointwise convolution cases. + + ConfigurationFunctionExecutorPtr func = configs_G77.get_function(src->data_type()); + + ARM_COMPUTE_ERROR_ON_MSG(func == nullptr, "Data type not supported for indirect convolution"); + return (this->*func)(src, wei, conv_info); +} + +DirectConvComputeKernelInfo ClIndirectConvDefaultConfigValhall::configure_G77_f32(const ITensorInfo *src, + const ITensorInfo *wei, + const PadStrideInfo &conv_info) +{ + DirectConvComputeKernelInfo desc; + + if (src->data_layout() == DataLayout::NHWC) + { + const TensorShape dst_shape = misc::shape_calculator::compute_deep_convolution_shape(*src, *wei, conv_info); + const bool export_weights_to_cl_image = export_to_cl_image(wei); + const int32_t stride_x = conv_info.stride().first; + const int32_t stride_y = conv_info.stride().second; + const int32_t ofm = dst_shape[0]; + const int32_t m = (dst_shape[1] / stride_x) * (dst_shape[2] / stride_y); + + desc.export_weights_to_cl_image = export_weights_to_cl_image; + + if (ofm <= 4) + { + desc.m0 = 1; + desc.n0 = 2; + desc.k0 = 16; + } + else + { + // The 16000 threshold value has been identified as the right + // one for using the biggest block size allowed on F32: 5x4x4 + if (m < 16000) + { + desc.m0 = 4; + desc.n0 = 4; + desc.k0 = 4; + } + else + { + desc.m0 = 5; + desc.n0 = 4; + desc.k0 = 4; + } + } + } + + return desc; +} + +DirectConvComputeKernelInfo ClIndirectConvDefaultConfigValhall::configure_G77_f16(const ITensorInfo *src, + const ITensorInfo *wei, + const PadStrideInfo &conv_info) +{ + DirectConvComputeKernelInfo desc; + + if (src->data_layout() == DataLayout::NHWC) + { + const TensorShape wei_shape = wei->tensor_shape(); + const TensorShape dst_shape = misc::shape_calculator::compute_deep_convolution_shape(*src, *wei, conv_info); + const bool export_weights_to_cl_image = export_to_cl_image(wei); + + const int32_t ofm = dst_shape[0]; + const int32_t m = dst_shape[1] * dst_shape[2]; + const int32_t k = wei_shape[0]; + + desc.export_weights_to_cl_image = export_weights_to_cl_image; + + if (ofm <= 4) + { + // k0 should be as larger as possible. However, we should avoid + // having left-over for loops that make the implementation slower. + if ((k % 16) == 0) + { + desc.k0 = 16; + } + else if ((k % 8) == 0) + { + desc.k0 = 8; + } + else + { + desc.k0 = 4; + } + + desc.m0 = 1; + desc.n0 = ofm; + } + else + { + // The 16000 threshold value has been identified as the right + // one for using the biggest block size allowed on F16: 8x4 + if (m >= 16000 && k < 4) + { + desc.m0 = 8; + desc.n0 = 4; + desc.k0 = 4; // k0 is clamped to k inside the kernel when k is less than 4 + } + else + { + desc.m0 = 5; + desc.n0 = 4; + desc.k0 = 8; + } + } + } + + return desc; +} +} // namespace cl_indirect_conv +} // namespace arm_compute diff --git a/src/runtime/GLES_COMPUTE/GCRuntimeContext.cpp b/src/runtime/heuristics/indirect_conv/ClIndirectConvDefaultConfigValhall.h index 1c30af1b71..bab808c66c 100644 --- a/src/runtime/GLES_COMPUTE/GCRuntimeContext.cpp +++ b/src/runtime/heuristics/indirect_conv/ClIndirectConvDefaultConfigValhall.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019 ARM Limited. + * Copyright (c) 2022 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -21,47 +21,35 @@ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ -#include "arm_compute/runtime/GLES_COMPUTE/GCRuntimeContext.h" +#ifndef SRC_RUNTIME_HEURISTICS_INDIRECT_CONV_CLINDIRECTCONVDEFAULTCONFIGVALHALL +#define SRC_RUNTIME_HEURISTICS_INDIRECT_CONV_CLINDIRECTCONVDEFAULTCONFIGVALHALL -#include "arm_compute/core/Validate.h" -#include "arm_compute/runtime/GLES_COMPUTE/GCHelpers.h" -#include "arm_compute/runtime/GLES_COMPUTE/GCScheduler.h" +#include "src/runtime/heuristics/indirect_conv/IClIndirectConvKernelConfig.h" namespace arm_compute { -GCRuntimeContext::GCRuntimeContext() - : _gpu_owned_scheduler(support::cpp14::make_unique<GCScheduler>()), - _gpu_scheduler(_gpu_owned_scheduler.get()), - _core_context() +namespace cl_indirect_conv { - auto attrs = create_opengl_display_and_context(); - auto display = std::get<0>(attrs); - auto ctx = std::get<1>(attrs); - - _gpu_owned_scheduler->default_init_with_context(display, ctx); - _kernel_lib.init("./cs_shaders/", display, ctx); - - _core_context = GCCoreRuntimeContext(&_kernel_lib); -} - -GCKernelLibrary &GCRuntimeContext::kernel_library() -{ - return _kernel_lib; -} - -GCCoreRuntimeContext *GCRuntimeContext::core_runtime_context() -{ - return &_core_context; -} - -void GCRuntimeContext::set_gpu_scheduler(GCScheduler *scheduler) -{ - ARM_COMPUTE_ERROR_ON_NULLPTR(scheduler); - _gpu_scheduler = scheduler; -} - -GCScheduler *GCRuntimeContext::gpu_scheduler() +/** Valhall based OpenCL indirect convolution configuration */ +class ClIndirectConvDefaultConfigValhall final : public IClIndirectConvKernelConfig { - return _gpu_scheduler; -} +public: + /** Constructor + * + * @param[in] gpu GPU target + */ + ClIndirectConvDefaultConfigValhall(GPUTarget gpu); + + // Inherited overridden method + DirectConvComputeKernelInfo + configure(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info) override; + +private: + DirectConvComputeKernelInfo + configure_G77_f32(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info); + DirectConvComputeKernelInfo + configure_G77_f16(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info); +}; +} // namespace cl_indirect_conv } // namespace arm_compute +#endif /* SRC_RUNTIME_HEURISTICS_INDIRECT_CONV_CLINDIRECTCONVDEFAULTCONFIGVALHALL */ diff --git a/src/runtime/heuristics/indirect_conv/ClIndirectConvKernelConfig.h b/src/runtime/heuristics/indirect_conv/ClIndirectConvKernelConfig.h new file mode 100644 index 0000000000..5e7ba6f8e9 --- /dev/null +++ b/src/runtime/heuristics/indirect_conv/ClIndirectConvKernelConfig.h @@ -0,0 +1,62 @@ +/* + * Copyright (c) 2022-2023 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ACL_SRC_RUNTIME_HEURISTICS_INDIRECT_CONV_CLINDIRECTCONVKERNELCONFIG_H +#define ACL_SRC_RUNTIME_HEURISTICS_INDIRECT_CONV_CLINDIRECTCONVKERNELCONFIG_H + +#include "src/runtime/heuristics/indirect_conv/ClIndirectConvDefaultConfigValhall.h" +#include "src/runtime/heuristics/indirect_conv/IClIndirectConvKernelConfig.h" + +#include <memory> + +namespace arm_compute +{ +namespace cl_indirect_conv +{ +/** ClIndirectConvolution factory class */ +class ClIndirectConvKernelConfigurationFactory final +{ +public: + /** Static method to call the ClIndirectConvolution kernel configuration class accordingly with the GPU target + * + * @param[in] gpu GPU target + * + * @return IClIndirectConvKernelConfig + */ + static std::unique_ptr<IClIndirectConvKernelConfig> create(GPUTarget gpu) + { + switch (get_arch_from_target(gpu)) + { + case GPUTarget::MIDGARD: + case GPUTarget::BIFROST: + case GPUTarget::VALHALL: + case GPUTarget::FIFTHGEN: + return std::make_unique<ClIndirectConvDefaultConfigValhall>(gpu); + default: + ARM_COMPUTE_ERROR("Not supported GPU target"); + } + } +}; +} // namespace cl_indirect_conv +} // namespace arm_compute +#endif // ACL_SRC_RUNTIME_HEURISTICS_INDIRECT_CONV_CLINDIRECTCONVKERNELCONFIG_H diff --git a/src/runtime/heuristics/indirect_conv/IClIndirectConvKernelConfig.h b/src/runtime/heuristics/indirect_conv/IClIndirectConvKernelConfig.h new file mode 100644 index 0000000000..d05da18b58 --- /dev/null +++ b/src/runtime/heuristics/indirect_conv/IClIndirectConvKernelConfig.h @@ -0,0 +1,108 @@ +/* + * Copyright (c) 2022 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef SRC_RUNTIME_HEURISTICS_INDIRECT_CONV_ICLINDIRECTCONVKERNELCONFIG +#define SRC_RUNTIME_HEURISTICS_INDIRECT_CONV_ICLINDIRECTCONVKERNELCONFIG + +#include "arm_compute/core/GPUTarget.h" +#include "arm_compute/core/KernelDescriptors.h" +#include "arm_compute/core/Types.h" + +#include "src/core/common/Macros.h" + +namespace arm_compute +{ +namespace cl_indirect_conv +{ +/** Basic container for the OpenCL indirect convolution configuration functions */ +template <class T> +class ClIndirectConvConfigArray +{ +public: + /** Alias for F32 index */ + static constexpr size_t DT_F32 = 0; + /** Alias for F16 index */ + static constexpr size_t DT_F16 = 1; + + /** Constructor + * + * @param[in] func_f32 Function to call for indirect convolution F32 + * @param[in] func_f16 Function to call for indirect convolution F16 + * + */ + ClIndirectConvConfigArray(T func_f32, T func_f16) : _configs{func_f32, func_f16} + { + } + + /** Method to return the indirect convolution configuration function based on data type + * + * @param[in] data_type Input data type + * + * @return the valid function otherwise it returns nullptr if the data type is not valid + */ + T get_function(DataType data_type) + { + switch (data_type) + { + case DataType::F32: + return _configs.at(DT_F32); + case DataType::F16: + return _configs.at(DT_F16); + default: + return nullptr; + } + } + +private: + std::array<T, 2> _configs; +}; + +/** Basic interface for the indirect convolution kernel configuration */ +class IClIndirectConvKernelConfig +{ +public: + /** Constructor + * + * @param[in] arch GPU target + */ + IClIndirectConvKernelConfig(GPUTarget arch) : _target(arch) + { + } + ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(IClIndirectConvKernelConfig); + /** Virtual destructor */ + virtual ~IClIndirectConvKernelConfig() = default; + /** This method returns the @ref DirectConvComputeKernelInfo for the given inputs + * + * @param[in] src Source tensor (activation tensor) + * @param[in] wei Weights tensor + * @param[in] conv_info Convolution info + */ + virtual DirectConvComputeKernelInfo + configure(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info) = 0; + +protected: + GPUTarget _target; +}; +} // namespace cl_indirect_conv +} // namespace arm_compute +#endif /* SRC_RUNTIME_HEURISTICS_INDIRECT_CONV_ICLINDIRECTCONVKERNELCONFIG */ diff --git a/src/runtime/heuristics/matmul_native/ClMatMulNativeDefaultConfigValhall.cpp b/src/runtime/heuristics/matmul_native/ClMatMulNativeDefaultConfigValhall.cpp new file mode 100644 index 0000000000..3a02a60650 --- /dev/null +++ b/src/runtime/heuristics/matmul_native/ClMatMulNativeDefaultConfigValhall.cpp @@ -0,0 +1,314 @@ +/* + * Copyright (c) 2023 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "src/runtime/heuristics/matmul_native/ClMatMulNativeDefaultConfigValhall.h" + +#include "arm_compute/core/CL/CLHelpers.h" +#include "arm_compute/core/CL/CLKernelLibrary.h" +#include "arm_compute/core/GPUTarget.h" +#include "arm_compute/core/KernelDescriptors.h" +#include "arm_compute/core/TensorInfo.h" + +#include "src/gpu/cl/kernels/ClMatMulNativeKernel.h" +#include "src/runtime/heuristics/matmul_native/ClMatMulNativeHelpers.h" + +#include <utility> + +namespace arm_compute +{ +namespace cl_matmul +{ +ClMatMulNativeDefaultConfigValhall::ClMatMulNativeDefaultConfigValhall(GPUTarget gpu) : IClMatMulNativeKernelConfig(gpu) +{ +} + +MatMulKernelInfo +ClMatMulNativeDefaultConfigValhall::configure(const ITensorInfo *lhs, const ITensorInfo *rhs, const MatMulInfo &info) +{ + using ConfigurationFunctionExecutorPtr = MatMulKernelInfo (ClMatMulNativeDefaultConfigValhall::*)( + unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool rhs_lock_padding, const MatMulInfo &info); + + ClMatMulNativeConfigArray<ConfigurationFunctionExecutorPtr> configs_G710( + &ClMatMulNativeDefaultConfigValhall::configure_G710_f32, + &ClMatMulNativeDefaultConfigValhall::configure_G710_f16, + &ClMatMulNativeDefaultConfigValhall::configure_G710_u8); + + ClMatMulNativeConfigArray<ConfigurationFunctionExecutorPtr> configs_G715( + &ClMatMulNativeDefaultConfigValhall::configure_G715_f32, + &ClMatMulNativeDefaultConfigValhall::configure_G715_f16, + &ClMatMulNativeDefaultConfigValhall::configure_G715_u8); + + ConfigurationFunctionExecutorPtr func = nullptr; + switch (_target) + { + case GPUTarget::G715: + case GPUTarget::G615: + func = configs_G715.get_function(lhs->data_type()); + break; + case GPUTarget::G710: + default: + func = configs_G710.get_function(lhs->data_type()); + break; + } + + const bool adj_lhs = info.adj_lhs(); + const bool adj_rhs = info.adj_rhs(); + + TensorShape lhs_shape = lhs->tensor_shape(); + TensorShape rhs_shape = rhs->tensor_shape(); + + const bool is_batched = lhs_shape.num_dimensions() > 2; + + if (is_batched == true) + { + lhs_shape.collapse_from(2); + } + + const unsigned int m = adj_lhs ? lhs_shape.x() : lhs_shape.y(); + const unsigned int n = adj_rhs ? rhs_shape.y() : rhs_shape.x(); + const unsigned int k = adj_lhs ? lhs_shape.y() : lhs_shape.x(); + const unsigned int b = lhs_shape.z(); + + ARM_COMPUTE_ERROR_ON_MSG(func == nullptr, "Data type not supported for matmul native"); + return (this->*func)(m, n, k, b, rhs->lock_paddings(), info); +} + +MatMulKernelInfo ClMatMulNativeDefaultConfigValhall::configure_G715_f32( + unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool rhs_lock_padding, const MatMulInfo &info) +{ + ARM_COMPUTE_UNUSED(m, n, k, b, rhs_lock_padding); + return {info.adj_lhs(), info.adj_rhs(), /* m0 */ 1, /* n0 */ 4, /* k0 */ 1, /* export_to_cl_image */ false}; +} + +MatMulKernelInfo ClMatMulNativeDefaultConfigValhall::configure_G715_f16( + unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool rhs_lock_padding, const MatMulInfo &info) +{ + return configure_G715_f32(m, n, k, b, rhs_lock_padding, info); +} + +MatMulKernelInfo ClMatMulNativeDefaultConfigValhall::configure_G715_u8( + unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool rhs_lock_padding, const MatMulInfo &info) +{ + ARM_COMPUTE_UNUSED(m, n, k, b, rhs_lock_padding); + return {info.adj_lhs(), info.adj_rhs(), /* m0 */ 4, /* n0 */ 16, /* k0 */ 4, /* export_to_cl_image */ false}; +} + +MatMulKernelInfo ClMatMulNativeDefaultConfigValhall::configure_G710_f32( + unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool rhs_lock_padding, const MatMulInfo &info) +{ + const MatMulNativeConfigsMatrix configs_mnkb_best_nt_nt = { + {3136, 64, 64, 36, 4, 4, 16, 1}, {4096, 48, 32, 36, 4, 4, 4, 1}, {688, 92, 68, 32, 2, 8, 4, 1}, + {24, 464, 412, 24, 2, 8, 4, 1}, {112, 184, 144, 28, 4, 4, 16, 1}, {5776, 64, 32, 36, 2, 4, 16, 1}, + {1568, 64, 40, 36, 2, 8, 8, 1}, {2920, 64, 64, 24, 4, 4, 16, 1}}; + + const MatMulNativeConfigsMatrix configs_mnkb_fallback_nt_nt = { + {3136, 64, 64, 36, 4, 4, 8, 0}, {4096, 48, 32, 36, 4, 4, 8, 0}, {688, 92, 68, 32, 5, 4, 4, 0}, + {24, 464, 412, 24, 6, 2, 8, 0}, {112, 184, 144, 28, 6, 4, 4, 0}, {5776, 64, 32, 36, 5, 4, 4, 0}, + {1568, 64, 40, 36, 4, 4, 8, 0}, {2920, 64, 64, 24, 4, 4, 8, 0}}; + + const MatMulNativeConfigsMatrix configs_mnkb_best_nt_t = { + {3136, 64, 64, 36, 4, 4, 4, 1}, {4096, 48, 32, 36, 2, 2, 16, 1}, {688, 92, 68, 32, 4, 4, 4, 1}, + {24, 464, 412, 24, 6, 2, 8, 1}, {112, 184, 144, 28, 4, 2, 16, 1}, {5776, 64, 32, 36, 4, 4, 4, 1}, + {1568, 64, 40, 36, 4, 4, 8, 1}, {2920, 64, 64, 24, 4, 4, 4, 1}}; + + const MatMulNativeConfigsMatrix configs_mnkb_fallback_nt_t = { + {3136, 64, 64, 36, 5, 4, 4, 0}, {4096, 48, 32, 36, 5, 4, 4, 0}, {688, 92, 68, 32, 5, 4, 4, 0}, + {24, 464, 412, 24, 6, 2, 4, 0}, {112, 184, 144, 28, 5, 4, 4, 0}, {5776, 64, 32, 36, 5, 4, 4, 0}, + {1568, 64, 40, 36, 5, 4, 4, 0}, {2920, 64, 64, 24, 6, 2, 4, 0}}; + + const MatMulNativeConfigsMatrix configs_mnkb_best_t_nt = { + {3136, 64, 64, 36, 4, 4, 16, 1}, {4096, 48, 32, 36, 4, 4, 4, 1}, {688, 92, 68, 32, 2, 8, 4, 1}, + {24, 464, 412, 24, 2, 8, 4, 1}, {112, 184, 144, 28, 4, 4, 16, 1}, {5776, 64, 32, 36, 2, 8, 8, 1}, + {1568, 64, 40, 36, 4, 4, 8, 1}, {2920, 64, 64, 24, 4, 4, 16, 1}}; + + const MatMulNativeConfigsMatrix configs_mnkb_fallback_t_nt = { + {3136, 64, 64, 36, 4, 4, 4, 0}, {4096, 48, 32, 36, 4, 4, 4, 0}, {688, 92, 68, 32, 4, 4, 4, 0}, + {24, 464, 412, 24, 4, 4, 4, 0}, {112, 184, 144, 28, 4, 4, 4, 0}, {5776, 64, 32, 36, 4, 4, 8, 0}, + {1568, 64, 40, 36, 4, 4, 4, 0}, {2920, 64, 64, 24, 4, 4, 4, 0}}; + + const MatMulNativeConfigsMatrix configs_mnkb_best_t_t = { + {3136, 64, 64, 36, 4, 4, 4, 1}, {4096, 48, 32, 36, 4, 4, 4, 1}, {688, 92, 68, 32, 4, 4, 4, 1}, + {24, 464, 412, 24, 2, 2, 16, 1}, {112, 184, 144, 28, 4, 4, 4, 1}, {5776, 64, 32, 36, 4, 4, 4, 1}, + {1568, 64, 40, 36, 4, 4, 4, 1}, {2920, 64, 64, 24, 4, 4, 4, 1}}; + + const MatMulNativeConfigsMatrix configs_mnkb_fallback_t_t = { + {3136, 64, 64, 36, 4, 4, 4, 0}, {4096, 48, 32, 36, 4, 4, 4, 0}, {688, 92, 68, 32, 4, 4, 4, 0}, + {24, 464, 412, 24, 4, 2, 8, 0}, {112, 184, 144, 28, 4, 4, 4, 0}, {5776, 64, 32, 36, 4, 4, 4, 0}, + {1568, 64, 40, 36, 4, 4, 4, 0}, {2920, 64, 64, 24, 4, 4, 4, 0}}; + + const bool adj_lhs = info.adj_lhs(); + const bool adj_rhs = info.adj_rhs(); + + const MatMulNativeConfigsMatrix *configs_best_to_use = nullptr; + const MatMulNativeConfigsMatrix *configs_fallback_to_use = nullptr; + + if ((adj_lhs == false) && (adj_rhs == false)) + { + configs_best_to_use = &configs_mnkb_best_nt_nt; + configs_fallback_to_use = &configs_mnkb_fallback_nt_nt; + } + else if ((adj_lhs == false) && (adj_rhs == true)) + { + configs_best_to_use = &configs_mnkb_best_nt_t; + configs_fallback_to_use = &configs_mnkb_fallback_nt_t; + } + else if ((adj_lhs == true) && (adj_rhs == false)) + { + configs_best_to_use = &configs_mnkb_best_t_nt; + configs_fallback_to_use = &configs_mnkb_fallback_t_nt; + } + else + { + configs_best_to_use = &configs_mnkb_best_t_t; + configs_fallback_to_use = &configs_mnkb_fallback_t_t; + } + + MatMulKernelInfo desc0 = find_info(*configs_best_to_use, adj_lhs, adj_rhs, m, n, k, b); + MatMulKernelInfo desc1 = find_info(*configs_fallback_to_use, adj_lhs, adj_rhs, m, n, k, b); + + return select_info(desc0, desc1, m, n, k, b, DataType::F32, rhs_lock_padding); +} + +MatMulKernelInfo ClMatMulNativeDefaultConfigValhall::configure_G710_f16( + unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool rhs_lock_padding, const MatMulInfo &info) +{ + const MatMulNativeConfigsMatrix configs_mnkb_best_nt_nt = { + {3136, 64, 64, 36, 4, 4, 16, 1}, {4096, 48, 32, 36, 4, 4, 8, 1}, {688, 92, 68, 32, 4, 4, 16, 1}, + {24, 464, 412, 24, 4, 4, 4, 1}, {112, 184, 144, 28, 4, 4, 16, 1}, {5776, 64, 32, 36, 4, 4, 8, 1}, + {1568, 64, 40, 36, 4, 4, 8, 1}, {2920, 64, 64, 24, 4, 4, 16, 1}}; + + const MatMulNativeConfigsMatrix configs_mnkb_fallback_nt_nt = { + {3136, 64, 64, 36, 6, 4, 8, 0}, {4096, 48, 32, 36, 6, 4, 8, 0}, {688, 92, 68, 32, 6, 4, 8, 0}, + {24, 464, 412, 24, 4, 4, 8, 0}, {112, 184, 144, 28, 6, 4, 8, 0}, {5776, 64, 32, 36, 6, 4, 8, 0}, + {1568, 64, 40, 36, 6, 4, 8, 0}, {2920, 64, 64, 24, 6, 4, 8, 0}}; + + const MatMulNativeConfigsMatrix configs_mnkb_best_nt_t = { + {3136, 64, 64, 36, 6, 4, 8, 1}, {4096, 48, 32, 36, 6, 4, 8, 1}, {688, 92, 68, 32, 4, 4, 4, 1}, + {24, 464, 412, 24, 6, 2, 4, 1}, {112, 184, 144, 28, 4, 2, 16, 1}, {5776, 64, 32, 36, 6, 4, 8, 1}, + {1568, 64, 40, 36, 6, 4, 8, 1}, {2920, 64, 64, 24, 6, 4, 8, 1}}; + + const MatMulNativeConfigsMatrix configs_mnkb_fallback_nt_t = { + {3136, 64, 64, 36, 6, 2, 16, 0}, {4096, 48, 32, 36, 5, 4, 8, 0}, {688, 92, 68, 32, 6, 2, 16, 0}, + {24, 464, 412, 24, 6, 2, 16, 0}, {112, 184, 144, 28, 6, 2, 16, 0}, {5776, 64, 32, 36, 5, 4, 8, 0}, + {1568, 64, 40, 36, 5, 4, 8, 0}, {2920, 64, 64, 24, 6, 2, 16, 0}}; + + const MatMulNativeConfigsMatrix configs_mnkb_best_t_nt = { + {3136, 64, 64, 36, 4, 4, 16, 1}, {4096, 48, 32, 36, 4, 4, 4, 1}, {688, 92, 68, 32, 4, 4, 4, 1}, + {24, 464, 412, 24, 4, 4, 4, 1}, {112, 184, 144, 28, 4, 4, 4, 1}, {5776, 64, 32, 36, 4, 4, 4, 1}, + {1568, 64, 40, 36, 4, 4, 4, 1}, {2920, 64, 64, 24, 4, 4, 4, 1}}; + + const MatMulNativeConfigsMatrix configs_mnkb_fallback_t_nt = { + {3136, 64, 64, 36, 4, 4, 4, 0}, {4096, 48, 32, 36, 4, 4, 4, 0}, {688, 92, 68, 32, 4, 4, 4, 0}, + {24, 464, 412, 24, 4, 4, 4, 0}, {112, 184, 144, 28, 4, 4, 4, 0}, {5776, 64, 32, 36, 4, 4, 4, 0}, + {1568, 64, 40, 36, 4, 4, 4, 0}, {2920, 64, 64, 24, 4, 4, 4, 0}}; + + const MatMulNativeConfigsMatrix configs_mnkb_best_t_t = { + {3136, 64, 64, 36, 4, 4, 16, 1}, {4096, 48, 32, 36, 4, 4, 8, 1}, {688, 92, 68, 32, 4, 4, 4, 1}, + {24, 464, 412, 24, 4, 2, 8, 1}, {112, 184, 144, 28, 4, 2, 16, 1}, {5776, 64, 32, 36, 4, 4, 16, 1}, + {1568, 64, 40, 36, 4, 4, 8, 1}, {2920, 64, 64, 24, 4, 4, 16, 1}}; + + const MatMulNativeConfigsMatrix configs_mnkb_fallback_t_t = { + {3136, 64, 64, 36, 4, 4, 8, 0}, {4096, 48, 32, 36, 4, 4, 8, 0}, {688, 92, 68, 32, 4, 4, 8, 0}, + {24, 464, 412, 24, 4, 4, 8, 0}, {112, 184, 144, 28, 4, 4, 8, 0}, {5776, 64, 32, 36, 4, 4, 8, 0}, + {1568, 64, 40, 36, 4, 4, 8, 0}, {2920, 64, 64, 24, 4, 4, 8, 0}}; + + const bool adj_lhs = info.adj_lhs(); + const bool adj_rhs = info.adj_rhs(); + + const MatMulNativeConfigsMatrix *configs_best_to_use = nullptr; + const MatMulNativeConfigsMatrix *configs_fallback_to_use = nullptr; + + if ((adj_lhs == false) && (adj_rhs == false)) + { + configs_best_to_use = &configs_mnkb_best_nt_nt; + configs_fallback_to_use = &configs_mnkb_fallback_nt_nt; + } + else if ((adj_lhs == false) && (adj_rhs == true)) + { + configs_best_to_use = &configs_mnkb_best_nt_t; + configs_fallback_to_use = &configs_mnkb_fallback_nt_t; + } + else if ((adj_lhs == true) && (adj_rhs == false)) + { + configs_best_to_use = &configs_mnkb_best_t_nt; + configs_fallback_to_use = &configs_mnkb_fallback_t_nt; + } + else + { + configs_best_to_use = &configs_mnkb_best_t_t; + configs_fallback_to_use = &configs_mnkb_fallback_t_t; + } + + MatMulKernelInfo desc0 = find_info(*configs_best_to_use, adj_lhs, adj_rhs, m, n, k, b); + MatMulKernelInfo desc1 = find_info(*configs_fallback_to_use, adj_lhs, adj_rhs, m, n, k, b); + + return select_info(desc0, desc1, m, n, k, b, DataType::F16, rhs_lock_padding); +} + +MatMulKernelInfo ClMatMulNativeDefaultConfigValhall::configure_G710_u8( + unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool rhs_lock_padding, const MatMulInfo &info) +{ + ARM_COMPUTE_UNUSED(rhs_lock_padding); + + const MatMulNativeConfigsMatrix configs_mnkb_best_nt_nt = { + {3136, 64, 64, 36, 6, 4, 4, 0}, {4096, 48, 32, 36, 6, 4, 4, 0}, {688, 92, 68, 32, 2, 8, 4, 0}, + {24, 464, 412, 24, 4, 4, 4, 0}, {112, 184, 144, 28, 6, 4, 4, 0}, {5776, 64, 32, 36, 6, 4, 4, 0}, + {1568, 64, 40, 36, 6, 4, 4, 0}, {2920, 64, 64, 24, 5, 4, 4, 0}}; + + const MatMulNativeConfigsMatrix configs_mnkb_best_nt_t = { + {3136, 64, 64, 36, 4, 4, 16, 0}, {4096, 48, 32, 36, 4, 4, 16, 0}, {688, 92, 68, 32, 4, 4, 16, 0}, + {24, 464, 412, 24, 6, 2, 16, 0}, {112, 184, 144, 28, 4, 4, 16, 0}, {5776, 64, 32, 36, 4, 4, 16, 0}, + {1568, 64, 40, 36, 6, 4, 4, 0}, {2920, 64, 64, 24, 4, 4, 16, 0}}; + + const MatMulNativeConfigsMatrix configs_mnkb_best_t_nt = { + {3136, 64, 64, 36, 4, 4, 8, 0}, {4096, 48, 32, 36, 4, 4, 8, 0}, {688, 92, 68, 32, 4, 4, 4, 0}, + {24, 464, 412, 24, 4, 4, 4, 0}, {112, 184, 144, 28, 4, 4, 8, 0}, {5776, 64, 32, 36, 4, 4, 8, 0}, + {1568, 64, 40, 36, 4, 4, 8, 0}, {2920, 64, 64, 24, 4, 4, 8, 0}}; + + const MatMulNativeConfigsMatrix configs_mnkb_best_t_t = { + {3136, 64, 64, 36, 4, 2, 16, 0}, {4096, 48, 32, 36, 4, 4, 4, 0}, {688, 92, 68, 32, 4, 4, 8, 0}, + {24, 464, 412, 24, 4, 2, 16, 0}, {112, 184, 144, 28, 4, 2, 16, 0}, {5776, 64, 32, 36, 4, 4, 4, 0}, + {1568, 64, 40, 36, 4, 4, 8, 0}, {2920, 64, 64, 24, 4, 2, 16, 0}}; + + const bool adj_lhs = info.adj_lhs(); + const bool adj_rhs = info.adj_rhs(); + + if ((adj_lhs == false) && (adj_rhs == false)) + { + return find_info(configs_mnkb_best_nt_nt, adj_lhs, adj_rhs, m, n, k, b); + } + else if ((adj_lhs == false) && (adj_rhs == true)) + { + return find_info(configs_mnkb_best_nt_t, adj_lhs, adj_rhs, m, n, k, b); + } + else if ((adj_lhs == true) && (adj_rhs == false)) + { + return find_info(configs_mnkb_best_t_nt, adj_lhs, adj_rhs, m, n, k, b); + } + else + { + return find_info(configs_mnkb_best_t_t, adj_lhs, adj_rhs, m, n, k, b); + } +} +} // namespace cl_matmul +} // namespace arm_compute diff --git a/src/runtime/heuristics/matmul_native/ClMatMulNativeDefaultConfigValhall.h b/src/runtime/heuristics/matmul_native/ClMatMulNativeDefaultConfigValhall.h new file mode 100644 index 0000000000..5279871057 --- /dev/null +++ b/src/runtime/heuristics/matmul_native/ClMatMulNativeDefaultConfigValhall.h @@ -0,0 +1,62 @@ +/* + * Copyright (c) 2023 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ACL_SRC_RUNTIME_HEURISTICS_MATMUL_NATIVE_CLMATMULNATIVEDEFAULTCONFIGVALHALL_H +#define ACL_SRC_RUNTIME_HEURISTICS_MATMUL_NATIVE_CLMATMULNATIVEDEFAULTCONFIGVALHALL_H + +#include "src/runtime/heuristics/matmul_native/IClMatMulNativeKernelConfig.h" + +namespace arm_compute +{ +namespace cl_matmul +{ +/** Valhall based OpenCL matmul configuration */ +class ClMatMulNativeDefaultConfigValhall final : public IClMatMulNativeKernelConfig +{ +public: + /** Constructor + * + * @param[in] gpu GPU target + */ + ClMatMulNativeDefaultConfigValhall(GPUTarget gpu); + + // Inherited overridden method + MatMulKernelInfo configure(const ITensorInfo *lhs, const ITensorInfo *rhs, const MatMulInfo &info) override; + +private: + MatMulKernelInfo configure_G710_f32( + unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool rhs_lock_padding, const MatMulInfo &info); + MatMulKernelInfo configure_G710_f16( + unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool rhs_lock_padding, const MatMulInfo &info); + MatMulKernelInfo configure_G710_u8( + unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool rhs_lock_padding, const MatMulInfo &info); + MatMulKernelInfo configure_G715_f32( + unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool rhs_lock_padding, const MatMulInfo &info); + MatMulKernelInfo configure_G715_f16( + unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool rhs_lock_padding, const MatMulInfo &info); + MatMulKernelInfo configure_G715_u8( + unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool rhs_lock_padding, const MatMulInfo &info); +}; +} // namespace cl_matmul +} // namespace arm_compute +#endif // ACL_SRC_RUNTIME_HEURISTICS_MATMUL_NATIVE_CLMATMULNATIVEDEFAULTCONFIGVALHALL_H diff --git a/src/runtime/heuristics/matmul_native/ClMatMulNativeDefaultVariantValhall.cpp b/src/runtime/heuristics/matmul_native/ClMatMulNativeDefaultVariantValhall.cpp new file mode 100644 index 0000000000..3878f698fd --- /dev/null +++ b/src/runtime/heuristics/matmul_native/ClMatMulNativeDefaultVariantValhall.cpp @@ -0,0 +1,113 @@ +/* + * Copyright (c) 2023 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "src/runtime/heuristics/matmul_native/ClMatMulNativeDefaultVariantValhall.h" + +#include "arm_compute/core/Error.h" +#include "arm_compute/core/GPUTarget.h" +#include "arm_compute/core/TensorInfo.h" + +namespace arm_compute +{ +namespace cl_matmul +{ +ClMatMulNativeDefaultVariantValhall::ClMatMulNativeDefaultVariantValhall(GPUTarget gpu) + : IClMatMulNativeKernelVariant(gpu) +{ +} + +MatMulKernelType ClMatMulNativeDefaultVariantValhall::select_kernel(const ITensorInfo *lhs, + const ITensorInfo *rhs, + const MatMulInfo &info, + const ActivationLayerInfo &act_info) +{ + ARM_COMPUTE_UNUSED(rhs); + + using VariantFunctionExecutorPtr = + MatMulKernelType (ClMatMulNativeDefaultVariantValhall::*)(int k, bool act_enabled); + + ClMatMulNativeVariantArray<VariantFunctionExecutorPtr> configs_G715( + &ClMatMulNativeDefaultVariantValhall::configure_G715_float, + &ClMatMulNativeDefaultVariantValhall::configure_G715_quantized); + + ClMatMulNativeVariantArray<VariantFunctionExecutorPtr> configs_default( + &ClMatMulNativeDefaultVariantValhall::configure_default_float, + &ClMatMulNativeDefaultVariantValhall::configure_default_quantized); + + VariantFunctionExecutorPtr func = nullptr; + switch (_target) + { + case GPUTarget::G715: + case GPUTarget::G615: + func = configs_G715.get_function(lhs->data_type()); + break; + default: + func = configs_default.get_function(lhs->data_type()); + break; + } + + const int k = info.adj_lhs() ? lhs->tensor_shape().y() : lhs->tensor_shape().x(); + const bool act_enabled = act_info.enabled(); + + ARM_COMPUTE_ERROR_ON_MSG(func == nullptr, "Data type not supported for matmul native"); + return (this->*func)(k, act_enabled); +} + +MatMulKernelType ClMatMulNativeDefaultVariantValhall::configure_G715_float(int k, bool act_enabled) +{ + // MMUL kernel works only when K is a multiple of 4 + if (!act_enabled && k % 4 == 0) + { + return MatMulKernelType::NATIVE_MMUL_FP; + } + + return MatMulKernelType::NATIVE_FP; +} + +MatMulKernelType ClMatMulNativeDefaultVariantValhall::configure_G715_quantized(int k, bool act_enabled) +{ + // MMUL kernel works only when K is a multiple of 16 + if (!act_enabled && k % 16 == 0) + { + return MatMulKernelType::NATIVE_MMUL_QUANTIZED; + } + + return MatMulKernelType::NATIVE_QUANTIZED; +} + +MatMulKernelType ClMatMulNativeDefaultVariantValhall::configure_default_float(int k, bool act_enabled) +{ + ARM_COMPUTE_UNUSED(k, act_enabled); + + return MatMulKernelType::NATIVE_FP; +} + +MatMulKernelType ClMatMulNativeDefaultVariantValhall::configure_default_quantized(int k, bool act_enabled) +{ + ARM_COMPUTE_UNUSED(k, act_enabled); + + return MatMulKernelType::NATIVE_QUANTIZED; +} + +} // namespace cl_matmul +} // namespace arm_compute diff --git a/src/runtime/heuristics/matmul_native/ClMatMulNativeDefaultVariantValhall.h b/src/runtime/heuristics/matmul_native/ClMatMulNativeDefaultVariantValhall.h new file mode 100644 index 0000000000..a202676e98 --- /dev/null +++ b/src/runtime/heuristics/matmul_native/ClMatMulNativeDefaultVariantValhall.h @@ -0,0 +1,57 @@ +/* + * Copyright (c) 2023 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ACL_SRC_RUNTIME_HEURISTICS_MATMUL_NATIVE_CLMATMULNATIVEDEFAULTVARIANTVALHALL_H +#define ACL_SRC_RUNTIME_HEURISTICS_MATMUL_NATIVE_CLMATMULNATIVEDEFAULTVARIANTVALHALL_H + +#include "src/runtime/heuristics/matmul_native/IClMatMulNativeKernelVariant.h" + +namespace arm_compute +{ +namespace cl_matmul +{ +/** Valhall based OpenCL matmul configuration */ +class ClMatMulNativeDefaultVariantValhall final : public IClMatMulNativeKernelVariant +{ +public: + /** Constructor + * + * @param[in] gpu GPU target + */ + ClMatMulNativeDefaultVariantValhall(GPUTarget gpu); + + // Inherited overridden method + MatMulKernelType select_kernel(const ITensorInfo *lhs, + const ITensorInfo *rhs, + const MatMulInfo &info, + const ActivationLayerInfo &act_info) override; + +private: + MatMulKernelType configure_G715_float(int k, bool act_enabled); + MatMulKernelType configure_G715_quantized(int k, bool act_enabled); + MatMulKernelType configure_default_float(int k, bool act_enabled); + MatMulKernelType configure_default_quantized(int k, bool act_enabled); +}; +} // namespace cl_matmul +} // namespace arm_compute +#endif // ACL_SRC_RUNTIME_HEURISTICS_MATMUL_NATIVE_CLMATMULNATIVEDEFAULTVARIANTVALHALL_H diff --git a/src/runtime/heuristics/matmul_native/ClMatMulNativeHelpers.cpp b/src/runtime/heuristics/matmul_native/ClMatMulNativeHelpers.cpp new file mode 100644 index 0000000000..89cad30214 --- /dev/null +++ b/src/runtime/heuristics/matmul_native/ClMatMulNativeHelpers.cpp @@ -0,0 +1,134 @@ +/* + * Copyright (c) 2023 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "src/runtime/heuristics/matmul_native/ClMatMulNativeHelpers.h" + +#include "arm_compute/core/KernelDescriptors.h" +#include "arm_compute/core/TensorInfo.h" +#include "arm_compute/core/TensorShape.h" + +#include "src/gpu/cl/kernels/ClMatMulNativeKernel.h" + +#include <limits> +#include <utility> + +namespace arm_compute +{ +namespace cl_matmul +{ +MatMulKernelInfo select_info(const MatMulKernelInfo &info0, + const MatMulKernelInfo &info1, + unsigned int m, + unsigned int n, + unsigned int k, + unsigned int b, + DataType data_type, + bool rhs_lock_padding) +{ + ARM_COMPUTE_ERROR_ON_MSG(info1.export_rhs_to_cl_image == true, + "The fallback MatMul configuration cannot have export_to_cl_image = true"); + ARM_COMPUTE_ERROR_ON_MSG(info0.adj_lhs != info1.adj_lhs, + "The MatMul configurations must have the same adj_lhs value"); + ARM_COMPUTE_ERROR_ON_MSG(info0.adj_rhs != info1.adj_rhs, + "The MatMul configurations must have the same adj_rhs value"); + + const bool adj_lhs = info0.adj_lhs; + const bool adj_rhs = info0.adj_rhs; + + TensorInfo lhs_info = + !adj_lhs ? TensorInfo(TensorShape(k, m, b), 1, data_type) : TensorInfo(TensorShape(m, k, b), 1, data_type); + TensorInfo rhs_info = + !adj_rhs ? TensorInfo(TensorShape(n, k, b), 1, data_type) : TensorInfo(TensorShape(k, n, b), 1, data_type); + TensorInfo dst_info; + + if (rhs_lock_padding == false) + { + if (bool(opencl::kernels::ClMatMulNativeKernel::validate(&lhs_info, &rhs_info, nullptr, &dst_info, info0))) + { + return info0; + } + else + { + return info1; + } + } + else + { + return info1; + } +} + +MatMulKernelInfo find_info(const MatMulNativeConfigsMatrix &configs, + bool adj_lhs, + bool adj_rhs, + unsigned int m, + unsigned int n, + unsigned int k, + unsigned int b) +{ + size_t min_acc = std::numeric_limits<size_t>::max(); + size_t min_idx = 0; + + ARM_COMPUTE_ERROR_ON(configs.size() == 0); + const size_t num_rows = configs.size(); + const size_t num_cols = configs[0].size(); + + ARM_COMPUTE_ERROR_ON_MSG(num_cols != 8U, + "The entry should have 8 integer values representing: M, N, K, B, M0, N0. K0, IMG_RHS"); + ARM_COMPUTE_UNUSED(num_cols); + + // Find nearest GeMM workload + // Note: the workload does not depend on the K dimension + for (size_t y = 0; y < num_rows; ++y) + { + size_t mc0 = static_cast<size_t>(configs[y][0]); + size_t nc0 = static_cast<size_t>(configs[y][1]); + size_t kc0 = static_cast<size_t>(configs[y][2]); + size_t bc0 = static_cast<size_t>(configs[y][3]); + + size_t acc = 0; + acc += (m - mc0) * (m - mc0); + acc += (n - nc0) * (n - nc0); + acc += (k - kc0) * (k - kc0); + acc += (b - bc0) * (b - bc0); + acc = std::sqrt(acc); + if (acc < min_acc) + { + min_acc = acc; + min_idx = y; + } + } + + // Get the configuration from the nearest GeMM shape + MatMulKernelInfo desc; + desc.adj_lhs = adj_lhs; + desc.adj_rhs = adj_rhs; + desc.m0 = configs[min_idx][4]; + desc.n0 = configs[min_idx][5]; + desc.k0 = configs[min_idx][6]; + desc.export_rhs_to_cl_image = configs[min_idx][7]; + + return desc; +} +} // namespace cl_matmul +} // namespace arm_compute diff --git a/src/runtime/heuristics/matmul_native/ClMatMulNativeHelpers.h b/src/runtime/heuristics/matmul_native/ClMatMulNativeHelpers.h new file mode 100644 index 0000000000..699f5fe8c1 --- /dev/null +++ b/src/runtime/heuristics/matmul_native/ClMatMulNativeHelpers.h @@ -0,0 +1,83 @@ +/* + * Copyright (c) 2023 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ACL_SRC_RUNTIME_HEURISTICS_MATMUL_NATIVE_CLMATMULNATIVEHELPERS_H +#define ACL_SRC_RUNTIME_HEURISTICS_MATMUL_NATIVE_CLMATMULNATIVEHELPERS_H + +#include "arm_compute/core/Types.h" + +namespace arm_compute +{ +// Forward declaration +struct MatMulKernelInfo; + +namespace cl_matmul +{ +using MatMulNativeConfigsMatrix = std::vector<std::vector<int32_t>>; + +/** This function accepts two MatMulKernelInfo objects where only the first can be with cl_image2d support enabled. + * The aim of this function is to check whether the first MatMulKernelInfo object is valid. If not, the function will + * return the second MatMulKernelInfo object. Otherwise, the first one. + * + * @param[in] info0 MatMulKernelInfo with cl_image2d support + * @param[in] info1 MatMulKernelInfo to fall-back if cl_image2d cannot be used + * @param[in] m Number of rows (M) of the LHS matrix + * @param[in] n Number of columns (N) in the RHS matrix not reshaped + * @param[in] k Number of rows (K) in the RHS matrix not reshaped + * @param[in] b Batch size + * @param[in] data_type Data type + * @param[in] rhs_lock_padding Flag used to know whether the RHS paddings are locked + * + * @return @ref MatMulKernelInfo + */ +MatMulKernelInfo select_info(const MatMulKernelInfo &info0, + const MatMulKernelInfo &info1, + unsigned int m, + unsigned int n, + unsigned int k, + unsigned int b, + DataType data_type, + bool rhs_lock_padding); + +/** Find the preferred configurations for the MatMul Native kernel using the MatMulNativeConfigsMatrix provided by the user + * + * @param[in] configs List of best configurations for a limited number of MatMul shapes + * @param[in] adj_lhs Adjoint LHS flag value + * @param[in] adj_rhs Adjoint RHS flag value + * @param[in] m Number of rows (M) of the LHS matrix + * @param[in] n Number of columns (N) in the RHS matrix not reshaped + * @param[in] k Number of rows (K) in the RHS matrix not reshaped + * @param[in] b Batch size + * + * @return @ref MatMulKernelInfo + */ +MatMulKernelInfo find_info(const MatMulNativeConfigsMatrix &configs, + bool adj_lhs, + bool adj_rhs, + unsigned int m, + unsigned int n, + unsigned int k, + unsigned int b); +} // namespace cl_matmul +} // namespace arm_compute +#endif // ACL_SRC_RUNTIME_HEURISTICS_MATMUL_NATIVE_CLMATMULNATIVEHELPERS_H diff --git a/src/runtime/heuristics/matmul_native/ClMatMulNativeKernelConfig.h b/src/runtime/heuristics/matmul_native/ClMatMulNativeKernelConfig.h new file mode 100644 index 0000000000..e7485bca81 --- /dev/null +++ b/src/runtime/heuristics/matmul_native/ClMatMulNativeKernelConfig.h @@ -0,0 +1,62 @@ +/* + * Copyright (c) 2023 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ACL_SRC_RUNTIME_HEURISTICS_MATMUL_NATIVE_CLMATMULNATIVEKERNELCONFIG_H +#define ACL_SRC_RUNTIME_HEURISTICS_MATMUL_NATIVE_CLMATMULNATIVEKERNELCONFIG_H + +#include "src/runtime/heuristics/matmul_native/ClMatMulNativeDefaultConfigValhall.h" +#include "src/runtime/heuristics/matmul_native/IClMatMulNativeKernelConfig.h" + +#include <memory> + +namespace arm_compute +{ +namespace cl_matmul +{ +/** ClMatMul configuration factory class */ +class ClMatMulNativeKernelConfigurationFactory final +{ +public: + /** Static method to call the ClMatMul configuration class accordingly with the GPU target + * + * @param[in] gpu GPU target + * + * @return IClMatMulNativeKernelConfig + */ + static std::unique_ptr<IClMatMulNativeKernelConfig> create(GPUTarget gpu) + { + switch (get_arch_from_target(gpu)) + { + case GPUTarget::MIDGARD: + case GPUTarget::BIFROST: + case GPUTarget::VALHALL: + case GPUTarget::FIFTHGEN: + return std::make_unique<ClMatMulNativeDefaultConfigValhall>(gpu); + default: + ARM_COMPUTE_ERROR("Not supported GPU target"); + } + } +}; +} // namespace cl_matmul +} // namespace arm_compute +#endif // ACL_SRC_RUNTIME_HEURISTICS_MATMUL_NATIVE_CLMATMULNATIVEKERNELCONFIG_H diff --git a/src/runtime/heuristics/matmul_native/ClMatMulNativeKernelVariant.h b/src/runtime/heuristics/matmul_native/ClMatMulNativeKernelVariant.h new file mode 100644 index 0000000000..c2895b8919 --- /dev/null +++ b/src/runtime/heuristics/matmul_native/ClMatMulNativeKernelVariant.h @@ -0,0 +1,63 @@ +/* + * Copyright (c) 2023 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ACL_SRC_RUNTIME_HEURISTICS_MATMUL_NATIVE_CLMATMULNATIVEKERNELVARIANT_H +#define ACL_SRC_RUNTIME_HEURISTICS_MATMUL_NATIVE_CLMATMULNATIVEKERNELVARIANT_H + +#include "src/runtime/heuristics/matmul_native/ClMatMulNativeDefaultVariantValhall.h" +#include "src/runtime/heuristics/matmul_native/IClMatMulNativeKernelVariant.h" + +#include <memory> + +namespace arm_compute +{ +namespace cl_matmul +{ + +/** ClMatMul variant factory class */ +class ClMatMulNativeKernelVariantFactory final +{ +public: + /** Static method to call the ClMatMul configuration class accordingly with the GPU target + * + * @param[in] gpu GPU target + * + * @return IClMatMulNativeKernelVariant + */ + static std::unique_ptr<IClMatMulNativeKernelVariant> create(GPUTarget gpu) + { + switch (get_arch_from_target(gpu)) + { + case GPUTarget::MIDGARD: + case GPUTarget::BIFROST: + case GPUTarget::VALHALL: + case GPUTarget::FIFTHGEN: + return std::make_unique<ClMatMulNativeDefaultVariantValhall>(gpu); + default: + ARM_COMPUTE_ERROR("Not supported GPU target"); + } + } +}; +} // namespace cl_matmul +} // namespace arm_compute +#endif // ACL_SRC_RUNTIME_HEURISTICS_MATMUL_NATIVE_CLMATMULNATIVEKERNELVARIANT_H diff --git a/src/runtime/heuristics/matmul_native/IClMatMulNativeKernelConfig.h b/src/runtime/heuristics/matmul_native/IClMatMulNativeKernelConfig.h new file mode 100644 index 0000000000..00ba3641d5 --- /dev/null +++ b/src/runtime/heuristics/matmul_native/IClMatMulNativeKernelConfig.h @@ -0,0 +1,117 @@ +/* + * Copyright (c) 2023 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ACL_SRC_RUNTIME_HEURISTICS_MATMUL_NATIVE_ICLMATMULNATIVEKERNELCONFIG_H +#define ACL_SRC_RUNTIME_HEURISTICS_MATMUL_NATIVE_ICLMATMULNATIVEKERNELCONFIG_H + +#include "arm_compute/core/GPUTarget.h" +#include "arm_compute/core/KernelDescriptors.h" +#include "arm_compute/core/Types.h" +#include "arm_compute/function_info/MatMulInfo.h" + +#include "src/core/common/Macros.h" + +namespace arm_compute +{ +namespace cl_matmul +{ +/** Basic container for the OpenCL MatMul Native configuration functions */ +template <class T> +class ClMatMulNativeConfigArray +{ +public: + /** Alias for F32 index */ + static constexpr size_t DT_F32 = 0; + /** Alias for F16 index */ + static constexpr size_t DT_F16 = 1; + /** Alias for Int8 index */ + static constexpr size_t DT_INT8 = 2; + + /** Constructor + * + * @param[in] func_f32 Function to call for matmul native F32 + * @param[in] func_f16 Function to call for matmul native F16 + * @param[in] func_int8 Function to call for matmul native Int8 (QASYMM8, QASYMM8_SIGNED, QSYMM8_PER_CHANNEL) + * + */ + ClMatMulNativeConfigArray(T func_f32, T func_f16, T func_int8) : _configs{func_f32, func_f16, func_int8} + { + } + + /** Method to return the matmul native configuration function based on data type + * + * @param[in] data_type Input data type + * + * @return the valid function otherwise it returns nullptr if the data type is not valid + */ + T get_function(DataType data_type) + { + switch (data_type) + { + case DataType::F32: + return _configs.at(DT_F32); + case DataType::F16: + return _configs.at(DT_F16); + case DataType::QASYMM8: + case DataType::QASYMM8_SIGNED: + case DataType::QSYMM8_PER_CHANNEL: + return _configs.at(DT_INT8); + default: + return nullptr; + } + } + +private: + std::array<T, 3> _configs; +}; + +/** Basic interface for the matmul native kernel configuration + * This is the base class that chooses architecture specific kernel configurations. +*/ +class IClMatMulNativeKernelConfig +{ +public: + /** Constructor + * + * @param[in] arch GPU target + */ + IClMatMulNativeKernelConfig(GPUTarget arch) : _target(arch) + { + } + ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(IClMatMulNativeKernelConfig); + /** Virtual destructor */ + virtual ~IClMatMulNativeKernelConfig() = default; + /** This method returns the @ref MatMulKernelInfo for the given inputs + * + * @param[in] lhs LHS tensor + * @param[in] rhs RHS tensor + * @param[in] info MatMul info + */ + virtual MatMulKernelInfo configure(const ITensorInfo *lhs, const ITensorInfo *rhs, const MatMulInfo &info) = 0; + +protected: + GPUTarget _target; +}; +} // namespace cl_matmul +} // namespace arm_compute +#endif // ACL_SRC_RUNTIME_HEURISTICS_MATMUL_NATIVE_ICLMATMULNATIVEKERNELCONFIG_H diff --git a/src/runtime/heuristics/matmul_native/IClMatMulNativeKernelVariant.h b/src/runtime/heuristics/matmul_native/IClMatMulNativeKernelVariant.h new file mode 100644 index 0000000000..eac41dd6a3 --- /dev/null +++ b/src/runtime/heuristics/matmul_native/IClMatMulNativeKernelVariant.h @@ -0,0 +1,135 @@ +/* + * Copyright (c) 2023 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ACL_SRC_RUNTIME_HEURISTICS_MATMUL_NATIVE_ICLMATMULNATIVEKERNELVARIANT_H +#define ACL_SRC_RUNTIME_HEURISTICS_MATMUL_NATIVE_ICLMATMULNATIVEKERNELVARIANT_H + +#include "arm_compute/core/CoreTypes.h" // DataType +#include "arm_compute/core/GPUTarget.h" +#include "arm_compute/core/ITensorInfo.h" +#include "arm_compute/function_info/ActivationLayerInfo.h" +#include "arm_compute/function_info/MatMulInfo.h" + +#include "src/core/common/Macros.h" + +#include <array> + +namespace arm_compute +{ +namespace cl_matmul +{ +enum class MatMulKernelType +{ + /** Native matrix multiplication for FP types */ + NATIVE_FP, + + /** Native matrix multiplication for quantized types */ + NATIVE_QUANTIZED, + + /** Native matrix multiplication using MMUL extension for FP types */ + NATIVE_MMUL_FP, + + /** Native matrix multiplication using MMUL extension for Quantized types */ + NATIVE_MMUL_QUANTIZED +}; + +/** Basic container for the OpenCL MatMul Native variant functions */ +template <class T> +class ClMatMulNativeVariantArray +{ +public: + /** Alias for Float index */ + static constexpr size_t DT_FLOAT = 0; + /** Alias for Quantized type index */ + static constexpr size_t DT_QUANTIZED = 1; + + /** Constructor + * + * @param[in] func_float Function to call for matmul native float (F32, F16) + * @param[in] func_quantized Function to call for matmul native quantized (QASYMM8, QASYMM8_SIGNED, QSYMM8_PER_CHANNEL) + * + */ + ClMatMulNativeVariantArray(T func_float, T func_quantized) : _configs{func_float, func_quantized} + { + } + + /** Method to return the matmul native variant function based on data type + * + * @param[in] data_type Input data type + * + * @return the valid function otherwise it returns nullptr if the data type is not valid + */ + T get_function(DataType data_type) + { + switch (data_type) + { + case DataType::F32: + case DataType::F16: + return _configs.at(DT_FLOAT); + case DataType::QASYMM8: + case DataType::QASYMM8_SIGNED: + case DataType::QSYMM8_PER_CHANNEL: + return _configs.at(DT_QUANTIZED); + default: + return nullptr; + } + } + +private: + std::array<T, 2> _configs; +}; + +/** Basic interface for the matmul native kernel variant + * This is the base class that chooses architecture specific kernel variants. +*/ +class IClMatMulNativeKernelVariant +{ +public: + /** Constructor + * + * @param[in] arch GPU target + */ + IClMatMulNativeKernelVariant(GPUTarget arch) : _target(arch) + { + } + ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(IClMatMulNativeKernelVariant); + /** Virtual destructor */ + virtual ~IClMatMulNativeKernelVariant() = default; + /** This method returns the @ref MatMulKernelType for the given inputs + * + * @param[in] lhs LHS tensor + * @param[in] rhs RHS tensor + * @param[in] info MatMul info + * @param[in] act_info Activation layer info + */ + virtual MatMulKernelType select_kernel(const ITensorInfo *lhs, + const ITensorInfo *rhs, + const MatMulInfo &info, + const ActivationLayerInfo &act_info) = 0; + +protected: + GPUTarget _target; +}; +} // namespace cl_matmul +} // namespace arm_compute +#endif // ACL_SRC_RUNTIME_HEURISTICS_MATMUL_NATIVE_ICLMATMULNATIVEKERNELVARIANT_H |