From b4bb6a03f717a320b935809fde795b3d6ec5a69f Mon Sep 17 00:00:00 2001 From: Manuel Bottini Date: Mon, 24 May 2021 16:01:32 +0100 Subject: Rename ported functions Rename CpuPooling to CpuPool2d Rename CpuPoolingKernel to CpuPool2dKernel Rename CpuPoolingAssemblyWrapperKernel to CpuPool2dAssemblyWrapperKernel Move CpuPool2dAssemblyWrapperKernel in internal subfolder Rename CpuDepthwiseConvolutionNativeKernel to CpuDepthwiseConv2dNativeKernel Rename CpuDepthwiseConvolutionAssemblyDispatch to CpuDepthwiseConv2dAssemblyDispatch Rename CpuDepthwiseConvolution to CpuDepthwiseConv2d Rename CpuDirectConvolutionKernel to CpuDirectConv2dKernel Rename CpuDirectConvolutionOutputStageKernel to CpuDirectConv2dOutputStageKernel Rename CpuDirectConvolution to CpuDirectConv2d Rename ClPoolingKernel to ClPool2dKernel Rename ClPooling to ClPool2d Rename ClDirectConvolutionKernel to ClDirectConv2dKernel Resolves: COMPMID-4405 Change-Id: I8e48f015e4e492a76a7512f5679cb3eb0cd028f6 Signed-off-by: Manuel Bottini Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/5708 Reviewed-by: Georgios Pinitas Tested-by: Arm Jenkins Comments-Addressed: Arm Jenkins --- .../CL/functions/CLDirectConvolutionLayer.cpp | 16 +- src/runtime/CL/functions/CLPoolingLayer.cpp | 14 +- .../NEON/functions/NEDepthwiseConvolutionLayer.cpp | 66 +-- .../NEON/functions/NEDirectConvolutionLayer.cpp | 16 +- src/runtime/NEON/functions/NEPoolingLayer.cpp | 16 +- src/runtime/cpu/operators/CpuDepthwiseConv2d.cpp | 523 +++++++++++++++++++ src/runtime/cpu/operators/CpuDepthwiseConv2d.h | 213 ++++++++ .../CpuDepthwiseConv2dAssemblyDispatch.cpp | 563 +++++++++++++++++++++ .../operators/CpuDepthwiseConv2dAssemblyDispatch.h | 86 ++++ .../cpu/operators/CpuDepthwiseConvolution.cpp | 523 ------------------- .../cpu/operators/CpuDepthwiseConvolution.h | 230 --------- .../CpuDepthwiseConvolutionAssemblyDispatch.cpp | 563 --------------------- .../CpuDepthwiseConvolutionAssemblyDispatch.h | 97 ---- src/runtime/cpu/operators/CpuDirectConv2d.cpp | 147 ++++++ src/runtime/cpu/operators/CpuDirectConv2d.h | 107 ++++ src/runtime/cpu/operators/CpuDirectConvolution.cpp | 147 ------ src/runtime/cpu/operators/CpuDirectConvolution.h | 121 ----- src/runtime/cpu/operators/CpuPool2d.cpp | 156 ++++++ src/runtime/cpu/operators/CpuPool2d.h | 87 ++++ src/runtime/cpu/operators/CpuPooling.cpp | 156 ------ src/runtime/cpu/operators/CpuPooling.h | 99 ---- src/runtime/gpu/cl/operators/ClDirectConv2d.cpp | 102 ++++ src/runtime/gpu/cl/operators/ClDirectConv2d.h | 83 +++ .../gpu/cl/operators/ClDirectConvolution.cpp | 102 ---- src/runtime/gpu/cl/operators/ClDirectConvolution.h | 92 ---- src/runtime/gpu/cl/operators/ClPool2d.cpp | 101 ++++ src/runtime/gpu/cl/operators/ClPool2d.h | 72 +++ src/runtime/gpu/cl/operators/ClPooling.cpp | 101 ---- src/runtime/gpu/cl/operators/ClPooling.h | 75 --- 29 files changed, 2304 insertions(+), 2370 deletions(-) create mode 100644 src/runtime/cpu/operators/CpuDepthwiseConv2d.cpp create mode 100644 src/runtime/cpu/operators/CpuDepthwiseConv2d.h create mode 100644 src/runtime/cpu/operators/CpuDepthwiseConv2dAssemblyDispatch.cpp create mode 100644 src/runtime/cpu/operators/CpuDepthwiseConv2dAssemblyDispatch.h delete mode 100644 src/runtime/cpu/operators/CpuDepthwiseConvolution.cpp delete mode 100644 src/runtime/cpu/operators/CpuDepthwiseConvolution.h delete mode 100644 src/runtime/cpu/operators/CpuDepthwiseConvolutionAssemblyDispatch.cpp delete mode 100644 src/runtime/cpu/operators/CpuDepthwiseConvolutionAssemblyDispatch.h create mode 100644 src/runtime/cpu/operators/CpuDirectConv2d.cpp create mode 100644 src/runtime/cpu/operators/CpuDirectConv2d.h delete mode 100644 src/runtime/cpu/operators/CpuDirectConvolution.cpp delete mode 100644 src/runtime/cpu/operators/CpuDirectConvolution.h create mode 100644 src/runtime/cpu/operators/CpuPool2d.cpp create mode 100644 src/runtime/cpu/operators/CpuPool2d.h delete mode 100644 src/runtime/cpu/operators/CpuPooling.cpp delete mode 100644 src/runtime/cpu/operators/CpuPooling.h create mode 100644 src/runtime/gpu/cl/operators/ClDirectConv2d.cpp create mode 100644 src/runtime/gpu/cl/operators/ClDirectConv2d.h delete mode 100644 src/runtime/gpu/cl/operators/ClDirectConvolution.cpp delete mode 100644 src/runtime/gpu/cl/operators/ClDirectConvolution.h create mode 100644 src/runtime/gpu/cl/operators/ClPool2d.cpp create mode 100644 src/runtime/gpu/cl/operators/ClPool2d.h delete mode 100644 src/runtime/gpu/cl/operators/ClPooling.cpp delete mode 100644 src/runtime/gpu/cl/operators/ClPooling.h (limited to 'src/runtime') diff --git a/src/runtime/CL/functions/CLDirectConvolutionLayer.cpp b/src/runtime/CL/functions/CLDirectConvolutionLayer.cpp index 74867ff64f..907e69d8d7 100644 --- a/src/runtime/CL/functions/CLDirectConvolutionLayer.cpp +++ b/src/runtime/CL/functions/CLDirectConvolutionLayer.cpp @@ -29,17 +29,17 @@ #include "arm_compute/core/Validate.h" #include "arm_compute/runtime/CL/CLScheduler.h" #include "src/runtime/gpu/cl/operators/ClActivation.h" -#include "src/runtime/gpu/cl/operators/ClDirectConvolution.h" +#include "src/runtime/gpu/cl/operators/ClDirectConv2d.h" namespace arm_compute { struct CLDirectConvolutionLayer::Impl { - const ICLTensor *src{ nullptr }; - const ICLTensor *weights{ nullptr }; - const ICLTensor *biases{ nullptr }; - ICLTensor *dst{ nullptr }; - std::unique_ptr op{ nullptr }; + const ICLTensor *src{ nullptr }; + const ICLTensor *weights{ nullptr }; + const ICLTensor *biases{ nullptr }; + ICLTensor *dst{ nullptr }; + std::unique_ptr op{ nullptr }; }; CLDirectConvolutionLayer::CLDirectConvolutionLayer() @@ -65,14 +65,14 @@ void CLDirectConvolutionLayer::configure(const CLCompileContext &compile_context _impl->biases = biases; _impl->dst = output; - _impl->op = std::make_unique(); + _impl->op = std::make_unique(); _impl->op->configure(compile_context, input->info(), weights->info(), (biases != nullptr) ? biases->info() : nullptr, output->info(), conv_info, act_info); } Status CLDirectConvolutionLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info) { - return opencl::ClDirectConvolution::validate(input, weights, biases, output, conv_info, act_info); + return opencl::ClDirectConv2d::validate(input, weights, biases, output, conv_info, act_info); } void CLDirectConvolutionLayer::run() diff --git a/src/runtime/CL/functions/CLPoolingLayer.cpp b/src/runtime/CL/functions/CLPoolingLayer.cpp index fbaec1d2d9..7ba911c342 100644 --- a/src/runtime/CL/functions/CLPoolingLayer.cpp +++ b/src/runtime/CL/functions/CLPoolingLayer.cpp @@ -26,16 +26,16 @@ #include "arm_compute/core/CL/CLKernelLibrary.h" #include "arm_compute/core/CL/ICLTensor.h" #include "src/core/CL/ICLKernel.h" -#include "src/runtime/gpu/cl/operators/ClPooling.h" +#include "src/runtime/gpu/cl/operators/ClPool2d.h" namespace arm_compute { struct CLPoolingLayer::Impl { - const ICLTensor *src{ nullptr }; - ICLTensor *dst{ nullptr }; - ICLTensor *indices{ nullptr }; - std::unique_ptr op{ nullptr }; + const ICLTensor *src{ nullptr }; + ICLTensor *dst{ nullptr }; + ICLTensor *indices{ nullptr }; + std::unique_ptr op{ nullptr }; }; CLPoolingLayer::CLPoolingLayer() @@ -55,13 +55,13 @@ void CLPoolingLayer::configure(const CLCompileContext &compile_context, ICLTenso _impl->dst = output; _impl->indices = indices; - _impl->op = std::make_unique(); + _impl->op = std::make_unique(); _impl->op->configure(compile_context, input->info(), output->info(), pool_info, (indices) ? indices->info() : nullptr); } Status CLPoolingLayer::validate(const ITensorInfo *input, const ITensorInfo *output, const PoolingLayerInfo &pool_info, const ITensorInfo *indices) { - return opencl::ClPooling::validate(input, output, pool_info, indices); + return opencl::ClPool2d::validate(input, output, pool_info, indices); } void CLPoolingLayer::run() diff --git a/src/runtime/NEON/functions/NEDepthwiseConvolutionLayer.cpp b/src/runtime/NEON/functions/NEDepthwiseConvolutionLayer.cpp index da9610ef42..a561b88058 100644 --- a/src/runtime/NEON/functions/NEDepthwiseConvolutionLayer.cpp +++ b/src/runtime/NEON/functions/NEDepthwiseConvolutionLayer.cpp @@ -27,7 +27,7 @@ #include "arm_compute/core/utils/misc/ShapeCalculator.h" #include "arm_compute/core/utils/quantization/AsymmHelpers.h" #include "arm_compute/runtime/NEON/NEScheduler.h" -#include "src/runtime/cpu/operators/CpuDepthwiseConvolution.h" +#include "src/runtime/cpu/operators/CpuDepthwiseConv2d.h" using namespace arm_compute::misc; using namespace arm_compute::misc::shape_calculator; @@ -47,15 +47,15 @@ struct NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerOptimizedInternal const ITensor *biases { nullptr - }; // SRC_2 - Tensor permuted_input{}; // INT_0 - Tensor permuted_weights{}; // INT_1 - Tensor permuted_output{}; // INT_2 - Tensor workspace{}; // INT_3 - Tensor packed_weights{}; // INT_4 - std::shared_ptr op{ nullptr }; - bool is_prepared{ false }; - bool permute{ false }; + }; // SRC_2 + Tensor permuted_input{}; // INT_0 + Tensor permuted_weights{}; // INT_1 + Tensor permuted_output{}; // INT_2 + Tensor workspace{}; // INT_3 + Tensor packed_weights{}; // INT_4 + std::shared_ptr op{ nullptr }; + bool is_prepared{ false }; + bool permute{ false }; }; NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerOptimizedInternal::NEDepthwiseConvolutionLayerOptimizedInternal(std::shared_ptr memory_manager) @@ -80,7 +80,7 @@ void NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerOptimizedInternal:: _impl->dst = output; _impl->permute = is_nhwc; - _impl->op = std::make_unique(); + _impl->op = std::make_unique(); ConvolutionInfo info{ conv_info, depth_multiplier, act_info, dilation }; _impl->op->configure(_impl->src->info(), _impl->weights->info(), _impl->biases == nullptr ? nullptr : _impl->biases->info(), _impl->dst->info(), info); @@ -97,7 +97,7 @@ void NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerOptimizedInternal:: } info = ConvolutionInfo{ conv_info, depth_multiplier, act_info_to_use, dilation }; - auto dwc_optimized_func = std::make_unique(); + auto dwc_optimized_func = std::make_unique(); if(is_nhwc) { @@ -154,7 +154,7 @@ Status NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerOptimizedInternal const Size2D &dilation) { ConvolutionInfo info{ conv_info, depth_multiplier, act_info, dilation }; - return cpu::CpuDepthwiseConvolution::validate(input, weights, biases, output, info); + return cpu::CpuDepthwiseConv2d::validate(input, weights, biases, output, info); } void NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerOptimizedInternal::run() @@ -197,17 +197,17 @@ void NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerOptimizedInternal:: struct NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerGeneric::Impl { - Tensor permuted_input{}; - Tensor permuted_weights{}; - Tensor permuted_output{}; - bool is_prepared{ false }; - bool is_nchw{ false }; - bool is_activationlayer_enabled{ false }; - const ITensor *weights{ nullptr }; - const ITensor *biases{ nullptr }; - const ITensor *src{ nullptr }; - ITensor *dst{ nullptr }; - std::shared_ptr op{ nullptr }; + Tensor permuted_input{}; + Tensor permuted_weights{}; + Tensor permuted_output{}; + bool is_prepared{ false }; + bool is_nchw{ false }; + bool is_activationlayer_enabled{ false }; + const ITensor *weights{ nullptr }; + const ITensor *biases{ nullptr }; + const ITensor *src{ nullptr }; + ITensor *dst{ nullptr }; + std::shared_ptr op{ nullptr }; }; NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerGeneric::NEDepthwiseConvolutionLayerGeneric() @@ -223,7 +223,7 @@ void NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerGeneric::configure( output->info(), conv_info, depth_multiplier, act_info, dilation)); const ConvolutionInfo info{ conv_info, depth_multiplier, act_info, dilation }; - _impl->op = std::make_unique(); + _impl->op = std::make_unique(); _impl->op->configure(input->info(), weights->info(), biases == nullptr ? nullptr : biases->info(), output->info(), info); _impl->src = input; @@ -253,7 +253,7 @@ void NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerGeneric::configure( output_to_use = &_impl->permuted_output; } - auto depthwise_conv_kernel = std::make_unique(); + auto depthwise_conv_kernel = std::make_unique(); depthwise_conv_kernel->configure(input_to_use->info(), weights_to_use->info(), biases == nullptr ? nullptr : biases->info(), output_to_use->info(), info); if(_impl->is_nchw) @@ -273,7 +273,7 @@ Status NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerGeneric::validate unsigned int depth_multiplier, const ActivationLayerInfo &act_info, const Size2D &dilation) { ConvolutionInfo info{ conv_info, depth_multiplier, act_info, dilation }; - return cpu::CpuDepthwiseConvolution::validate(input, weights, biases, output, info); + return cpu::CpuDepthwiseConv2d::validate(input, weights, biases, output, info); } void NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerGeneric::run() @@ -298,10 +298,10 @@ NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayer(std::shared_ptr op{ nullptr }; + DepthwiseConvolutionFunction depth_conv_func{ DepthwiseConvolutionFunction::OPTIMIZED }; + NEDepthwiseConvolutionLayerOptimizedInternal func_optimized{ nullptr }; + NEDepthwiseConvolutionLayerGeneric func_generic{}; + std::shared_ptr op{ nullptr }; }; #endif // DOXYGEN_SKIP_THIS @@ -309,7 +309,7 @@ void NEDepthwiseConvolutionLayer::configure(ITensor *input, const ITensor *weigh const ActivationLayerInfo &act_info, const Size2D &dilation) { const ConvolutionInfo info{ conv_info, depth_multiplier, act_info, dilation }; - _impl->op = std::make_shared(); + _impl->op = std::make_shared(); _impl->depth_conv_func = _impl->op->get_depthwiseconvolution_function(input->info(), weights->info(), (biases != nullptr) ? biases->info() : nullptr, output->info(), info); switch(_impl->depth_conv_func) @@ -329,7 +329,7 @@ Status NEDepthwiseConvolutionLayer::validate(const ITensorInfo *input, const ITe unsigned int depth_multiplier, const ActivationLayerInfo &act_info, const Size2D &dilation) { ConvolutionInfo info{ conv_info, depth_multiplier, act_info, dilation }; - return cpu::CpuDepthwiseConvolution::validate(input, weights, biases, output, info); + return cpu::CpuDepthwiseConv2d::validate(input, weights, biases, output, info); } void NEDepthwiseConvolutionLayer::run() diff --git a/src/runtime/NEON/functions/NEDirectConvolutionLayer.cpp b/src/runtime/NEON/functions/NEDirectConvolutionLayer.cpp index 73834381c6..58530e4a8f 100644 --- a/src/runtime/NEON/functions/NEDirectConvolutionLayer.cpp +++ b/src/runtime/NEON/functions/NEDirectConvolutionLayer.cpp @@ -27,17 +27,17 @@ #include "arm_compute/core/Utils.h" #include "arm_compute/core/Validate.h" #include "arm_compute/runtime/NEON/NEScheduler.h" -#include "src/runtime/cpu/operators/CpuDirectConvolution.h" +#include "src/runtime/cpu/operators/CpuDirectConv2d.h" namespace arm_compute { struct NEDirectConvolutionLayer::Impl { - ITensor *src{ nullptr }; - const ITensor *weights{ nullptr }; - const ITensor *bias{ nullptr }; - ITensor *dst{ nullptr }; - std::unique_ptr op{ nullptr }; + ITensor *src{ nullptr }; + const ITensor *weights{ nullptr }; + const ITensor *bias{ nullptr }; + ITensor *dst{ nullptr }; + std::unique_ptr op{ nullptr }; }; NEDirectConvolutionLayer::NEDirectConvolutionLayer(std::shared_ptr memory_manager) @@ -52,14 +52,14 @@ void NEDirectConvolutionLayer::configure(ITensor *input, const ITensor *weights, _impl->weights = weights; _impl->bias = bias; _impl->dst = output; - _impl->op = std::make_unique(_memory_manager); + _impl->op = std::make_unique(_memory_manager); _impl->op->configure(input->info(), weights->info(), (bias != nullptr ? bias->info() : nullptr), output->info(), conv_info, act_info); } Status NEDirectConvolutionLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *bias, const ITensorInfo *output, const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info) { - return cpu::CpuDirectConvolution::validate(input, weights, bias, output, conv_info, act_info); + return cpu::CpuDirectConv2d::validate(input, weights, bias, output, conv_info, act_info); } void NEDirectConvolutionLayer::run() diff --git a/src/runtime/NEON/functions/NEPoolingLayer.cpp b/src/runtime/NEON/functions/NEPoolingLayer.cpp index 1570cdeedc..bbf3e7cc4e 100644 --- a/src/runtime/NEON/functions/NEPoolingLayer.cpp +++ b/src/runtime/NEON/functions/NEPoolingLayer.cpp @@ -26,17 +26,17 @@ #include "arm_compute/core/TensorInfo.h" #include "arm_compute/core/Validate.h" #include "arm_compute/runtime/Tensor.h" -#include "src/runtime/cpu/operators/CpuPooling.h" +#include "src/runtime/cpu/operators/CpuPool2d.h" namespace arm_compute { struct NEPoolingLayer::Impl { - ITensor *src{ nullptr }; - ITensor *dst{ nullptr }; - ITensor *indices{ nullptr }; - Tensor workspace{ nullptr }; - std::unique_ptr op{ nullptr }; + ITensor *src{ nullptr }; + ITensor *dst{ nullptr }; + ITensor *indices{ nullptr }; + Tensor workspace{ nullptr }; + std::unique_ptr op{ nullptr }; }; NEPoolingLayer::~NEPoolingLayer() = default; @@ -51,7 +51,7 @@ void NEPoolingLayer::configure(ITensor *input, ITensor *output, const PoolingLay _impl->src = input; _impl->dst = output; _impl->indices = indices; - _impl->op = std::make_unique(); + _impl->op = std::make_unique(); _impl->op->configure(input->info(), output->info(), pool_info, (indices) ? indices->info() : nullptr); // Allocate workspace based on kernel's memory requirements @@ -66,7 +66,7 @@ void NEPoolingLayer::configure(ITensor *input, ITensor *output, const PoolingLay Status NEPoolingLayer::validate(const ITensorInfo *input, const ITensorInfo *output, const PoolingLayerInfo &pool_info, const ITensorInfo *indices) { - return cpu::CpuPooling::validate(input, output, pool_info, indices); + return cpu::CpuPool2d::validate(input, output, pool_info, indices); } void NEPoolingLayer::run() diff --git a/src/runtime/cpu/operators/CpuDepthwiseConv2d.cpp b/src/runtime/cpu/operators/CpuDepthwiseConv2d.cpp new file mode 100644 index 0000000000..160a9fd70b --- /dev/null +++ b/src/runtime/cpu/operators/CpuDepthwiseConv2d.cpp @@ -0,0 +1,523 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "src/runtime/cpu/operators/CpuDepthwiseConv2d.h" + +#include "arm_compute/core/TensorInfo.h" +#include "arm_compute/core/Validate.h" +#include "arm_compute/core/utils/misc/InfoHelpers.h" +#include "arm_compute/core/utils/misc/ShapeCalculator.h" +#include "arm_compute/runtime/NEON/NEScheduler.h" +#include "src/core/cpu/kernels/CpuDepthwiseConv2dNativeKernel.h" + +namespace arm_compute +{ +namespace cpu +{ +namespace +{ +Status validate_arguments_optimized(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, const ConvolutionInfo &info) +{ + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, weights, dst); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F16, DataType::F32); + if(!is_data_type_quantized_per_channel(weights->data_type())) + { + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, weights); + } + ARM_COMPUTE_RETURN_ERROR_ON(src->data_layout() == DataLayout::UNKNOWN); + ARM_COMPUTE_RETURN_ERROR_ON(info.dilation.x() < 1 || info.dilation.y() < 1); + const size_t idx_w = get_data_layout_dimension_index(src->data_layout(), DataLayoutDimension::WIDTH); + const size_t idx_h = get_data_layout_dimension_index(src->data_layout(), DataLayoutDimension::HEIGHT); + ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(idx_w) + (weights->dimension(idx_w) - 1) * (info.dilation.x() - 1) > src->dimension(idx_w) + info.pad_stride_info.pad_left() + + info.pad_stride_info.pad_right()); + ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(idx_h) + (weights->dimension(idx_h) - 1) * (info.dilation.y() - 1) > src->dimension(idx_h) + info.pad_stride_info.pad_top() + + info.pad_stride_info.pad_bottom()); + + if(biases != nullptr) + { + const unsigned int channel_idx = get_data_layout_dimension_index(src->data_layout(), DataLayoutDimension::CHANNEL); + ARM_COMPUTE_RETURN_ERROR_ON(biases->num_dimensions() > 1); + ARM_COMPUTE_RETURN_ERROR_ON(biases->dimension(0) != weights->dimension(channel_idx)); + } + + ARM_COMPUTE_RETURN_ON_ERROR(CpuDepthwiseConv2dAssemblyDispatch::validate(src, weights, biases, dst, info)); + + //Validate Activation Layer + if(info.act_info.enabled()) + { + ARM_COMPUTE_RETURN_ON_ERROR(CpuActivation::validate(dst, nullptr, info.act_info)); + } + return Status{}; +} +} // namespace + +CpuDepthwiseConv2d::CpuDepthwiseConv2dOptimizedInternal::CpuDepthwiseConv2dOptimizedInternal() + : _dwc_optimized_func(nullptr), _permute_input(nullptr), _permute_weights(nullptr), _permute_output(nullptr), _activationlayer_function(nullptr), _has_bias(false), _is_quantized(false), + _is_nchw(true), _permute(false), _is_activationlayer_enabled(false), _is_prepared(false) +{ +} + +void CpuDepthwiseConv2d::CpuDepthwiseConv2dOptimizedInternal::configure(ITensorInfo *src, + const ITensorInfo *weights, + const ITensorInfo *biases, + ITensorInfo *dst, + const ConvolutionInfo &info) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(src, weights, dst); + // Perform validation step + ARM_COMPUTE_ERROR_THROW_ON(CpuDepthwiseConv2dOptimizedInternal::validate(src, weights, (biases == nullptr) ? nullptr : biases, + dst, info)); + + _is_quantized = is_data_type_quantized_asymmetric(src->data_type()); + _has_bias = biases != nullptr; + _is_nchw = src->data_layout() == DataLayout::NCHW; + _permute = _is_nchw; + _is_prepared = false; + + // Configure pipeline + ActivationLayerInfo act_info_to_use = ActivationLayerInfo(); + const bool is_relu = arm_compute::utils::info_helpers::is_relu(info.act_info); + const bool is_relu6 = arm_compute::utils::info_helpers::is_relu6(info.act_info); + _is_activationlayer_enabled = info.act_info.enabled() && !(is_relu || is_relu6); + + if(!_is_activationlayer_enabled) + { + act_info_to_use = info.act_info; + } + + _dwc_optimized_func = std::make_unique(); + if(_is_nchw) + { + _permute_input = std::make_unique(); + _permute_weights = std::make_unique(); + _permute_output = std::make_unique(); + + auto input_perm = std::make_unique(); + auto weights_perm = std::make_unique(); + auto output_perm = std::make_unique(); + + // Configure the function to transform the input tensor from NCHW -> NHWC + _permute_input->configure(src, input_perm.get(), PermutationVector(2U, 0U, 1U)); + input_perm->set_data_layout(DataLayout::NHWC); + + // Configure the function to transform the weights tensor from IHW -> HWI + _permute_weights->configure(weights, weights_perm.get(), PermutationVector(2U, 0U, 1U)); + weights_perm->set_data_layout(DataLayout::NHWC); + + output_perm->set_data_layout(DataLayout::NHWC); + output_perm->set_quantization_info(dst->quantization_info()); + + // Configure optimized depthwise + _dwc_optimized_func->configure(input_perm.get(), weights_perm.get(), biases, output_perm.get(), info); + + // Configure the function to transform the convoluted output to ACL's native ordering format NCHW + output_perm->set_data_layout(DataLayout::NHWC); + _permute_output->configure(output_perm.get(), dst, PermutationVector(1U, 2U, 0U)); + } + else + { + _dwc_optimized_func->configure(src, weights, biases, dst, info); + } + + // Configure activation + if(_is_activationlayer_enabled) + { + _activationlayer_function = std::make_unique(); + _activationlayer_function->configure(dst, nullptr, info.act_info); + } +} + +Status CpuDepthwiseConv2d::CpuDepthwiseConv2dOptimizedInternal::validate(const ITensorInfo *src, + const ITensorInfo *weights, + const ITensorInfo *biases, + const ITensorInfo *dst, + const ConvolutionInfo &info) +{ + return validate_arguments_optimized(src, weights, biases, dst, info); +} + +void CpuDepthwiseConv2d::CpuDepthwiseConv2dOptimizedInternal::run(ITensorPack &tensors) +{ + ARM_COMPUTE_ERROR_ON_MSG(tensors.empty(), "No inputs provided"); + prepare(tensors); + + auto bias = tensors.get_const_tensor(TensorType::ACL_SRC_2); + auto dst = tensors.get_tensor(TensorType::ACL_DST_0); + auto workspace = tensors.get_tensor(TensorType::ACL_INT_3); + auto packed_weights = tensors.get_tensor(TensorType::ACL_INT_4); + + // Permute input + if(_permute) + { + ITensorPack pack; + auto src = tensors.get_const_tensor(TensorType::ACL_SRC_0); + auto src_perm = tensors.get_tensor(TensorType::ACL_INT_0); + pack.add_tensor(TensorType::ACL_SRC, src); + pack.add_tensor(TensorType::ACL_DST, src_perm); + _permute_input->run(pack); + } + + // Run assembly function + if(_is_nchw) + { + auto src_perm = tensors.get_tensor(TensorType::ACL_INT_0); + auto weights_perm = tensors.get_tensor(TensorType::ACL_INT_1); + auto dst_perm = tensors.get_tensor(TensorType::ACL_INT_2); + + ITensorPack pack; + pack.add_tensor(TensorType::ACL_SRC_0, src_perm); + pack.add_tensor(TensorType::ACL_SRC_1, weights_perm); + pack.add_tensor(TensorType::ACL_SRC_2, bias); + pack.add_tensor(TensorType::ACL_INT_0, workspace); + pack.add_tensor(TensorType::ACL_INT_1, packed_weights); + pack.add_tensor(TensorType::ACL_DST, dst_perm); + _dwc_optimized_func->run(pack); + } + else + { + auto src = tensors.get_tensor(TensorType::ACL_SRC_0); + auto weights = tensors.get_tensor(TensorType::ACL_SRC_1); + auto dst = tensors.get_tensor(TensorType::ACL_DST); + + ITensorPack pack; + pack.add_tensor(TensorType::ACL_SRC_0, src); + pack.add_tensor(TensorType::ACL_SRC_1, weights); + pack.add_tensor(TensorType::ACL_SRC_2, bias); + pack.add_tensor(TensorType::ACL_INT_0, workspace); + pack.add_tensor(TensorType::ACL_INT_1, packed_weights); + pack.add_tensor(TensorType::ACL_DST, dst); + _dwc_optimized_func->run(pack); + } + + // Permute output + if(_is_nchw) + { + ITensorPack pack; + auto dst_perm = tensors.get_tensor(TensorType::ACL_INT_2); + pack.add_tensor(TensorType::ACL_SRC, dst_perm); + pack.add_tensor(TensorType::ACL_DST, dst); + _permute_output->run(pack); + } + + // Run activation + if(_is_activationlayer_enabled) + { + ITensorPack pack; + pack.add_tensor(TensorType::ACL_SRC, dst); + pack.add_tensor(TensorType::ACL_DST, dst); + _activationlayer_function->run(pack); + } +} + +void CpuDepthwiseConv2d::CpuDepthwiseConv2dOptimizedInternal::prepare(ITensorPack &tensors) +{ + if(!_is_prepared) + { + auto weights = tensors.get_const_tensor(TensorType::ACL_SRC_1); + auto bias = tensors.get_const_tensor(TensorType::ACL_SRC_2); + auto packed_weights = tensors.get_tensor(TensorType::ACL_INT_4); + + // Permute weights + if(_permute) + { + auto permuted_weights = tensors.get_tensor(TensorType::ACL_INT_1); + + ITensorPack pack; + pack.add_tensor(TensorType::ACL_SRC, weights); + pack.add_tensor(TensorType::ACL_DST, permuted_weights); + _permute_weights->run(pack); + + weights->mark_as_unused(); + + ITensorPack pack_opt; + pack_opt.add_const_tensor(TensorType::ACL_SRC_1, permuted_weights); + pack_opt.add_tensor(TensorType::ACL_SRC_2, bias); + pack_opt.add_tensor(TensorType::ACL_INT_1, packed_weights); + + // Prepare optimized function + _dwc_optimized_func->prepare(pack_opt); + } + else + { + ITensorPack pack_opt; + pack_opt.add_tensor(TensorType::ACL_SRC_1, weights); + pack_opt.add_tensor(TensorType::ACL_SRC_2, bias); + pack_opt.add_tensor(TensorType::ACL_INT_1, packed_weights); + + // Prepare optimized function + _dwc_optimized_func->prepare(pack_opt); + } + + _is_prepared = true; + } +} + +CpuDepthwiseConv2d::CpuDepthwiseConv2dGeneric::CpuDepthwiseConv2dGeneric() + : _depthwise_conv_kernel(nullptr), _permute_input(nullptr), _permute_weights(nullptr), _permute_output(nullptr), _activationlayer_function(nullptr), _is_nchw(true), _is_prepared(false), + _is_activationlayer_enabled(false) +{ +} + +void CpuDepthwiseConv2d::CpuDepthwiseConv2dGeneric::configure(ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, ITensorInfo *dst, const ConvolutionInfo &info) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(src, weights, dst); + ARM_COMPUTE_ERROR_THROW_ON(CpuDepthwiseConv2d::validate(src, weights, (biases == nullptr) ? nullptr : biases, + dst, info)); + + _is_nchw = src->data_layout() == DataLayout::NCHW; + _is_prepared = !_is_nchw; + + ITensorInfo *input_to_use = src; + const ITensorInfo *weights_to_use = weights; + ITensorInfo *output_to_use = dst; + + auto input_perm = std::make_unique(); + auto weights_perm = std::make_unique(); + auto output_perm = std::make_unique(dst->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(TensorShape())); + + if(_is_nchw) + { + _permute_input = std::make_unique(); + _permute_weights = std::make_unique(); + + _permute_input->configure(src, input_perm.get(), PermutationVector(2U, 0U, 1U)); + input_perm->set_data_layout(DataLayout::NHWC); + input_to_use = input_perm.get(); + + _permute_weights->configure(weights, weights_perm.get(), PermutationVector(2U, 0U, 1U)); + weights_perm->set_data_layout(DataLayout::NHWC); + weights_to_use = weights_perm.get(); + + output_to_use = output_perm.get(); + } + + _depthwise_conv_kernel = std::make_unique(); + _depthwise_conv_kernel->configure(input_to_use, weights_to_use, biases, output_to_use, info); + + if(_is_nchw) + { + _permute_output = std::make_unique(); + _permute_output->configure(output_perm.get(), dst, PermutationVector(1U, 2U, 0U)); + output_perm->set_data_layout(DataLayout::NHWC); + } + + //Configure Activation Layer + _is_activationlayer_enabled = info.act_info.enabled(); + if(_is_activationlayer_enabled) + { + _activationlayer_function = std::make_unique(); + _activationlayer_function->configure(dst, nullptr, info.act_info); + } +} + +Status CpuDepthwiseConv2d::CpuDepthwiseConv2dGeneric::validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, + const ConvolutionInfo &info) +{ + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, weights, dst); + if(src->data_layout() == DataLayout::NCHW) + { + TensorShape permuted_input_shape = src->tensor_shape(); + TensorShape permuted_weights_shape = weights->tensor_shape(); + TensorShape permuted_output_shape = misc::shape_calculator::compute_depthwise_convolution_shape(*src, *weights, info); + permute(permuted_input_shape, PermutationVector(2U, 0U, 1U)); + permute(permuted_weights_shape, PermutationVector(2U, 0U, 1U)); + permute(permuted_output_shape, PermutationVector(2U, 0U, 1U)); + + const TensorInfo permuted_input = TensorInfo(src->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(permuted_input_shape).set_data_layout(DataLayout::NHWC)); + const TensorInfo permuted_weights = TensorInfo(weights->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(permuted_weights_shape).set_data_layout(DataLayout::NHWC)); + const TensorInfo permuted_output = TensorInfo(dst->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(permuted_output_shape).set_data_layout(DataLayout::NCHW)); + + ARM_COMPUTE_RETURN_ON_ERROR(CpuPermute::validate(src, &permuted_input, PermutationVector(2U, 0U, 1U))); + ARM_COMPUTE_RETURN_ON_ERROR(CpuPermute::validate(weights, &permuted_weights, PermutationVector(2U, 0U, 1U))); + ARM_COMPUTE_RETURN_ON_ERROR(CpuPermute::validate(&permuted_output, dst, PermutationVector(1U, 2U, 0U))); + + ARM_COMPUTE_RETURN_ON_ERROR(cpu::kernels::CpuDepthwiseConv2dNativeKernel::validate(&permuted_input, &permuted_weights, biases, &permuted_output, info)); + } + else + { + ARM_COMPUTE_RETURN_ON_ERROR(cpu::kernels::CpuDepthwiseConv2dNativeKernel::validate(src, weights, biases, dst, info)); + } + + // Validate Activation Layer + if(info.act_info.enabled()) + { + ARM_COMPUTE_RETURN_ON_ERROR(CpuActivation::validate(dst, nullptr, info.act_info)); + } + + return Status{}; +} + +void CpuDepthwiseConv2d::CpuDepthwiseConv2dGeneric::run(ITensorPack &tensors) +{ + auto src = tensors.get_const_tensor(TensorType::ACL_SRC_0); + auto weights = tensors.get_const_tensor(TensorType::ACL_SRC_1); + auto biases = tensors.get_const_tensor(TensorType::ACL_SRC_2); + auto dst = tensors.get_tensor(TensorType::ACL_DST_0); + + if(_is_nchw) + { + prepare(tensors); + auto src_perm = tensors.get_tensor(TensorType::ACL_INT_0); + auto weights_perm = tensors.get_tensor(TensorType::ACL_INT_1); + auto dst_perm = tensors.get_tensor(TensorType::ACL_INT_2); + + ITensorPack pack; + pack.add_tensor(TensorType::ACL_SRC, src); + pack.add_tensor(TensorType::ACL_DST, src_perm); + _permute_input->run(pack); + + ITensorPack pack_depth; + pack_depth.add_const_tensor(TensorType::ACL_SRC_0, src_perm); + pack_depth.add_const_tensor(TensorType::ACL_SRC_1, weights_perm); + pack_depth.add_tensor(TensorType::ACL_SRC_2, biases); + pack_depth.add_tensor(TensorType::ACL_DST, dst_perm); + NEScheduler::get().schedule_op(_depthwise_conv_kernel.get(), Window::DimY, _depthwise_conv_kernel->window(), pack_depth); + } + else + { + ITensorPack pack_depth; + pack_depth.add_tensor(TensorType::ACL_SRC_0, src); + pack_depth.add_tensor(TensorType::ACL_SRC_1, weights); + pack_depth.add_tensor(TensorType::ACL_SRC_2, biases); + pack_depth.add_tensor(TensorType::ACL_DST, dst); + NEScheduler::get().schedule_op(_depthwise_conv_kernel.get(), Window::DimY, _depthwise_conv_kernel->window(), pack_depth); + } + + if(_is_nchw) + { + ITensorPack pack; + auto dst_perm = tensors.get_tensor(TensorType::ACL_INT_2); + pack.add_tensor(TensorType::ACL_SRC, dst_perm); + pack.add_tensor(TensorType::ACL_DST, dst); + _permute_output->run(pack); + } + + if(_is_activationlayer_enabled) + { + ITensorPack pack; + pack.add_tensor(TensorType::ACL_SRC, dst); + pack.add_tensor(TensorType::ACL_DST, dst); + _activationlayer_function->run(pack); + } +} + +void CpuDepthwiseConv2d::CpuDepthwiseConv2dGeneric::prepare(ITensorPack &tensors) +{ + if(!_is_prepared) + { + auto weights = tensors.get_const_tensor(TensorType::ACL_SRC_1); + auto weights_perm = tensors.get_tensor(TensorType::ACL_INT_1); + + ARM_COMPUTE_ERROR_ON(!weights->is_used()); + + ITensorPack pack; + pack.add_tensor(TensorType::ACL_SRC, weights); + pack.add_tensor(TensorType::ACL_DST, weights_perm); + + _permute_weights->run(pack); + weights->mark_as_unused(); + _is_prepared = true; + } +} + +CpuDepthwiseConv2d::CpuDepthwiseConv2d() + : _depth_conv_func(DepthwiseConvolutionFunction::GENERIC), _func_optimized(), _func_generic() +{ +} + +void CpuDepthwiseConv2d::configure(ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, ITensorInfo *dst, const ConvolutionInfo &info) +{ + _depth_conv_func = get_depthwiseconvolution_function(src, weights, (biases != nullptr) ? biases : nullptr, dst, info); + switch(_depth_conv_func) + { + case DepthwiseConvolutionFunction::OPTIMIZED: + _func_optimized.configure(src, weights, biases, dst, info); + break; + case DepthwiseConvolutionFunction::GENERIC: + _func_generic.configure(src, weights, biases, dst, info); + break; + default: + ARM_COMPUTE_ERROR("Unsupported DepthwiseConvolutionFunction"); + } +} + +Status CpuDepthwiseConv2d::validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, const ConvolutionInfo &info) +{ + DepthwiseConvolutionFunction depth_conv_func = get_depthwiseconvolution_function(src, weights, biases, dst, info); + switch(depth_conv_func) + { + case DepthwiseConvolutionFunction::OPTIMIZED: + return CpuDepthwiseConv2dOptimizedInternal::validate(src, weights, biases, dst, info); + break; + case DepthwiseConvolutionFunction::GENERIC: + return CpuDepthwiseConv2dGeneric::validate(src, weights, biases, dst, info); + break; + default: + ARM_COMPUTE_ERROR("Unsupported DepthwiseConvolutionFunction"); + } +} + +DepthwiseConvolutionFunction CpuDepthwiseConv2d::get_depthwiseconvolution_function(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, + const ConvolutionInfo &info) +{ + if(bool(CpuDepthwiseConv2dOptimizedInternal::validate(src, weights, biases, dst, info))) + { + return DepthwiseConvolutionFunction::OPTIMIZED; + } + else + { + return DepthwiseConvolutionFunction::GENERIC; + } +} + +void CpuDepthwiseConv2d::run(ITensorPack &tensors) +{ + switch(_depth_conv_func) + { + case DepthwiseConvolutionFunction::OPTIMIZED: + _func_optimized.run(tensors); + break; + case DepthwiseConvolutionFunction::GENERIC: + _func_generic.run(tensors); + break; + default: + ARM_COMPUTE_ERROR("DepthwiseConvolutionFunction not properly configured"); + } +} + +void CpuDepthwiseConv2d::prepare(ITensorPack &tensors) +{ + switch(_depth_conv_func) + { + case DepthwiseConvolutionFunction::OPTIMIZED: + _func_optimized.prepare(tensors); + break; + case DepthwiseConvolutionFunction::GENERIC: + _func_generic.prepare(tensors); + break; + default: + ARM_COMPUTE_ERROR("DepthwiseConvolutionFunction not properly configured"); + } +} +} // namespace cpu +} // namespace arm_compute diff --git a/src/runtime/cpu/operators/CpuDepthwiseConv2d.h b/src/runtime/cpu/operators/CpuDepthwiseConv2d.h new file mode 100644 index 0000000000..049397fe60 --- /dev/null +++ b/src/runtime/cpu/operators/CpuDepthwiseConv2d.h @@ -0,0 +1,213 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ARM_COMPUTE_CPU_DEPTHWISECONV2D_H +#define ARM_COMPUTE_CPU_DEPTHWISECONV2D_H + +#include "arm_compute/core/ITensorInfo.h" +#include "arm_compute/core/experimental/Types.h" +#include "src/core/cpu/ICpuKernel.h" +#include "src/core/cpu/kernels/CpuDepthwiseConv2dNativeKernel.h" +#include "src/runtime/cpu/ICpuOperator.h" +#include "src/runtime/cpu/operators/CpuActivation.h" +#include "src/runtime/cpu/operators/CpuDepthwiseConv2dAssemblyDispatch.h" +#include "src/runtime/cpu/operators/CpuPermute.h" + +#include + +namespace arm_compute +{ +namespace cpu +{ +/** Function to execute a depthwise convolution. + */ +class CpuDepthwiseConv2d : public ICpuOperator +{ +public: + /** Default constructor */ + CpuDepthwiseConv2d(); + /** Initialize the function's source, destination, weights and convolution information. + * + * @param[in, out] src Source tensor info. Data type supported: QASYMM8/QASYMM8_SIGNED/F16/F32 + * @param[out] dst Destination tensor info. Data type supported: same as @p src. + * @param[in] weights Weights tensor info. These are 3D tensor infos with shape [kernel_x, kernel_y, IFM]. + * Data type supported: Same as @p src or QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL when @p src is QASYMM8/QASYMM8_SIGNED. + * @param[in] biases Biases tensor info. A 1D tensor with shape [IFM]. Must be nullptr if not needed. + * Data type supported: Same as @p src, S32 when src is QASYMM8/QASYMM8_SIGNED. + * @param[in] info Depthwise convolution meta-data. + */ + void configure(ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, ITensorInfo *dst, const ConvolutionInfo &info); + + /** Static function to check if given info will lead to a valid configuration + * + * Similar to CpuDepthwiseConv2d::configure() + * + * @return a status + */ + static Status validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, const ConvolutionInfo &info); + + /** Static function to choose the best depthwise convolution function for @ref CpuDepthwiseConv2d + * + * @param[in] src Source tensor info. Data type supported: QASYMM8/QASYMM8_SIGNED/F16/F32 + * @param[in] weights Weights tensor info. These are 3D tensors with shape [kernel_x, kernel_y, IFM]. + * Data type supported: Same as @p src or QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL when @p src is QASYMM8/QASYMM8_SIGNED. + * @param[in] biases Biases tensor info. A 1D tensor with shape [IFM]. Must be nullptr if not needed. + * Data type supported: Same as @p src, S32 when src is QASYMM8/QASYMM8_SIGNED. + * @param[in] dst Destination tensor. Data type supported: same as @p src. + * @param[in] info Depthwise convolution meta-data. + * + * @return a Depthwise Convolution Function + */ + static DepthwiseConvolutionFunction get_depthwiseconvolution_function(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, + const ConvolutionInfo &info); + + // Inherited methods overriden: + void run(ITensorPack &tensors) override; + void prepare(ITensorPack &tensors) override; + +private: + /** Basic function to execute optimized depthwise convolution routines. This function calls the following kernels: + * + * @note At the moment 3x3 and 5x5 convolution of stride 1, 2 are supported + * + * -# @ref NEFillBorderKernel (if pad_x or pad_y > 0) and no assembly kernel implementation is present + * -# @ref CpuDepthwiseConv2d3x3Kernel if 3x3 and no assembly kernel implementation is present + * -# @ref NEDepthwiseConvolutionAssemblyDispatch if assembly kernel implementation is present + * -# @ref NEDirectConvolutionLayerOutputStageKernel if re-quantization of dst is required + * -# @ref NEActivationLayer if fused activation is required + * + */ + class CpuDepthwiseConv2dOptimizedInternal : public ICpuOperator + { + public: + /** Default constructor */ + CpuDepthwiseConv2dOptimizedInternal(); + /** Prevent instances of this class from being copied (As this class contains pointers) */ + CpuDepthwiseConv2dOptimizedInternal(const CpuDepthwiseConv2dOptimizedInternal &) = delete; + /** Default move constructor */ + CpuDepthwiseConv2dOptimizedInternal(CpuDepthwiseConv2dOptimizedInternal &&) = default; + /** Prevent instances of this class from being copied (As this class contains pointers) */ + CpuDepthwiseConv2dOptimizedInternal &operator=(const CpuDepthwiseConv2dOptimizedInternal &) = delete; + /** Default move assignment operator */ + CpuDepthwiseConv2dOptimizedInternal &operator=(CpuDepthwiseConv2dOptimizedInternal &&) = default; + /** Default destructor */ + ~CpuDepthwiseConv2dOptimizedInternal() = default; + /** Initialize the function's source, destination, kernels and border_size. + * + * @param[in, out] src Source tensor info. Data type supported: QASYMM8/QASYMM8_SIGNED/F16/F32. (Written to only for border filling). + * @param[in] weights Weights tensor info. These are 3D tensors with shape [kernel_x, kernel_y, IFM]. Data type supported: Same as @p src. + * @param[in] biases Biases tensor info. A 1D tensor with shape [IFM]. Must be nullptr if not needed. + * Data type supported: Same as @p src, S32 when src is QASYMM8/QASYMM8_SIGNED. + * @param[out] dst Destination tensor info. Data type supported: same as @p src. + * @param[in] info Depthwise convolution meta-data. + */ + void configure(ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, ITensorInfo *dst, const ConvolutionInfo &info); + + /** Static function to check if given info will lead to a valid configuration + * + * Similar to CpuDepthwiseConv2dOptimizedInternal::configure() + * + * @return a status + */ + static Status validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, const ConvolutionInfo &info); + + // Inherited methods overriden: + void run(ITensorPack &tensors) override; + void prepare(ITensorPack &tensors) override; + + private: + std::unique_ptr _dwc_optimized_func{ nullptr }; + std::unique_ptr _permute_input{ nullptr }; + std::unique_ptr _permute_weights{ nullptr }; + std::unique_ptr _permute_output{ nullptr }; + std::unique_ptr _activationlayer_function{ nullptr }; + bool _has_bias{ false }; + bool _is_quantized{ false }; + bool _is_nchw{ true }; + bool _permute{ false }; + bool _is_activationlayer_enabled{ false }; + bool _is_prepared{ false }; + }; + + /** Basic function to execute a generic depthwise convolution. This function calls the following kernel: + * + * -# @ref CpuDepthwiseConv2dNativeKernel + * + */ + class CpuDepthwiseConv2dGeneric : public ICpuOperator + { + public: + /** Default constructor */ + CpuDepthwiseConv2dGeneric(); + /** Prevent instances of this class from being copied (As this class contains pointers) */ + CpuDepthwiseConv2dGeneric(const CpuDepthwiseConv2dGeneric &) = delete; + /** Default move constructor */ + CpuDepthwiseConv2dGeneric(CpuDepthwiseConv2dGeneric &&) = default; + /** Prevent instances of this class from being copied (As this class contains pointers) */ + CpuDepthwiseConv2dGeneric &operator=(const CpuDepthwiseConv2dGeneric &) = delete; + /** Default move assignment operator */ + CpuDepthwiseConv2dGeneric &operator=(CpuDepthwiseConv2dGeneric &&) = default; + /** Default destructor */ + ~CpuDepthwiseConv2dGeneric() = default; + /** Initialize the function's source, destination, weights and convolution information. + * + * @param[in, out] src Source tensor info. Data type supported: QASYMM8/QASYMM8_SIGNED/F16/F32. (Written to only for border filling). + * @param[out] dst Destination tensor info. Data type supported: same as @p src. + * @param[in] weights Weights tensor info. These are 3D tensors with shape [kernel_x, kernel_y, IFM]. + * Data type supported: Same as @p src or QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL when @p src is QASYMM8/QASYMM8_SIGNED. + * @param[in] biases Biases tensor info. A 1D tensor with shape [IFM]. Must be nullptr if not needed. + * Data type supported: Same as @p src, S32 when src is QASYMM8/QASYMM8_SIGNED. + * @param[in] info Depthwise convolution meta-data. + */ + void configure(ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, ITensorInfo *dst, const ConvolutionInfo &info); + + /** Static function to check if given info will lead to a valid configuration + * + * Similar to CpuDepthwiseConv2dGeneric::configure() + * + * @return a status + */ + static Status validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, const ConvolutionInfo &info); + + // Inherited methods overridden: + void run(ITensorPack &tensors) override; + void prepare(ITensorPack &tensors) override; + + private: + std::unique_ptr _depthwise_conv_kernel{ nullptr }; + std::unique_ptr _permute_input{ nullptr }; + std::unique_ptr _permute_weights{ nullptr }; + std::unique_ptr _permute_output{ nullptr }; + std::unique_ptr _activationlayer_function{ nullptr }; + bool _is_nchw{ true }; + bool _is_prepared{ false }; + bool _is_activationlayer_enabled{ false }; + }; + + DepthwiseConvolutionFunction _depth_conv_func; + CpuDepthwiseConv2dOptimizedInternal _func_optimized; + CpuDepthwiseConv2dGeneric _func_generic; +}; +} // namespace cpu +} // namespace arm_compute +#endif /* ARM_COMPUTE_CPU_DEPTHWISECONV2D_H */ diff --git a/src/runtime/cpu/operators/CpuDepthwiseConv2dAssemblyDispatch.cpp b/src/runtime/cpu/operators/CpuDepthwiseConv2dAssemblyDispatch.cpp new file mode 100644 index 0000000000..a36ee1d45b --- /dev/null +++ b/src/runtime/cpu/operators/CpuDepthwiseConv2dAssemblyDispatch.cpp @@ -0,0 +1,563 @@ +/* + * Copyright (c) 2019-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "src/runtime/cpu/operators/CpuDepthwiseConv2dAssemblyDispatch.h" + +#include "arm_compute/core/ITensor.h" +#include "arm_compute/core/Utils.h" +#include "arm_compute/core/utils/misc/InfoHelpers.h" +#include "arm_compute/core/utils/misc/ShapeCalculator.h" +#include "arm_compute/core/utils/quantization/AsymmHelpers.h" +#include "src/core/CPP/Validate.h" +#include "src/core/NEON/kernels/assembly/NEDepthwiseConvolutionAssemblyKernelWrapper.h" +#include "src/core/NEON/kernels/convolution/depthwise/depthwise_dilated.hpp" +#include "src/core/NEON/kernels/convolution/depthwise/depthwise_quantized_dilated.hpp" +#include "src/core/helpers/AutoConfiguration.h" + +#include "arm_compute/runtime/NEON/NEScheduler.h" + +#include + +namespace arm_compute +{ +namespace cpu +{ +namespace +{ +std::unique_ptr get_qasymm8_convolver(int kernel_size, int stride_x, + int n_batches, int in_rows, int in_cols, int n_channels, + int dilation_factor, neon_convolution_kernels::ActivationFunction activation, + const qasymm8::QAsymm8Params &wqinfo, const qasymm8::QAsymm8Params &iqinfo, const qasymm8::QAsymm8Params &oqinfo, + const qasymm8::QAsymm8RescaleParams &rescale_params, + int padding_top, int padding_left, int padding_bottom, int padding_right) +{ + switch(kernel_size) + { + case 3: + { + switch(stride_x) + { + case 1: + return std::make_unique>( + n_batches, in_rows, in_cols, n_channels, dilation_factor, activation, wqinfo, iqinfo, oqinfo, rescale_params, padding_top, padding_left, padding_bottom, padding_right); + case 2: + return std::make_unique>( + n_batches, in_rows, in_cols, n_channels, dilation_factor, activation, wqinfo, iqinfo, oqinfo, rescale_params, padding_top, padding_left, padding_bottom, padding_right); + default: + return nullptr; + } + } + case 5: + { + switch(stride_x) + { + case 1: + return std::make_unique>( + n_batches, in_rows, in_cols, n_channels, dilation_factor, activation, wqinfo, iqinfo, oqinfo, rescale_params, padding_top, padding_left, padding_bottom, padding_right); + case 2: + return std::make_unique>( + n_batches, in_rows, in_cols, n_channels, dilation_factor, activation, wqinfo, iqinfo, oqinfo, rescale_params, padding_top, padding_left, padding_bottom, padding_right); + default: + return nullptr; + } + } + default: + return nullptr; + } +} + +std::unique_ptr get_qsymm8_perchannel_convolver(int kernel_size, int stride_x, + int n_batches, int in_rows, int in_cols, int n_channels, + neon_convolution_kernels::ActivationFunction activation, + const qsymm8::QSymm8PerChannelParams &wqinfo, const qasymm8::QAsymm8Params &iqinfo, const qasymm8::QAsymm8Params &oqinfo, + const qsymm8::QSymm8PerChannelRescaleParams &rescale_params, + int padding_top, int padding_left, int padding_bottom, int padding_right) +{ + switch(kernel_size) + { + case 3: + { + switch(stride_x) + { + case 1: + return std::make_unique>( + n_batches, in_rows, in_cols, n_channels, activation, wqinfo, iqinfo, oqinfo, rescale_params, padding_top, padding_left, padding_bottom, padding_right); + case 2: + return std::make_unique>( + n_batches, in_rows, in_cols, n_channels, activation, wqinfo, iqinfo, oqinfo, rescale_params, padding_top, padding_left, padding_bottom, padding_right); + default: + return nullptr; + } + } + case 5: + { + switch(stride_x) + { + case 1: + return std::make_unique>( + n_batches, in_rows, in_cols, n_channels, activation, wqinfo, iqinfo, oqinfo, rescale_params, padding_top, padding_left, padding_bottom, padding_right); + case 2: + return std::make_unique>( + n_batches, in_rows, in_cols, n_channels, activation, wqinfo, iqinfo, oqinfo, rescale_params, padding_top, padding_left, padding_bottom, padding_right); + default: + return nullptr; + } + } + default: + return nullptr; + } +} + +#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC +std::unique_ptr get_fp16_convolver(int kernel_size, int stride_x, + int n_batches, int in_rows, int in_cols, int n_channels, + int dilation_factor, neon_convolution_kernels::ActivationFunction activation, + int padding_top, int padding_left, int padding_bottom, int padding_right) +{ + switch(kernel_size) + { + case 3: + { + switch(stride_x) + { + case 1: + return std::make_unique>( + n_batches, in_rows, in_cols, n_channels, dilation_factor, activation, padding_top, padding_left, padding_bottom, padding_right); + case 2: + return std::make_unique>( + n_batches, in_rows, in_cols, n_channels, dilation_factor, activation, padding_top, padding_left, padding_bottom, padding_right); + default: + return nullptr; + } + } + case 5: + { + switch(stride_x) + { + case 1: + return std::make_unique>( + n_batches, in_rows, in_cols, n_channels, dilation_factor, activation, padding_top, padding_left, padding_bottom, padding_right); + case 2: + return std::make_unique>( + n_batches, in_rows, in_cols, n_channels, dilation_factor, activation, padding_top, padding_left, padding_bottom, padding_right); + default: + return nullptr; + } + } + default: + return nullptr; + } +} +#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC + +std::unique_ptr get_fp32_convolver(int kernel_size, int stride_x, + int n_batches, int in_rows, int in_cols, int n_channels, + int dilation_factor, neon_convolution_kernels::ActivationFunction activation, + int padding_top, int padding_left, int padding_bottom, int padding_right) +{ + switch(kernel_size) + { + case 3: + { + switch(stride_x) + { + case 1: + return std::make_unique>( + n_batches, in_rows, in_cols, n_channels, dilation_factor, activation, padding_top, padding_left, padding_bottom, padding_right); + case 2: + return std::make_unique>( + n_batches, in_rows, in_cols, n_channels, dilation_factor, activation, padding_top, padding_left, padding_bottom, padding_right); + default: + return nullptr; + } + } + case 5: + { + switch(stride_x) + { + case 1: + return std::make_unique>( + n_batches, in_rows, in_cols, n_channels, dilation_factor, activation, padding_top, padding_left, padding_bottom, padding_right); + case 2: + return std::make_unique>( + n_batches, in_rows, in_cols, n_channels, dilation_factor, activation, padding_top, padding_left, padding_bottom, padding_right); + default: + return nullptr; + } + } + default: + return nullptr; + } +} + +std::unique_ptr create_convolver(const ITensorInfo *src, + const ITensorInfo *weights, + ITensorInfo *output, + const ConvolutionInfo &info) +{ + const DataType data_type = src->data_type(); + const TensorShape shape = src->tensor_shape(); + + const int n_batches = shape[3]; + const int in_rows = shape.z(); + const int in_cols = shape.y(); + const int n_channels = shape.x(); + const int dilation_factor = info.dilation.x(); + const int padding_top = info.pad_stride_info.pad_top(); + const int padding_left = info.pad_stride_info.pad_left(); + const int padding_bottom = info.pad_stride_info.pad_bottom(); + const int padding_right = info.pad_stride_info.pad_right(); + + const bool is_uniform_quantized = (data_type == DataType::QASYMM8) && (weights->data_type() == DataType::QASYMM8); + const bool is_perchannel_quantized = (data_type == DataType::QASYMM8) && (weights->data_type() == DataType::QSYMM8_PER_CHANNEL); + + const unsigned int stride_x = info.pad_stride_info.stride().first; + const unsigned int kernel_size = weights->tensor_shape().y(); + + // Map activation function + neon_convolution_kernels::ActivationFunction activation = neon_convolution_kernels::ActivationFunction::None; + if(arm_compute::utils::info_helpers::is_relu(info.act_info)) + { + activation = neon_convolution_kernels::ActivationFunction::ReLU; + } + else if(arm_compute::utils::info_helpers::is_relu6(info.act_info)) + { + activation = neon_convolution_kernels::ActivationFunction::ReLU6; + } + + // Create quantized convolver + if(is_uniform_quantized) + { + const UniformQuantizationInfo input_qinfo = src->quantization_info().uniform(); + const UniformQuantizationInfo weights_qinfo = weights->quantization_info().uniform(); + const UniformQuantizationInfo output_qinfo = output->quantization_info().uniform(); + + // Check that quantization info are in the range [0, 255] + ARM_COMPUTE_ERROR_ON(input_qinfo.offset < 0 || input_qinfo.offset > 255); + ARM_COMPUTE_ERROR_ON(weights_qinfo.offset < 0 || weights_qinfo.offset > 255); + ARM_COMPUTE_ERROR_ON(output_qinfo.offset < 0 || output_qinfo.offset > 255); + const qasymm8::QAsymm8Params iqinfo{ static_cast(input_qinfo.offset), input_qinfo.scale }; + const qasymm8::QAsymm8Params wqinfo{ static_cast(weights_qinfo.offset), weights_qinfo.scale }; + const qasymm8::QAsymm8Params oqinfo{ static_cast(output_qinfo.offset), output_qinfo.scale }; + + // Calculate rescale parameters + const float fmultipler = iqinfo.scale * wqinfo.scale / oqinfo.scale; + int32_t qmultiplier = 0; + int32_t qshift = 0; + quantization::calculate_quantized_multiplier_less_than_one(fmultipler, &qmultiplier, &qshift); + qasymm8::QAsymm8RescaleParams rescale_params(qshift, qmultiplier, fmultipler); + + return get_qasymm8_convolver(kernel_size, stride_x, n_batches, in_rows, in_cols, n_channels, dilation_factor, activation, + wqinfo, iqinfo, oqinfo, rescale_params, padding_top, padding_left, padding_bottom, padding_right); + } + else if(is_perchannel_quantized) + { + const UniformQuantizationInfo input_qinfo = src->quantization_info().uniform(); + const QuantizationInfo weights_qinfo = weights->quantization_info(); + const UniformQuantizationInfo output_qinfo = output->quantization_info().uniform(); + + // Check that quantization info are in the range [0, 255] + ARM_COMPUTE_ERROR_ON(input_qinfo.offset < 0 || input_qinfo.offset > 255); + ARM_COMPUTE_ERROR_ON(output_qinfo.offset < 0 || output_qinfo.offset > 255); + const qasymm8::QAsymm8Params iqinfo{ static_cast(input_qinfo.offset), input_qinfo.scale }; + const qsymm8::QSymm8PerChannelParams wqinfo{ weights_qinfo.scale() }; + const qasymm8::QAsymm8Params oqinfo{ static_cast(output_qinfo.offset), output_qinfo.scale }; + + // Calculate rescale parameters + std::vector fmultipliers; + std::vector qmultipliers; + std::vector qshifts; + + for(auto const s : wqinfo.scales) + { + const float fmultipler = iqinfo.scale * s / oqinfo.scale; + int32_t qmultiplier = 0; + int32_t qshift = 0; + quantization::calculate_quantized_multiplier_less_than_one(fmultipler, &qmultiplier, &qshift); + fmultipliers.push_back(fmultipler); + qmultipliers.push_back(qmultiplier); + qshifts.push_back(qshift); + } + + qsymm8::QSymm8PerChannelRescaleParams rescale_params(qshifts, qmultipliers, fmultipliers); + + return get_qsymm8_perchannel_convolver(kernel_size, stride_x, n_batches, in_rows, in_cols, n_channels, activation, + wqinfo, iqinfo, oqinfo, rescale_params, padding_top, padding_left, padding_bottom, padding_right); + } + else + { + // Create float convolver + switch(data_type) + { +#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC + case DataType::F16: + { + return get_fp16_convolver(kernel_size, stride_x, n_batches, in_rows, in_cols, n_channels, dilation_factor, activation, padding_top, padding_left, padding_bottom, padding_right); + } +#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC + case DataType::F32: + { + return get_fp32_convolver(kernel_size, stride_x, n_batches, in_rows, in_cols, n_channels, dilation_factor, activation, padding_top, padding_left, padding_bottom, padding_right); + } + default: + return nullptr; + } + } +} +} // namespace + +struct CpuDepthwiseConv2dAssemblyDispatch::LocalImpl +{ + std::unique_ptr dwc_assembly_kernel{ nullptr }; + NEDepthwiseConvolutionAssemblyKernelWrapper dwc_acl_kernel{}; + bool is_prepared{ false }; + experimental::MemoryRequirements mem_req{}; +}; + +#ifndef DOXYGEN_SKIP_THIS +CpuDepthwiseConv2dAssemblyDispatch::CpuDepthwiseConv2dAssemblyDispatch() + : _pImpl(std::make_unique()) +{ +} +#endif /* DOXYGEN_SKIP_THIS */ + +CpuDepthwiseConv2dAssemblyDispatch::~CpuDepthwiseConv2dAssemblyDispatch() = default; + +void CpuDepthwiseConv2dAssemblyDispatch::configure(const ITensorInfo *src, + const ITensorInfo *weights, + const ITensorInfo *bias, + ITensorInfo *dst, + const ConvolutionInfo &info) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(src, weights, dst); + ARM_COMPUTE_UNUSED(bias); + ARM_COMPUTE_ERROR_THROW_ON(CpuDepthwiseConv2dAssemblyDispatch::validate(src, + weights, + bias != nullptr ? bias : nullptr, + dst, + info)); + + // Output auto inizialitation if not yet initialized + const TensorShape dst_shape = misc::shape_calculator::compute_depthwise_convolution_shape(*src, *weights, info); + auto_init_if_empty(*dst, src->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(dst_shape).set_quantization_info(dst->quantization_info())); + + _pImpl->is_prepared = false; + + // Create convolver + _pImpl->dwc_assembly_kernel = create_convolver(src, weights, dst, info); + ARM_COMPUTE_ERROR_ON(_pImpl->dwc_assembly_kernel == nullptr); + + // Create assembly kernel wrapper + _pImpl->dwc_acl_kernel.configure(_pImpl->dwc_assembly_kernel.get()); + + constexpr size_t alignment = 128; + + // Create workspace + const unsigned int num_threads = NEScheduler::get().num_threads(); + const size_t workspace_size = _pImpl->dwc_assembly_kernel->get_working_space_size(num_threads); + ARM_COMPUTE_ERROR_ON_MSG(workspace_size == 0, "Workspace size cannot be 0 !"); + _pImpl->mem_req.push_back({ TensorType::ACL_INT_0, workspace_size, alignment }); + + // Create packing tensor + const size_t pack_tensor_size = _pImpl->dwc_assembly_kernel->get_packed_params_size(); + ARM_COMPUTE_ERROR_ON_MSG(pack_tensor_size == 0, "Pack tensor size cannot be 0 !"); + + _pImpl->mem_req.push_back({ TensorType::ACL_INT_1, pack_tensor_size, alignment }); +} + +experimental::MemoryRequirements CpuDepthwiseConv2dAssemblyDispatch::workspace() const +{ + return _pImpl->mem_req; +} + +Status CpuDepthwiseConv2dAssemblyDispatch::validate(const ITensorInfo *src, + const ITensorInfo *weights, + const ITensorInfo *bias, + const ITensorInfo *dst, + const ConvolutionInfo &info) +{ + ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(src); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::F16, DataType::F32); + if(weights->data_type() != DataType::QSYMM8_PER_CHANNEL) + { + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, weights); + } + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(src, weights); + + // Validate convolver + ARM_COMPUTE_RETURN_ERROR_ON(!is_optimized_supported(src, weights, info)); + + // Validate activation + const bool is_relu = arm_compute::utils::info_helpers::is_relu(info.act_info); + const bool is_relu6 = arm_compute::utils::info_helpers::is_relu6(info.act_info); + ARM_COMPUTE_RETURN_ERROR_ON(info.act_info.enabled() && !(is_relu || is_relu6)); + + // Check bias + if(bias != nullptr) + { + unsigned int channel_idx = get_data_layout_dimension_index(src->data_layout(), DataLayoutDimension::CHANNEL); + ARM_COMPUTE_RETURN_ERROR_ON(bias->num_dimensions() > 1); + ARM_COMPUTE_RETURN_ERROR_ON(bias->dimension(0) != weights->dimension(channel_idx)); + } + + // Check output + if(dst->total_size() != 0) + { + const TensorShape dst_shape = misc::shape_calculator::compute_depthwise_convolution_shape(*src, *weights, info); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(dst->tensor_shape(), dst_shape); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst); + } + + // The uniform quantization case will only have 1 scale value in the weights quantization info + const UniformQuantizationInfo src_qinfo = src->quantization_info().uniform(); + const QuantizationInfo weights_qinfo = weights->quantization_info(); + const UniformQuantizationInfo dst_qinfo = dst->quantization_info().uniform(); + for(auto const s : weights_qinfo.scale()) + { + const float fmultipler = src_qinfo.scale * s / dst_qinfo.scale; + ARM_COMPUTE_RETURN_ERROR_ON(fmultipler > 1.f); + } + + return Status{}; +} + +bool CpuDepthwiseConv2dAssemblyDispatch::is_optimized_supported(const ITensorInfo *src, + const ITensorInfo *weights, + const ConvolutionInfo &info) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(src, weights); + + // Reshape input shape if in NHWC format + const DataLayout data_layout = src->data_layout(); + TensorShape in_shape{ src->tensor_shape() }; + if(data_layout == DataLayout::NHWC) + { + in_shape.set(Window::DimX, src->tensor_shape().y()); + in_shape.set(Window::DimY, src->tensor_shape().z()); + in_shape.set(Window::DimZ, src->tensor_shape().x()); + } + + // Check data type + const DataType input_type = src->data_type(); + const bool is_input_type_valid = is_data_type_float(input_type) || input_type == DataType::QASYMM8; + const DataType weights_type = weights->data_type(); + const bool is_weights_type_valid = is_data_type_float(weights_type) || weights_type == DataType::QASYMM8 || weights_type == DataType::QASYMM8_SIGNED + || weights_type == DataType::QSYMM8_PER_CHANNEL; + + // Check weighs size + std::set supported_kernel_sizes = { 3, 5 }; + const unsigned int width_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH); + const unsigned int height_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT); + const unsigned int kernel_w = weights->dimension(width_idx); + const unsigned int kernel_h = weights->dimension(height_idx); + bool weights_supported = (kernel_w == kernel_h) && (supported_kernel_sizes.count(kernel_w) != 0); + + // Check for supported strides + const auto &strides = info.pad_stride_info.stride(); + bool supported_strides = (strides.first == strides.second) && ((strides.first == 1) || (strides.first == 2)); + + // Check for supported padding + const auto pad_top = info.pad_stride_info.pad_top(); + const auto pad_right = info.pad_stride_info.pad_right(); + const auto pad_bottom = info.pad_stride_info.pad_bottom(); + const auto pad_left = info.pad_stride_info.pad_left(); + PadStrideInfo same_pad = calculate_same_pad(in_shape, TensorShape(kernel_w, kernel_h), info.pad_stride_info, DataLayout::NCHW, info.dilation); + bool is_same_padding = (pad_top == same_pad.pad_top()) && (pad_right == same_pad.pad_right()) && (pad_bottom == same_pad.pad_bottom()) && (pad_left == same_pad.pad_left()); + bool is_valid_padding = (pad_top == 0) && (pad_right == 0) && (pad_bottom == 0) && (pad_left == 0); + bool supported_padding = is_same_padding || is_valid_padding; + // TODO(COMPMID-2464): Enable once dilated conv with stride 2 is supported + bool is_dilation_supported = ((info.dilation == Size2D(1U, 1U)) || ((info.dilation.x() == info.dilation.y()) && strides.first == 1)); + + if(weights_type == DataType::QSYMM8_PER_CHANNEL) + { + is_dilation_supported = is_dilation_supported && (info.dilation == Size2D(1U, 1U)); + } + + return is_input_type_valid && is_weights_type_valid && weights_supported && supported_strides && supported_padding && (info.depth_multiplier == 1) && is_dilation_supported; +} + +void CpuDepthwiseConv2dAssemblyDispatch::run(ITensorPack &tensors) +{ + // Prepare assembly kernel + prepare(tensors); + + auto src = tensors.get_tensor(TensorType::ACL_SRC_0); + auto workspace = tensors.get_tensor(TensorType::ACL_INT_0); + auto dst = tensors.get_tensor(TensorType::ACL_DST); + + // Setup inputs/outputs + ARM_COMPUTE_ERROR_ON(workspace == nullptr && workspace->buffer() == nullptr); + _pImpl->dwc_assembly_kernel->set_working_space(static_cast(workspace->buffer())); + + ARM_COMPUTE_ERROR_ON(workspace->buffer() == nullptr); + const int input_element_size = src->info()->element_size(); + const int input_batch_stride = src->info()->strides_in_bytes()[3] / input_element_size; + const int input_row_stride = src->info()->strides_in_bytes().z() / input_element_size; + const int input_col_stride = src->info()->strides_in_bytes().y() / input_element_size; + const void *input_ptr = src->buffer() + src->info()->offset_first_element_in_bytes(); + _pImpl->dwc_assembly_kernel->set_input(input_ptr, input_batch_stride, input_row_stride, input_col_stride); + + ARM_COMPUTE_ERROR_ON(dst->buffer() == nullptr); + const int output_element_size = dst->info()->element_size(); + const int output_batch_stride = dst->info()->strides_in_bytes()[3] / output_element_size; + const int output_row_stride = dst->info()->strides_in_bytes().z() / output_element_size; + const int output_col_stride = dst->info()->strides_in_bytes().y() / output_element_size; + void *output_ptr = dst->buffer() + dst->info()->offset_first_element_in_bytes(); + _pImpl->dwc_assembly_kernel->set_output(output_ptr, output_batch_stride, output_row_stride, output_col_stride); + + // Schedule assembly kernel + NEScheduler::get().schedule(&_pImpl->dwc_acl_kernel, Window::DimX); +} + +void CpuDepthwiseConv2dAssemblyDispatch::prepare(ITensorPack &tensors) +{ + if(!_pImpl->is_prepared) + { + auto weights = tensors.get_const_tensor(TensorType::ACL_SRC_1); + auto bias = tensors.get_const_tensor(TensorType::ACL_SRC_2); + auto packed_weights = tensors.get_tensor(TensorType::ACL_INT_1); + + ARM_COMPUTE_ERROR_ON(packed_weights->buffer() == nullptr); + + // Pack weights and bias + const int weights_element_size = weights->info()->element_size(); + const int weights_row_stride = weights->info()->strides_in_bytes().z() / weights_element_size; + const int weights_col_stride = weights->info()->strides_in_bytes().y() / weights_element_size; + _pImpl->dwc_assembly_kernel->pack_params(packed_weights->buffer(), + weights->buffer() + weights->info()->offset_first_element_in_bytes(), + weights_row_stride, + weights_col_stride, + (bias != nullptr) ? bias->buffer() : nullptr); + _pImpl->dwc_assembly_kernel->set_packed_params_buffer(packed_weights->buffer()); + + weights->mark_as_unused(); + if(bias != nullptr) + { + bias->mark_as_unused(); + } + _pImpl->is_prepared = true; + } +} +} // namespace cpu +} // namespace arm_compute diff --git a/src/runtime/cpu/operators/CpuDepthwiseConv2dAssemblyDispatch.h b/src/runtime/cpu/operators/CpuDepthwiseConv2dAssemblyDispatch.h new file mode 100644 index 0000000000..195942b7fd --- /dev/null +++ b/src/runtime/cpu/operators/CpuDepthwiseConv2dAssemblyDispatch.h @@ -0,0 +1,86 @@ +/* + * Copyright (c) 2019-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ARM_COMPUTE_CPU_DEPTHWISECONV2DASSEMBLYDISPATCH_H +#define ARM_COMPUTE_CPU_DEPTHWISECONV2DASSEMBLYDISPATCH_H + +#include "src/core/common/Macros.h" +#include "src/runtime/cpu/ICpuOperator.h" + +namespace arm_compute +{ +namespace cpu +{ +/** Depthwise convolution assembly kernel glue */ +class CpuDepthwiseConv2dAssemblyDispatch : public ICpuOperator +{ +public: + /** Default constructor */ + CpuDepthwiseConv2dAssemblyDispatch(); + ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuDepthwiseConv2dAssemblyDispatch); + /** Default destructor */ + ~CpuDepthwiseConv2dAssemblyDispatch(); + + /** Initialize the function's source, destination, kernels and border_size. + * + * @note Supports only NHWC format + * + * @param[in] src Source tensor info. Data type supported: QASYMM8/F16/F32. (Written to only for border filling). + * @param[in] weights Weights tensor info. These are 3D tensors with shape [W, H, IFM]. Data type supported: Same as @p src. + * @param[in] bias (Optional) Biases tensor info. A 1D tensor with shape [IFM]. Must be nullptr if not needed. + * Data type supported: Same as @p src. + * @param[out] dst Destination tensor info. Data type supported: same as @p src. + * @param[in] info Depthwise convolution meta-data. + */ + void configure(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *bias, ITensorInfo *dst, const ConvolutionInfo &info); + /** Static function to check if given info will lead to a valid configuration + * + * Similar to CpuDepthwiseConv2dAssemblyDispatch::configure() + * + * @return a status + */ + static Status validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *bias, const ITensorInfo *dst, const ConvolutionInfo &info); + /** Check if the optimized kernel can be used for the given kernel sizes and strides + * + * @warning Even if this return true the inputs and outputs might need to get permuted as the only layout supported is NHWC + * + * @param[in] src Input tensor info. + * @param[in] weights Weights tensor info. + * @param[in] info Depthwise convolution meta-data. + * + * @return True if the assembly kernel could be used else false. Note that transformations of input/output could be needed. + */ + static bool is_optimized_supported(const ITensorInfo *src, const ITensorInfo *weights, const ConvolutionInfo &info); + + // Inherited methods overridden: + void run(ITensorPack &tensors) override; + void prepare(ITensorPack &tensors) override; + experimental::MemoryRequirements workspace() const override; + +private: + struct LocalImpl; + std::unique_ptr _pImpl; +}; +} // namespace cpu +} // namespace arm_compute +#endif /* ARM_COMPUTE_CPU_DEPTHWISECONV2DASSEMBLYDISPATCH_H */ diff --git a/src/runtime/cpu/operators/CpuDepthwiseConvolution.cpp b/src/runtime/cpu/operators/CpuDepthwiseConvolution.cpp deleted file mode 100644 index 6d097280e0..0000000000 --- a/src/runtime/cpu/operators/CpuDepthwiseConvolution.cpp +++ /dev/null @@ -1,523 +0,0 @@ -/* - * Copyright (c) 2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "src/runtime/cpu/operators/CpuDepthwiseConvolution.h" - -#include "arm_compute/core/TensorInfo.h" -#include "arm_compute/core/Validate.h" -#include "arm_compute/core/utils/misc/InfoHelpers.h" -#include "arm_compute/core/utils/misc/ShapeCalculator.h" -#include "arm_compute/runtime/NEON/NEScheduler.h" -#include "src/core/cpu/kernels/CpuDepthwiseConvolutionNativeKernel.h" - -namespace arm_compute -{ -namespace cpu -{ -namespace -{ -Status validate_arguments_optimized(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const ConvolutionInfo &info) -{ - ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F16, DataType::F32); - if(!is_data_type_quantized_per_channel(weights->data_type())) - { - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights); - } - ARM_COMPUTE_RETURN_ERROR_ON(input->data_layout() == DataLayout::UNKNOWN); - ARM_COMPUTE_RETURN_ERROR_ON(info.dilation.x() < 1 || info.dilation.y() < 1); - const size_t idx_w = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::WIDTH); - const size_t idx_h = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::HEIGHT); - ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(idx_w) + (weights->dimension(idx_w) - 1) * (info.dilation.x() - 1) > input->dimension(idx_w) + info.pad_stride_info.pad_left() + - info.pad_stride_info.pad_right()); - ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(idx_h) + (weights->dimension(idx_h) - 1) * (info.dilation.y() - 1) > input->dimension(idx_h) + info.pad_stride_info.pad_top() + - info.pad_stride_info.pad_bottom()); - - if(biases != nullptr) - { - const unsigned int channel_idx = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::CHANNEL); - ARM_COMPUTE_RETURN_ERROR_ON(biases->num_dimensions() > 1); - ARM_COMPUTE_RETURN_ERROR_ON(biases->dimension(0) != weights->dimension(channel_idx)); - } - - ARM_COMPUTE_RETURN_ON_ERROR(CpuDepthwiseConvolutionAssemblyDispatch::validate(input, weights, biases, output, info)); - - //Validate Activation Layer - if(info.act_info.enabled()) - { - ARM_COMPUTE_RETURN_ON_ERROR(CpuActivation::validate(output, nullptr, info.act_info)); - } - return Status{}; -} -} // namespace - -CpuDepthwiseConvolution::CpuDepthwiseConvolutionOptimizedInternal::CpuDepthwiseConvolutionOptimizedInternal() - : _dwc_optimized_func(nullptr), _permute_input(nullptr), _permute_weights(nullptr), _permute_output(nullptr), _activationlayer_function(nullptr), _has_bias(false), _is_quantized(false), - _is_nchw(true), _permute(false), _is_activationlayer_enabled(false), _is_prepared(false) -{ -} - -void CpuDepthwiseConvolution::CpuDepthwiseConvolutionOptimizedInternal::configure(ITensorInfo *input, - const ITensorInfo *weights, - const ITensorInfo *biases, - ITensorInfo *output, - const ConvolutionInfo &info) -{ - ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output); - // Perform validation step - ARM_COMPUTE_ERROR_THROW_ON(CpuDepthwiseConvolutionOptimizedInternal::validate(input, weights, (biases == nullptr) ? nullptr : biases, - output, info)); - - _is_quantized = is_data_type_quantized_asymmetric(input->data_type()); - _has_bias = biases != nullptr; - _is_nchw = input->data_layout() == DataLayout::NCHW; - _permute = _is_nchw; - _is_prepared = false; - - // Configure pipeline - ActivationLayerInfo act_info_to_use = ActivationLayerInfo(); - const bool is_relu = arm_compute::utils::info_helpers::is_relu(info.act_info); - const bool is_relu6 = arm_compute::utils::info_helpers::is_relu6(info.act_info); - _is_activationlayer_enabled = info.act_info.enabled() && !(is_relu || is_relu6); - - if(!_is_activationlayer_enabled) - { - act_info_to_use = info.act_info; - } - - _dwc_optimized_func = std::make_unique(); - if(_is_nchw) - { - _permute_input = std::make_unique(); - _permute_weights = std::make_unique(); - _permute_output = std::make_unique(); - - auto input_perm = std::make_unique(); - auto weights_perm = std::make_unique(); - auto output_perm = std::make_unique(); - - // Configure the function to transform the input tensor from NCHW -> NHWC - _permute_input->configure(input, input_perm.get(), PermutationVector(2U, 0U, 1U)); - input_perm->set_data_layout(DataLayout::NHWC); - - // Configure the function to transform the weights tensor from IHW -> HWI - _permute_weights->configure(weights, weights_perm.get(), PermutationVector(2U, 0U, 1U)); - weights_perm->set_data_layout(DataLayout::NHWC); - - output_perm->set_data_layout(DataLayout::NHWC); - output_perm->set_quantization_info(output->quantization_info()); - - // Configure optimized depthwise - _dwc_optimized_func->configure(input_perm.get(), weights_perm.get(), biases, output_perm.get(), info); - - // Configure the function to transform the convoluted output to ACL's native ordering format NCHW - output_perm->set_data_layout(DataLayout::NHWC); - _permute_output->configure(output_perm.get(), output, PermutationVector(1U, 2U, 0U)); - } - else - { - _dwc_optimized_func->configure(input, weights, biases, output, info); - } - - // Configure activation - if(_is_activationlayer_enabled) - { - _activationlayer_function = std::make_unique(); - _activationlayer_function->configure(output, nullptr, info.act_info); - } -} - -Status CpuDepthwiseConvolution::CpuDepthwiseConvolutionOptimizedInternal::validate(const ITensorInfo *input, - const ITensorInfo *weights, - const ITensorInfo *biases, - const ITensorInfo *output, - const ConvolutionInfo &info) -{ - return validate_arguments_optimized(input, weights, biases, output, info); -} - -void CpuDepthwiseConvolution::CpuDepthwiseConvolutionOptimizedInternal::run(ITensorPack &tensors) -{ - ARM_COMPUTE_ERROR_ON_MSG(tensors.empty(), "No inputs provided"); - prepare(tensors); - - auto bias = tensors.get_const_tensor(TensorType::ACL_SRC_2); - auto dst = tensors.get_tensor(TensorType::ACL_DST_0); - auto workspace = tensors.get_tensor(TensorType::ACL_INT_3); - auto packed_weights = tensors.get_tensor(TensorType::ACL_INT_4); - - // Permute input - if(_permute) - { - ITensorPack pack; - auto src = tensors.get_const_tensor(TensorType::ACL_SRC_0); - auto src_perm = tensors.get_tensor(TensorType::ACL_INT_0); - pack.add_tensor(TensorType::ACL_SRC, src); - pack.add_tensor(TensorType::ACL_DST, src_perm); - _permute_input->run(pack); - } - - // Run assembly function - if(_is_nchw) - { - auto src_perm = tensors.get_tensor(TensorType::ACL_INT_0); - auto weights_perm = tensors.get_tensor(TensorType::ACL_INT_1); - auto dst_perm = tensors.get_tensor(TensorType::ACL_INT_2); - - ITensorPack pack; - pack.add_tensor(TensorType::ACL_SRC_0, src_perm); - pack.add_tensor(TensorType::ACL_SRC_1, weights_perm); - pack.add_tensor(TensorType::ACL_SRC_2, bias); - pack.add_tensor(TensorType::ACL_INT_0, workspace); - pack.add_tensor(TensorType::ACL_INT_1, packed_weights); - pack.add_tensor(TensorType::ACL_DST, dst_perm); - _dwc_optimized_func->run(pack); - } - else - { - auto src = tensors.get_tensor(TensorType::ACL_SRC_0); - auto weights = tensors.get_tensor(TensorType::ACL_SRC_1); - auto dst = tensors.get_tensor(TensorType::ACL_DST); - - ITensorPack pack; - pack.add_tensor(TensorType::ACL_SRC_0, src); - pack.add_tensor(TensorType::ACL_SRC_1, weights); - pack.add_tensor(TensorType::ACL_SRC_2, bias); - pack.add_tensor(TensorType::ACL_INT_0, workspace); - pack.add_tensor(TensorType::ACL_INT_1, packed_weights); - pack.add_tensor(TensorType::ACL_DST, dst); - _dwc_optimized_func->run(pack); - } - - // Permute output - if(_is_nchw) - { - ITensorPack pack; - auto dst_perm = tensors.get_tensor(TensorType::ACL_INT_2); - pack.add_tensor(TensorType::ACL_SRC, dst_perm); - pack.add_tensor(TensorType::ACL_DST, dst); - _permute_output->run(pack); - } - - // Run activation - if(_is_activationlayer_enabled) - { - ITensorPack pack; - pack.add_tensor(TensorType::ACL_SRC, dst); - pack.add_tensor(TensorType::ACL_DST, dst); - _activationlayer_function->run(pack); - } -} - -void CpuDepthwiseConvolution::CpuDepthwiseConvolutionOptimizedInternal::prepare(ITensorPack &tensors) -{ - if(!_is_prepared) - { - auto weights = tensors.get_const_tensor(TensorType::ACL_SRC_1); - auto bias = tensors.get_const_tensor(TensorType::ACL_SRC_2); - auto packed_weights = tensors.get_tensor(TensorType::ACL_INT_4); - - // Permute weights - if(_permute) - { - auto permuted_weights = tensors.get_tensor(TensorType::ACL_INT_1); - - ITensorPack pack; - pack.add_tensor(TensorType::ACL_SRC, weights); - pack.add_tensor(TensorType::ACL_DST, permuted_weights); - _permute_weights->run(pack); - - weights->mark_as_unused(); - - ITensorPack pack_opt; - pack_opt.add_const_tensor(TensorType::ACL_SRC_1, permuted_weights); - pack_opt.add_tensor(TensorType::ACL_SRC_2, bias); - pack_opt.add_tensor(TensorType::ACL_INT_1, packed_weights); - - // Prepare optimized function - _dwc_optimized_func->prepare(pack_opt); - } - else - { - ITensorPack pack_opt; - pack_opt.add_tensor(TensorType::ACL_SRC_1, weights); - pack_opt.add_tensor(TensorType::ACL_SRC_2, bias); - pack_opt.add_tensor(TensorType::ACL_INT_1, packed_weights); - - // Prepare optimized function - _dwc_optimized_func->prepare(pack_opt); - } - - _is_prepared = true; - } -} - -CpuDepthwiseConvolution::CpuDepthwiseConvolutionGeneric::CpuDepthwiseConvolutionGeneric() - : _depthwise_conv_kernel(nullptr), _permute_input(nullptr), _permute_weights(nullptr), _permute_output(nullptr), _activationlayer_function(nullptr), _is_nchw(true), _is_prepared(false), - _is_activationlayer_enabled(false) -{ -} - -void CpuDepthwiseConvolution::CpuDepthwiseConvolutionGeneric::configure(ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, ITensorInfo *output, const ConvolutionInfo &info) -{ - ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output); - ARM_COMPUTE_ERROR_THROW_ON(CpuDepthwiseConvolution::validate(input, weights, (biases == nullptr) ? nullptr : biases, - output, info)); - - _is_nchw = input->data_layout() == DataLayout::NCHW; - _is_prepared = !_is_nchw; - - ITensorInfo *input_to_use = input; - const ITensorInfo *weights_to_use = weights; - ITensorInfo *output_to_use = output; - - auto input_perm = std::make_unique(); - auto weights_perm = std::make_unique(); - auto output_perm = std::make_unique(output->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(TensorShape())); - - if(_is_nchw) - { - _permute_input = std::make_unique(); - _permute_weights = std::make_unique(); - - _permute_input->configure(input, input_perm.get(), PermutationVector(2U, 0U, 1U)); - input_perm->set_data_layout(DataLayout::NHWC); - input_to_use = input_perm.get(); - - _permute_weights->configure(weights, weights_perm.get(), PermutationVector(2U, 0U, 1U)); - weights_perm->set_data_layout(DataLayout::NHWC); - weights_to_use = weights_perm.get(); - - output_to_use = output_perm.get(); - } - - _depthwise_conv_kernel = std::make_unique(); - _depthwise_conv_kernel->configure(input_to_use, weights_to_use, biases, output_to_use, info); - - if(_is_nchw) - { - _permute_output = std::make_unique(); - _permute_output->configure(output_perm.get(), output, PermutationVector(1U, 2U, 0U)); - output_perm->set_data_layout(DataLayout::NHWC); - } - - //Configure Activation Layer - _is_activationlayer_enabled = info.act_info.enabled(); - if(_is_activationlayer_enabled) - { - _activationlayer_function = std::make_unique(); - _activationlayer_function->configure(output, nullptr, info.act_info); - } -} - -Status CpuDepthwiseConvolution::CpuDepthwiseConvolutionGeneric::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, - const ConvolutionInfo &info) -{ - ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output); - if(input->data_layout() == DataLayout::NCHW) - { - TensorShape permuted_input_shape = input->tensor_shape(); - TensorShape permuted_weights_shape = weights->tensor_shape(); - TensorShape permuted_output_shape = misc::shape_calculator::compute_depthwise_convolution_shape(*input, *weights, info); - permute(permuted_input_shape, PermutationVector(2U, 0U, 1U)); - permute(permuted_weights_shape, PermutationVector(2U, 0U, 1U)); - permute(permuted_output_shape, PermutationVector(2U, 0U, 1U)); - - const TensorInfo permuted_input = TensorInfo(input->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(permuted_input_shape).set_data_layout(DataLayout::NHWC)); - const TensorInfo permuted_weights = TensorInfo(weights->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(permuted_weights_shape).set_data_layout(DataLayout::NHWC)); - const TensorInfo permuted_output = TensorInfo(output->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(permuted_output_shape).set_data_layout(DataLayout::NCHW)); - - ARM_COMPUTE_RETURN_ON_ERROR(CpuPermute::validate(input, &permuted_input, PermutationVector(2U, 0U, 1U))); - ARM_COMPUTE_RETURN_ON_ERROR(CpuPermute::validate(weights, &permuted_weights, PermutationVector(2U, 0U, 1U))); - ARM_COMPUTE_RETURN_ON_ERROR(CpuPermute::validate(&permuted_output, output, PermutationVector(1U, 2U, 0U))); - - ARM_COMPUTE_RETURN_ON_ERROR(cpu::kernels::CpuDepthwiseConvolutionNativeKernel::validate(&permuted_input, &permuted_weights, biases, &permuted_output, info)); - } - else - { - ARM_COMPUTE_RETURN_ON_ERROR(cpu::kernels::CpuDepthwiseConvolutionNativeKernel::validate(input, weights, biases, output, info)); - } - - // Validate Activation Layer - if(info.act_info.enabled()) - { - ARM_COMPUTE_RETURN_ON_ERROR(CpuActivation::validate(output, nullptr, info.act_info)); - } - - return Status{}; -} - -void CpuDepthwiseConvolution::CpuDepthwiseConvolutionGeneric::run(ITensorPack &tensors) -{ - auto src = tensors.get_const_tensor(TensorType::ACL_SRC_0); - auto weights = tensors.get_const_tensor(TensorType::ACL_SRC_1); - auto biases = tensors.get_const_tensor(TensorType::ACL_SRC_2); - auto dst = tensors.get_tensor(TensorType::ACL_DST_0); - - if(_is_nchw) - { - prepare(tensors); - auto src_perm = tensors.get_tensor(TensorType::ACL_INT_0); - auto weights_perm = tensors.get_tensor(TensorType::ACL_INT_1); - auto dst_perm = tensors.get_tensor(TensorType::ACL_INT_2); - - ITensorPack pack; - pack.add_tensor(TensorType::ACL_SRC, src); - pack.add_tensor(TensorType::ACL_DST, src_perm); - _permute_input->run(pack); - - ITensorPack pack_depth; - pack_depth.add_const_tensor(TensorType::ACL_SRC_0, src_perm); - pack_depth.add_const_tensor(TensorType::ACL_SRC_1, weights_perm); - pack_depth.add_tensor(TensorType::ACL_SRC_2, biases); - pack_depth.add_tensor(TensorType::ACL_DST, dst_perm); - NEScheduler::get().schedule_op(_depthwise_conv_kernel.get(), Window::DimY, _depthwise_conv_kernel->window(), pack_depth); - } - else - { - ITensorPack pack_depth; - pack_depth.add_tensor(TensorType::ACL_SRC_0, src); - pack_depth.add_tensor(TensorType::ACL_SRC_1, weights); - pack_depth.add_tensor(TensorType::ACL_SRC_2, biases); - pack_depth.add_tensor(TensorType::ACL_DST, dst); - NEScheduler::get().schedule_op(_depthwise_conv_kernel.get(), Window::DimY, _depthwise_conv_kernel->window(), pack_depth); - } - - if(_is_nchw) - { - ITensorPack pack; - auto dst_perm = tensors.get_tensor(TensorType::ACL_INT_2); - pack.add_tensor(TensorType::ACL_SRC, dst_perm); - pack.add_tensor(TensorType::ACL_DST, dst); - _permute_output->run(pack); - } - - if(_is_activationlayer_enabled) - { - ITensorPack pack; - pack.add_tensor(TensorType::ACL_SRC, dst); - pack.add_tensor(TensorType::ACL_DST, dst); - _activationlayer_function->run(pack); - } -} - -void CpuDepthwiseConvolution::CpuDepthwiseConvolutionGeneric::prepare(ITensorPack &tensors) -{ - if(!_is_prepared) - { - auto weights = tensors.get_const_tensor(TensorType::ACL_SRC_1); - auto weights_perm = tensors.get_tensor(TensorType::ACL_INT_1); - - ARM_COMPUTE_ERROR_ON(!weights->is_used()); - - ITensorPack pack; - pack.add_tensor(TensorType::ACL_SRC, weights); - pack.add_tensor(TensorType::ACL_DST, weights_perm); - - _permute_weights->run(pack); - weights->mark_as_unused(); - _is_prepared = true; - } -} - -CpuDepthwiseConvolution::CpuDepthwiseConvolution() - : _depth_conv_func(DepthwiseConvolutionFunction::GENERIC), _func_optimized(), _func_generic() -{ -} - -void CpuDepthwiseConvolution::configure(ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, ITensorInfo *output, const ConvolutionInfo &info) -{ - _depth_conv_func = get_depthwiseconvolution_function(input, weights, (biases != nullptr) ? biases : nullptr, output, info); - switch(_depth_conv_func) - { - case DepthwiseConvolutionFunction::OPTIMIZED: - _func_optimized.configure(input, weights, biases, output, info); - break; - case DepthwiseConvolutionFunction::GENERIC: - _func_generic.configure(input, weights, biases, output, info); - break; - default: - ARM_COMPUTE_ERROR("Unsupported DepthwiseConvolutionFunction"); - } -} - -Status CpuDepthwiseConvolution::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const ConvolutionInfo &info) -{ - DepthwiseConvolutionFunction depth_conv_func = get_depthwiseconvolution_function(input, weights, biases, output, info); - switch(depth_conv_func) - { - case DepthwiseConvolutionFunction::OPTIMIZED: - return CpuDepthwiseConvolutionOptimizedInternal::validate(input, weights, biases, output, info); - break; - case DepthwiseConvolutionFunction::GENERIC: - return CpuDepthwiseConvolutionGeneric::validate(input, weights, biases, output, info); - break; - default: - ARM_COMPUTE_ERROR("Unsupported DepthwiseConvolutionFunction"); - } -} - -DepthwiseConvolutionFunction CpuDepthwiseConvolution::get_depthwiseconvolution_function(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, - const ConvolutionInfo &info) -{ - if(bool(CpuDepthwiseConvolutionOptimizedInternal::validate(input, weights, biases, output, info))) - { - return DepthwiseConvolutionFunction::OPTIMIZED; - } - else - { - return DepthwiseConvolutionFunction::GENERIC; - } -} - -void CpuDepthwiseConvolution::run(ITensorPack &tensors) -{ - switch(_depth_conv_func) - { - case DepthwiseConvolutionFunction::OPTIMIZED: - _func_optimized.run(tensors); - break; - case DepthwiseConvolutionFunction::GENERIC: - _func_generic.run(tensors); - break; - default: - ARM_COMPUTE_ERROR("DepthwiseConvolutionFunction not properly configured"); - } -} - -void CpuDepthwiseConvolution::prepare(ITensorPack &tensors) -{ - switch(_depth_conv_func) - { - case DepthwiseConvolutionFunction::OPTIMIZED: - _func_optimized.prepare(tensors); - break; - case DepthwiseConvolutionFunction::GENERIC: - _func_generic.prepare(tensors); - break; - default: - ARM_COMPUTE_ERROR("DepthwiseConvolutionFunction not properly configured"); - } -} -} // namespace cpu -} // namespace arm_compute diff --git a/src/runtime/cpu/operators/CpuDepthwiseConvolution.h b/src/runtime/cpu/operators/CpuDepthwiseConvolution.h deleted file mode 100644 index e39cb7db4d..0000000000 --- a/src/runtime/cpu/operators/CpuDepthwiseConvolution.h +++ /dev/null @@ -1,230 +0,0 @@ -/* - * Copyright (c) 2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifndef ARM_COMPUTE_CPU_DEQUANTIZATION_H -#define ARM_COMPUTE_CPU_DEQUANTIZATION_H - -#include "arm_compute/core/ITensorInfo.h" -#include "arm_compute/core/experimental/Types.h" -#include "src/core/cpu/ICpuKernel.h" -#include "src/core/cpu/kernels/CpuDepthwiseConvolutionNativeKernel.h" -#include "src/runtime/cpu/ICpuOperator.h" -#include "src/runtime/cpu/operators/CpuActivation.h" -#include "src/runtime/cpu/operators/CpuDepthwiseConvolutionAssemblyDispatch.h" -#include "src/runtime/cpu/operators/CpuPermute.h" - -#include - -namespace arm_compute -{ -namespace cpu -{ -/** Function to execute a depthwise convolution. - */ -class CpuDepthwiseConvolution : public ICpuOperator -{ -public: - /** Default constructor */ - CpuDepthwiseConvolution(); - /** Initialize the function's source, destination, weights and convolution information. - * - * @param[in, out] input Source tensor info. Data type supported: QASYMM8/QASYMM8_SIGNED/F16/F32 - * @param[out] output Destination tensor info. Data type supported: same as @p input. - * @param[in] weights Weights tensor info. These are 3D tensor infos with shape [kernel_x, kernel_y, IFM]. - * Data type supported: Same as @p input or QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL when @p input is QASYMM8/QASYMM8_SIGNED. - * @param[in] biases Biases tensor info. A 1D tensor with shape [IFM]. Must be nullptr if not needed. - * Data type supported: Same as @p input, S32 when input is QASYMM8/QASYMM8_SIGNED. - * @param[in] info Depthwise convolution meta-data. - */ - void configure(ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, ITensorInfo *output, const ConvolutionInfo &info); - - /** Static function to check if given info will lead to a valid configuration of @ref CpuDepthwiseConvolution - * - * @param[in] input Source tensor info. Data type supported: QASYMM8/QASYMM8_SIGNED/F16/F32 - * @param[in] output Destination tensor info. Data type supported: same as @p input. - * @param[in] weights Weights tensor info. These are 3D tensors info with shape [kernel_x, kernel_y, IFM]. - * Data type supported: Same as @p input or QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL when @p input is QASYMM8/QASYMM8_SIGNED. - * @param[in] biases Biases tensor info. A 1D tensor with shape [IFM]. Must be nullptr if not needed. - * Data type supported: Same as @p input, S32 when input is QASYMM8/QASYMM8_SIGNED. - * @param[in] info Depthwise convolution meta-data. - * - * @return a status - */ - static Status validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const ConvolutionInfo &info); - - /** Static function to choose the best depthwise convolution function for @ref CpuDepthwiseConvolution - * - * @param[in] input Source tensor info. Data type supported: QASYMM8/QASYMM8_SIGNED/F16/F32 - * @param[in] weights Weights tensor info. These are 3D tensors with shape [kernel_x, kernel_y, IFM]. - * Data type supported: Same as @p input or QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL when @p input is QASYMM8/QASYMM8_SIGNED. - * @param[in] biases Biases tensor info. A 1D tensor with shape [IFM]. Must be nullptr if not needed. - * Data type supported: Same as @p input, S32 when input is QASYMM8/QASYMM8_SIGNED. - * @param[in] output Destination tensor. Data type supported: same as @p input. - * @param[in] info Depthwise convolution meta-data. - * - * @return a Depthwise Convolution Function - */ - static DepthwiseConvolutionFunction get_depthwiseconvolution_function(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, - const ConvolutionInfo &info); - - // Inherited methods overriden: - void run(ITensorPack &tensors) override; - void prepare(ITensorPack &tensors) override; - -private: - /** Basic function to execute optimized depthwise convolution routines. This function calls the following kernels: - * - * @note At the moment 3x3 and 5x5 convolution of stride 1, 2 are supported - * - * -# @ref NEFillBorderKernel (if pad_x or pad_y > 0) and no assembly kernel implementation is present - * -# @ref CpuDepthwiseConvolution3x3Kernel if 3x3 and no assembly kernel implementation is present - * -# @ref NEDepthwiseConvolutionAssemblyDispatch if assembly kernel implementation is present - * -# @ref NEDirectConvolutionLayerOutputStageKernel if re-quantization of output is required - * -# @ref NEActivationLayer if fused activation is required - * - */ - class CpuDepthwiseConvolutionOptimizedInternal : public ICpuOperator - { - public: - /** Default constructor */ - CpuDepthwiseConvolutionOptimizedInternal(); - /** Prevent instances of this class from being copied (As this class contains pointers) */ - CpuDepthwiseConvolutionOptimizedInternal(const CpuDepthwiseConvolutionOptimizedInternal &) = delete; - /** Default move constructor */ - CpuDepthwiseConvolutionOptimizedInternal(CpuDepthwiseConvolutionOptimizedInternal &&) = default; - /** Prevent instances of this class from being copied (As this class contains pointers) */ - CpuDepthwiseConvolutionOptimizedInternal &operator=(const CpuDepthwiseConvolutionOptimizedInternal &) = delete; - /** Default move assignment operator */ - CpuDepthwiseConvolutionOptimizedInternal &operator=(CpuDepthwiseConvolutionOptimizedInternal &&) = default; - /** Default destructor */ - ~CpuDepthwiseConvolutionOptimizedInternal() = default; - /** Initialize the function's source, destination, kernels and border_size. - * - * @param[in, out] input Source tensor info. Data type supported: QASYMM8/QASYMM8_SIGNED/F16/F32. (Written to only for border filling). - * @param[in] weights Weights tensor info. These are 3D tensors with shape [kernel_x, kernel_y, IFM]. Data type supported: Same as @p input. - * @param[in] biases Biases tensor info. A 1D tensor with shape [IFM]. Must be nullptr if not needed. - * Data type supported: Same as @p input, S32 when input is QASYMM8/QASYMM8_SIGNED. - * @param[out] output Destination tensor info. Data type supported: same as @p input. - * @param[in] info Depthwise convolution meta-data. - */ - void configure(ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, ITensorInfo *output, const ConvolutionInfo &info); - - /** Static function to check if given info will lead to a valid configuration of @ref CpuDepthwiseConvolution3x3 - * - * @param[in] input Source tensor info. Data type supported: QASYMM8/QASYMM8_SIGNED/F16/F32. (Written to only for border filling). - * @param[in] weights Weights tensor info. These are 3D tensors with shape [kernel_x, kernel_y, IFM]. Data type supported: Same as @p input. - * @param[in] biases Biases tensor info. A 1D tensor with shape [IFM]. Must be nullptr if not needed. - * Data type supported: Same as @p input, S32 when input is QASYMM8/QASYMM8_SIGNED. - * @param[in] output Destination tensor info. Data type supported: same as @p input. - * @param[in] info Depthwise convolution meta-data. - * - * @return a status - */ - static Status validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const ConvolutionInfo &info); - - // Inherited methods overriden: - void run(ITensorPack &tensors) override; - void prepare(ITensorPack &tensors) override; - - private: - std::unique_ptr _dwc_optimized_func{ nullptr }; - std::unique_ptr _permute_input{ nullptr }; - std::unique_ptr _permute_weights{ nullptr }; - std::unique_ptr _permute_output{ nullptr }; - std::unique_ptr _activationlayer_function{ nullptr }; - bool _has_bias{ false }; - bool _is_quantized{ false }; - bool _is_nchw{ true }; - bool _permute{ false }; - bool _is_activationlayer_enabled{ false }; - bool _is_prepared{ false }; - }; - - /** Basic function to execute a generic depthwise convolution. This function calls the following kernel: - * - * -# @ref CpuDepthwiseConvolutionNativeKernel - * - */ - class CpuDepthwiseConvolutionGeneric : public ICpuOperator - { - public: - /** Default constructor */ - CpuDepthwiseConvolutionGeneric(); - /** Prevent instances of this class from being copied (As this class contains pointers) */ - CpuDepthwiseConvolutionGeneric(const CpuDepthwiseConvolutionGeneric &) = delete; - /** Default move constructor */ - CpuDepthwiseConvolutionGeneric(CpuDepthwiseConvolutionGeneric &&) = default; - /** Prevent instances of this class from being copied (As this class contains pointers) */ - CpuDepthwiseConvolutionGeneric &operator=(const CpuDepthwiseConvolutionGeneric &) = delete; - /** Default move assignment operator */ - CpuDepthwiseConvolutionGeneric &operator=(CpuDepthwiseConvolutionGeneric &&) = default; - /** Default destructor */ - ~CpuDepthwiseConvolutionGeneric() = default; - /** Initialize the function's source, destination, weights and convolution information. - * - * @param[in, out] input Source tensor info. Data type supported: QASYMM8/QASYMM8_SIGNED/F16/F32. (Written to only for border filling). - * @param[out] output Destination tensor info. Data type supported: same as @p input. - * @param[in] weights Weights tensor info. These are 3D tensors with shape [kernel_x, kernel_y, IFM]. - * Data type supported: Same as @p input or QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL when @p input is QASYMM8/QASYMM8_SIGNED. - * @param[in] biases Biases tensor info. A 1D tensor with shape [IFM]. Must be nullptr if not needed. - * Data type supported: Same as @p input, S32 when input is QASYMM8/QASYMM8_SIGNED. - * @param[in] info Depthwise convolution meta-data. - */ - void configure(ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, ITensorInfo *output, const ConvolutionInfo &info); - - /** Static function to check if given info will lead to a valid configuration of @ref CpuDepthwiseConvolutionGeneric - * - * @param[in] input Source tensor info. Data type supported: QASYMM8/QASYMM8_SIGNED/F16/F32. (Written to only for border filling). - * @param[in] output Destination tensor info. Data type supported: same as @p input. - * @param[in] weights Weights tensor info. These are 3D tensors with shape [kernel_x, kernel_y, IFM]. - * Data type supported: Same as @p input or QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL when @p input is QASYMM8/QASYMM8_SIGNED. - * @param[in] biases Biases tensor info. A 1D tensor with shape [IFM]. Must be nullptr if not needed. - * Data type supported: Same as @p input, S32 when input is QASYMM8/QASYMM8_SIGNED. - * @param[in] info Depthwise convolution meta-data. - * - * @return a status - */ - static Status validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const ConvolutionInfo &info); - - // Inherited methods overridden: - void run(ITensorPack &tensors) override; - void prepare(ITensorPack &tensors) override; - - private: - std::unique_ptr _depthwise_conv_kernel{ nullptr }; - std::unique_ptr _permute_input{ nullptr }; - std::unique_ptr _permute_weights{ nullptr }; - std::unique_ptr _permute_output{ nullptr }; - std::unique_ptr _activationlayer_function{ nullptr }; - bool _is_nchw{ true }; - bool _is_prepared{ false }; - bool _is_activationlayer_enabled{ false }; - }; - - DepthwiseConvolutionFunction _depth_conv_func; - CpuDepthwiseConvolutionOptimizedInternal _func_optimized; - CpuDepthwiseConvolutionGeneric _func_generic; -}; -} // namespace cpu -} // namespace arm_compute -#endif /* ARM_COMPUTE_CPU_DEQUANTIZATION_H */ diff --git a/src/runtime/cpu/operators/CpuDepthwiseConvolutionAssemblyDispatch.cpp b/src/runtime/cpu/operators/CpuDepthwiseConvolutionAssemblyDispatch.cpp deleted file mode 100644 index 039714abb1..0000000000 --- a/src/runtime/cpu/operators/CpuDepthwiseConvolutionAssemblyDispatch.cpp +++ /dev/null @@ -1,563 +0,0 @@ -/* - * Copyright (c) 2019-2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ - -#include "src/runtime/cpu/operators/CpuDepthwiseConvolutionAssemblyDispatch.h" - -#include "arm_compute/core/ITensor.h" -#include "arm_compute/core/Utils.h" -#include "arm_compute/core/utils/misc/InfoHelpers.h" -#include "arm_compute/core/utils/misc/ShapeCalculator.h" -#include "arm_compute/core/utils/quantization/AsymmHelpers.h" -#include "src/core/CPP/Validate.h" -#include "src/core/NEON/kernels/assembly/NEDepthwiseConvolutionAssemblyKernelWrapper.h" -#include "src/core/NEON/kernels/convolution/depthwise/depthwise_dilated.hpp" -#include "src/core/NEON/kernels/convolution/depthwise/depthwise_quantized_dilated.hpp" -#include "src/core/helpers/AutoConfiguration.h" - -#include "arm_compute/runtime/NEON/NEScheduler.h" - -#include - -namespace arm_compute -{ -namespace cpu -{ -namespace -{ -std::unique_ptr get_qasymm8_convolver(int kernel_size, int stride_x, - int n_batches, int in_rows, int in_cols, int n_channels, - int dilation_factor, neon_convolution_kernels::ActivationFunction activation, - const qasymm8::QAsymm8Params &wqinfo, const qasymm8::QAsymm8Params &iqinfo, const qasymm8::QAsymm8Params &oqinfo, - const qasymm8::QAsymm8RescaleParams &rescale_params, - int padding_top, int padding_left, int padding_bottom, int padding_right) -{ - switch(kernel_size) - { - case 3: - { - switch(stride_x) - { - case 1: - return std::make_unique>( - n_batches, in_rows, in_cols, n_channels, dilation_factor, activation, wqinfo, iqinfo, oqinfo, rescale_params, padding_top, padding_left, padding_bottom, padding_right); - case 2: - return std::make_unique>( - n_batches, in_rows, in_cols, n_channels, dilation_factor, activation, wqinfo, iqinfo, oqinfo, rescale_params, padding_top, padding_left, padding_bottom, padding_right); - default: - return nullptr; - } - } - case 5: - { - switch(stride_x) - { - case 1: - return std::make_unique>( - n_batches, in_rows, in_cols, n_channels, dilation_factor, activation, wqinfo, iqinfo, oqinfo, rescale_params, padding_top, padding_left, padding_bottom, padding_right); - case 2: - return std::make_unique>( - n_batches, in_rows, in_cols, n_channels, dilation_factor, activation, wqinfo, iqinfo, oqinfo, rescale_params, padding_top, padding_left, padding_bottom, padding_right); - default: - return nullptr; - } - } - default: - return nullptr; - } -} - -std::unique_ptr get_qsymm8_perchannel_convolver(int kernel_size, int stride_x, - int n_batches, int in_rows, int in_cols, int n_channels, - neon_convolution_kernels::ActivationFunction activation, - const qsymm8::QSymm8PerChannelParams &wqinfo, const qasymm8::QAsymm8Params &iqinfo, const qasymm8::QAsymm8Params &oqinfo, - const qsymm8::QSymm8PerChannelRescaleParams &rescale_params, - int padding_top, int padding_left, int padding_bottom, int padding_right) -{ - switch(kernel_size) - { - case 3: - { - switch(stride_x) - { - case 1: - return std::make_unique>( - n_batches, in_rows, in_cols, n_channels, activation, wqinfo, iqinfo, oqinfo, rescale_params, padding_top, padding_left, padding_bottom, padding_right); - case 2: - return std::make_unique>( - n_batches, in_rows, in_cols, n_channels, activation, wqinfo, iqinfo, oqinfo, rescale_params, padding_top, padding_left, padding_bottom, padding_right); - default: - return nullptr; - } - } - case 5: - { - switch(stride_x) - { - case 1: - return std::make_unique>( - n_batches, in_rows, in_cols, n_channels, activation, wqinfo, iqinfo, oqinfo, rescale_params, padding_top, padding_left, padding_bottom, padding_right); - case 2: - return std::make_unique>( - n_batches, in_rows, in_cols, n_channels, activation, wqinfo, iqinfo, oqinfo, rescale_params, padding_top, padding_left, padding_bottom, padding_right); - default: - return nullptr; - } - } - default: - return nullptr; - } -} - -#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC -std::unique_ptr get_fp16_convolver(int kernel_size, int stride_x, - int n_batches, int in_rows, int in_cols, int n_channels, - int dilation_factor, neon_convolution_kernels::ActivationFunction activation, - int padding_top, int padding_left, int padding_bottom, int padding_right) -{ - switch(kernel_size) - { - case 3: - { - switch(stride_x) - { - case 1: - return std::make_unique>( - n_batches, in_rows, in_cols, n_channels, dilation_factor, activation, padding_top, padding_left, padding_bottom, padding_right); - case 2: - return std::make_unique>( - n_batches, in_rows, in_cols, n_channels, dilation_factor, activation, padding_top, padding_left, padding_bottom, padding_right); - default: - return nullptr; - } - } - case 5: - { - switch(stride_x) - { - case 1: - return std::make_unique>( - n_batches, in_rows, in_cols, n_channels, dilation_factor, activation, padding_top, padding_left, padding_bottom, padding_right); - case 2: - return std::make_unique>( - n_batches, in_rows, in_cols, n_channels, dilation_factor, activation, padding_top, padding_left, padding_bottom, padding_right); - default: - return nullptr; - } - } - default: - return nullptr; - } -} -#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC - -std::unique_ptr get_fp32_convolver(int kernel_size, int stride_x, - int n_batches, int in_rows, int in_cols, int n_channels, - int dilation_factor, neon_convolution_kernels::ActivationFunction activation, - int padding_top, int padding_left, int padding_bottom, int padding_right) -{ - switch(kernel_size) - { - case 3: - { - switch(stride_x) - { - case 1: - return std::make_unique>( - n_batches, in_rows, in_cols, n_channels, dilation_factor, activation, padding_top, padding_left, padding_bottom, padding_right); - case 2: - return std::make_unique>( - n_batches, in_rows, in_cols, n_channels, dilation_factor, activation, padding_top, padding_left, padding_bottom, padding_right); - default: - return nullptr; - } - } - case 5: - { - switch(stride_x) - { - case 1: - return std::make_unique>( - n_batches, in_rows, in_cols, n_channels, dilation_factor, activation, padding_top, padding_left, padding_bottom, padding_right); - case 2: - return std::make_unique>( - n_batches, in_rows, in_cols, n_channels, dilation_factor, activation, padding_top, padding_left, padding_bottom, padding_right); - default: - return nullptr; - } - } - default: - return nullptr; - } -} - -std::unique_ptr create_convolver(const ITensorInfo *input, - const ITensorInfo *weights, - ITensorInfo *output, - const ConvolutionInfo &info) -{ - const DataType data_type = input->data_type(); - const TensorShape shape = input->tensor_shape(); - - const int n_batches = shape[3]; - const int in_rows = shape.z(); - const int in_cols = shape.y(); - const int n_channels = shape.x(); - const int dilation_factor = info.dilation.x(); - const int padding_top = info.pad_stride_info.pad_top(); - const int padding_left = info.pad_stride_info.pad_left(); - const int padding_bottom = info.pad_stride_info.pad_bottom(); - const int padding_right = info.pad_stride_info.pad_right(); - - const bool is_uniform_quantized = (data_type == DataType::QASYMM8) && (weights->data_type() == DataType::QASYMM8); - const bool is_perchannel_quantized = (data_type == DataType::QASYMM8) && (weights->data_type() == DataType::QSYMM8_PER_CHANNEL); - - const unsigned int stride_x = info.pad_stride_info.stride().first; - const unsigned int kernel_size = weights->tensor_shape().y(); - - // Map activation function - neon_convolution_kernels::ActivationFunction activation = neon_convolution_kernels::ActivationFunction::None; - if(arm_compute::utils::info_helpers::is_relu(info.act_info)) - { - activation = neon_convolution_kernels::ActivationFunction::ReLU; - } - else if(arm_compute::utils::info_helpers::is_relu6(info.act_info)) - { - activation = neon_convolution_kernels::ActivationFunction::ReLU6; - } - - // Create quantized convolver - if(is_uniform_quantized) - { - const UniformQuantizationInfo input_qinfo = input->quantization_info().uniform(); - const UniformQuantizationInfo weights_qinfo = weights->quantization_info().uniform(); - const UniformQuantizationInfo output_qinfo = output->quantization_info().uniform(); - - // Check that quantization info are in the range [0, 255] - ARM_COMPUTE_ERROR_ON(input_qinfo.offset < 0 || input_qinfo.offset > 255); - ARM_COMPUTE_ERROR_ON(weights_qinfo.offset < 0 || weights_qinfo.offset > 255); - ARM_COMPUTE_ERROR_ON(output_qinfo.offset < 0 || output_qinfo.offset > 255); - const qasymm8::QAsymm8Params iqinfo{ static_cast(input_qinfo.offset), input_qinfo.scale }; - const qasymm8::QAsymm8Params wqinfo{ static_cast(weights_qinfo.offset), weights_qinfo.scale }; - const qasymm8::QAsymm8Params oqinfo{ static_cast(output_qinfo.offset), output_qinfo.scale }; - - // Calculate rescale parameters - const float fmultipler = iqinfo.scale * wqinfo.scale / oqinfo.scale; - int32_t qmultiplier = 0; - int32_t qshift = 0; - quantization::calculate_quantized_multiplier_less_than_one(fmultipler, &qmultiplier, &qshift); - qasymm8::QAsymm8RescaleParams rescale_params(qshift, qmultiplier, fmultipler); - - return get_qasymm8_convolver(kernel_size, stride_x, n_batches, in_rows, in_cols, n_channels, dilation_factor, activation, - wqinfo, iqinfo, oqinfo, rescale_params, padding_top, padding_left, padding_bottom, padding_right); - } - else if(is_perchannel_quantized) - { - const UniformQuantizationInfo input_qinfo = input->quantization_info().uniform(); - const QuantizationInfo weights_qinfo = weights->quantization_info(); - const UniformQuantizationInfo output_qinfo = output->quantization_info().uniform(); - - // Check that quantization info are in the range [0, 255] - ARM_COMPUTE_ERROR_ON(input_qinfo.offset < 0 || input_qinfo.offset > 255); - ARM_COMPUTE_ERROR_ON(output_qinfo.offset < 0 || output_qinfo.offset > 255); - const qasymm8::QAsymm8Params iqinfo{ static_cast(input_qinfo.offset), input_qinfo.scale }; - const qsymm8::QSymm8PerChannelParams wqinfo{ weights_qinfo.scale() }; - const qasymm8::QAsymm8Params oqinfo{ static_cast(output_qinfo.offset), output_qinfo.scale }; - - // Calculate rescale parameters - std::vector fmultipliers; - std::vector qmultipliers; - std::vector qshifts; - - for(auto const s : wqinfo.scales) - { - const float fmultipler = iqinfo.scale * s / oqinfo.scale; - int32_t qmultiplier = 0; - int32_t qshift = 0; - quantization::calculate_quantized_multiplier_less_than_one(fmultipler, &qmultiplier, &qshift); - fmultipliers.push_back(fmultipler); - qmultipliers.push_back(qmultiplier); - qshifts.push_back(qshift); - } - - qsymm8::QSymm8PerChannelRescaleParams rescale_params(qshifts, qmultipliers, fmultipliers); - - return get_qsymm8_perchannel_convolver(kernel_size, stride_x, n_batches, in_rows, in_cols, n_channels, activation, - wqinfo, iqinfo, oqinfo, rescale_params, padding_top, padding_left, padding_bottom, padding_right); - } - else - { - // Create float convolver - switch(data_type) - { -#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC - case DataType::F16: - { - return get_fp16_convolver(kernel_size, stride_x, n_batches, in_rows, in_cols, n_channels, dilation_factor, activation, padding_top, padding_left, padding_bottom, padding_right); - } -#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC - case DataType::F32: - { - return get_fp32_convolver(kernel_size, stride_x, n_batches, in_rows, in_cols, n_channels, dilation_factor, activation, padding_top, padding_left, padding_bottom, padding_right); - } - default: - return nullptr; - } - } -} -} // namespace - -struct CpuDepthwiseConvolutionAssemblyDispatch::LocalImpl -{ - std::unique_ptr dwc_assembly_kernel{ nullptr }; - NEDepthwiseConvolutionAssemblyKernelWrapper dwc_acl_kernel{}; - bool is_prepared{ false }; - experimental::MemoryRequirements mem_req{}; -}; - -#ifndef DOXYGEN_SKIP_THIS -CpuDepthwiseConvolutionAssemblyDispatch::CpuDepthwiseConvolutionAssemblyDispatch() - : _pImpl(std::make_unique()) -{ -} -#endif /* DOXYGEN_SKIP_THIS */ - -CpuDepthwiseConvolutionAssemblyDispatch::~CpuDepthwiseConvolutionAssemblyDispatch() = default; - -void CpuDepthwiseConvolutionAssemblyDispatch::configure(const ITensorInfo *input, - const ITensorInfo *weights, - const ITensorInfo *bias, - ITensorInfo *output, - const ConvolutionInfo &info) -{ - ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output); - ARM_COMPUTE_UNUSED(bias); - ARM_COMPUTE_ERROR_THROW_ON(CpuDepthwiseConvolutionAssemblyDispatch::validate(input, - weights, - bias != nullptr ? bias : nullptr, - output, - info)); - - // Output auto inizialitation if not yet initialized - const TensorShape output_shape = misc::shape_calculator::compute_depthwise_convolution_shape(*input, *weights, info); - auto_init_if_empty(*output, input->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(output_shape).set_quantization_info(output->quantization_info())); - - _pImpl->is_prepared = false; - - // Create convolver - _pImpl->dwc_assembly_kernel = create_convolver(input, weights, output, info); - ARM_COMPUTE_ERROR_ON(_pImpl->dwc_assembly_kernel == nullptr); - - // Create assembly kernel wrapper - _pImpl->dwc_acl_kernel.configure(_pImpl->dwc_assembly_kernel.get()); - - constexpr size_t alignment = 128; - - // Create workspace - const unsigned int num_threads = NEScheduler::get().num_threads(); - const size_t workspace_size = _pImpl->dwc_assembly_kernel->get_working_space_size(num_threads); - ARM_COMPUTE_ERROR_ON_MSG(workspace_size == 0, "Workspace size cannot be 0 !"); - _pImpl->mem_req.push_back({ TensorType::ACL_INT_0, workspace_size, alignment }); - - // Create packing tensor - const size_t pack_tensor_size = _pImpl->dwc_assembly_kernel->get_packed_params_size(); - ARM_COMPUTE_ERROR_ON_MSG(pack_tensor_size == 0, "Pack tensor size cannot be 0 !"); - - _pImpl->mem_req.push_back({ TensorType::ACL_INT_1, pack_tensor_size, alignment }); -} - -experimental::MemoryRequirements CpuDepthwiseConvolutionAssemblyDispatch::workspace() const -{ - return _pImpl->mem_req; -} - -Status CpuDepthwiseConvolutionAssemblyDispatch::validate(const ITensorInfo *input, - const ITensorInfo *weights, - const ITensorInfo *bias, - const ITensorInfo *output, - const ConvolutionInfo &info) -{ - ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F16, DataType::F32); - if(weights->data_type() != DataType::QSYMM8_PER_CHANNEL) - { - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights); - } - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, weights); - - // Validate convolver - ARM_COMPUTE_RETURN_ERROR_ON(!is_optimized_supported(input, weights, info)); - - // Validate activation - const bool is_relu = arm_compute::utils::info_helpers::is_relu(info.act_info); - const bool is_relu6 = arm_compute::utils::info_helpers::is_relu6(info.act_info); - ARM_COMPUTE_RETURN_ERROR_ON(info.act_info.enabled() && !(is_relu || is_relu6)); - - // Check bias - if(bias != nullptr) - { - unsigned int channel_idx = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::CHANNEL); - ARM_COMPUTE_RETURN_ERROR_ON(bias->num_dimensions() > 1); - ARM_COMPUTE_RETURN_ERROR_ON(bias->dimension(0) != weights->dimension(channel_idx)); - } - - // Check output - if(output->total_size() != 0) - { - const TensorShape output_shape = misc::shape_calculator::compute_depthwise_convolution_shape(*input, *weights, info); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), output_shape); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); - } - - // The uniform quantization case will only have 1 scale value in the weights quantization info - const UniformQuantizationInfo input_qinfo = input->quantization_info().uniform(); - const QuantizationInfo weights_qinfo = weights->quantization_info(); - const UniformQuantizationInfo output_qinfo = output->quantization_info().uniform(); - for(auto const s : weights_qinfo.scale()) - { - const float fmultipler = input_qinfo.scale * s / output_qinfo.scale; - ARM_COMPUTE_RETURN_ERROR_ON(fmultipler > 1.f); - } - - return Status{}; -} - -bool CpuDepthwiseConvolutionAssemblyDispatch::is_optimized_supported(const ITensorInfo *input, - const ITensorInfo *weights, - const ConvolutionInfo &info) -{ - ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights); - - // Reshape input shape if in NHWC format - const DataLayout data_layout = input->data_layout(); - TensorShape in_shape{ input->tensor_shape() }; - if(data_layout == DataLayout::NHWC) - { - in_shape.set(Window::DimX, input->tensor_shape().y()); - in_shape.set(Window::DimY, input->tensor_shape().z()); - in_shape.set(Window::DimZ, input->tensor_shape().x()); - } - - // Check data type - const DataType input_type = input->data_type(); - const bool is_input_type_valid = is_data_type_float(input_type) || input_type == DataType::QASYMM8; - const DataType weights_type = weights->data_type(); - const bool is_weights_type_valid = is_data_type_float(weights_type) || weights_type == DataType::QASYMM8 || weights_type == DataType::QASYMM8_SIGNED - || weights_type == DataType::QSYMM8_PER_CHANNEL; - - // Check weighs size - std::set supported_kernel_sizes = { 3, 5 }; - const unsigned int width_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH); - const unsigned int height_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT); - const unsigned int kernel_w = weights->dimension(width_idx); - const unsigned int kernel_h = weights->dimension(height_idx); - bool weights_supported = (kernel_w == kernel_h) && (supported_kernel_sizes.count(kernel_w) != 0); - - // Check for supported strides - const auto &strides = info.pad_stride_info.stride(); - bool supported_strides = (strides.first == strides.second) && ((strides.first == 1) || (strides.first == 2)); - - // Check for supported padding - const auto pad_top = info.pad_stride_info.pad_top(); - const auto pad_right = info.pad_stride_info.pad_right(); - const auto pad_bottom = info.pad_stride_info.pad_bottom(); - const auto pad_left = info.pad_stride_info.pad_left(); - PadStrideInfo same_pad = calculate_same_pad(in_shape, TensorShape(kernel_w, kernel_h), info.pad_stride_info, DataLayout::NCHW, info.dilation); - bool is_same_padding = (pad_top == same_pad.pad_top()) && (pad_right == same_pad.pad_right()) && (pad_bottom == same_pad.pad_bottom()) && (pad_left == same_pad.pad_left()); - bool is_valid_padding = (pad_top == 0) && (pad_right == 0) && (pad_bottom == 0) && (pad_left == 0); - bool supported_padding = is_same_padding || is_valid_padding; - // TODO(COMPMID-2464): Enable once dilated conv with stride 2 is supported - bool is_dilation_supported = ((info.dilation == Size2D(1U, 1U)) || ((info.dilation.x() == info.dilation.y()) && strides.first == 1)); - - if(weights_type == DataType::QSYMM8_PER_CHANNEL) - { - is_dilation_supported = is_dilation_supported && (info.dilation == Size2D(1U, 1U)); - } - - return is_input_type_valid && is_weights_type_valid && weights_supported && supported_strides && supported_padding && (info.depth_multiplier == 1) && is_dilation_supported; -} - -void CpuDepthwiseConvolutionAssemblyDispatch::run(ITensorPack &tensors) -{ - // Prepare assembly kernel - prepare(tensors); - - auto src = tensors.get_tensor(TensorType::ACL_SRC_0); - auto workspace = tensors.get_tensor(TensorType::ACL_INT_0); - auto dst = tensors.get_tensor(TensorType::ACL_DST); - - // Setup inputs/outputs - ARM_COMPUTE_ERROR_ON(workspace == nullptr && workspace->buffer() == nullptr); - _pImpl->dwc_assembly_kernel->set_working_space(static_cast(workspace->buffer())); - - ARM_COMPUTE_ERROR_ON(workspace->buffer() == nullptr); - const int input_element_size = src->info()->element_size(); - const int input_batch_stride = src->info()->strides_in_bytes()[3] / input_element_size; - const int input_row_stride = src->info()->strides_in_bytes().z() / input_element_size; - const int input_col_stride = src->info()->strides_in_bytes().y() / input_element_size; - const void *input_ptr = src->buffer() + src->info()->offset_first_element_in_bytes(); - _pImpl->dwc_assembly_kernel->set_input(input_ptr, input_batch_stride, input_row_stride, input_col_stride); - - ARM_COMPUTE_ERROR_ON(dst->buffer() == nullptr); - const int output_element_size = dst->info()->element_size(); - const int output_batch_stride = dst->info()->strides_in_bytes()[3] / output_element_size; - const int output_row_stride = dst->info()->strides_in_bytes().z() / output_element_size; - const int output_col_stride = dst->info()->strides_in_bytes().y() / output_element_size; - void *output_ptr = dst->buffer() + dst->info()->offset_first_element_in_bytes(); - _pImpl->dwc_assembly_kernel->set_output(output_ptr, output_batch_stride, output_row_stride, output_col_stride); - - // Schedule assembly kernel - NEScheduler::get().schedule(&_pImpl->dwc_acl_kernel, Window::DimX); -} - -void CpuDepthwiseConvolutionAssemblyDispatch::prepare(ITensorPack &tensors) -{ - if(!_pImpl->is_prepared) - { - auto weights = tensors.get_const_tensor(TensorType::ACL_SRC_1); - auto bias = tensors.get_const_tensor(TensorType::ACL_SRC_2); - auto packed_weights = tensors.get_tensor(TensorType::ACL_INT_1); - - ARM_COMPUTE_ERROR_ON(packed_weights->buffer() == nullptr); - - // Pack weights and bias - const int weights_element_size = weights->info()->element_size(); - const int weights_row_stride = weights->info()->strides_in_bytes().z() / weights_element_size; - const int weights_col_stride = weights->info()->strides_in_bytes().y() / weights_element_size; - _pImpl->dwc_assembly_kernel->pack_params(packed_weights->buffer(), - weights->buffer() + weights->info()->offset_first_element_in_bytes(), - weights_row_stride, - weights_col_stride, - (bias != nullptr) ? bias->buffer() : nullptr); - _pImpl->dwc_assembly_kernel->set_packed_params_buffer(packed_weights->buffer()); - - weights->mark_as_unused(); - if(bias != nullptr) - { - bias->mark_as_unused(); - } - _pImpl->is_prepared = true; - } -} -} // namespace cpu -} // namespace arm_compute diff --git a/src/runtime/cpu/operators/CpuDepthwiseConvolutionAssemblyDispatch.h b/src/runtime/cpu/operators/CpuDepthwiseConvolutionAssemblyDispatch.h deleted file mode 100644 index 6aac74c3ef..0000000000 --- a/src/runtime/cpu/operators/CpuDepthwiseConvolutionAssemblyDispatch.h +++ /dev/null @@ -1,97 +0,0 @@ -/* - * Copyright (c) 2019-2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifndef ARM_COMPUTE_CPU_DEPTHWISECONVOLUTIONASSEMBLYDISPATCH_H -#define ARM_COMPUTE_CPU_DEPTHWISECONVOLUTIONASSEMBLYDISPATCH_H - -#include "src/runtime/cpu/ICpuOperator.h" - -namespace arm_compute -{ -namespace cpu -{ -/** Depthwise convolution assembly kernel glue */ -class CpuDepthwiseConvolutionAssemblyDispatch : public ICpuOperator -{ -public: - CpuDepthwiseConvolutionAssemblyDispatch(); - /** Prevent instances of this class from being copied (As this class contains pointers) */ - CpuDepthwiseConvolutionAssemblyDispatch(const CpuDepthwiseConvolutionAssemblyDispatch &) = delete; - /** Default move constructor */ - CpuDepthwiseConvolutionAssemblyDispatch(CpuDepthwiseConvolutionAssemblyDispatch &&) = default; - /** Prevent instances of this class from being copied (As this class contains pointers) */ - CpuDepthwiseConvolutionAssemblyDispatch &operator=(const CpuDepthwiseConvolutionAssemblyDispatch &) = delete; - /** Default move assignment operator */ - CpuDepthwiseConvolutionAssemblyDispatch &operator=(CpuDepthwiseConvolutionAssemblyDispatch &&) = default; - /** Default destructor */ - ~CpuDepthwiseConvolutionAssemblyDispatch(); - /** Initialize the function's source, destination, kernels and border_size. - * - * @note Supports only NHWC format - * - * @param[in] input Source tensor info. Data type supported: QASYMM8/F16/F32. (Written to only for border filling). - * @param[in] weights Weights tensor info. These are 3D tensors with shape [W, H, IFM]. Data type supported: Same as @p input. - * @param[in] bias (Optional) Biases tensor info. A 1D tensor with shape [IFM]. Must be nullptr if not needed. - * Data type supported: Same as @p input. - * @param[out] output Destination tensor info. Data type supported: same as @p input. - * @param[in] info Depthwise convolution meta-data. - */ - void configure(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *bias, ITensorInfo *output, const ConvolutionInfo &info); - /** Static function to check if given info will lead to a valid configuration of @ref CpuDepthwiseConvolutionAssemblyDispatch - * - * @note Supports only NHWC format - * - * @param[in] input Source tensor info. Data type supported: QASYMM8/F16/F32. (Written to only for border filling). - * @param[in] weights Weights tensor info. These are 3D tensors with shape [W, H, IFM]. Data type supported: Same as @p input. - * @param[in] bias (Optional) Biases tensor info. A 1D tensor with shape [IFM]. Must be nullptr if not needed. - * Data type supported: Same as @p input. - * @param[out] output Destination tensor info. Data type supported: same as @p input. - * @param[in] info Depthwise convolution meta-data. - * - * @return An error status - */ - static Status validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *bias, const ITensorInfo *output, const ConvolutionInfo &info); - /** Check if the optimized kernel can be used for the given kernel sizes and strides - * - * @warning Even if this return true the inputs and outputs might need to get permuted as the only layout supported is NHWC - * - * @param[in] input Input tensor info. - * @param[in] weights Weights tensor info. - * @param[in] info Depthwise convolution meta-data. - * - * @return True if the assembly kernel could be used else false. Note that transformations of input/output could be needed. - */ - static bool is_optimized_supported(const ITensorInfo *input, const ITensorInfo *weights, const ConvolutionInfo &info); - - // Inherited methods overridden: - void run(ITensorPack &tensors) override; - void prepare(ITensorPack &tensors) override; - experimental::MemoryRequirements workspace() const override; - -private: - struct LocalImpl; - std::unique_ptr _pImpl; -}; -} // namespace cpu -} // namespace arm_compute -#endif /* ARM_COMPUTE_CPU_DEPTHWISECONVOLUTIONASSEMBLYDISPATCH_H */ diff --git a/src/runtime/cpu/operators/CpuDirectConv2d.cpp b/src/runtime/cpu/operators/CpuDirectConv2d.cpp new file mode 100644 index 0000000000..8812b777a3 --- /dev/null +++ b/src/runtime/cpu/operators/CpuDirectConv2d.cpp @@ -0,0 +1,147 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "src/runtime/cpu/operators/CpuDirectConv2d.h" + +#include "arm_compute/core/PixelValue.h" +#include "arm_compute/core/Utils.h" +#include "arm_compute/core/Validate.h" +#include "arm_compute/runtime/NEON/NEScheduler.h" + +namespace arm_compute +{ +namespace cpu +{ +CpuDirectConv2d::~CpuDirectConv2d() = default; + +CpuDirectConv2d::CpuDirectConv2d(std::shared_ptr memory_manager) + : _memory_group(std::move(memory_manager)), _output_stage_kernel(), _conv_kernel(), _input_border_handler(), _activationlayer_function(), _accumulator(), _has_bias(false), + _is_activationlayer_enabled(false), _dim_split(Window::DimZ), _is_padding_required() +{ +} + +void CpuDirectConv2d::configure(ITensorInfo *src, ITensorInfo *weights, const ITensorInfo *bias, ITensorInfo *dst, const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info) +{ + ARM_COMPUTE_ERROR_ON(src->data_layout() == DataLayout::UNKNOWN); + _output_stage_kernel = std::make_unique(); + _conv_kernel = std::make_unique(); + _input_border_handler = std::make_unique(); + + // Free accumulator + if(_accumulator.buffer() != nullptr) + { + _accumulator.allocator()->free(); + } + + _dim_split = src->data_layout() == DataLayout::NCHW ? Window::DimZ : Window::DimY; + + // Check if bias should be added in the convolution result + _has_bias = (bias != nullptr); + + _conv_kernel->configure(src, weights, dst, conv_info); + if(_has_bias) + { + _output_stage_kernel->configure(dst, bias); + } + _is_padding_required = !_conv_kernel->border_size().empty(); + + if(_is_padding_required) + { + // Add zero padding XY + _input_border_handler->configure(src, _conv_kernel->border_size(), BorderMode::CONSTANT, PixelValue(static_cast(0.f))); + } + + //Configure Activation Layer + _is_activationlayer_enabled = act_info.enabled(); + if(_is_activationlayer_enabled) + { + _activationlayer_function = std::make_unique(); + _activationlayer_function->configure(dst, dst, act_info); + } +} + +Status CpuDirectConv2d::validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *bias, const ITensorInfo *dst, const PadStrideInfo &conv_info, + const ActivationLayerInfo &act_info) +{ + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, weights, dst); + + // output might not be initialized since it can be an intermediate tensor of another layer + DataType data_type = src->data_type(); + TensorInfo accumulator(dst->clone()->set_is_resizable(true).reset_padding().set_data_type(data_type)); + + // Validate Convolution kernel + ARM_COMPUTE_RETURN_ON_ERROR(kernels::CpuDirectConv2dKernel::validate(src, weights, &accumulator, conv_info)); + + if(bias != nullptr) + { + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(weights, bias); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(bias->dimension(0) != weights->dimension(3), + "Biases size and number of input feature maps should match"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(bias->num_dimensions() > 1, "Biases should be one dimensional"); + } + + // Validate bias kernel + ARM_COMPUTE_RETURN_ON_ERROR(kernels::CpuDirectConv2dOutputStageKernel::validate(&accumulator, bias, dst)); + + if(act_info.enabled()) + { + ARM_COMPUTE_RETURN_ON_ERROR(CpuActivation::validate(dst, nullptr, act_info)); + } + + return Status{}; +} + +void CpuDirectConv2d::run(ITensorPack &tensors) +{ + MemoryGroupResourceScope scope_mg(_memory_group); + + auto src = tensors.get_tensor(TensorType::ACL_SRC_0); + auto bias = tensors.get_const_tensor(TensorType::ACL_SRC_2); + auto dst = tensors.get_tensor(TensorType::ACL_DST); + + if(_is_padding_required) + { + ITensorPack pack; + pack.add_tensor(TensorType::ACL_SRC_DST, src); + NEScheduler::get().schedule_op(_input_border_handler.get(), Window::DimZ, _input_border_handler->window(), pack); + } + NEScheduler::get().schedule_op(_conv_kernel.get(), _dim_split, _conv_kernel->window(), tensors); + if(_has_bias) + { + ITensorPack pack; + pack.add_tensor(TensorType::ACL_SRC_0, dst); + pack.add_tensor(TensorType::ACL_SRC_1, bias); + pack.add_tensor(TensorType::ACL_DST, dst); + NEScheduler::get().schedule_op(_output_stage_kernel.get(), Window::DimY, _output_stage_kernel->window(), pack); + } + + if(_is_activationlayer_enabled) + { + ITensorPack pack; + pack.add_tensor(TensorType::ACL_SRC, dst); + pack.add_tensor(TensorType::ACL_DST, dst); + _activationlayer_function->run(pack); + } +} +} // namespace cpu +} // namespace arm_compute diff --git a/src/runtime/cpu/operators/CpuDirectConv2d.h b/src/runtime/cpu/operators/CpuDirectConv2d.h new file mode 100644 index 0000000000..9e584b9c49 --- /dev/null +++ b/src/runtime/cpu/operators/CpuDirectConv2d.h @@ -0,0 +1,107 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ARM_COMPUTE_CPU_DIRECTCONV2D_H +#define ARM_COMPUTE_CPU_DIRECTCONV2D_H + +#include "arm_compute/core/ITensorInfo.h" +#include "arm_compute/core/Types.h" +#include "arm_compute/core/experimental/Types.h" +#include "arm_compute/runtime/IMemoryManager.h" +#include "arm_compute/runtime/MemoryGroup.h" +#include "arm_compute/runtime/NEON/functions/NEActivationLayer.h" +#include "arm_compute/runtime/Tensor.h" +#include "src/core/NEON/kernels/NEFillBorderKernel.h" +#include "src/core/cpu/ICpuKernel.h" +#include "src/core/cpu/kernels/CpuDirectConv2dKernel.h" +#include "src/core/cpu/kernels/CpuDirectConv2dOutputStageKernel.h" +#include "src/runtime/cpu/ICpuOperator.h" +#include "src/runtime/cpu/operators/CpuActivation.h" + +#include + +namespace arm_compute +{ +namespace cpu +{ +/** Function to run the direct convolution. + * + * This function calls the following kernels: + * + * -# @ref NEFillBorderKernel for the input + * -# @ref kernels::CpuDirectConv2dOutputStageKernel + * -# @ref kernels::CpuDirectConv2dKernel + */ +class CpuDirectConv2d : public ICpuOperator +{ +public: + /** Constructor */ + CpuDirectConv2d(std::shared_ptr memory_manager = nullptr); + /** Destructor */ + ~CpuDirectConv2d(); + /** Set the input, weights, biases and output tensors. + * + * @note: DirectConvolution only works in the following configurations: + * 1x1 convolution with stride_x = 1/2/3, stride_y = 1/2/3 data type = F16/F32 + * 3x3 convolution with stride_x = 1/2/3, stride_y = 1/2/3 data type = F16/F32 + * 5x5 convolution with stride_x = 1/2/3, stride_y = 1/2/3 data type = F32 + * + * @param[in, out] src Input tensor info. Data types supported: F16/F32. + * @param[in] weights Set of kernels to convolve the input volume. + * Supported sizes: 1x1, 3x3 and 5x5. + * The 3rd dimension must be the same as the input's volume 3rd dimension. + * Data type supported: Same as @p src. + * @param[in] bias Set of biases. Can be nullptr. Data type supported: Same as @p src. + * @param[out] dst Output tensor info. + * The 3rd dimensions must be equal to the 4th dimension of the @p kernels tensor. Data types supported: Same as @p input. + * @param[in] conv_info Contains padding and stride information described in @ref PadStrideInfo. + * @param[in] act_info (Optional) Activation layer information in case of a fused activation. + */ + void configure(ITensorInfo *src, ITensorInfo *weights, const ITensorInfo *bias, ITensorInfo *dst, const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info = ActivationLayerInfo()); + /** Static function to check if given info will lead to a valid configuration + * + * Similar to CpuDirectConv2d::configure() + * + * @return a status + */ + static Status validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *bias, const ITensorInfo *dst, const PadStrideInfo &conv_info, + const ActivationLayerInfo &act_info = ActivationLayerInfo()); + + // Inherited methods overridden: + void run(ITensorPack &tensors) override; + +private: + MemoryGroup _memory_group; + std::unique_ptr _output_stage_kernel; + std::unique_ptr _conv_kernel; + std::unique_ptr _input_border_handler; + std::unique_ptr _activationlayer_function; + Tensor _accumulator; + bool _has_bias{ false }; + bool _is_activationlayer_enabled{ false }; + unsigned int _dim_split{ 0 }; + bool _is_padding_required{ false }; +}; +} // namespace cpu +} // namespace arm_compute +#endif /* ARM_COMPUTE_CPU_DIRECTCONV2D_H */ diff --git a/src/runtime/cpu/operators/CpuDirectConvolution.cpp b/src/runtime/cpu/operators/CpuDirectConvolution.cpp deleted file mode 100644 index 33f79603e8..0000000000 --- a/src/runtime/cpu/operators/CpuDirectConvolution.cpp +++ /dev/null @@ -1,147 +0,0 @@ -/* - * Copyright (c) 2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "src/runtime/cpu/operators/CpuDirectConvolution.h" - -#include "arm_compute/core/PixelValue.h" -#include "arm_compute/core/Utils.h" -#include "arm_compute/core/Validate.h" -#include "arm_compute/runtime/NEON/NEScheduler.h" - -namespace arm_compute -{ -namespace cpu -{ -CpuDirectConvolution::~CpuDirectConvolution() = default; - -CpuDirectConvolution::CpuDirectConvolution(std::shared_ptr memory_manager) - : _memory_group(std::move(memory_manager)), _output_stage_kernel(), _conv_kernel(), _input_border_handler(), _activationlayer_function(), _accumulator(), _has_bias(false), - _is_activationlayer_enabled(false), _dim_split(Window::DimZ), _is_padding_required() -{ -} - -void CpuDirectConvolution::configure(ITensorInfo *src, ITensorInfo *weights, const ITensorInfo *bias, ITensorInfo *dst, const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info) -{ - ARM_COMPUTE_ERROR_ON(src->data_layout() == DataLayout::UNKNOWN); - _output_stage_kernel = std::make_unique(); - _conv_kernel = std::make_unique(); - _input_border_handler = std::make_unique(); - - // Free accumulator - if(_accumulator.buffer() != nullptr) - { - _accumulator.allocator()->free(); - } - - _dim_split = src->data_layout() == DataLayout::NCHW ? Window::DimZ : Window::DimY; - - // Check if bias should be added in the convolution result - _has_bias = (bias != nullptr); - - _conv_kernel->configure(src, weights, dst, conv_info); - if(_has_bias) - { - _output_stage_kernel->configure(dst, bias); - } - _is_padding_required = !_conv_kernel->border_size().empty(); - - if(_is_padding_required) - { - // Add zero padding XY - _input_border_handler->configure(src, _conv_kernel->border_size(), BorderMode::CONSTANT, PixelValue(static_cast(0.f))); - } - - //Configure Activation Layer - _is_activationlayer_enabled = act_info.enabled(); - if(_is_activationlayer_enabled) - { - _activationlayer_function = std::make_unique(); - _activationlayer_function->configure(dst, dst, act_info); - } -} - -Status CpuDirectConvolution::validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *bias, const ITensorInfo *dst, const PadStrideInfo &conv_info, - const ActivationLayerInfo &act_info) -{ - ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, weights, dst); - - // output might not be initialized since it can be an intermediate tensor of another layer - DataType data_type = src->data_type(); - TensorInfo accumulator(dst->clone()->set_is_resizable(true).reset_padding().set_data_type(data_type)); - - // Validate Convolution kernel - ARM_COMPUTE_RETURN_ON_ERROR(kernels::CpuDirectConvolutionKernel::validate(src, weights, &accumulator, conv_info)); - - if(bias != nullptr) - { - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(weights, bias); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(bias->dimension(0) != weights->dimension(3), - "Biases size and number of input feature maps should match"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(bias->num_dimensions() > 1, "Biases should be one dimensional"); - } - - // Validate bias kernel - ARM_COMPUTE_RETURN_ON_ERROR(kernels::CpuDirectConvolutionOutputStageKernel::validate(&accumulator, bias, dst)); - - if(act_info.enabled()) - { - ARM_COMPUTE_RETURN_ON_ERROR(CpuActivation::validate(dst, nullptr, act_info)); - } - - return Status{}; -} - -void CpuDirectConvolution::run(ITensorPack &tensors) -{ - MemoryGroupResourceScope scope_mg(_memory_group); - - auto src = tensors.get_tensor(TensorType::ACL_SRC_0); - auto bias = tensors.get_const_tensor(TensorType::ACL_SRC_2); - auto dst = tensors.get_tensor(TensorType::ACL_DST); - - if(_is_padding_required) - { - ITensorPack pack; - pack.add_tensor(TensorType::ACL_SRC_DST, src); - NEScheduler::get().schedule_op(_input_border_handler.get(), Window::DimZ, _input_border_handler->window(), pack); - } - NEScheduler::get().schedule_op(_conv_kernel.get(), _dim_split, _conv_kernel->window(), tensors); - if(_has_bias) - { - ITensorPack pack; - pack.add_tensor(TensorType::ACL_SRC_0, dst); - pack.add_tensor(TensorType::ACL_SRC_1, bias); - pack.add_tensor(TensorType::ACL_DST, dst); - NEScheduler::get().schedule_op(_output_stage_kernel.get(), Window::DimY, _output_stage_kernel->window(), pack); - } - - if(_is_activationlayer_enabled) - { - ITensorPack pack; - pack.add_tensor(TensorType::ACL_SRC, dst); - pack.add_tensor(TensorType::ACL_DST, dst); - _activationlayer_function->run(pack); - } -} -} // namespace cpu -} // namespace arm_compute diff --git a/src/runtime/cpu/operators/CpuDirectConvolution.h b/src/runtime/cpu/operators/CpuDirectConvolution.h deleted file mode 100644 index 0635e087fd..0000000000 --- a/src/runtime/cpu/operators/CpuDirectConvolution.h +++ /dev/null @@ -1,121 +0,0 @@ -/* - * Copyright (c) 2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifndef ARM_COMPUTE_CPU_DIRECTCONVOLUTION_H -#define ARM_COMPUTE_CPU_DIRECTCONVOLUTION_H - -#include "arm_compute/core/ITensorInfo.h" -#include "arm_compute/core/Types.h" -#include "arm_compute/core/experimental/Types.h" -#include "arm_compute/runtime/IMemoryManager.h" -#include "arm_compute/runtime/MemoryGroup.h" -#include "arm_compute/runtime/NEON/functions/NEActivationLayer.h" -#include "arm_compute/runtime/Tensor.h" -#include "src/core/NEON/kernels/NEFillBorderKernel.h" -#include "src/core/cpu/ICpuKernel.h" -#include "src/core/cpu/kernels/CpuDirectConvolutionKernel.h" -#include "src/core/cpu/kernels/CpuDirectConvolutionOutputStageKernel.h" -#include "src/runtime/cpu/ICpuOperator.h" -#include "src/runtime/cpu/operators/CpuActivation.h" - -#include - -namespace arm_compute -{ -namespace cpu -{ -/** Function to run the direct convolution. - * - * This function calls the following kernels: - * - * -# @ref NEFillBorderKernel for the input - * -# @ref kernels::CpuDirectConvolutionOutputStageKernel - * -# @ref kernels::CpuDirectConvolutionKernel - */ -class CpuDirectConvolution : public ICpuOperator -{ -public: - /** Constructor */ - CpuDirectConvolution(std::shared_ptr memory_manager = nullptr); - /** Destructor */ - ~CpuDirectConvolution(); - /** Set the input, weights, biases and output tensors. - * - * @note: DirectConvolution only works in the following configurations: - * 1x1 convolution with stride_x = 1/2/3, stride_y = 1/2/3 data type = F16/F32 - * 3x3 convolution with stride_x = 1/2/3, stride_y = 1/2/3 data type = F16/F32 - * 5x5 convolution with stride_x = 1/2/3, stride_y = 1/2/3 data type = F32 - * - * @param[in, out] src Input tensor info. Data types supported: F16/F32. - * @param[in] weights Set of kernels to convolve the input volume. - * Supported sizes: 1x1, 3x3 and 5x5. - * The 3rd dimension must be the same as the input's volume 3rd dimension. - * Data type supported: Same as @p src. - * @param[in] bias Set of biases. Can be nullptr. Data type supported: Same as @p src. - * @param[out] dst Output tensor info. - * The 3rd dimensions must be equal to the 4th dimension of the @p kernels tensor. Data types supported: Same as @p input. - * @param[in] conv_info Contains padding and stride information described in @ref PadStrideInfo. - * @param[in] act_info (Optional) Activation layer information in case of a fused activation. - */ - void configure(ITensorInfo *src, ITensorInfo *weights, const ITensorInfo *bias, ITensorInfo *dst, const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info = ActivationLayerInfo()); - /** Static function to check if given info will lead to a valid configuration of @ref NEDirectConvolutionLayer - * - * @note: DirectConvolution only works in the following configurations: - * 1x1 convolution with stride_x = 1/2/3, stride_y = 1/2/3 data type = F16/F32 - * 3x3 convolution with stride_x = 1/2/3, stride_y = 1/2/3 data type = F16/F32 - * 5x5 convolution with stride_x = 1/2/3, stride_y = 1/2/3 data type = F32 - * - * @param[in] src Input tensor info. Data types supported: F16/F32. - * @param[in] weights Set of kernels to convolve the input volume. - * Supported sizes: 1x1, 3x3 and 5x5. - * The 3rd dimension must be the same as the input's volume 3rd dimension. - * Data type supported: Same as @p src. - * @param[in] bias Set of biases. Can be nullptr. Data type supported: Same as @p src. - * @param[in] dst Output tensor info. - * The 3rd dimensions must be equal to the 4th dimension of the @p kernels tensor. Data types supported: Same as @p input. - * @param[in] conv_info Contains padding and stride information described in @ref PadStrideInfo. - * @param[in] act_info (Optional) Activation layer information in case of a fused activation. - * - * @return a status - */ - static Status validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *bias, const ITensorInfo *dst, const PadStrideInfo &conv_info, - const ActivationLayerInfo &act_info = ActivationLayerInfo()); - - // Inherited methods overridden: - void run(ITensorPack &tensors) override; - -private: - MemoryGroup _memory_group; - std::unique_ptr _output_stage_kernel; - std::unique_ptr _conv_kernel; - std::unique_ptr _input_border_handler; - std::unique_ptr _activationlayer_function; - Tensor _accumulator; - bool _has_bias{ false }; - bool _is_activationlayer_enabled{ false }; - unsigned int _dim_split{ 0 }; - bool _is_padding_required{ false }; -}; -} // namespace cpu -} // namespace arm_compute -#endif /* ARM_COMPUTE_CPU_DIRECTCONVOLUTION_H */ diff --git a/src/runtime/cpu/operators/CpuPool2d.cpp b/src/runtime/cpu/operators/CpuPool2d.cpp new file mode 100644 index 0000000000..b225199c40 --- /dev/null +++ b/src/runtime/cpu/operators/CpuPool2d.cpp @@ -0,0 +1,156 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "src/runtime/cpu/operators/CpuPool2d.h" + +#include "arm_compute/core/ITensor.h" +#include "arm_compute/core/TensorInfo.h" +#include "arm_compute/runtime/NEON/NEScheduler.h" +#include "src/core/NEON/kernels/NEFillBorderKernel.h" +#include "src/core/cpu/kernels/CpuPool2dKernel.h" +#include "src/core/cpu/kernels/internal/CpuPool2dAssemblyWrapperKernel.h" + +namespace arm_compute +{ +namespace cpu +{ +CpuPool2d::CpuPool2d() + : _pooling_layer_kernel(), + _border_handler(), + _asm_glue(), + _is_global_pooling_layer(false), + _data_layout(DataLayout::NCHW), + _mem_req() +{ +} + +CpuPool2d::~CpuPool2d() = default; + +void CpuPool2d::configure(ITensorInfo *src, ITensorInfo *dst, const PoolingLayerInfo &pool_info, ITensorInfo *indices) +{ + // Check if we can run assembly kernels. Currently, indices are not supported by those kernels + const bool run_optimised = bool(kernels::CpuPool2dAssemblyWrapperKernel::validate(src, dst, pool_info)) && (indices == nullptr); + + // Get data layout + _data_layout = pool_info.data_layout == DataLayout::UNKNOWN ? src->data_layout() : pool_info.data_layout; + + // Check if we have Global Pooling Layer + const unsigned int idx_width = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::WIDTH); + const unsigned int idx_height = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::HEIGHT); + _is_global_pooling_layer = (src->dimension(idx_width) == pool_info.pool_size.width) && (src->dimension(idx_height) == pool_info.pool_size.height); + + if(run_optimised) + { + const CPUInfo &ci = NEScheduler::get().cpu_info(); + const unsigned int num_threads = NEScheduler::get().num_threads(); + + auto pooling_wrapper = std::make_unique(); + ARM_COMPUTE_ERROR_ON(pooling_wrapper == nullptr); + pooling_wrapper->configure(src, dst, pool_info, ci); + + // Get kernel's memory requirements + constexpr size_t alignment = 4096; + const size_t workspace_size = pooling_wrapper->get_working_size(num_threads); + _mem_req.push_back({ TensorType::ACL_INT_0, workspace_size, alignment }); + + _asm_glue = std::move(pooling_wrapper); + } + else + { + // Configure pooling kernel + auto k = std::make_unique(); + k->configure(src, dst, pool_info, indices); + _pooling_layer_kernel = std::move(k); + + switch(_data_layout) + { + case DataLayout::NCHW: + { + // Configure border depending on operation required (quantize border in case of asymmetric data_type) + BorderMode border_mode = (!indices && pool_info.pool_type == PoolingType::MAX) ? BorderMode::REPLICATE : BorderMode::CONSTANT; + PixelValue zero_value((indices) ? std::numeric_limits::min() : 0.f); + if(is_data_type_quantized_asymmetric(src->data_type()) && !pool_info.exclude_padding) + { + zero_value = PixelValue(0, src->data_type(), src->quantization_info()); + } + auto b = std::make_unique(); + b->configure(src, _pooling_layer_kernel->border_size(), border_mode, zero_value); + _border_handler = std::move(b); + break; + } + case DataLayout::NHWC: + break; + default: + ARM_COMPUTE_ERROR("Data layout not supported"); + } + } +} + +Status CpuPool2d::validate(const ITensorInfo *src, const ITensorInfo *dst, const PoolingLayerInfo &pool_info, const ITensorInfo *indices) +{ + const bool run_optimised = bool(kernels::CpuPool2dAssemblyWrapperKernel::validate(src, dst, pool_info)) && (indices == nullptr); + + if(run_optimised) + { + return Status{}; + } + + return kernels::CpuPool2dKernel::validate(src, dst, pool_info, indices); +} + +void CpuPool2d::run(ITensorPack &tensors) +{ + ARM_COMPUTE_ERROR_ON_MSG(tensors.empty(), "No tensors provided"); + + if(_asm_glue) + { + const auto hints = (_is_global_pooling_layer) ? Window::DimX : Window::DimY; + NEScheduler::get().schedule_op(_asm_glue.get(), hints, _asm_glue->window(), tensors); + } + else + { + switch(_data_layout) + { + case DataLayout::NCHW: + // Fill border + NEScheduler::get().schedule_op(_border_handler.get(), Window::DimY, _border_handler->window(), tensors); + + // Run pooling layer + NEScheduler::get().schedule_op(_pooling_layer_kernel.get(), _is_global_pooling_layer ? Window::DimZ : Window::DimY, _pooling_layer_kernel->window(), tensors); + break; + case DataLayout::NHWC: + // Run pooling layer + NEScheduler::get().schedule_op(_pooling_layer_kernel.get(), Window::DimX, _pooling_layer_kernel->window(), tensors); + break; + default: + ARM_COMPUTE_ERROR("Data layout not supported"); + } + } +} + +experimental::MemoryRequirements CpuPool2d::workspace() const +{ + return _mem_req; +} +} // namespace cpu +} // namespace arm_compute diff --git a/src/runtime/cpu/operators/CpuPool2d.h b/src/runtime/cpu/operators/CpuPool2d.h new file mode 100644 index 0000000000..ae3d115dfc --- /dev/null +++ b/src/runtime/cpu/operators/CpuPool2d.h @@ -0,0 +1,87 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ARM_COMPUTE_CPU_POOL2D_H +#define ARM_COMPUTE_CPU_POOL2D_H + +#include "arm_compute/core/experimental/Types.h" +#include "src/core/common/Macros.h" +#include "src/runtime/cpu/ICpuOperator.h" + +#include + +namespace arm_compute +{ +// Forward Declarations +struct PoolingLayerInfo; + +namespace cpu +{ +/** Basic function to simulate a pooling layer with the specified pooling operation. This function calls the following kernels: + * + * -# @ref NEFillBorderKernel (executed if padding size is different from zero) + * -# @ref kernels::CpuPool2dKernel + * -# @ref kernels::CpuPool2dAssemblyWrapperKernel + */ +class CpuPool2d : public ICpuOperator +{ +public: + /** Constructor */ + CpuPool2d(); + ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuPool2d); + /** Default destructor */ + ~CpuPool2d(); + /** Set the src and dst tensors. + * + * @note F16 is supported for pool sizes 2 and 3 only + * + * @param[in, out] src Source tensor info. (Written to only when padding != 0) Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32. + * @param[out] dst Destination tensor info. Data types supported: same as @p src. + * @param[in] pool_info Contains pooling operation information described in @ref PoolingLayerInfo. + * @param[out] indices (optional) The indices of the maximal values. Data type supported: U32. + */ + void configure(ITensorInfo *src, ITensorInfo *dst, const PoolingLayerInfo &pool_info, ITensorInfo *indices = nullptr); + /** Static function to check if given info will lead to a valid configuration + * + * Similar to CpuPool2d::configure() + * + * @return a status + */ + static Status validate(const ITensorInfo *src, const ITensorInfo *dst, const PoolingLayerInfo &pool_info, const ITensorInfo *indices = nullptr); + + // Inherited methods overridden: + void run(ITensorPack &tensors) override; + experimental::MemoryRequirements workspace() const override; + +private: + std::unique_ptr _pooling_layer_kernel; + std::unique_ptr _border_handler; + std::unique_ptr _asm_glue; + + bool _is_global_pooling_layer; + DataLayout _data_layout; + experimental::MemoryRequirements _mem_req; +}; +} // namespace cpu +} // namespace arm_compute +#endif /* ARM_COMPUTE_CPU_POOL2D_H */ diff --git a/src/runtime/cpu/operators/CpuPooling.cpp b/src/runtime/cpu/operators/CpuPooling.cpp deleted file mode 100644 index 3a6ac24a74..0000000000 --- a/src/runtime/cpu/operators/CpuPooling.cpp +++ /dev/null @@ -1,156 +0,0 @@ -/* - * Copyright (c) 2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "src/runtime/cpu/operators/CpuPooling.h" - -#include "arm_compute/core/ITensor.h" -#include "arm_compute/core/TensorInfo.h" -#include "arm_compute/runtime/NEON/NEScheduler.h" -#include "src/core/NEON/kernels/NEFillBorderKernel.h" -#include "src/core/cpu/kernels/CpuPoolingAssemblyWrapperKernel.h" -#include "src/core/cpu/kernels/CpuPoolingKernel.h" - -namespace arm_compute -{ -namespace cpu -{ -CpuPooling::CpuPooling() - : _pooling_layer_kernel(), - _border_handler(), - _asm_glue(), - _is_global_pooling_layer(false), - _data_layout(DataLayout::NCHW), - _mem_req() -{ -} - -CpuPooling::~CpuPooling() = default; - -void CpuPooling::configure(ITensorInfo *src, ITensorInfo *dst, const PoolingLayerInfo &pool_info, ITensorInfo *indices) -{ - // Check if we can run assembly kernels. Currently, indices are not supported by those kernels - const bool run_optimised = bool(kernels::CpuPoolingAssemblyWrapperKernel::validate(src, dst, pool_info)) && (indices == nullptr); - - // Get data layout - _data_layout = pool_info.data_layout == DataLayout::UNKNOWN ? src->data_layout() : pool_info.data_layout; - - // Check if we have Global Pooling Layer - const unsigned int idx_width = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::WIDTH); - const unsigned int idx_height = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::HEIGHT); - _is_global_pooling_layer = (src->dimension(idx_width) == pool_info.pool_size.width) && (src->dimension(idx_height) == pool_info.pool_size.height); - - if(run_optimised) - { - const CPUInfo &ci = NEScheduler::get().cpu_info(); - const unsigned int num_threads = NEScheduler::get().num_threads(); - - auto pooling_wrapper = std::make_unique(); - ARM_COMPUTE_ERROR_ON(pooling_wrapper == nullptr); - pooling_wrapper->configure(src, dst, pool_info, ci); - - // Get kernel's memory requirements - constexpr size_t alignment = 4096; - const size_t workspace_size = pooling_wrapper->get_working_size(num_threads); - _mem_req.push_back({ TensorType::ACL_INT_0, workspace_size, alignment }); - - _asm_glue = std::move(pooling_wrapper); - } - else - { - // Configure pooling kernel - auto k = std::make_unique(); - k->configure(src, dst, pool_info, indices); - _pooling_layer_kernel = std::move(k); - - switch(_data_layout) - { - case DataLayout::NCHW: - { - // Configure border depending on operation required (quantize border in case of asymmetric data_type) - BorderMode border_mode = (!indices && pool_info.pool_type == PoolingType::MAX) ? BorderMode::REPLICATE : BorderMode::CONSTANT; - PixelValue zero_value((indices) ? std::numeric_limits::min() : 0.f); - if(is_data_type_quantized_asymmetric(src->data_type()) && !pool_info.exclude_padding) - { - zero_value = PixelValue(0, src->data_type(), src->quantization_info()); - } - auto b = std::make_unique(); - b->configure(src, _pooling_layer_kernel->border_size(), border_mode, zero_value); - _border_handler = std::move(b); - break; - } - case DataLayout::NHWC: - break; - default: - ARM_COMPUTE_ERROR("Data layout not supported"); - } - } -} - -Status CpuPooling::validate(const ITensorInfo *src, const ITensorInfo *dst, const PoolingLayerInfo &pool_info, const ITensorInfo *indices) -{ - const bool run_optimised = bool(kernels::CpuPoolingAssemblyWrapperKernel::validate(src, dst, pool_info)) && (indices == nullptr); - - if(run_optimised) - { - return Status{}; - } - - return kernels::CpuPoolingKernel::validate(src, dst, pool_info, indices); -} - -void CpuPooling::run(ITensorPack &tensors) -{ - ARM_COMPUTE_ERROR_ON_MSG(tensors.empty(), "No tensors provided"); - - if(_asm_glue) - { - const auto hints = (_is_global_pooling_layer) ? Window::DimX : Window::DimY; - NEScheduler::get().schedule_op(_asm_glue.get(), hints, _asm_glue->window(), tensors); - } - else - { - switch(_data_layout) - { - case DataLayout::NCHW: - // Fill border - NEScheduler::get().schedule_op(_border_handler.get(), Window::DimY, _border_handler->window(), tensors); - - // Run pooling layer - NEScheduler::get().schedule_op(_pooling_layer_kernel.get(), _is_global_pooling_layer ? Window::DimZ : Window::DimY, _pooling_layer_kernel->window(), tensors); - break; - case DataLayout::NHWC: - // Run pooling layer - NEScheduler::get().schedule_op(_pooling_layer_kernel.get(), Window::DimX, _pooling_layer_kernel->window(), tensors); - break; - default: - ARM_COMPUTE_ERROR("Data layout not supported"); - } - } -} - -experimental::MemoryRequirements CpuPooling::workspace() const -{ - return _mem_req; -} -} // namespace cpu -} // namespace arm_compute diff --git a/src/runtime/cpu/operators/CpuPooling.h b/src/runtime/cpu/operators/CpuPooling.h deleted file mode 100644 index bc30adf762..0000000000 --- a/src/runtime/cpu/operators/CpuPooling.h +++ /dev/null @@ -1,99 +0,0 @@ -/* - * Copyright (c) 2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifndef ARM_COMPUTE_CPU_POOLING_H -#define ARM_COMPUTE_CPU_POOLING_H - -#include "src/runtime/cpu/ICpuOperator.h" - -#include "arm_compute/core/experimental/Types.h" - -#include - -namespace arm_compute -{ -// Forward Declarations -struct PoolingLayerInfo; - -namespace cpu -{ -/** Basic function to simulate a pooling layer with the specified pooling operation. This function calls the following kernels: - * - * -# @ref NEFillBorderKernel (executed if padding size is different from zero) - * -# @ref kernels::CpuPoolingKernel - * -# @ref kernels::CpuPoolingAssemblyWrapperKernel - */ -class CpuPooling : public ICpuOperator -{ -public: - /** Constructor */ - CpuPooling(); - /** Prevent instances of this class from being copied (As this class contains pointers) */ - CpuPooling(const CpuPooling &) = delete; - /** Prevent instances of this class from being copied (As this class contains pointers) */ - CpuPooling &operator=(const CpuPooling &) = delete; - /** Prevent instances of this class from being moved (As this class contains non movable objects) */ - CpuPooling(CpuPooling &&) = delete; - /** Prevent instances of this class from being moved (As this class contains non movable objects) */ - CpuPooling &operator=(CpuPooling &&) = delete; - /** Default destructor */ - ~CpuPooling(); - /** Set the src and dst tensors. - * - * @note F16 is supported for pool sizes 2 and 3 only - * - * @param[in, out] src Source tensor info. (Written to only when padding != 0) Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32. - * @param[out] dst Destination tensor info. Data types supported: same as @p src. - * @param[in] pool_info Contains pooling operation information described in @ref PoolingLayerInfo. - * @param[out] indices (optional) The indices of the maximal values. Data type supported: U32. - */ - void configure(ITensorInfo *src, ITensorInfo *dst, const PoolingLayerInfo &pool_info, ITensorInfo *indices = nullptr); - /** Static function to check if given info will lead to a valid configuration of @ref CpuPooling - * - * @note F16 is supported for pool sizes 2 and 3 only - * - * @param[in] src Source tensor info. (Written to only when padding != 0) Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32. - * @param[in] dst Destination tensor info. Data types supported: same as @p src. - * @param[in] pool_info Contains pooling operation information described in @ref PoolingLayerInfo. - * @param[in] indices (optional) Tensor info of the indices of the maximal values. Data type supported: U32. - * - * @return a status - */ - static Status validate(const ITensorInfo *src, const ITensorInfo *dst, const PoolingLayerInfo &pool_info, const ITensorInfo *indices = nullptr); - - // Inherited methods overridden: - void run(ITensorPack &tensors) override; - experimental::MemoryRequirements workspace() const override; - -private: - std::unique_ptr _pooling_layer_kernel; - std::unique_ptr _border_handler; - std::unique_ptr _asm_glue; - - bool _is_global_pooling_layer; - DataLayout _data_layout; - experimental::MemoryRequirements _mem_req; -}; -} // namespace cpu -} // namespace arm_compute -#endif /* ARM_COMPUTE_CPU_POOLING_H */ diff --git a/src/runtime/gpu/cl/operators/ClDirectConv2d.cpp b/src/runtime/gpu/cl/operators/ClDirectConv2d.cpp new file mode 100644 index 0000000000..527b3a65f9 --- /dev/null +++ b/src/runtime/gpu/cl/operators/ClDirectConv2d.cpp @@ -0,0 +1,102 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "src/runtime/gpu/cl/operators/ClDirectConv2d.h" + +#include "arm_compute/runtime/CL/CLScheduler.h" +#include "src/core/CL/kernels/CLFillBorderKernel.h" +#include "src/core/gpu/cl/ClCompileContext.h" +#include "src/core/gpu/cl/kernels/ClActivationKernel.h" +#include "src/core/gpu/cl/kernels/ClDirectConv2dKernel.h" + +namespace arm_compute +{ +namespace opencl +{ +namespace +{ +ITensorPack select_activation_src_dst(ITensorPack &tensors) +{ + ITensorPack pack; + pack.add_tensor(TensorType::ACL_SRC, tensors.get_tensor(TensorType::ACL_DST)); + pack.add_tensor(TensorType::ACL_DST, tensors.get_tensor(TensorType::ACL_DST)); + return pack; +} +} // namespace + +void ClDirectConv2d::configure(const CLCompileContext &compile_context, ITensorInfo *src, ITensorInfo *weights, ITensorInfo *biases, ITensorInfo *dst, + const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info) +{ + // Configure direct convolution kernel + auto k = std::make_unique(); + k->set_target(CLScheduler::get().target()); + k->configure(compile_context, src, weights, biases, dst, conv_info); + _direct_conv_kernel = std::move(k); + + // Configure border handler + PixelValue zero_value(0.f); + if(is_data_type_quantized_asymmetric(src->data_type())) + { + zero_value = PixelValue(0, src->data_type(), src->quantization_info()); + } + auto b = std::make_unique(); + b->configure(compile_context, src, _direct_conv_kernel->border_size(), BorderMode::CONSTANT, zero_value); + _src_border_handler = std::move(b); + + if(act_info.enabled()) + { + auto a = std::make_unique(); + a->configure(compile_context, dst, dst, act_info); + _activation_kernel = std::move(a); + } + + // Tune kernels + CLScheduler::get().tune_kernel_static(*_direct_conv_kernel); +} + +Status ClDirectConv2d::validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, + const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info) +{ + ARM_COMPUTE_RETURN_ON_ERROR(kernels::ClDirectConv2dKernel::validate(src, weights, biases, dst, conv_info, CLScheduler::get().target())); + if(act_info.enabled()) + { + ARM_COMPUTE_RETURN_ON_ERROR(kernels::ClActivationKernel::validate(dst, dst, act_info)); + } + return Status{}; +} + +void ClDirectConv2d::run(ITensorPack &tensors) +{ + // Run border handler + CLScheduler::get().enqueue_op(*_src_border_handler.get(), tensors, false); + // Run direct convolution + CLScheduler::get().enqueue_op(*_direct_conv_kernel.get(), tensors, false); + // Run activation kernel + if(_activation_kernel) + { + auto act_pack = select_activation_src_dst(tensors); + CLScheduler::get().enqueue_op(*_activation_kernel.get(), act_pack, false); + } +} +} // namespace opencl +} // namespace arm_compute diff --git a/src/runtime/gpu/cl/operators/ClDirectConv2d.h b/src/runtime/gpu/cl/operators/ClDirectConv2d.h new file mode 100644 index 0000000000..e069733fab --- /dev/null +++ b/src/runtime/gpu/cl/operators/ClDirectConv2d.h @@ -0,0 +1,83 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ARM_COMPUTE_CL_DIRECT_CONV2D_H +#define ARM_COMPUTE_CL_DIRECT_CONV2D_H + +#include "src/core/gpu/cl/ClCompileContext.h" +#include "src/core/gpu/cl/IClKernel.h" +#include "src/runtime/gpu/cl/IClOperator.h" + +#include + +namespace arm_compute +{ +namespace opencl +{ +/** Basic function to simulate a directly convolution layer. This function calls the following OpenCL kernels: + * + * -# @ref CLFillBorderKernel (executed if padding size is different from zero) + * -# @ref opencl::ClDirectConv2d + */ +class ClDirectConv2d : public IClOperator +{ +public: + /** Constructor */ + ClDirectConv2d() = default; + /** Set the src and dst tensors. + * + * @param[in] compile_context The compile context to be used. + * @param[in] src Source tensor. 3 lower dimensions represent a single src [width, height, IFM], + * while every optional dimension from 4 and above represent a batch of srcs. + * Data types supported: QASYMM8_SIGNED/QASYMM8/F16/F32. + * @param[in] weights Weights tensor. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM]. Data type supported:Same as @p src. + * @param[in] biases Biases tensor. Shared biases supported. Biases are 1D tensor with dimensions [OFM]. + * Data type supported: Should match @p src data type, except for src of QASYMM8 and QASYMM8_SIGNED type where biases should be of S32 type. + * @param[out] dst Destination tensor. 3 lower dimensions represent a single dst [width, height, OFM], while the rest represent batch of dsts. + * Data types supported: Same as @p src. + * @param[in] conv_info Contains padding and stride information described in @ref PadStrideInfo. + * @param[in] act_info (Optional) Activation layer information in case of a fused activation. + * + */ + void configure(const CLCompileContext &compile_context, ITensorInfo *src, ITensorInfo *weights, ITensorInfo *biases, ITensorInfo *dst, const PadStrideInfo &conv_info, + const ActivationLayerInfo &act_info = ActivationLayerInfo()); + /** Static function to check if given info will lead to a valid configuration + * + * Similar to ClDirectConv2d::configure() + * + * @return a status + */ + static Status validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, const PadStrideInfo &conv_info, + const ActivationLayerInfo &act_info = ActivationLayerInfo()); + + // Inherited method overridden + void run(ITensorPack &tensors) override; + +private: + std::unique_ptr _direct_conv_kernel{ nullptr }; + std::unique_ptr _src_border_handler{ nullptr }; + std::unique_ptr _activation_kernel{ nullptr }; +}; +} // namespace opencl +} // namespace arm_compute +#endif /* ARM_COMPUTE_CL_DIRECT_CONV2D_H */ \ No newline at end of file diff --git a/src/runtime/gpu/cl/operators/ClDirectConvolution.cpp b/src/runtime/gpu/cl/operators/ClDirectConvolution.cpp deleted file mode 100644 index 3382a6c3c5..0000000000 --- a/src/runtime/gpu/cl/operators/ClDirectConvolution.cpp +++ /dev/null @@ -1,102 +0,0 @@ -/* - * Copyright (c) 2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "src/runtime/gpu/cl/operators/ClDirectConvolution.h" - -#include "arm_compute/runtime/CL/CLScheduler.h" -#include "src/core/CL/kernels/CLFillBorderKernel.h" -#include "src/core/gpu/cl/ClCompileContext.h" -#include "src/core/gpu/cl/kernels/ClActivationKernel.h" -#include "src/core/gpu/cl/kernels/ClDirectConvolutionKernel.h" - -namespace arm_compute -{ -namespace opencl -{ -namespace -{ -ITensorPack select_activation_src_dst(ITensorPack &tensors) -{ - ITensorPack pack; - pack.add_tensor(TensorType::ACL_SRC, tensors.get_tensor(TensorType::ACL_DST)); - pack.add_tensor(TensorType::ACL_DST, tensors.get_tensor(TensorType::ACL_DST)); - return pack; -} -} // namespace - -void ClDirectConvolution::configure(const CLCompileContext &compile_context, ITensorInfo *src, ITensorInfo *weights, ITensorInfo *biases, ITensorInfo *dst, - const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info) -{ - // Configure direct convolution kernel - auto k = std::make_unique(); - k->set_target(CLScheduler::get().target()); - k->configure(compile_context, src, weights, biases, dst, conv_info); - _direct_conv_kernel = std::move(k); - - // Configure border handler - PixelValue zero_value(0.f); - if(is_data_type_quantized_asymmetric(src->data_type())) - { - zero_value = PixelValue(0, src->data_type(), src->quantization_info()); - } - auto b = std::make_unique(); - b->configure(compile_context, src, _direct_conv_kernel->border_size(), BorderMode::CONSTANT, zero_value); - _src_border_handler = std::move(b); - - if(act_info.enabled()) - { - auto a = std::make_unique(); - a->configure(compile_context, dst, dst, act_info); - _activation_kernel = std::move(a); - } - - // Tune kernels - CLScheduler::get().tune_kernel_static(*_direct_conv_kernel); -} - -Status ClDirectConvolution::validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, - const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info) -{ - ARM_COMPUTE_RETURN_ON_ERROR(kernels::ClDirectConvolutionKernel::validate(src, weights, biases, dst, conv_info, CLScheduler::get().target())); - if(act_info.enabled()) - { - ARM_COMPUTE_RETURN_ON_ERROR(kernels::ClActivationKernel::validate(dst, dst, act_info)); - } - return Status{}; -} - -void ClDirectConvolution::run(ITensorPack &tensors) -{ - // Run border handler - CLScheduler::get().enqueue_op(*_src_border_handler.get(), tensors, false); - // Run direct convolution - CLScheduler::get().enqueue_op(*_direct_conv_kernel.get(), tensors, false); - // Run activation kernel - if(_activation_kernel) - { - auto act_pack = select_activation_src_dst(tensors); - CLScheduler::get().enqueue_op(*_activation_kernel.get(), act_pack, false); - } -} -} // namespace opencl -} // namespace arm_compute diff --git a/src/runtime/gpu/cl/operators/ClDirectConvolution.h b/src/runtime/gpu/cl/operators/ClDirectConvolution.h deleted file mode 100644 index e7ad927b0b..0000000000 --- a/src/runtime/gpu/cl/operators/ClDirectConvolution.h +++ /dev/null @@ -1,92 +0,0 @@ -/* - * Copyright (c) 2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifndef ARM_COMPUTE_CL_DIRECT_CONVOLUTION_H -#define ARM_COMPUTE_CL_DIRECT_CONVOLUTION_H - -#include "src/core/gpu/cl/ClCompileContext.h" -#include "src/core/gpu/cl/IClKernel.h" -#include "src/runtime/gpu/cl/IClOperator.h" - -#include - -namespace arm_compute -{ -namespace opencl -{ -/** Basic function to simulate a directly convolution layer. This function calls the following OpenCL kernels: - * - * -# @ref CLFillBorderKernel (executed if padding size is different from zero) - * -# @ref opencl::ClDirectConvolution - */ -class ClDirectConvolution : public IClOperator -{ -public: - /** Constructor */ - ClDirectConvolution() = default; - /** Set the src and dst tensors. - * - * @param[in] compile_context The compile context to be used. - * @param[in] src Source tensor. 3 lower dimensions represent a single src [width, height, IFM], - * while every optional dimension from 4 and above represent a batch of srcs. - * Data types supported: QASYMM8_SIGNED/QASYMM8/F16/F32. - * @param[in] weights Weights tensor. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM]. Data type supported:Same as @p src. - * @param[in] biases Biases tensor. Shared biases supported. Biases are 1D tensor with dimensions [OFM]. - * Data type supported: Should match @p src data type, except for src of QASYMM8 and QASYMM8_SIGNED type where biases should be of S32 type. - * @param[out] dst Destination tensor. 3 lower dimensions represent a single dst [width, height, OFM], while the rest represent batch of dsts. - * Data types supported: Same as @p src. - * @param[in] conv_info Contains padding and stride information described in @ref PadStrideInfo. - * @param[in] act_info (Optional) Activation layer information in case of a fused activation. - * - */ - void configure(const CLCompileContext &compile_context, ITensorInfo *src, ITensorInfo *weights, ITensorInfo *biases, ITensorInfo *dst, const PadStrideInfo &conv_info, - const ActivationLayerInfo &act_info = ActivationLayerInfo()); - /** Static function to check if given info will lead to a valid configuration of @ref ClDirectConvolution - * - * @param[in] src Source tensor. 3 lower dimensions represent a single src [width, height, IFM], - * while every optional dimension from 4 and above represent a batch of srcs. - * Data types supported: QASYMM8_SIGNED/QASYMM8/F16/F32. - * @param[in] weights Weights tensor. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM]. Data type supported:Same as @p src. - * @param[in] biases Biases tensor. Shared biases supported. Biases are 1D tensor with dimensions [OFM]. - * Data type supported: Should match @p src data type, except for src of QASYMM8 and QASYMM8_SIGNED type where biases should be of S32 type. - * @param[in] dst Destination tensor. 3 lower dimensions represent a single dst [width, height, OFM], while the rest represent batch of dsts. - * Data types supported: Same as @p src. - * @param[in] conv_info Contains padding and stride information described in @ref PadStrideInfo. - * @param[in] act_info (Optional) Activation layer information in case of a fused activation. - * - * @return a status - */ - static Status validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, const PadStrideInfo &conv_info, - const ActivationLayerInfo &act_info = ActivationLayerInfo()); - - // Inherited method overridden - void run(ITensorPack &tensors) override; - -private: - std::unique_ptr _direct_conv_kernel{ nullptr }; - std::unique_ptr _src_border_handler{ nullptr }; - std::unique_ptr _activation_kernel{ nullptr }; -}; -} // namespace opencl -} // namespace arm_compute -#endif /* ARM_COMPUTE_CL_DIRECT_CONVOLUTION_H */ \ No newline at end of file diff --git a/src/runtime/gpu/cl/operators/ClPool2d.cpp b/src/runtime/gpu/cl/operators/ClPool2d.cpp new file mode 100644 index 0000000000..40c2b0a8ba --- /dev/null +++ b/src/runtime/gpu/cl/operators/ClPool2d.cpp @@ -0,0 +1,101 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "src/runtime/gpu/cl/operators/ClPool2d.h" + +#include "arm_compute/runtime/CL/CLScheduler.h" + +#include "src/core/CL/kernels/CLFillBorderKernel.h" +#include "src/core/gpu/cl/ClCompileContext.h" +#include "src/core/gpu/cl/kernels/ClPool2dKernel.h" + +namespace arm_compute +{ +namespace opencl +{ +void ClPool2d::configure(const ClCompileContext &compile_context, ITensorInfo *src, ITensorInfo *dst, const PoolingLayerInfo &info, ITensorInfo *indices) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(src); + // Configure pooling kernel + auto k = std::make_unique(); + k->set_target(CLScheduler::get().target()); + k->configure(compile_context, src, dst, info, indices); + _pooling = std::move(k); + + const DataType data_type = src->data_type(); + + // Configure border depending on operation required (quantize border in case of asymmetric data_type) + BorderMode border_mode{}; + PixelValue pixel_value(0.f); + if(is_data_type_quantized_asymmetric(data_type) && !info.exclude_padding) + { + pixel_value = PixelValue(0, data_type, src->quantization_info()); + } + + // Data layout + const auto data_layout = info.data_layout == DataLayout::UNKNOWN ? src->data_layout() : info.data_layout; + + switch(data_layout) + { + case DataLayout::NCHW: + border_mode = (PoolingType::MAX == info.pool_type) ? BorderMode::REPLICATE : BorderMode::CONSTANT; + break; + case DataLayout::NHWC: + border_mode = BorderMode::CONSTANT; + if(PoolingType::MAX == info.pool_type) + { + if(is_data_type_quantized(data_type)) + { + std::tie(pixel_value, std::ignore) = get_min_max(data_type); + } + else + { + pixel_value = PixelValue(std::numeric_limits::lowest()); + } + } + break; + default: + ARM_COMPUTE_ERROR("Data layout not supported"); + } + auto b = std::make_unique(); + b->configure(compile_context, src, _pooling->border_size(), border_mode, pixel_value); + _border_handler = std::move(b); + + // Tune kernels + CLScheduler::get().tune_kernel_static(*_pooling); +} + +Status ClPool2d::validate(const ITensorInfo *src, const ITensorInfo *dst, const PoolingLayerInfo &info, const ITensorInfo *indices) +{ + return kernels::ClPool2dKernel::validate(src, dst, info, indices); +} + +void ClPool2d::run(ITensorPack &tensors) +{ + ARM_COMPUTE_ERROR_ON_MSG(tensors.empty(), "No inputs provided"); + + CLScheduler::get().enqueue_op(*_border_handler.get(), tensors, false); + CLScheduler::get().enqueue_op(*_pooling.get(), tensors, false); +} +} // namespace opencl +} // namespace arm_compute diff --git a/src/runtime/gpu/cl/operators/ClPool2d.h b/src/runtime/gpu/cl/operators/ClPool2d.h new file mode 100644 index 0000000000..8ac386a64b --- /dev/null +++ b/src/runtime/gpu/cl/operators/ClPool2d.h @@ -0,0 +1,72 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ARM_COMPUTE_CL_POOL2D_H +#define ARM_COMPUTE_CL_POOL2D_H + +#include "src/core/gpu/cl/ClCompileContext.h" +#include "src/runtime/gpu/cl/IClOperator.h" + +#include + +namespace arm_compute +{ +namespace opencl +{ +/** Basic function to simulate a pooling layer with the specified pooling operation. This function calls the following OpenCL kernels: + * + * -# @ref CLFillBorderKernel (executed if padding size is different from zero) + * -# @ref opencl::ClPool2d + */ +class ClPool2d : public IClOperator +{ +public: + /** Constructor */ + ClPool2d() = default; + /** Configure operator for a given list of arguments + * + * @param[in] compile_context The compile context to be used. + * @param[in] src Source tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32. + * @param[out] dst Destination tensor info. Data type supported: same as @p src + * @param[in] info Pooling layer parameters. + * @param[out] indices (optional) The indices info of the maximal values. Data type supported: U32. + */ + void configure(const ClCompileContext &compile_context, ITensorInfo *src, ITensorInfo *dst, const PoolingLayerInfo &info, ITensorInfo *indices = nullptr); + /** Static function to check if given info will lead to a valid configuration + * + * Similar to ClPool2d::configure() + * + * @return a status + */ + static Status validate(const ITensorInfo *src, const ITensorInfo *dst, const PoolingLayerInfo &info, const ITensorInfo *indices = nullptr); + + // Inherited method overridden + void run(ITensorPack &tensors) override; + +private: + std::unique_ptr _pooling{ nullptr }; + std::unique_ptr _border_handler{ nullptr }; +}; +} // namespace opencl +} // namespace arm_compute +#endif /* ARM_COMPUTE_CL_POOL2D_H */ diff --git a/src/runtime/gpu/cl/operators/ClPooling.cpp b/src/runtime/gpu/cl/operators/ClPooling.cpp deleted file mode 100644 index 8610eb9842..0000000000 --- a/src/runtime/gpu/cl/operators/ClPooling.cpp +++ /dev/null @@ -1,101 +0,0 @@ -/* - * Copyright (c) 2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "src/runtime/gpu/cl/operators/ClPooling.h" - -#include "arm_compute/runtime/CL/CLScheduler.h" - -#include "src/core/CL/kernels/CLFillBorderKernel.h" -#include "src/core/gpu/cl/ClCompileContext.h" -#include "src/core/gpu/cl/kernels/ClPoolingKernel.h" - -namespace arm_compute -{ -namespace opencl -{ -void ClPooling::configure(const ClCompileContext &compile_context, ITensorInfo *src, ITensorInfo *dst, const PoolingLayerInfo &info, ITensorInfo *indices) -{ - ARM_COMPUTE_ERROR_ON_NULLPTR(src); - // Configure pooling kernel - auto k = std::make_unique(); - k->set_target(CLScheduler::get().target()); - k->configure(compile_context, src, dst, info, indices); - _pooling = std::move(k); - - const DataType data_type = src->data_type(); - - // Configure border depending on operation required (quantize border in case of asymmetric data_type) - BorderMode border_mode{}; - PixelValue pixel_value(0.f); - if(is_data_type_quantized_asymmetric(data_type) && !info.exclude_padding) - { - pixel_value = PixelValue(0, data_type, src->quantization_info()); - } - - // Data layout - const auto data_layout = info.data_layout == DataLayout::UNKNOWN ? src->data_layout() : info.data_layout; - - switch(data_layout) - { - case DataLayout::NCHW: - border_mode = (PoolingType::MAX == info.pool_type) ? BorderMode::REPLICATE : BorderMode::CONSTANT; - break; - case DataLayout::NHWC: - border_mode = BorderMode::CONSTANT; - if(PoolingType::MAX == info.pool_type) - { - if(is_data_type_quantized(data_type)) - { - std::tie(pixel_value, std::ignore) = get_min_max(data_type); - } - else - { - pixel_value = PixelValue(std::numeric_limits::lowest()); - } - } - break; - default: - ARM_COMPUTE_ERROR("Data layout not supported"); - } - auto b = std::make_unique(); - b->configure(compile_context, src, _pooling->border_size(), border_mode, pixel_value); - _border_handler = std::move(b); - - // Tune kernels - CLScheduler::get().tune_kernel_static(*_pooling); -} - -Status ClPooling::validate(const ITensorInfo *src, const ITensorInfo *dst, const PoolingLayerInfo &info, const ITensorInfo *indices) -{ - return kernels::ClPoolingKernel::validate(src, dst, info, indices); -} - -void ClPooling::run(ITensorPack &tensors) -{ - ARM_COMPUTE_ERROR_ON_MSG(tensors.empty(), "No inputs provided"); - - CLScheduler::get().enqueue_op(*_border_handler.get(), tensors, false); - CLScheduler::get().enqueue_op(*_pooling.get(), tensors, false); -} -} // namespace opencl -} // namespace arm_compute diff --git a/src/runtime/gpu/cl/operators/ClPooling.h b/src/runtime/gpu/cl/operators/ClPooling.h deleted file mode 100644 index 99de6d0dcf..0000000000 --- a/src/runtime/gpu/cl/operators/ClPooling.h +++ /dev/null @@ -1,75 +0,0 @@ -/* - * Copyright (c) 2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifndef ARM_COMPUTE_CL_POOLING_H -#define ARM_COMPUTE_CL_POOLING_H - -#include "src/core/gpu/cl/ClCompileContext.h" -#include "src/runtime/gpu/cl/IClOperator.h" - -#include - -namespace arm_compute -{ -namespace opencl -{ -/** Basic function to simulate a pooling layer with the specified pooling operation. This function calls the following OpenCL kernels: - * - * -# @ref CLFillBorderKernel (executed if padding size is different from zero) - * -# @ref opencl::ClPooling - */ -class ClPooling : public IClOperator -{ -public: - /** Constructor */ - ClPooling() = default; - /** Configure operator for a given list of arguments - * - * @param[in] compile_context The compile context to be used. - * @param[in] src Source tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32. - * @param[out] dst Destination tensor info. Data type supported: same as @p src - * @param[in] info Pooling layer parameters. - * @param[out] indices (optional) The indices info of the maximal values. Data type supported: U32. - */ - void configure(const ClCompileContext &compile_context, ITensorInfo *src, ITensorInfo *dst, const PoolingLayerInfo &info, ITensorInfo *indices = nullptr); - /** Static function to check if given info will lead to a valid configuration of @ref ClPooling - * - * @param[in] src Source tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32. - * @param[out] dst Destination tensor info. Data type supported: same as @p src - * @param[in] info Pooling layer parameters. - * @param[out] indices (optional) The indices info of the maximal values. Data type supported: U32. - * - * @return a status - */ - static Status validate(const ITensorInfo *src, const ITensorInfo *dst, const PoolingLayerInfo &info, const ITensorInfo *indices = nullptr); - - // Inherited method overridden - void run(ITensorPack &tensors) override; - -private: - std::unique_ptr _pooling{ nullptr }; - std::unique_ptr _border_handler{ nullptr }; -}; -} // namespace opencl -} // namespace arm_compute -#endif /* ARM_COMPUTE_CL_POOLING_H */ -- cgit v1.2.1