From ec0113dd7749991959ae351934eea0c0d8077dcb Mon Sep 17 00:00:00 2001 From: Gunes Bayir Date: Wed, 9 Nov 2022 09:26:27 +0000 Subject: Optimize Transposed Convolution for CL backend (FP32/16) This patch optimizes transposed convolution for CL backend by rewriting it in a single kernel instead of three (flip_kernel + upsample + conv). The new kernel skips the upsampling step which reduces the input space of convolution by stride_x * stride_y, resulting in significant performance improvement. It also skips the kernel flipping by traversing the weights accordingly, thus reduces the memory footprint. Resolves: COMPMID-5676 Signed-off-by: Gunes Bayir Change-Id: I8a333212dc7c5f7f0597aa58b0d56d44814baa14 Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/8588 Tested-by: Arm Jenkins Reviewed-by: Gian Marco Iodice Comments-Addressed: Arm Jenkins Benchmark: Arm Jenkins --- src/runtime/CL/functions/CLDeconvolutionLayer.cpp | 68 +++++++++++++++++++++-- 1 file changed, 62 insertions(+), 6 deletions(-) (limited to 'src/runtime/CL') diff --git a/src/runtime/CL/functions/CLDeconvolutionLayer.cpp b/src/runtime/CL/functions/CLDeconvolutionLayer.cpp index c348bfcd0c..a4db6d7770 100644 --- a/src/runtime/CL/functions/CLDeconvolutionLayer.cpp +++ b/src/runtime/CL/functions/CLDeconvolutionLayer.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2021 Arm Limited. + * Copyright (c) 2017-2022 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -29,6 +29,8 @@ #include "arm_compute/core/utils/quantization/AsymmHelpers.h" #include "arm_compute/runtime/CL/CLScheduler.h" #include "src/core/CL/ICLKernel.h" +#include "src/gpu/cl/IClOperator.h" +#include "src/gpu/cl/operators/ClTransposedConvolution.h" #include "src/common/utils/Log.h" @@ -39,8 +41,19 @@ using namespace arm_compute; using namespace arm_compute::misc::shape_calculator; +struct CLDeconvolutionLayer::Impl +{ + const ICLTensor *src{ nullptr }; + const ICLTensor *weights{ nullptr }; + const ICLTensor *biases{ nullptr }; + ICLTensor *dst{ nullptr }; + std::unique_ptr op{ nullptr }; +}; + +CLDeconvolutionLayer::~CLDeconvolutionLayer() = default; + CLDeconvolutionLayer::CLDeconvolutionLayer(std::shared_ptr memory_manager) - : _memory_manager(std::move(memory_manager)), _function() + : _memory_manager(std::move(memory_manager)), _function(), _impl(std::make_unique()) { } @@ -59,6 +72,19 @@ void CLDeconvolutionLayer::configure(const CLCompileContext &compile_context, IC switch(CLDeconvolutionLayer::get_deconvolution_method(input->info(), weights->info(), nullptr, output->info(), deconv_info, weights_info)) { case DeconvolutionMethod::DIRECT: + { + auto op = std::make_unique(); + op->configure(compile_context, input->info(), weights->info(), bias != nullptr ? bias->info() : nullptr, output->info(), deconv_info); + + _impl->src = input; + _impl->weights = weights; + _impl->biases = bias; + _impl->dst = output; + + _impl->op = std::move(op); + break; + } + case DeconvolutionMethod::UPSCALE_CONV2D: { auto f = std::make_unique(); f->configure(compile_context, input, weights, bias, output, deconv_info, weights_info); @@ -85,6 +111,12 @@ Status CLDeconvolutionLayer::validate(const ITensorInfo *input, const ITensorInf switch(CLDeconvolutionLayer::get_deconvolution_method(input, weights, bias, output, deconv_info, weights_info)) { case DeconvolutionMethod::DIRECT: + { + // Validate transposed convolution operator + ARM_COMPUTE_RETURN_ON_ERROR(opencl::ClTransposedConvolution::validate(input, weights, bias, output, deconv_info)); + break; + } + case DeconvolutionMethod::UPSCALE_CONV2D: { // Validate direct convolution layer ARM_COMPUTE_RETURN_ON_ERROR(CLDirectDeconvolutionLayer::validate(input, weights, bias, output, deconv_info, weights_info)); @@ -109,11 +141,16 @@ DeconvolutionMethod CLDeconvolutionLayer::get_deconvolution_method(const ITensor { ARM_COMPUTE_UNUSED(output, bias, weights_info); - if(is_data_type_quantized_per_channel(weights->data_type())) + if(input->data_layout() == DataLayout::NHWC && (input->data_type() == DataType::F32 || input->data_type() == DataType::F16)) { return DeconvolutionMethod::DIRECT; } + if(is_data_type_quantized_per_channel(weights->data_type())) + { + return DeconvolutionMethod::UPSCALE_CONV2D; + } + const DataLayout data_layout = input->data_layout(); const size_t idx_w = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH); @@ -121,7 +158,7 @@ DeconvolutionMethod CLDeconvolutionLayer::get_deconvolution_method(const ITensor if(weights->dimension(idx_w) != deconv_info.stride().first || weights->dimension(idx_h) != deconv_info.stride().second) { - return DeconvolutionMethod::DIRECT; + return DeconvolutionMethod::UPSCALE_CONV2D; } return DeconvolutionMethod::GEMM; @@ -130,10 +167,29 @@ DeconvolutionMethod CLDeconvolutionLayer::get_deconvolution_method(const ITensor void CLDeconvolutionLayer::run() { prepare(); - _function->run(); + + if(_impl->op != nullptr) + { + // Optimized Operator will be used + ITensorPack pack; + + pack.add_tensor(TensorType::ACL_SRC_0, _impl->src); + pack.add_tensor(TensorType::ACL_SRC_1, _impl->weights); + pack.add_tensor(TensorType::ACL_SRC_2, _impl->biases); + pack.add_tensor(TensorType::ACL_DST, _impl->dst); + + _impl->op->run(pack); + } + else + { + _function->run(); + } } void CLDeconvolutionLayer::prepare() { - _function->prepare(); + if(_impl->op == nullptr) + { + _function->prepare(); + } } -- cgit v1.2.1