diff options
Diffstat (limited to 'src/runtime')
-rw-r--r-- | src/runtime/NEON/functions/NEGEMM.cpp | 17 | ||||
-rw-r--r-- | src/runtime/NEON/functions/NEGEMMConv2d.cpp | 139 | ||||
-rw-r--r-- | src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.cpp | 15 | ||||
-rw-r--r-- | src/runtime/cpu/operators/CpuGemmDirectConv2d.cpp | 206 | ||||
-rw-r--r-- | src/runtime/cpu/operators/CpuGemmDirectConv2d.h | 112 | ||||
-rw-r--r-- | src/runtime/cpu/operators/internal/CpuGemmAssemblyDispatch.cpp | 190 | ||||
-rw-r--r-- | src/runtime/cpu/operators/internal/CpuGemmAssemblyDispatch.h | 35 |
7 files changed, 473 insertions, 241 deletions
diff --git a/src/runtime/NEON/functions/NEGEMM.cpp b/src/runtime/NEON/functions/NEGEMM.cpp index b84128e6c0..7318c3e492 100644 --- a/src/runtime/NEON/functions/NEGEMM.cpp +++ b/src/runtime/NEON/functions/NEGEMM.cpp @@ -89,10 +89,19 @@ void NEGEMM::configure(const ITensor *a, const ITensor *b, const ITensor *c, ITe if(run_optimised) { - const ITensor *c_to_use = is_c_bias ? c : nullptr; - _asm_glue->configure(a, b, c_to_use, d, asm_info); + const ITensor *c_to_use = is_c_bias ? c : nullptr; + const ITensorInfo *c_info_to_use = c_to_use != nullptr ? c_to_use->info() : nullptr; + _asm_glue->configure(a->info(), b->info(), c_info_to_use, d->info(), asm_info); ARM_COMPUTE_ERROR_ON(!_asm_glue->is_configured()); + _asm_glue_tensors = + { + { ACL_SRC_0, a }, + { ACL_SRC_1, b }, + { ACL_SRC_2, c_to_use }, + { ACL_DST, d }, + }; + // Scale product by alpha if(_run_alpha_scale) { @@ -314,7 +323,7 @@ void NEGEMM::run() if(_asm_glue->is_configured()) { - _asm_glue->run(); + _asm_glue->run(_asm_glue_tensors); if(_run_alpha_scale) { _alpha_scale_func.run(); @@ -368,7 +377,7 @@ void NEGEMM::prepare() ARM_COMPUTE_ERROR_ON(!_original_b->is_used()); } - _asm_glue->prepare(); + _asm_glue->prepare(_asm_glue_tensors); if(!original_b_managed_by_weights_manager) { _original_b->mark_as_unused(); diff --git a/src/runtime/NEON/functions/NEGEMMConv2d.cpp b/src/runtime/NEON/functions/NEGEMMConv2d.cpp index ddeacc85f5..94ceb6d27c 100644 --- a/src/runtime/NEON/functions/NEGEMMConv2d.cpp +++ b/src/runtime/NEON/functions/NEGEMMConv2d.cpp @@ -26,151 +26,48 @@ #include "arm_compute/core/utils/misc/ShapeCalculator.h" #include "arm_compute/core/utils/quantization/AsymmHelpers.h" #include "arm_compute/runtime/NEON/NEScheduler.h" -#include "src/runtime/cpu/operators/internal/CpuGemmAssemblyDispatch.h" +#include "src/runtime/cpu/operators/CpuGemmDirectConv2d.h" #include <set> namespace arm_compute { -namespace -{ -GEMMLowpOutputStageInfo calculate_output_stage_metadata(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *output, const ActivationLayerInfo &act) -{ - // Since we need negative offsets for computing convolution, we need to change QuantizationInfo() - // Extract and negate input and weights offset - const QuantizationInfo iqinfo = input->quantization_info(); - const QuantizationInfo wqinfo = weights->quantization_info(); - const QuantizationInfo oqinfo = (output->total_size() == 0) ? iqinfo : output->quantization_info(); - const UniformQuantizationInfo uoqinfo = oqinfo.uniform(); - const DataType data_type = input->data_type(); - // Merge activation with output stage - const std::set<ActivationLayerInfo::ActivationFunction> supported_acts = { ActivationLayerInfo::ActivationFunction::RELU, - ActivationLayerInfo::ActivationFunction::BOUNDED_RELU, - ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU - }; - PixelValue type_min{}; - PixelValue type_max{}; - std::tie(type_min, type_max) = get_min_max(data_type); - int32_t min_activation = type_min.get<int32_t>(); - int32_t max_activation = type_max.get<int32_t>(); - if(supported_acts.count(act.activation()) != 0) - { - std::tie(min_activation, max_activation) = get_quantized_activation_min_max(act, data_type, uoqinfo); - } - GEMMLowpOutputStageInfo os_info; - os_info.type = GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT; - os_info.gemmlowp_offset = uoqinfo.offset; - os_info.gemmlowp_min_bound = min_activation; - os_info.gemmlowp_max_bound = max_activation; - os_info.is_quantized_per_channel = (weights->data_type() == DataType::QSYMM8_PER_CHANNEL); - quantization::calculate_quantized_multipliers(iqinfo, wqinfo, oqinfo, os_info); - return os_info; -} -cpu::AsmGemmInfo init_assembly_metadata(const Conv2dInfo &info, bool is_indirect) +using OperatorType = cpu::CpuGemmDirectConv2d; + +struct NEGEMMConv2d::Impl { - cpu::AsmGemmInfo asm_info; - asm_info.method = is_indirect ? cpu::AsmConvMethod::Indirect : cpu::AsmConvMethod::Conv; - asm_info.ps_info = info.conv_info; - asm_info.activation_info = info.act_info; - asm_info.depth_output_gemm3d = true; - asm_info.reinterpret_input_as_3d = true; - asm_info.padding_top = info.conv_info.pad_top(); - asm_info.padding_left = info.conv_info.pad_left(); - asm_info.padding_value = 0.f; - asm_info.negated_offsets = false; - return asm_info; -} -} // namespace + ITensorPack tensors{}; + std::unique_ptr<OperatorType> op{ nullptr }; +}; NEGEMMConv2d::NEGEMMConv2d(const std::shared_ptr<IMemoryManager> &memory_manager) - : _gemm_asm_func(std::make_unique<cpu::CpuGemmAssemblyDispatch>(memory_manager)), _activation_func(), _weights_permute_func(), _original_weights(nullptr), _permuted_weights(), _is_prepared(false), - _run_activation(false) + : _impl(std::make_unique<Impl>()) { + _impl->op = std::make_unique<OperatorType>(memory_manager); } NEGEMMConv2d::~NEGEMMConv2d() = default; void NEGEMMConv2d::configure(ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const Conv2dInfo &info) { - ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output); - ARM_COMPUTE_ERROR_THROW_ON(NEGEMMConv2d::validate(input->info(), - weights->info(), - biases != nullptr ? biases->info() : nullptr, - output->info(), - info)); - _original_weights = weights; - _weights_permute_func.configure(weights, &_permuted_weights, PermutationVector{ 3, 0, 1, 2 }); + _impl->tensors.add_const_tensor(TensorType::ACL_SRC_0, input); + _impl->tensors.add_const_tensor(TensorType::ACL_SRC_1, weights); + _impl->tensors.add_const_tensor(TensorType::ACL_SRC_2, biases); + _impl->tensors.add_tensor(TensorType::ACL_DST, output); - // Configure assembly dispatch - cpu::AsmGemmInfo asm_info = init_assembly_metadata(info, false); - if(is_data_type_quantized(input->info()->data_type())) - { - asm_info.output_stage = calculate_output_stage_metadata(input->info(), weights->info(), output->info(), info.act_info); - } - _gemm_asm_func->configure(input, &_permuted_weights, biases, output, asm_info); - - // Configure activation - if(info.act_info.enabled() && !_gemm_asm_func->is_activation_supported(info.act_info)) - { - _activation_func.configure(output, nullptr, info.act_info); - _run_activation = true; - } + _impl->op->configure(input->info(), weights->info(), biases->info(), output->info(), info); } + Status NEGEMMConv2d::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const Conv2dInfo &info) { - ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::BFLOAT16, DataType::F16, DataType::F32); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(weights, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM8_PER_CHANNEL, DataType::BFLOAT16, DataType::F16, DataType::F32); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, weights); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.num_groups > 1, "Grouping (num_groups != 1) is not supported on Neon"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->data_layout() != DataLayout::NHWC, "Data layout supported is NHWC"); - const DataType data_type = input->data_type(); - const TensorShape i_shape = input->tensor_shape(); - const TensorShape w_shape = weights->tensor_shape(); - ARM_COMPUTE_RETURN_ERROR_ON(w_shape[0] != i_shape[0]); - ARM_COMPUTE_RETURN_ERROR_ON(info.dilation != Size2D(1U, 1U)); - ARM_COMPUTE_RETURN_ERROR_ON(weights->num_dimensions() > 4); - // Validate biases - if(biases != nullptr) - { - if(is_data_type_quantized_asymmetric(data_type)) - { - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(biases, 1, DataType::S32); - } - else if(data_type == DataType::BFLOAT16) - { - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(biases, 1, DataType::F32); - } - else - { - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, biases); - } - ARM_COMPUTE_RETURN_ERROR_ON(biases->dimension(0) != weights->dimension(3)); - ARM_COMPUTE_RETURN_ERROR_ON(biases->num_dimensions() > 1); - } - - cpu::AsmGemmInfo asm_info = init_assembly_metadata(info, false); - ARM_COMPUTE_RETURN_ON_ERROR(cpu::CpuGemmAssemblyDispatch::validate(input, weights, biases, output, asm_info)); - return Status{}; + return OperatorType::validate(input, weights, biases, output, info); } void NEGEMMConv2d::run() { - prepare(); - - _gemm_asm_func->run(); - if(_run_activation) - { - _activation_func.run(); - } + _impl->op->run(_impl->tensors); } void NEGEMMConv2d::prepare() { - if(!_is_prepared) - { - _permuted_weights.allocator()->allocate(); - _weights_permute_func.run(); - _original_weights->mark_as_unused(); - _is_prepared = true; - } + _impl->op->prepare(_impl->tensors); } } // namespace arm_compute diff --git a/src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.cpp b/src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.cpp index 53dd39e549..cc0f20e695 100644 --- a/src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.cpp +++ b/src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.cpp @@ -146,14 +146,21 @@ void NEGEMMLowpMatrixMultiplyCore::configure(const ITensor *a, const ITensor *b, { if(is_data_type_quantized_asymmetric(a_to_use->info()->data_type()) && info.gemmlowp_output_stage().type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT) { - _asm_glue->configure(a_to_use, b, c, output, asm_info); + auto c_info_to_use = c == nullptr ? nullptr : c->info(); + _asm_glue->configure(a_to_use->info(), b->info(), c_info_to_use, output->info(), asm_info); _fused_assembly_path = _asm_glue->is_configured(); + _asm_glue_tensors.add_const_tensor(TensorType::ACL_SRC_2, c); + _asm_glue_tensors.add_tensor(TensorType::ACL_DST, output); } else { - _asm_glue->configure(a_to_use, b, nullptr, _fuse_output_stage ? &_mm_result_s32 : output, asm_info); + auto output_to_use = (_fuse_output_stage ? &_mm_result_s32 : output); + _asm_glue->configure(a_to_use->info(), b->info(), nullptr, output_to_use->info(), asm_info); + _asm_glue_tensors.add_tensor(TensorType::ACL_DST, output_to_use); } _assembly_path = _asm_glue->is_configured(); + _asm_glue_tensors.add_const_tensor(TensorType::ACL_SRC_0, a_to_use); + _asm_glue_tensors.add_const_tensor(TensorType::ACL_SRC_1, b); break; } default: @@ -513,7 +520,7 @@ void NEGEMMLowpMatrixMultiplyCore::run() // Run GEMM if(_asm_glue->is_configured()) { - _asm_glue->run(); + _asm_glue->run(_asm_glue_tensors); } else { @@ -583,7 +590,7 @@ void NEGEMMLowpMatrixMultiplyCore::prepare() ARM_COMPUTE_ERROR_ON(!_original_b->is_used()); } - _asm_glue->prepare(); + _asm_glue->prepare(_asm_glue_tensors); if(!original_b_managed_by_weights_manager) { _original_b->mark_as_unused(); diff --git a/src/runtime/cpu/operators/CpuGemmDirectConv2d.cpp b/src/runtime/cpu/operators/CpuGemmDirectConv2d.cpp new file mode 100644 index 0000000000..b47a08a5e9 --- /dev/null +++ b/src/runtime/cpu/operators/CpuGemmDirectConv2d.cpp @@ -0,0 +1,206 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "src/runtime/cpu/operators/CpuGemmDirectConv2d.h" + +#include "arm_compute/core/utils/misc/ShapeCalculator.h" +#include "arm_compute/core/utils/quantization/AsymmHelpers.h" +#include "arm_compute/runtime/FunctionDescriptors.h" +#include "arm_compute/runtime/NEON/NEScheduler.h" +#include "src/runtime/cpu/operators/CpuActivation.h" +#include "src/runtime/cpu/operators/CpuPermute.h" +#include "src/runtime/cpu/operators/internal/CpuGemmAssemblyDispatch.h" + +#include <set> + +namespace arm_compute +{ +namespace cpu +{ +namespace +{ +GEMMLowpOutputStageInfo calculate_output_stage_metadata(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *dst, const ActivationLayerInfo &act) +{ + // Since we need negative offsets for computing convolution, we need to change QuantizationInfo() + // Extract and negate input and weights offset + const QuantizationInfo iqinfo = src->quantization_info(); + const QuantizationInfo wqinfo = weights->quantization_info(); + const QuantizationInfo oqinfo = (dst->total_size() == 0) ? iqinfo : dst->quantization_info(); + const UniformQuantizationInfo uoqinfo = oqinfo.uniform(); + const DataType data_type = src->data_type(); + // Merge activation with output stage + const std::set<ActivationLayerInfo::ActivationFunction> supported_acts = { ActivationLayerInfo::ActivationFunction::RELU, + ActivationLayerInfo::ActivationFunction::BOUNDED_RELU, + ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU + }; + PixelValue type_min{}; + PixelValue type_max{}; + std::tie(type_min, type_max) = get_min_max(data_type); + int32_t min_activation = type_min.get<int32_t>(); + int32_t max_activation = type_max.get<int32_t>(); + if(supported_acts.count(act.activation()) != 0) + { + std::tie(min_activation, max_activation) = get_quantized_activation_min_max(act, data_type, uoqinfo); + } + GEMMLowpOutputStageInfo os_info; + os_info.type = GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT; + os_info.gemmlowp_offset = uoqinfo.offset; + os_info.gemmlowp_min_bound = min_activation; + os_info.gemmlowp_max_bound = max_activation; + os_info.is_quantized_per_channel = (weights->data_type() == DataType::QSYMM8_PER_CHANNEL); + quantization::calculate_quantized_multipliers(iqinfo, wqinfo, oqinfo, os_info); + return os_info; +} +cpu::AsmGemmInfo init_assembly_metadata(const Conv2dInfo &info, bool is_indirect) +{ + cpu::AsmGemmInfo asm_info; + asm_info.method = is_indirect ? cpu::AsmConvMethod::Indirect : cpu::AsmConvMethod::Conv; + asm_info.ps_info = info.conv_info; + asm_info.activation_info = info.act_info; + asm_info.depth_output_gemm3d = true; + asm_info.reinterpret_input_as_3d = true; + asm_info.padding_top = info.conv_info.pad_top(); + asm_info.padding_left = info.conv_info.pad_left(); + asm_info.padding_value = 0.f; + asm_info.negated_offsets = false; + return asm_info; +} +} // namespace + +CpuGemmDirectConv2d::CpuGemmDirectConv2d(const std::shared_ptr<IMemoryManager> &memory_manager) + : _gemm_asm_func(std::make_unique<CpuGemmAssemblyDispatch>(memory_manager)), + _activation_func(std::make_unique<CpuActivation>()), + _weights_permute_func(std::make_unique<CpuPermute>()), + _permuted_weights_info(), + _permuted_weights(std::make_unique<Tensor>()) +{ +} + +CpuGemmDirectConv2d::~CpuGemmDirectConv2d() = default; + +void CpuGemmDirectConv2d::configure(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, ITensorInfo *dst, const Conv2dInfo &info) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(src, weights, dst); + ARM_COMPUTE_ERROR_THROW_ON(CpuGemmDirectConv2d::validate(src, + weights, + biases != nullptr ? biases : nullptr, + dst, + info)); + _original_weights_info = weights; + _weights_permute_func->configure(weights, &_permuted_weights_info, PermutationVector{ 3, 0, 1, 2 }); + + // Configure assembly dispatch + cpu::AsmGemmInfo asm_info = init_assembly_metadata(info, false); + if(is_data_type_quantized(src->data_type())) + { + asm_info.output_stage = calculate_output_stage_metadata(src, weights, dst, info.act_info); + } + _gemm_asm_func->configure(src, &_permuted_weights_info, biases, dst, asm_info); + + // Configure activation + if(info.act_info.enabled() && !_gemm_asm_func->is_activation_supported(info.act_info)) + { + _activation_func->configure(dst, nullptr, info.act_info); + _run_activation = true; + } +} +Status CpuGemmDirectConv2d::validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, const Conv2dInfo &info) +{ + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, weights, dst); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::BFLOAT16, DataType::F16, DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(weights, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM8_PER_CHANNEL, DataType::BFLOAT16, DataType::F16, DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(src, weights); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.num_groups > 1, "Grouping (num_groups != 1) is not supported on Neon"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(src->data_layout() != DataLayout::NHWC, "Data layout supported is NHWC"); + const DataType data_type = src->data_type(); + const TensorShape i_shape = src->tensor_shape(); + const TensorShape w_shape = weights->tensor_shape(); + ARM_COMPUTE_RETURN_ERROR_ON(w_shape[0] != i_shape[0]); + ARM_COMPUTE_RETURN_ERROR_ON(info.dilation != Size2D(1U, 1U)); + ARM_COMPUTE_RETURN_ERROR_ON(weights->num_dimensions() > 4); + // Validate biases + if(biases != nullptr) + { + if(is_data_type_quantized_asymmetric(data_type)) + { + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(biases, 1, DataType::S32); + } + else if(data_type == DataType::BFLOAT16) + { + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(biases, 1, DataType::F32); + } + else + { + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, biases); + } + ARM_COMPUTE_RETURN_ERROR_ON(biases->dimension(0) != weights->dimension(3)); + ARM_COMPUTE_RETURN_ERROR_ON(biases->num_dimensions() > 1); + } + + cpu::AsmGemmInfo asm_info = init_assembly_metadata(info, false); + ARM_COMPUTE_RETURN_ON_ERROR(cpu::CpuGemmAssemblyDispatch::validate(src, weights, biases, dst, asm_info)); + return Status{}; +} +void CpuGemmDirectConv2d::run(ITensorPack &tensors) +{ + prepare(tensors); + + _gemm_asm_func->run(tensors); + if(_run_activation) + { + _activation_func->run(tensors); + } +} + +void CpuGemmDirectConv2d::allocate_permuted_weights() +{ + // TODO: This function will be removed when memory injection is implemeted. + ARM_COMPUTE_ERROR_ON(_permuted_weights == nullptr); + _permuted_weights->allocator()->free(); + _permuted_weights->allocator()->init(_permuted_weights_info); + _permuted_weights->allocator()->allocate(); +} + +void CpuGemmDirectConv2d::prepare(ITensorPack &tensors) +{ + if(!_is_prepared) + { + allocate_permuted_weights(); + ITensorPack permute_tensors + { + { TensorType::ACL_SRC, tensors.get_const_tensor(TensorType::ACL_SRC_1) }, + { TensorType::ACL_DST, _permuted_weights.get() }, + }; + + _weights_permute_func->run(permute_tensors); + + tensors.get_const_tensor(TensorType::ACL_SRC_1)->mark_as_unused(); + + // switch the original tensor with permuted tensor + tensors.add_const_tensor(TensorType::ACL_SRC_1, _permuted_weights.get()); + _is_prepared = true; + } +} + +} // namespace cpu +} // namespace arm_compute
\ No newline at end of file diff --git a/src/runtime/cpu/operators/CpuGemmDirectConv2d.h b/src/runtime/cpu/operators/CpuGemmDirectConv2d.h new file mode 100644 index 0000000000..6aa17c2349 --- /dev/null +++ b/src/runtime/cpu/operators/CpuGemmDirectConv2d.h @@ -0,0 +1,112 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ARM_COMPUTE_CPU_GEMM_DIRECT_CONV_2D_H +#define ARM_COMPUTE_CPU_GEMM_DIRECT_CONV_2D_H + +#include "arm_compute/core/ITensorInfo.h" +#include "arm_compute/core/experimental/Types.h" +#include "arm_compute/runtime/Tensor.h" +#include "src/core/common/Macros.h" +#include "src/core/cpu/ICpuKernel.h" +#include "src/runtime/cpu/ICpuOperator.h" + +#include <memory> + +namespace arm_compute +{ +// Forward declarations +class ITensor; +struct Conv2dInfo; +namespace cpu +{ +class CpuGemmAssemblyDispatch; +class CpuActivation; +class CpuPermute; + +class CpuGemmDirectConv2d : public ICpuOperator +{ +public: + /** Constructor */ + CpuGemmDirectConv2d(const std::shared_ptr<IMemoryManager> &memory_manager = nullptr); + ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuGemmDirectConv2d); + /** Destructor */ + ~CpuGemmDirectConv2d(); + /** Set the input and output tensors. + * + * Valid data layouts: + * - All + * + * Valid data type configurations: + * |src0 |src1 |src2 |dst | + * |:--------------|:--------------|:--------------|:--------------| + * |QASYMM8 |QASYMM8 |S32 |QASYMM8 | + * |QASYMM8_SIGNED |QASYMM8_SIGNED |S32 |QASYMM8_SIGNED | + * |F16 |F16 |F16 |F16 | + * |F32 |F32 |F32 |F32 | + * |BFLOAT16 |BFLOAT16 |BFLOAT16 |BFLOAT16 | + * + * @param[in] src Source tensor info. 3 lower dimensions represent a single input [width, height, IFM], + * while every optional dimension from 4 and above represent a batch of inputs. + * Data types supported: QASYMM8/QASYMM8_SIGNED/BFLOAT16/F16/F32. + * @param[in] weights Weights tensor info. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM]. + * Data type supported: QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL/BFLOAT16/F16/F32. + * @param[in] biases Biases tensor info. Shared biases supported. Biases are 1D tensor with dimensions [OFM]. + * Data type supported: Should match @p input data type, except for input of QASYMM8/QASYMM8_SIGNED type where biases should be of S32 type. + * @param[in] dst Destination tensor info. 3 lower dimensions represent a single output [width, height, OFM], while the rest represent batch of outputs. + * Data types supported: Same as @p input. + * @param[in] info Contains padding and stride information described in @ref PadStrideInfo. + */ + void configure(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, ITensorInfo *dst, const Conv2dInfo &info); + /** Static function to check if given info will lead to a valid configuration of @ref CpuGemmDirectConv2d + * + * Similar to CpuGemmDirectConv2d::configure() + * + * @return a status + */ + static Status validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, const Conv2dInfo &info); + + // Inherited methods overridden: + void run(ITensorPack &tensors) override; + void prepare(ITensorPack &constants) override; + +private: + std::unique_ptr<CpuGemmAssemblyDispatch> _gemm_asm_func; + std::unique_ptr<CpuActivation> _activation_func; + std::unique_ptr<CpuPermute> _weights_permute_func; + const ITensorInfo *_original_weights_info{}; + TensorInfo _permuted_weights_info; + std::unique_ptr<Tensor> _permuted_weights{ nullptr }; + bool _is_prepared{ false }; + bool _run_activation{ false }; + + /** Function to allocated a tensor for permuted weights + * + * @note This function will be removed when memory injection is properly implemented. + */ + void allocate_permuted_weights(); +}; +} // namespace cpu +} // namespace arm_compute + +#endif /* ARM_COMPUTE_CPU_GEMM_DIRECT_CONV_2D_H */ diff --git a/src/runtime/cpu/operators/internal/CpuGemmAssemblyDispatch.cpp b/src/runtime/cpu/operators/internal/CpuGemmAssemblyDispatch.cpp index 36c1bbb1b3..0c511ff548 100644 --- a/src/runtime/cpu/operators/internal/CpuGemmAssemblyDispatch.cpp +++ b/src/runtime/cpu/operators/internal/CpuGemmAssemblyDispatch.cpp @@ -56,14 +56,13 @@ struct Params bool indirect; }; -Params extract_parameters(const ITensor *a, const ITensor *b, const ITensor *d, const AsmGemmInfo &info) +Params extract_parameters(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *d, const AsmGemmInfo &info) { ARM_COMPUTE_ERROR_ON_NULLPTR(a, b, d); - Params p; - p.M = d->info()->tensor_shape().y(); - p.K = a->info()->tensor_shape().x(); - p.N = d->info()->tensor_shape().x(); + p.M = d->tensor_shape().y(); + p.K = a->tensor_shape().x(); + p.N = d->tensor_shape().x(); p.batches = 1; p.multis = 1; p.sections = 1; @@ -72,19 +71,19 @@ Params extract_parameters(const ITensor *a, const ITensor *b, const ITensor *d, if(info.method == AsmConvMethod::Conv || info.method == AsmConvMethod::Indirect) { p.indirect = true; - p.sections = b->info()->tensor_shape()[2] * b->info()->tensor_shape()[3]; + p.sections = b->tensor_shape()[2] * b->tensor_shape()[3]; } else { - p.multis = b->info()->tensor_shape().z(); - p.batches = d->info()->tensor_shape().total_size_upper(2) / p.multis; + p.multis = b->tensor_shape().z(); + p.batches = d->tensor_shape().total_size_upper(2) / p.multis; } // Update M in case of GEMM3D for output if(info.depth_output_gemm3d != 0) { - p.M = d->info()->tensor_shape().y() * d->info()->tensor_shape().z(); - p.batches = d->info()->tensor_shape().total_size_upper(3) / p.multis; + p.M = d->tensor_shape().y() * d->tensor_shape().z(); + p.batches = d->tensor_shape().total_size_upper(3) / p.multis; } return p; @@ -205,11 +204,11 @@ public: } private: - Tensor _output{}; - int _ldb{}; - const TypeInput *_in1_ptr{}; - int _multi_stride_b{}; - size_t _B_pretranspose_size{}; + Tensor _output{}; + int _ldb{}; + const TypeInput *_in1_ptr{}; + int _multi_stride_b{}; + size_t _B_pretranspose_size{}; std::shared_ptr<arm_gemm::GemmCommon<TypeInput, TypeOutput>> _gemm_kernel_asm{ nullptr }; }; @@ -221,8 +220,7 @@ public: /** Destructor */ ~Fallback() { - // Release memory if we have allocated the memory ourselves - if(_pretranspose && !(_weights_manager && _weights_manager->are_weights_managed(_b))) + if(_pretranspose && !(is_weight_managed())) { delete _pretranspose; } @@ -240,7 +238,7 @@ public: * @param[in] weights_manager Weights manager to be used by the function. * @param[in] os Output stage meta-data. */ - void configure(const ITensor *a, const ITensor *b, const ITensor *c, ITensor *d, + void configure(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, ITensorInfo *d, arm_gemm::GemmArgs args, const AsmGemmInfo &gemm_info, MemoryGroup &memory_group, IWeightsManager *weights_manager, const OutputStage &os = {}); @@ -262,8 +260,8 @@ public: const std::vector<int32_t> &multipliers); // Inherited methods overridden: - void run() override; - void prepare() override; + void run(ITensorPack &tensors) override; + void prepare(ITensorPack &tensors) override; bool is_configured() const override; private: @@ -283,28 +281,12 @@ private: */ void configure_indirect(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *d, const AsmGemmInfo &info); /** Prepare the indirect buffer */ - void prepare_indirect_buffer(); + void prepare_indirect_buffer(ITensorPack &tensors); /** Assembly Gemm kernel */ std::shared_ptr<arm_gemm::GemmCommon<TypeInput, TypeOutput>> _gemm_kernel_asm{ nullptr }; /** Optimised Arm® Neon™ kernel */ std::unique_ptr<INEKernel> _optimised_kernel{ nullptr }; - /** Input A */ - const ITensor *_a - { - nullptr - }; - /** Input B */ - const ITensor *_b - { - nullptr - }; - const ITensor *_c - { - nullptr - }; - /** Output */ - ITensor *_d{ nullptr }; /** GEMM workspace */ Tensor _workspace{}; /** Pre-transpose tensor */ @@ -328,8 +310,27 @@ private: /** Indirect buffer */ std::unique_ptr<const TypeInput *const *, free_delete> _indirect_arg{}; std::unique_ptr<const TypeInput *, free_delete> _indirect_buf{}; - std::vector<TypeInput> _indirect_pad{}; - arm_gemm::ConvolutionParameters _cp{}; + std::vector<TypeInput> _indirect_pad{}; + arm_gemm::ConvolutionParameters _cp{}; + + bool is_weight_managed() + { + // TODO (COMPMID-4539): This function should do the following: + // _weights_manager && _weights_manager->are_weights_managed(_b) + // , where _b is the second Tensor that is used to be given to the configure(). + // Currently, however, weight manager is disabled to make this class stateless. + // This should be revisited in the future. + return false; + } + + void acquire_managed_weight() + { + // TODO (COMPMID-4539): This function should do the following: + // _pretranspose = _weights_manager->acquire(_b, &_weights_transform); + // , where _b is the second Tensor that is used to be given to the configure(). + // Currently, however, weight manager is disabled to make this class stateless. + _pretranspose = nullptr; + } }; template <typename TypeInput, typename TypeOutput, class OutputStage> @@ -352,14 +353,15 @@ Fallback<TypeInput, TypeOutput, OutputStage>::set_requantize_data(const std::vec } template <typename TypeInput, typename TypeOutput, class OutputStage> -void Fallback<TypeInput, TypeOutput, OutputStage>::prepare_indirect_buffer() +void Fallback<TypeInput, TypeOutput, OutputStage>::prepare_indirect_buffer(ITensorPack &tensors) { - const TypeInput *A_ptr = reinterpret_cast<TypeInput *>(_a->buffer()); + auto a = tensors.get_const_tensor(TensorType::ACL_SRC_0); + const TypeInput *A_ptr = reinterpret_cast<TypeInput *>(a->buffer()); const int multis = 1; - const int batches = _a->info()->tensor_shape().total_size_upper(3); - const size_t stride_A = _a->info()->strides_in_bytes().y() / sizeof(TypeInput); - const size_t batch_stride_A = _a->info()->strides_in_bytes()[3] / sizeof(TypeInput); - const size_t multi_stride_A = _a->info()->strides_in_bytes()[4] / sizeof(TypeInput); + const int batches = a->info()->tensor_shape().total_size_upper(3); + const size_t stride_A = a->info()->strides_in_bytes().y() / sizeof(TypeInput); + const size_t batch_stride_A = a->info()->strides_in_bytes()[3] / sizeof(TypeInput); + const size_t multi_stride_A = a->info()->strides_in_bytes()[4] / sizeof(TypeInput); const size_t output_hw = _cp.output_height * _cp.output_width; const int batch_size = _cp.kernel_height * _cp.kernel_width * output_hw * sizeof(TypeInput); @@ -466,10 +468,11 @@ void Fallback<TypeInput, TypeOutput, OutputStage>::configure_indirect(const ITen } template <typename TypeInput, typename TypeOutput, class OutputStage> -void Fallback<TypeInput, TypeOutput, OutputStage>::configure(const ITensor *a, const ITensor *b, const ITensor *c, ITensor *d, +void Fallback<TypeInput, TypeOutput, OutputStage>::configure(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, ITensorInfo *d, arm_gemm::GemmArgs args, const AsmGemmInfo &gemm_info, MemoryGroup &memory_group, IWeightsManager *weights_manager, const OutputStage &os) { + ARM_COMPUTE_UNUSED(c); arm_gemm::GemmConfig gemm_cfg; _kernel_info = arm_gemm::get_gemm_method<TypeInput, TypeOutput, OutputStage>(args, os); _weights_manager = weights_manager; @@ -508,10 +511,6 @@ void Fallback<TypeInput, TypeOutput, OutputStage>::configure(const ITensor *a, c } _optimised_kernel = std::move(acl_gemm_wrapper); - _a = a; - _b = b; - _c = c; - _d = d; _gemm_info = gemm_info; // Check for pre-transposed support if(_gemm_kernel_asm->B_pretranspose_required()) @@ -519,10 +518,10 @@ void Fallback<TypeInput, TypeOutput, OutputStage>::configure(const ITensor *a, c // Forcing 128-byte alignment (required by 32-bit kernels) const unsigned int alignment = 128; const size_t B_pretranspose_size = _gemm_kernel_asm->get_B_pretransposed_array_size(); - if(weights_manager && _weights_manager->are_weights_managed(b)) + if(is_weight_managed()) { _weights_transform.configure(B_pretranspose_size, alignment); - _pretranspose = _weights_manager->acquire(b, &_weights_transform); + acquire_managed_weight(); } else { @@ -534,32 +533,34 @@ void Fallback<TypeInput, TypeOutput, OutputStage>::configure(const ITensor *a, c // Handle indirect GEMM convolution if(gemm_info.method == AsmConvMethod::Conv || gemm_info.method == AsmConvMethod::Indirect) { - configure_indirect(a->info(), b->info(), d->info(), gemm_info); + configure_indirect(a, b, d, gemm_info); } } template <typename TypeInput, typename TypeOutput, class OutputStage> -void Fallback<TypeInput, TypeOutput, OutputStage>::prepare() +void Fallback<TypeInput, TypeOutput, OutputStage>::prepare(ITensorPack &tensors) { + auto b = tensors.get_const_tensor(TensorType::ACL_SRC_1); + auto c = tensors.get_const_tensor(TensorType::ACL_SRC_2); if(!_is_prepared) { // Setup up matrix bias in the assembly kernel, it's just a pointer to matrix C. - if(_c && _c->info()->data_type() == DataType::S32) + if(c && c->info()->data_type() == DataType::S32) { - _gemm_kernel_asm->set_quantized_bias(reinterpret_cast<const int32_t *>(_c->buffer() + _c->info()->offset_first_element_in_bytes()), 0); + _gemm_kernel_asm->set_quantized_bias(reinterpret_cast<const int32_t *>(c->buffer() + c->info()->offset_first_element_in_bytes()), 0); } // Pretranspose B if required if(_gemm_kernel_asm->B_pretranspose_required()) { - const int ldb = _b->info()->strides_in_bytes().y() / sizeof(TypeInput); - const auto in1_ptr = reinterpret_cast<const TypeInput *>(_b->buffer() + _b->info()->offset_first_element_in_bytes()); - const int multi_stride_b = _b->info()->strides_in_bytes().z() / sizeof(TypeInput); + const int ldb = b->info()->strides_in_bytes().y() / sizeof(TypeInput); + const auto in1_ptr = reinterpret_cast<const TypeInput *>(b->buffer() + b->info()->offset_first_element_in_bytes()); + const int multi_stride_b = b->info()->strides_in_bytes().z() / sizeof(TypeInput); - if(_weights_manager && _weights_manager->are_weights_managed(_b)) + if(is_weight_managed()) { _weights_transform.set_args(ldb, in1_ptr, multi_stride_b, _gemm_kernel_asm); - _weights_manager->run(_b, &_weights_transform); + _weights_manager->run(b, &_weights_transform); // If we didn't run the reshape function, set the pretransposed buffer if(!_weights_transform.is_reshape_run()) @@ -572,13 +573,13 @@ void Fallback<TypeInput, TypeOutput, OutputStage>::prepare() static_cast<Tensor *>(_pretranspose)->allocator()->allocate(); ARM_COMPUTE_ERROR_ON(_pretranspose->buffer() == nullptr); _gemm_kernel_asm->pretranspose_B_array(_pretranspose->buffer(), in1_ptr, ldb, multi_stride_b); - _b->mark_as_unused(); + b->mark_as_unused(); } } if(_gemm_info.method == AsmConvMethod::Indirect) { - prepare_indirect_buffer(); + prepare_indirect_buffer(tensors); } _is_prepared = true; @@ -601,37 +602,42 @@ bool Fallback<TypeInput, TypeOutput, OutputStage>::is_configured() const } template <typename TypeInput, typename TypeOutput, class OutputStage> -void Fallback<TypeInput, TypeOutput, OutputStage>::run() +void Fallback<TypeInput, TypeOutput, OutputStage>::run(ITensorPack &tensors) { - int lda = _a->info()->strides_in_bytes().y() / sizeof(TypeInput); + auto a = tensors.get_const_tensor(TensorType::ACL_SRC_0); + auto b = tensors.get_const_tensor(TensorType::ACL_SRC_1); + auto c = tensors.get_const_tensor(TensorType::ACL_SRC_2); + auto d = tensors.get_tensor(TensorType::ACL_DST); + + int lda = a->info()->strides_in_bytes().y() / sizeof(TypeInput); int ldb = 0; - const int ldd = _d->info()->strides_in_bytes().y() / sizeof(TypeOutput); + const int ldd = d->info()->strides_in_bytes().y() / sizeof(TypeOutput); const size_t a_batch_idx = _gemm_info.reinterpret_input_as_3d != 0 ? 3 : 2; const size_t a_multi_idx = a_batch_idx + 1; const size_t d_batch_idx = _gemm_info.depth_output_gemm3d != 0 ? 3 : 2; const size_t d_multi_idx = d_batch_idx + 1; - int batch_stride_a = _a->info()->strides_in_bytes()[a_batch_idx] / sizeof(TypeInput); - const int batch_stride_d = _d->info()->strides_in_bytes()[d_batch_idx] / sizeof(TypeOutput); + int batch_stride_a = a->info()->strides_in_bytes()[a_batch_idx] / sizeof(TypeInput); + const int batch_stride_d = d->info()->strides_in_bytes()[d_batch_idx] / sizeof(TypeOutput); - int multi_stride_a = _a->info()->strides_in_bytes()[a_multi_idx] / sizeof(TypeInput); + int multi_stride_a = a->info()->strides_in_bytes()[a_multi_idx] / sizeof(TypeInput); int multi_stride_b = 0; - const int multi_stride_d = _d->info()->strides_in_bytes()[d_multi_idx] / sizeof(TypeOutput); + const int multi_stride_d = d->info()->strides_in_bytes()[d_multi_idx] / sizeof(TypeOutput); - auto in0_ptr = reinterpret_cast<const TypeInput *>(_a->buffer() + _a->info()->offset_first_element_in_bytes()); + auto in0_ptr = reinterpret_cast<const TypeInput *>(a->buffer() + a->info()->offset_first_element_in_bytes()); const TypeInput *in1_ptr = nullptr; - auto out_ptr = reinterpret_cast<TypeOutput *>(_d->buffer() + _d->info()->offset_first_element_in_bytes()); + auto out_ptr = reinterpret_cast<TypeOutput *>(d->buffer() + d->info()->offset_first_element_in_bytes()); // Check if B is pre-tranposed and de-reference if not if(!_gemm_kernel_asm->B_is_pretransposed()) { - ldb = _b->info()->strides_in_bytes().y() / sizeof(TypeInput); - multi_stride_b = _b->info()->strides_in_bytes().z() / sizeof(TypeInput); - in1_ptr = reinterpret_cast<const TypeInput *>(_b->buffer() + _b->info()->offset_first_element_in_bytes()); + ldb = b->info()->strides_in_bytes().y() / sizeof(TypeInput); + multi_stride_b = b->info()->strides_in_bytes().z() / sizeof(TypeInput); + in1_ptr = reinterpret_cast<const TypeInput *>(b->buffer() + b->info()->offset_first_element_in_bytes()); } - const auto scheduling_hint = scheduling_hint_heuristic(_kernel_info.method, _d->info()->data_type()); + const auto scheduling_hint = scheduling_hint_heuristic(_kernel_info.method, d->info()->data_type()); // Set workspace if needed and reset number of threads as buffer manager gets re-created with max_threads if(_workspace.buffer() != nullptr) @@ -654,13 +660,13 @@ void Fallback<TypeInput, TypeOutput, OutputStage>::run() } // Prepare assembly kernel - prepare(); + prepare(tensors); // Setup up matrix bias in the assembly kernel, it's just a pointer to matrix C. TypeOutput *bias = nullptr; - if(_c && _c->info()->data_type() != DataType::S32) + if(c && c->info()->data_type() != DataType::S32) { - bias = reinterpret_cast<TypeOutput *>(_c->buffer() + _c->info()->offset_first_element_in_bytes()); + bias = reinterpret_cast<TypeOutput *>(c->buffer() + c->info()->offset_first_element_in_bytes()); } if(_gemm_info.method == AsmConvMethod::Indirect) @@ -682,7 +688,7 @@ void Fallback<TypeInput, TypeOutput, OutputStage>::run() template <typename TypeInput, typename TypeOutput> void create_arm_gemm(std::unique_ptr<CpuGemmAssemblyDispatch::IFallback> &arm_gemm, MemoryGroup &memory_group, - const ITensor *a, const ITensor *b, const ITensor *c, ITensor *d, arm_gemm::Activation activation, const AsmGemmInfo &info, + const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, ITensorInfo *d, arm_gemm::Activation activation, const AsmGemmInfo &info, IWeightsManager *weights_manager) { Params p = extract_parameters(a, b, d, info); @@ -699,7 +705,7 @@ void create_arm_gemm(std::unique_ptr<CpuGemmAssemblyDispatch::IFallback> &arm_ge template <typename TypeInput, typename TypeOutput> void create_arm_gemm_quant(std::unique_ptr<CpuGemmAssemblyDispatch::IFallback> &arm_gemm, MemoryGroup &memory_group, - const ITensor *a, const ITensor *b, const ITensor *c, ITensor *d, arm_gemm::Activation activation, const AsmGemmInfo &info, + const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, ITensorInfo *d, arm_gemm::Activation activation, const AsmGemmInfo &info, IWeightsManager *weights_manager) { ARM_COMPUTE_UNUSED(activation); @@ -714,8 +720,8 @@ void create_arm_gemm_quant(std::unique_ptr<CpuGemmAssemblyDispatch::IFallback> & // Configure requantization info const int32_t negation = info.negated_offsets ? 1 : -1; - const int32_t a_offset = -a->info()->quantization_info().uniform().offset * negation; - const int32_t b_offset = -b->info()->quantization_info().uniform().offset * negation; + const int32_t a_offset = -a->quantization_info().uniform().offset * negation; + const int32_t b_offset = -b->quantization_info().uniform().offset * negation; const GEMMLowpOutputStageInfo os_info = info.output_stage; arm_gemm::Requantize32 gemm_requant_info{}; @@ -786,18 +792,18 @@ bool CpuGemmAssemblyDispatch::is_activation_supported(const ActivationLayerInfo return act.type != arm_gemm::Activation::Type::None; } -void CpuGemmAssemblyDispatch::configure(const ITensor *a, const ITensor *b, const ITensor *c, ITensor *d, const AsmGemmInfo &info) +void CpuGemmAssemblyDispatch::configure(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, ITensorInfo *d, const AsmGemmInfo &info) { ARM_COMPUTE_ERROR_ON_NULLPTR(a, b, d); arm_gemm::Activation act = map_to_arm_gemm_activation(info.activation_info); //If we don't support a combination of data types, silently return: it is the caller's responsibility to check if configure() was successful via is_configured() - if(!CpuGemmAssemblyDispatch::validate(a->info(), b->info(), c != nullptr ? c->info() : nullptr, d->info(), info)) + if(!CpuGemmAssemblyDispatch::validate(a, b, c, d, info)) { return; } - switch(a->info()->data_type()) + switch(a->data_type()) { case DataType::F32: create_arm_gemm<float, float>(_arm_gemm, _memory_group, a, b, c, d, act, info, _weights_manager); @@ -805,7 +811,7 @@ void CpuGemmAssemblyDispatch::configure(const ITensor *a, const ITensor *b, cons #ifdef __aarch64__ case DataType::U8: case DataType::QASYMM8: - if(d->info()->data_type() == DataType::S32) + if(d->data_type() == DataType::S32) { create_arm_gemm<uint8_t, uint32_t>(_arm_gemm, _memory_group, a, b, c, d, act, info, _weights_manager); } @@ -816,7 +822,7 @@ void CpuGemmAssemblyDispatch::configure(const ITensor *a, const ITensor *b, cons break; case DataType::S8: case DataType::QASYMM8_SIGNED: - if(d->info()->data_type() == DataType::S32) + if(d->data_type() == DataType::S32) { create_arm_gemm<int8_t, int32_t>(_arm_gemm, _memory_group, a, b, c, d, act, info, _weights_manager); } @@ -841,10 +847,10 @@ void CpuGemmAssemblyDispatch::configure(const ITensor *a, const ITensor *b, cons } } -void CpuGemmAssemblyDispatch::prepare() +void CpuGemmAssemblyDispatch::prepare(ITensorPack &tensors) { ARM_COMPUTE_ERROR_ON(_arm_gemm == nullptr); - _arm_gemm->prepare(); + _arm_gemm->prepare(tensors); } bool CpuGemmAssemblyDispatch::is_configured() const @@ -852,12 +858,12 @@ bool CpuGemmAssemblyDispatch::is_configured() const return _arm_gemm != nullptr && _arm_gemm->is_configured(); } -void CpuGemmAssemblyDispatch::run() +void CpuGemmAssemblyDispatch::run(ITensorPack &tensors) { MemoryGroupResourceScope scope_mg(_memory_group); ARM_COMPUTE_ERROR_ON(_arm_gemm == nullptr); - _arm_gemm->run(); + _arm_gemm->run(tensors); } } // namespace cpu } // namespace arm_compute diff --git a/src/runtime/cpu/operators/internal/CpuGemmAssemblyDispatch.h b/src/runtime/cpu/operators/internal/CpuGemmAssemblyDispatch.h index 0bbae49a7e..ffc097c75c 100644 --- a/src/runtime/cpu/operators/internal/CpuGemmAssemblyDispatch.h +++ b/src/runtime/cpu/operators/internal/CpuGemmAssemblyDispatch.h @@ -21,14 +21,15 @@ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ -#ifndef SRC_NEGEMMASSEMBLYDISPATCH_H -#define SRC_NEGEMMASSEMBLYDISPATCH_H +#ifndef ARM_COMPUTE_CPU_INTERNAL_CPU_GEMM_ASSEMBLY_DISPATCH_H +#define ARM_COMPUTE_CPU_INTERNAL_CPU_GEMM_ASSEMBLY_DISPATCH_H -#include "arm_compute/runtime/IFunction.h" #include "arm_compute/runtime/IMemoryManager.h" #include "arm_compute/runtime/IWeightsManager.h" #include "arm_compute/runtime/MemoryGroup.h" #include "arm_compute/runtime/Tensor.h" +#include "src/core/common/Macros.h" +#include "src/runtime/cpu/ICpuOperator.h" namespace arm_compute { @@ -57,29 +58,23 @@ struct AsmGemmInfo }; /** Assembly kernel glue */ -class CpuGemmAssemblyDispatch : public IFunction +class CpuGemmAssemblyDispatch : public ICpuOperator { public: /** Constructor */ CpuGemmAssemblyDispatch(std::shared_ptr<IMemoryManager> memory_manager = nullptr, IWeightsManager *weights_manager = nullptr); - /** Prevent instances of this class from being copy constructed */ - CpuGemmAssemblyDispatch(const CpuGemmAssemblyDispatch &) = delete; - /** Prevent instances of this class from being copied */ - CpuGemmAssemblyDispatch &operator=(const CpuGemmAssemblyDispatch &) = delete; - /** Default move constructor */ - CpuGemmAssemblyDispatch(CpuGemmAssemblyDispatch &&) = default; - /** Default move assignment operator */ - CpuGemmAssemblyDispatch &operator=(CpuGemmAssemblyDispatch &&) = default; /** Defautl destructor */ ~CpuGemmAssemblyDispatch() = default; + ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuGemmAssemblyDispatch); + class IFallback { public: - virtual void run() = 0; - virtual void prepare() = 0; - virtual bool is_configured() const = 0; - virtual ~IFallback() = default; + virtual void run(ITensorPack &tensors) = 0; + virtual void prepare(ITensorPack &tensors) = 0; + virtual bool is_configured() const = 0; + virtual ~IFallback() = default; }; public: @@ -91,7 +86,7 @@ public: * @param[out] d Output tensor to store the result of matrix multiplication. Data type supported: same as @p input0. * @param[in] info GEMM meta-data */ - void configure(const ITensor *a, const ITensor *b, const ITensor *c, ITensor *d, const AsmGemmInfo &info); + void configure(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, ITensorInfo *d, const AsmGemmInfo &info); /** Indicates whether or not this function can be used to process the given parameters. * @@ -118,8 +113,8 @@ public: bool is_configured() const; // Inherited methods overridden: - void prepare() override; - void run() override; + void prepare(ITensorPack &tensors) override; + void run(ITensorPack &tensors) override; private: std::unique_ptr<IFallback> _arm_gemm; /**< Interface for the arm_gemm fallback */ @@ -128,4 +123,4 @@ private: }; } // namespace cpu } // namespace arm_compute -#endif /* SRC_NEGEMMASSEMBLYDISPATCH_H */ +#endif /* ARM_COMPUTE_CPU_INTERNAL_CPU_GEMM_ASSEMBLY_DISPATCH_H */ |