diff options
author | Felix Thomasmathibalan <felixjohnny.thomasmathibalan@arm.com> | 2023-09-27 17:46:17 +0100 |
---|---|---|
committer | felixjohnny.thomasmathibalan <felixjohnny.thomasmathibalan@arm.com> | 2023-09-28 12:08:05 +0000 |
commit | afd38f0c617d6f89b2b4532c6c44f116617e2b6f (patch) | |
tree | 03bc7d5a762099989b16a656fa8d397b490ed70e /src/cpu/operators | |
parent | bdcb4c148ee2fdeaaddf4cf1e57bbb0de02bb894 (diff) | |
download | ComputeLibrary-afd38f0c617d6f89b2b4532c6c44f116617e2b6f.tar.gz |
Apply clang-format on repository
Code is formatted as per a revised clang format configuration
file(not part of this delivery). Version 14.0.6 is used.
Exclusion List:
- files with .cl extension
- files that are not strictly C/C++ (e.g. Android.bp, Sconscript ...)
And the following directories
- compute_kernel_writer/validation/
- tests/
- include/
- src/core/NEON/kernels/convolution/
- src/core/NEON/kernels/arm_gemm/
- src/core/NEON/kernels/arm_conv/
- data/
There will be a follow up for formatting of .cl files and the
files under tests/ and compute_kernel_writer/validation/.
Signed-off-by: Felix Thomasmathibalan <felixjohnny.thomasmathibalan@arm.com>
Change-Id: Ib7eb1fcf4e7537b9feaefcfc15098a804a3fde0a
Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/10391
Benchmark: Arm Jenkins <bsgcomp@arm.com>
Tested-by: Arm Jenkins <bsgcomp@arm.com>
Reviewed-by: Gunes Bayir <gunes.bayir@arm.com>
Diffstat (limited to 'src/cpu/operators')
68 files changed, 2300 insertions, 1436 deletions
diff --git a/src/cpu/operators/CpuActivation.cpp b/src/cpu/operators/CpuActivation.cpp index 197e9850b9..44d70cf503 100644 --- a/src/cpu/operators/CpuActivation.cpp +++ b/src/cpu/operators/CpuActivation.cpp @@ -24,6 +24,7 @@ #include "src/cpu/operators/CpuActivation.h" #include "arm_compute/runtime/NEON/NEScheduler.h" + #include "src/common/IOperator.h" #include "src/common/utils/LegacySupport.h" #include "src/common/utils/Log.h" @@ -42,7 +43,8 @@ void CpuActivation::configure(const ITensorInfo *input, ITensorInfo *output, con _kernel = std::move(k); } -Status CpuActivation::validate(const ITensorInfo *input, const ITensorInfo *output, const ActivationLayerInfo &activation_info) +Status +CpuActivation::validate(const ITensorInfo *input, const ITensorInfo *output, const ActivationLayerInfo &activation_info) { return kernels::CpuActivationKernel::validate(input, output, activation_info); } @@ -54,13 +56,17 @@ void CpuActivation::run(ITensorPack &tensors) NEScheduler::get().schedule_op(_kernel.get(), split_dimension, _kernel->window(), tensors); } -std::tuple<IOperator *, StatusCode> CpuContext::create_activation(const AclTensorDescriptor &src, const AclTensorDescriptor &dst, const AclActivationDescriptor &act, bool is_validate) +std::tuple<IOperator *, StatusCode> CpuContext::create_activation(const AclTensorDescriptor &src, + const AclTensorDescriptor &dst, + const AclActivationDescriptor &act, + bool is_validate) { TensorInfo src_info = detail::convert_to_legacy_tensor_info(src); TensorInfo dst_info = detail::convert_to_legacy_tensor_info(dst); auto info = detail::convert_to_activation_info(act); - if(is_validate && !bool(CpuActivation::validate(&src_info.set_is_resizable(false), &dst_info.set_is_resizable(false), info))) + if (is_validate && + !bool(CpuActivation::validate(&src_info.set_is_resizable(false), &dst_info.set_is_resizable(false), info))) { return std::make_tuple(nullptr, StatusCode::UnsupportedConfig); } @@ -69,7 +75,7 @@ std::tuple<IOperator *, StatusCode> CpuContext::create_activation(const AclTenso act_op->configure(&src_info, &dst_info, info); auto op = new arm_compute::IOperator(static_cast<IContext *>(this)); - if(op == nullptr) + if (op == nullptr) { ARM_COMPUTE_LOG_ERROR_ACL("Couldn't allocate internal resources"); return std::make_tuple(nullptr, StatusCode::OutOfMemory); diff --git a/src/cpu/operators/CpuActivation.h b/src/cpu/operators/CpuActivation.h index e21fc7d32c..ec442f92c8 100644 --- a/src/cpu/operators/CpuActivation.h +++ b/src/cpu/operators/CpuActivation.h @@ -25,6 +25,7 @@ #define ARM_COMPUTE_CPU_ACTIVATION_H #include "arm_compute/function_info/ActivationLayerInfo.h" + #include "src/cpu/ICpuOperator.h" namespace arm_compute diff --git a/src/cpu/operators/CpuAdd.cpp b/src/cpu/operators/CpuAdd.cpp index 41def8e22f..53cd7fa1b7 100644 --- a/src/cpu/operators/CpuAdd.cpp +++ b/src/cpu/operators/CpuAdd.cpp @@ -23,17 +23,20 @@ */ #include "src/cpu/operators/CpuAdd.h" -#include "src/cpu/kernels/CpuAddKernel.h" +#include "arm_compute/runtime/NEON/NEScheduler.h" #include "src/common/utils/Log.h" - -#include "arm_compute/runtime/NEON/NEScheduler.h" +#include "src/cpu/kernels/CpuAddKernel.h" namespace arm_compute { namespace cpu { -void CpuAdd::configure(const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst, ConvertPolicy policy, const ActivationLayerInfo &act_info) +void CpuAdd::configure(const ITensorInfo *src0, + const ITensorInfo *src1, + ITensorInfo *dst, + ConvertPolicy policy, + const ActivationLayerInfo &act_info) { ARM_COMPUTE_UNUSED(act_info); ARM_COMPUTE_LOG_PARAMS(src0, src1, dst, policy, act_info); @@ -42,7 +45,11 @@ void CpuAdd::configure(const ITensorInfo *src0, const ITensorInfo *src1, ITensor _kernel = std::move(k); } -Status CpuAdd::validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst, ConvertPolicy policy, const ActivationLayerInfo &act_info) +Status CpuAdd::validate(const ITensorInfo *src0, + const ITensorInfo *src1, + const ITensorInfo *dst, + ConvertPolicy policy, + const ActivationLayerInfo &act_info) { ARM_COMPUTE_RETURN_ERROR_ON(act_info.enabled()); return kernels::CpuAddKernel::validate(src0, src1, dst, policy); diff --git a/src/cpu/operators/CpuAdd.h b/src/cpu/operators/CpuAdd.h index db05c100cc..5f60102de2 100644 --- a/src/cpu/operators/CpuAdd.h +++ b/src/cpu/operators/CpuAdd.h @@ -25,6 +25,7 @@ #define ARM_COMPUTE_CPU_ADD_H #include "arm_compute/function_info/ActivationLayerInfo.h" + #include "src/cpu/ICpuOperator.h" namespace arm_compute @@ -55,14 +56,22 @@ public: * @param[in] act_info (Optional) Activation layer information in case of a fused activation. Currently not supported. * */ - void configure(const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst, ConvertPolicy policy, const ActivationLayerInfo &act_info = ActivationLayerInfo()); + void configure(const ITensorInfo *src0, + const ITensorInfo *src1, + ITensorInfo *dst, + ConvertPolicy policy, + const ActivationLayerInfo &act_info = ActivationLayerInfo()); /** Static function to check if given info will lead to a valid configuration * * Similar to @ref CpuAdd::configure() * * @return a status */ - static Status validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst, ConvertPolicy policy, const ActivationLayerInfo &act_info = ActivationLayerInfo()); + static Status validate(const ITensorInfo *src0, + const ITensorInfo *src1, + const ITensorInfo *dst, + ConvertPolicy policy, + const ActivationLayerInfo &act_info = ActivationLayerInfo()); // Inherited methods overridden: void run(ITensorPack &tensors) override; diff --git a/src/cpu/operators/CpuAddMulAdd.cpp b/src/cpu/operators/CpuAddMulAdd.cpp index 590ee482ca..2f19f2f842 100644 --- a/src/cpu/operators/CpuAddMulAdd.cpp +++ b/src/cpu/operators/CpuAddMulAdd.cpp @@ -21,39 +21,49 @@ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ +#include "src/cpu/operators/CpuAddMulAdd.h" + #include "arm_compute/core/experimental/Types.h" #include "arm_compute/runtime/NEON/NEScheduler.h" #include "src/common/utils/Log.h" #include "src/core/helpers/MemoryHelpers.h" #include "src/cpu/kernels/CpuAddMulAddKernel.h" -#include "src/cpu/operators/CpuAddMulAdd.h" #include "src/cpu/utils/CpuAuxTensorHandler.h" namespace arm_compute { namespace cpu { -void CpuAddMulAdd::configure(const ITensorInfo *input1, const ITensorInfo *input2, - const ITensorInfo *bn_mul, const ITensorInfo *bn_add, - ITensorInfo *add_output, ITensorInfo *final_output, - ConvertPolicy policy, const ActivationLayerInfo &act_info) +void CpuAddMulAdd::configure(const ITensorInfo *input1, + const ITensorInfo *input2, + const ITensorInfo *bn_mul, + const ITensorInfo *bn_add, + ITensorInfo *add_output, + ITensorInfo *final_output, + ConvertPolicy policy, + const ActivationLayerInfo &act_info) { ARM_COMPUTE_LOG_PARAMS(input1, input2, bn_mul, bn_add, add_output, final_output, policy, act_info); auto k = std::make_unique<kernels::CpuAddMulAddKernel>(); const DataType data_type = input1->data_type(); - if(is_data_type_quantized(data_type)) + if (is_data_type_quantized(data_type)) { _dequantize_bn_mul.configure(bn_mul, &_dequantized_bn_mul); _dequantize_bn_add.configure(bn_add, &_dequantized_bn_add); - k->configure(input1, input2, &_dequantized_bn_mul, &_dequantized_bn_add, add_output, final_output, policy, act_info); + k->configure(input1, input2, &_dequantized_bn_mul, &_dequantized_bn_add, add_output, final_output, policy, + act_info); // Save auxilary memory requirements after configuration - _aux_mem[DequantizedBnMul] = experimental::MemoryInfo(offset_int_vec(DequantizedBnMul), experimental::MemoryLifetime::Temporary, _dequantized_bn_mul.total_size()); - _aux_mem[DequantizedBnAdd] = experimental::MemoryInfo(offset_int_vec(DequantizedBnAdd), experimental::MemoryLifetime::Temporary, _dequantized_bn_add.total_size()); + _aux_mem[DequantizedBnMul] = + experimental::MemoryInfo(offset_int_vec(DequantizedBnMul), experimental::MemoryLifetime::Temporary, + _dequantized_bn_mul.total_size()); + _aux_mem[DequantizedBnAdd] = + experimental::MemoryInfo(offset_int_vec(DequantizedBnAdd), experimental::MemoryLifetime::Temporary, + _dequantized_bn_add.total_size()); } else { @@ -63,13 +73,17 @@ void CpuAddMulAdd::configure(const ITensorInfo *input1, const ITensorInfo *input _kernel = std::move(k); } -Status CpuAddMulAdd::validate(const ITensorInfo *input1, const ITensorInfo *input2, - const ITensorInfo *bn_mul, const ITensorInfo *bn_add, - const ITensorInfo *add_output, const ITensorInfo *final_output, - ConvertPolicy policy, const ActivationLayerInfo &act_info) +Status CpuAddMulAdd::validate(const ITensorInfo *input1, + const ITensorInfo *input2, + const ITensorInfo *bn_mul, + const ITensorInfo *bn_add, + const ITensorInfo *add_output, + const ITensorInfo *final_output, + ConvertPolicy policy, + const ActivationLayerInfo &act_info) { const DataType data_type = input1->data_type(); - if(is_data_type_quantized(data_type)) + if (is_data_type_quantized(data_type)) { TensorInfo dequantized_bn_mul = bn_mul->clone()->set_data_type(DataType::F32); TensorInfo dequantized_bn_add = bn_add->clone()->set_data_type(DataType::F32); @@ -77,11 +91,13 @@ Status CpuAddMulAdd::validate(const ITensorInfo *input1, const ITensorInfo *inpu ARM_COMPUTE_RETURN_ON_ERROR(CpuDequantize::validate(bn_mul, &dequantized_bn_mul)); ARM_COMPUTE_RETURN_ON_ERROR(CpuDequantize::validate(bn_add, &dequantized_bn_add)); - return kernels::CpuAddMulAddKernel::validate(input1, input2, &dequantized_bn_mul, &dequantized_bn_add, add_output, final_output, policy, act_info); + return kernels::CpuAddMulAddKernel::validate(input1, input2, &dequantized_bn_mul, &dequantized_bn_add, + add_output, final_output, policy, act_info); } else { - return kernels::CpuAddMulAddKernel::validate(input1, input2, bn_mul, bn_add, add_output, final_output, policy, act_info); + return kernels::CpuAddMulAddKernel::validate(input1, input2, bn_mul, bn_add, add_output, final_output, policy, + act_info); } } @@ -89,37 +105,32 @@ void CpuAddMulAdd::run(ITensorPack &tensors) { const DataType data_type = tensors.get_const_tensor(TensorType::ACL_SRC_0)->info()->data_type(); - if(is_data_type_quantized(data_type)) + if (is_data_type_quantized(data_type)) { const ITensor *bn_mul = tensors.get_const_tensor(TensorType::ACL_SRC_2); const ITensor *bn_add = tensors.get_const_tensor(TensorType::ACL_SRC_3); - CpuAuxTensorHandler dequantized_bn_mul_handler(offset_int_vec(DequantizedBnMul), _dequantized_bn_mul, tensors, true); - CpuAuxTensorHandler dequantized_bn_add_handler(offset_int_vec(DequantizedBnAdd), _dequantized_bn_add, tensors, true); + CpuAuxTensorHandler dequantized_bn_mul_handler(offset_int_vec(DequantizedBnMul), _dequantized_bn_mul, tensors, + true); + CpuAuxTensorHandler dequantized_bn_add_handler(offset_int_vec(DequantizedBnAdd), _dequantized_bn_add, tensors, + true); - ITensorPack dequantize_mul_pack = - { - { TensorType::ACL_SRC_0, bn_mul }, - { TensorType::ACL_DST_0, dequantized_bn_mul_handler.get() } - }; + ITensorPack dequantize_mul_pack = {{TensorType::ACL_SRC_0, bn_mul}, + {TensorType::ACL_DST_0, dequantized_bn_mul_handler.get()}}; - ITensorPack dequantize_add_pack = - { - { TensorType::ACL_SRC_0, bn_add }, - { TensorType::ACL_DST_0, dequantized_bn_add_handler.get() } - }; + ITensorPack dequantize_add_pack = {{TensorType::ACL_SRC_0, bn_add}, + {TensorType::ACL_DST_0, dequantized_bn_add_handler.get()}}; _dequantize_bn_mul.run(dequantize_mul_pack); _dequantize_bn_add.run(dequantize_add_pack); - ITensorPack add_mul_add_pack = - { - { TensorType::ACL_SRC_0, tensors.get_const_tensor(TensorType::ACL_SRC_0) }, - { TensorType::ACL_SRC_1, tensors.get_const_tensor(TensorType::ACL_SRC_1) }, - { TensorType::ACL_SRC_2, dequantized_bn_mul_handler.get() }, - { TensorType::ACL_SRC_3, dequantized_bn_add_handler.get() }, - { TensorType::ACL_DST_0, tensors.get_tensor(TensorType::ACL_DST_0) }, - { TensorType::ACL_DST_1, tensors.get_tensor(TensorType::ACL_DST_1) }, + ITensorPack add_mul_add_pack = { + {TensorType::ACL_SRC_0, tensors.get_const_tensor(TensorType::ACL_SRC_0)}, + {TensorType::ACL_SRC_1, tensors.get_const_tensor(TensorType::ACL_SRC_1)}, + {TensorType::ACL_SRC_2, dequantized_bn_mul_handler.get()}, + {TensorType::ACL_SRC_3, dequantized_bn_add_handler.get()}, + {TensorType::ACL_DST_0, tensors.get_tensor(TensorType::ACL_DST_0)}, + {TensorType::ACL_DST_1, tensors.get_tensor(TensorType::ACL_DST_1)}, }; NEScheduler::get().schedule_op(_kernel.get(), Window::DimY, _kernel->window(), add_mul_add_pack); diff --git a/src/cpu/operators/CpuAddMulAdd.h b/src/cpu/operators/CpuAddMulAdd.h index cf1ece68f1..47db75c37e 100644 --- a/src/cpu/operators/CpuAddMulAdd.h +++ b/src/cpu/operators/CpuAddMulAdd.h @@ -42,20 +42,28 @@ public: * Similar to @ref NEAddMulAdd::configure() * */ - void configure(const ITensorInfo *input1, const ITensorInfo *input2, - const ITensorInfo *bn_mul, const ITensorInfo *bn_add, - ITensorInfo *add_output, ITensorInfo *final_output, - ConvertPolicy policy, const ActivationLayerInfo &act_info); + void configure(const ITensorInfo *input1, + const ITensorInfo *input2, + const ITensorInfo *bn_mul, + const ITensorInfo *bn_add, + ITensorInfo *add_output, + ITensorInfo *final_output, + ConvertPolicy policy, + const ActivationLayerInfo &act_info); /** Static function to check if given info will lead to a valid configuration * * Similar to @ref CpuAddMulAdd::configure() * * @return a status */ - static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, - const ITensorInfo *bn_mul, const ITensorInfo *bn_add, - const ITensorInfo *add_output, const ITensorInfo *final_output, - ConvertPolicy policy, const ActivationLayerInfo &act_info); + static Status validate(const ITensorInfo *input1, + const ITensorInfo *input2, + const ITensorInfo *bn_mul, + const ITensorInfo *bn_add, + const ITensorInfo *add_output, + const ITensorInfo *final_output, + ConvertPolicy policy, + const ActivationLayerInfo &act_info); // Inherited methods overridden: void run(ITensorPack &tensors) override; @@ -77,7 +85,7 @@ private: TensorInfo _dequantized_bn_mul{}; TensorInfo _dequantized_bn_add{}; - experimental::MemoryRequirements _aux_mem{ Count }; + experimental::MemoryRequirements _aux_mem{Count}; }; } // namespace cpu } // namespace arm_compute diff --git a/src/cpu/operators/CpuCast.cpp b/src/cpu/operators/CpuCast.cpp index 1cfd8c1d0e..55b9204d71 100644 --- a/src/cpu/operators/CpuCast.cpp +++ b/src/cpu/operators/CpuCast.cpp @@ -23,9 +23,8 @@ */ #include "src/cpu/operators/CpuCast.h" -#include "src/cpu/kernels/CpuCastKernel.h" - #include "src/common/utils/Log.h" +#include "src/cpu/kernels/CpuCastKernel.h" namespace arm_compute { diff --git a/src/cpu/operators/CpuConcatenate.cpp b/src/cpu/operators/CpuConcatenate.cpp index 4021fd8ded..5f517a8fcb 100644 --- a/src/cpu/operators/CpuConcatenate.cpp +++ b/src/cpu/operators/CpuConcatenate.cpp @@ -23,21 +23,20 @@ */ #include "src/cpu/operators/CpuConcatenate.h" -#include "src/cpu/kernels/CpuConcatenateBatchKernel.h" -#include "src/cpu/kernels/CpuConcatenateDepthKernel.h" -#include "src/cpu/kernels/CpuConcatenateHeightKernel.h" -#include "src/cpu/kernels/CpuConcatenateWidthKernel.h" - -#include "arm_compute/core/utils/misc/ShapeCalculator.h" -#include "arm_compute/runtime/NEON/NEScheduler.h" - #include "arm_compute/core/Error.h" #include "arm_compute/core/ITensor.h" #include "arm_compute/core/TensorInfo.h" #include "arm_compute/core/Types.h" +#include "arm_compute/core/utils/misc/ShapeCalculator.h" #include "arm_compute/core/Validate.h" +#include "arm_compute/runtime/NEON/NEScheduler.h" + #include "src/common/utils/Log.h" #include "src/core/helpers/AutoConfiguration.h" +#include "src/cpu/kernels/CpuConcatenateBatchKernel.h" +#include "src/cpu/kernels/CpuConcatenateDepthKernel.h" +#include "src/cpu/kernels/CpuConcatenateHeightKernel.h" +#include "src/cpu/kernels/CpuConcatenateWidthKernel.h" namespace arm_compute { @@ -59,9 +58,9 @@ void CpuConcatenate::configure(const std::vector<const ITensorInfo *> &srcs_vect unsigned int offset = 0; - for(unsigned int i = 0; i < _num_srcs; ++i) + for (unsigned int i = 0; i < _num_srcs; ++i) { - switch(axis) + switch (axis) { case Window::DimX: { @@ -98,16 +97,17 @@ void CpuConcatenate::configure(const std::vector<const ITensorInfo *> &srcs_vect } } -Status CpuConcatenate::validate(const std::vector<const ITensorInfo *> &srcs_vector, const ITensorInfo *dst, size_t axis) +Status +CpuConcatenate::validate(const std::vector<const ITensorInfo *> &srcs_vector, const ITensorInfo *dst, size_t axis) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(dst); ARM_COMPUTE_RETURN_ERROR_ON(srcs_vector.size() < 2); unsigned int offset = 0; - for(const auto &src : srcs_vector) + for (const auto &src : srcs_vector) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src); - switch(axis) + switch (axis) { case Window::DimX: { @@ -135,7 +135,7 @@ Status CpuConcatenate::validate(const std::vector<const ITensorInfo *> &srcs_vec offset += src->dimension(axis); } - if(dst->total_size() != 0) + if (dst->total_size() != 0) { TensorShape dst_shape = arm_compute::misc::shape_calculator::calculate_concatenate_shape(srcs_vector, axis); ARM_COMPUTE_RETURN_ERROR_ON(dst_shape.total_size() != dst->tensor_shape().total_size()); @@ -146,18 +146,18 @@ Status CpuConcatenate::validate(const std::vector<const ITensorInfo *> &srcs_vec void CpuConcatenate::run(ITensorPack &tensors) { - if(tensors.empty()) + if (tensors.empty()) { ARM_COMPUTE_ERROR("No inputs provided"); } - if(static_cast<int>(tensors.size() - 1) != static_cast<int>(_num_srcs)) + if (static_cast<int>(tensors.size() - 1) != static_cast<int>(_num_srcs)) { ARM_COMPUTE_ERROR("Configured with different number of inputs"); } int i = 0; - for(auto &k : _concat_kernels) + for (auto &k : _concat_kernels) { ITensorPack pack; pack.add_tensor(TensorType::ACL_SRC, tensors.get_const_tensor(ACL_SRC_VEC + i)); diff --git a/src/cpu/operators/CpuConcatenate.h b/src/cpu/operators/CpuConcatenate.h index eb11926b48..c36977c70f 100644 --- a/src/cpu/operators/CpuConcatenate.h +++ b/src/cpu/operators/CpuConcatenate.h @@ -68,8 +68,8 @@ public: private: std::vector<std::unique_ptr<ICPPKernel>> _concat_kernels{}; - unsigned int _num_srcs{ 0 }; - unsigned int _axis{ 0 }; + unsigned int _num_srcs{0}; + unsigned int _axis{0}; }; } // namespace cpu } // namespace arm_compute diff --git a/src/cpu/operators/CpuConv2d.cpp b/src/cpu/operators/CpuConv2d.cpp index 16ac16b3ba..19311733db 100644 --- a/src/cpu/operators/CpuConv2d.cpp +++ b/src/cpu/operators/CpuConv2d.cpp @@ -22,8 +22,10 @@ * SOFTWARE. */ #include "src/cpu/operators/CpuConv2d.h" -#include "arm_compute/runtime/NEON/NEScheduler.h" + #include "arm_compute/runtime/NEON/functions/NEFFTConvolutionLayer.h" +#include "arm_compute/runtime/NEON/NEScheduler.h" + #include "src/common/utils/Log.h" #include "src/cpu/operators/CpuDirectConv2d.h" #include "src/cpu/operators/CpuGemm.h" @@ -35,26 +37,35 @@ namespace arm_compute { namespace cpu { -CpuConv2d::CpuConv2d() - : _function() +CpuConv2d::CpuConv2d() : _function() { } CpuConv2d::~CpuConv2d() = default; -void CpuConv2d::configure(ITensorInfo *input, ITensorInfo *weights, const ITensorInfo *biases, ITensorInfo *output, const PadStrideInfo &conv_info, const WeightsInfo &weights_info, - const Size2D &dilation, const ActivationLayerInfo &act_info, bool enable_fast_math, unsigned int num_groups) +void CpuConv2d::configure(ITensorInfo *input, + ITensorInfo *weights, + const ITensorInfo *biases, + ITensorInfo *output, + const PadStrideInfo &conv_info, + const WeightsInfo &weights_info, + const Size2D &dilation, + const ActivationLayerInfo &act_info, + bool enable_fast_math, + unsigned int num_groups) { // Perform validate step ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output); ARM_COMPUTE_UNUSED(num_groups); - ARM_COMPUTE_ERROR_THROW_ON(CpuConv2d::validate(input, weights, biases, output, conv_info, weights_info, dilation, act_info, - enable_fast_math, num_groups)); + ARM_COMPUTE_ERROR_THROW_ON(CpuConv2d::validate(input, weights, biases, output, conv_info, weights_info, dilation, + act_info, enable_fast_math, num_groups)); - ARM_COMPUTE_LOG_PARAMS(input, weights, biases, output, conv_info, weights_info, dilation, act_info, enable_fast_math, num_groups); + ARM_COMPUTE_LOG_PARAMS(input, weights, biases, output, conv_info, weights_info, dilation, act_info, + enable_fast_math, num_groups); const Conv2dInfo info(conv_info, dilation, act_info, enable_fast_math, num_groups); - switch(CpuConv2d::get_convolution_method(input, weights, output, conv_info, weights_info, dilation, act_info, enable_fast_math)) + switch (CpuConv2d::get_convolution_method(input, weights, output, conv_info, weights_info, dilation, act_info, + enable_fast_math)) { case ConvolutionMethod::WINOGRAD: { @@ -92,19 +103,30 @@ void CpuConv2d::configure(ITensorInfo *input, ITensorInfo *weights, const ITenso _aux_mem = _function->workspace(); } -Status CpuConv2d::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info, - const WeightsInfo &weights_info, const Size2D &dilation, const ActivationLayerInfo &act_info, bool enable_fast_math, unsigned int num_groups) +Status CpuConv2d::validate(const ITensorInfo *input, + const ITensorInfo *weights, + const ITensorInfo *biases, + const ITensorInfo *output, + const PadStrideInfo &conv_info, + const WeightsInfo &weights_info, + const Size2D &dilation, + const ActivationLayerInfo &act_info, + bool enable_fast_math, + unsigned int num_groups) { ARM_COMPUTE_RETURN_ERROR_ON_MSG((num_groups != 1), "Grouping (num_groups != 1) is not supported on Neon"); const Conv2dInfo info(conv_info, dilation, act_info, enable_fast_math, num_groups); - switch(CpuConv2d::get_convolution_method(input, weights, output, conv_info, weights_info, dilation, act_info, enable_fast_math)) + switch (CpuConv2d::get_convolution_method(input, weights, output, conv_info, weights_info, dilation, act_info, + enable_fast_math)) { case ConvolutionMethod::WINOGRAD: - ARM_COMPUTE_RETURN_ON_ERROR(CpuWinogradConv2d::validate(input, weights, biases, output, conv_info, act_info, enable_fast_math)); + ARM_COMPUTE_RETURN_ON_ERROR( + CpuWinogradConv2d::validate(input, weights, biases, output, conv_info, act_info, enable_fast_math)); break; case ConvolutionMethod::GEMM: - ARM_COMPUTE_RETURN_ON_ERROR(CpuGemmConv2d::validate(input, weights, biases, output, conv_info, weights_info, dilation, act_info, enable_fast_math)); + ARM_COMPUTE_RETURN_ON_ERROR(CpuGemmConv2d::validate(input, weights, biases, output, conv_info, weights_info, + dilation, act_info, enable_fast_math)); break; case ConvolutionMethod::GEMM_CONV2D: ARM_COMPUTE_RETURN_ON_ERROR(CpuGemmDirectConv2d::validate(input, weights, biases, output, info)); @@ -120,9 +142,14 @@ Status CpuConv2d::validate(const ITensorInfo *input, const ITensorInfo *weights, return Status{}; } -ConvolutionMethod CpuConv2d::get_convolution_method(const ITensorInfo *input, const ITensorInfo *weights, - const ITensorInfo *output, const PadStrideInfo &conv_info, - const WeightsInfo &weights_info, const Size2D &dilation, const ActivationLayerInfo &act_info, bool enable_fast_math) +ConvolutionMethod CpuConv2d::get_convolution_method(const ITensorInfo *input, + const ITensorInfo *weights, + const ITensorInfo *output, + const PadStrideInfo &conv_info, + const WeightsInfo &weights_info, + const Size2D &dilation, + const ActivationLayerInfo &act_info, + bool enable_fast_math) { ARM_COMPUTE_ERROR_ON_NULLPTR(input, output, weights); ARM_COMPUTE_UNUSED(weights_info); @@ -137,35 +164,46 @@ ConvolutionMethod CpuConv2d::get_convolution_method(const ITensorInfo *input, co using ConvolutionConfiguration = std::tuple<Size2D, Size2D, Size2D, PadStrideInfo>; using ConfigurationMethod = std::pair<ConvolutionConfiguration, ConvolutionMethod>; - const std::vector<ConfigurationMethod> known_configs = - { + const std::vector<ConfigurationMethod> known_configs = { // Alexnet - ConfigurationMethod(ConvolutionConfiguration(Size2D(27U, 27U), Size2D(5U, 5U), Size2D(48U, 128U), PadStrideInfo(1U, 1U, 2U, 2U)), ConvolutionMethod::GEMM), + ConfigurationMethod(ConvolutionConfiguration(Size2D(27U, 27U), Size2D(5U, 5U), Size2D(48U, 128U), + PadStrideInfo(1U, 1U, 2U, 2U)), + ConvolutionMethod::GEMM), // VGG16 / VGG19 - ConfigurationMethod(ConvolutionConfiguration(Size2D(224U, 224U), Size2D(3U, 3U), Size2D(3U, 64U), PadStrideInfo(1U, 1U, 1U, 1U)), ConvolutionMethod::GEMM), + ConfigurationMethod(ConvolutionConfiguration(Size2D(224U, 224U), Size2D(3U, 3U), Size2D(3U, 64U), + PadStrideInfo(1U, 1U, 1U, 1U)), + ConvolutionMethod::GEMM), // Mobilenet 224 - ConfigurationMethod(ConvolutionConfiguration(Size2D(224U, 224U), Size2D(3U, 3U), Size2D(3U, 32U), PadStrideInfo(2U, 2U, 0U, 1U, 0U, 1U, DimensionRoundingType::FLOOR)), ConvolutionMethod::GEMM), + ConfigurationMethod( + ConvolutionConfiguration(Size2D(224U, 224U), Size2D(3U, 3U), Size2D(3U, 32U), + PadStrideInfo(2U, 2U, 0U, 1U, 0U, 1U, DimensionRoundingType::FLOOR)), + ConvolutionMethod::GEMM), // Mobilenet 160 - ConfigurationMethod(ConvolutionConfiguration(Size2D(160U, 160U), Size2D(3U, 3U), Size2D(3U, 24U), PadStrideInfo(2U, 2U, 0U, 1U, 0U, 1U, DimensionRoundingType::FLOOR)), ConvolutionMethod::GEMM) - }; + ConfigurationMethod( + ConvolutionConfiguration(Size2D(160U, 160U), Size2D(3U, 3U), Size2D(3U, 24U), + PadStrideInfo(2U, 2U, 0U, 1U, 0U, 1U, DimensionRoundingType::FLOOR)), + ConvolutionMethod::GEMM)}; const auto find_config = [&](ConfigurationMethod c) { const ConvolutionConfiguration config = c.first; const PadStrideInfo info = std::get<3>(config); - return std::get<0>(config) == Size2D(input->dimension(idx_w), input->dimension(idx_h)) && std::get<1>(config) == Size2D(weights->dimension(idx_w), weights->dimension(idx_h)) - && std::get<2>(config) == Size2D(weights->dimension(idx_c), weights->dimension(3)) && info.pad_top() == conv_info.pad_top() && info.pad_right() == conv_info.pad_right() - && info.pad_bottom() == conv_info.pad_bottom() && info.pad_left() == conv_info.pad_left() && info.stride() == conv_info.stride(); + return std::get<0>(config) == Size2D(input->dimension(idx_w), input->dimension(idx_h)) && + std::get<1>(config) == Size2D(weights->dimension(idx_w), weights->dimension(idx_h)) && + std::get<2>(config) == Size2D(weights->dimension(idx_c), weights->dimension(3)) && + info.pad_top() == conv_info.pad_top() && info.pad_right() == conv_info.pad_right() && + info.pad_bottom() == conv_info.pad_bottom() && info.pad_left() == conv_info.pad_left() && + info.stride() == conv_info.stride(); }; std::vector<ConfigurationMethod>::const_iterator found; - if((found = std::find_if(known_configs.begin(), known_configs.end(), find_config)) != known_configs.end()) + if ((found = std::find_if(known_configs.begin(), known_configs.end(), find_config)) != known_configs.end()) { return (*found).second; } - if(dilation != Size2D(1U, 1U)) + if (dilation != Size2D(1U, 1U)) { return ConvolutionMethod::GEMM; } @@ -173,43 +211,49 @@ ConvolutionMethod CpuConv2d::get_convolution_method(const ITensorInfo *input, co { // SRGAN // Output might not be initialized when it is an internal tensor of the layer using the convolution - if(input->total_size() > 1e7 && (weights->dimension(idx_h) > 7) - && (CpuDirectConv2d::validate(input, weights, nullptr, output, conv_info, act_info))) + if (input->total_size() > 1e7 && (weights->dimension(idx_h) > 7) && + (CpuDirectConv2d::validate(input, weights, nullptr, output, conv_info, act_info))) { return ConvolutionMethod::DIRECT; } - if(input->dimension(idx_c) < 16) + if (input->dimension(idx_c) < 16) { return ConvolutionMethod::GEMM; } #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC // This heuristics only applies to F16 data type on A55r1 - if(NEScheduler::get().cpu_info().get_cpu_model() == CPUModel::A55r1 && enable_fast_math && input->data_type() == DataType::F16) + if (NEScheduler::get().cpu_info().get_cpu_model() == CPUModel::A55r1 && enable_fast_math && + input->data_type() == DataType::F16) { // Exclude known bad winograd configs (and defaults to GEMM) - const std::vector<ConvolutionConfiguration> known_bad_winograd_f16_with_fastmath_configs = - { + const std::vector<ConvolutionConfiguration> known_bad_winograd_f16_with_fastmath_configs = { // Squeezenet_V1_1 fire2 and fire3 - ConvolutionConfiguration(Size2D(56U, 56U), Size2D(3U, 3U), Size2D(16U, 64U), PadStrideInfo(1U, 1U, 1U, 1U)), + ConvolutionConfiguration(Size2D(56U, 56U), Size2D(3U, 3U), Size2D(16U, 64U), + PadStrideInfo(1U, 1U, 1U, 1U)), // Squeezenet_V1_1 fire6 and fire7 - ConvolutionConfiguration(Size2D(14U, 14U), Size2D(3U, 3U), Size2D(48U, 192U), PadStrideInfo(1U, 1U, 1U, 1U)), + ConvolutionConfiguration(Size2D(14U, 14U), Size2D(3U, 3U), Size2D(48U, 192U), + PadStrideInfo(1U, 1U, 1U, 1U)), // Squeezenet_V1_1 fire8 and fire9 - ConvolutionConfiguration(Size2D(14U, 14U), Size2D(3U, 3U), Size2D(64U, 256U), PadStrideInfo(1U, 1U, 1U, 1U)), + ConvolutionConfiguration(Size2D(14U, 14U), Size2D(3U, 3U), Size2D(64U, 256U), + PadStrideInfo(1U, 1U, 1U, 1U)), }; const auto find_conv_config = [&](ConvolutionConfiguration c) { const PadStrideInfo info = std::get<3>(c); - return std::get<0>(c) == Size2D(input->dimension(idx_w), input->dimension(idx_h)) && std::get<1>(c) == Size2D(weights->dimension(idx_w), weights->dimension(idx_h)) - && std::get<2>(c) == Size2D(weights->dimension(idx_c), weights->dimension(3)) && info.pad_top() == conv_info.pad_top() && info.pad_right() == conv_info.pad_right() - && info.pad_bottom() == conv_info.pad_bottom() && info.pad_left() == conv_info.pad_left() && info.stride() == conv_info.stride(); + return std::get<0>(c) == Size2D(input->dimension(idx_w), input->dimension(idx_h)) && + std::get<1>(c) == Size2D(weights->dimension(idx_w), weights->dimension(idx_h)) && + std::get<2>(c) == Size2D(weights->dimension(idx_c), weights->dimension(3)) && + info.pad_top() == conv_info.pad_top() && info.pad_right() == conv_info.pad_right() && + info.pad_bottom() == conv_info.pad_bottom() && info.pad_left() == conv_info.pad_left() && + info.stride() == conv_info.stride(); }; - bool found_bad = std::find_if(known_bad_winograd_f16_with_fastmath_configs.begin(), known_bad_winograd_f16_with_fastmath_configs.end(), - find_conv_config) - != known_bad_winograd_f16_with_fastmath_configs.end(); - if(found_bad) + bool found_bad = std::find_if(known_bad_winograd_f16_with_fastmath_configs.begin(), + known_bad_winograd_f16_with_fastmath_configs.end(), + find_conv_config) != known_bad_winograd_f16_with_fastmath_configs.end(); + if (found_bad) { return ConvolutionMethod::GEMM; } @@ -217,16 +261,16 @@ ConvolutionMethod CpuConv2d::get_convolution_method(const ITensorInfo *input, co #endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC // For 1x1 convolutions run the default GEMM - if(weights->dimension(idx_w) == 1 && weights->dimension(idx_h) == 1) + if (weights->dimension(idx_w) == 1 && weights->dimension(idx_h) == 1) { return ConvolutionMethod::GEMM; } - if(bool(CpuWinogradConv2d::validate(input, weights, nullptr, output, conv_info, act_info, enable_fast_math))) + if (bool(CpuWinogradConv2d::validate(input, weights, nullptr, output, conv_info, act_info, enable_fast_math))) { return ConvolutionMethod::WINOGRAD; } - if(bool(CpuGemmDirectConv2d::validate(input, weights, nullptr, output, info))) + if (bool(CpuGemmDirectConv2d::validate(input, weights, nullptr, output, info))) { return ConvolutionMethod::GEMM_CONV2D; } diff --git a/src/cpu/operators/CpuConv2d.h b/src/cpu/operators/CpuConv2d.h index 0908ac0cbb..71b9e15dc1 100644 --- a/src/cpu/operators/CpuConv2d.h +++ b/src/cpu/operators/CpuConv2d.h @@ -22,6 +22,7 @@ * SOFTWARE. */ #include "arm_compute/function_info/ActivationLayerInfo.h" + #include "src/core/common/Macros.h" #include "src/cpu/ICpuOperator.h" @@ -102,17 +103,32 @@ public: * available which may introduce a drop of accuracy as well. Default is false * @param[in] num_groups (Optional) Number of groups when performing a grouped convolution. num_groups != 1 is not supported */ - void configure(ITensorInfo *src, ITensorInfo *weights, const ITensorInfo *biases, ITensorInfo *dst, const PadStrideInfo &conv_info, const WeightsInfo &weights_info = WeightsInfo(), - const Size2D &dilation = Size2D(1U, 1U), const ActivationLayerInfo &act_info = ActivationLayerInfo(), bool enable_fast_math = false, unsigned int num_groups = 1); + void configure(ITensorInfo *src, + ITensorInfo *weights, + const ITensorInfo *biases, + ITensorInfo *dst, + const PadStrideInfo &conv_info, + const WeightsInfo &weights_info = WeightsInfo(), + const Size2D &dilation = Size2D(1U, 1U), + const ActivationLayerInfo &act_info = ActivationLayerInfo(), + bool enable_fast_math = false, + unsigned int num_groups = 1); /** Static function to check if given info will lead to a valid configuration of @ref CpuConv2d * * Similar to CpuConv2d::configure() * * @return a status */ - static Status validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info, - const WeightsInfo &weights_info = WeightsInfo(), const Size2D &dilation = Size2D(1U, 1U), const ActivationLayerInfo &act_info = ActivationLayerInfo(), bool enable_fast_math = false, - unsigned int num_groups = 1); + static Status validate(const ITensorInfo *src, + const ITensorInfo *weights, + const ITensorInfo *biases, + const ITensorInfo *output, + const PadStrideInfo &conv_info, + const WeightsInfo &weights_info = WeightsInfo(), + const Size2D &dilation = Size2D(1U, 1U), + const ActivationLayerInfo &act_info = ActivationLayerInfo(), + bool enable_fast_math = false, + unsigned int num_groups = 1); /** Static function to check if given info will return the convolution called by @ref CpuConv2d * * @param[in] src Source tensor info. 3 lower dimensions represent a single input [width, height, IFM], @@ -132,11 +148,17 @@ public: * * @return the Convolution Method Hint */ - static ConvolutionMethod get_convolution_method(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *dst, const PadStrideInfo &conv_info, - const WeightsInfo &weights_info = WeightsInfo(), const Size2D &dilation = Size2D(1U, 1U), const ActivationLayerInfo &act_info = ActivationLayerInfo(), bool enable_fast_math = false); + static ConvolutionMethod get_convolution_method(const ITensorInfo *src, + const ITensorInfo *weights, + const ITensorInfo *dst, + const PadStrideInfo &conv_info, + const WeightsInfo &weights_info = WeightsInfo(), + const Size2D &dilation = Size2D(1U, 1U), + const ActivationLayerInfo &act_info = ActivationLayerInfo(), + bool enable_fast_math = false); // Inherited methods overridden: - void run(ITensorPack &tensors) override; - void prepare(ITensorPack &constants) override; + void run(ITensorPack &tensors) override; + void prepare(ITensorPack &constants) override; experimental::MemoryRequirements workspace() const override; private: diff --git a/src/cpu/operators/CpuConvertFullyConnectedWeights.cpp b/src/cpu/operators/CpuConvertFullyConnectedWeights.cpp index 810ffb1e4e..49e31926e3 100644 --- a/src/cpu/operators/CpuConvertFullyConnectedWeights.cpp +++ b/src/cpu/operators/CpuConvertFullyConnectedWeights.cpp @@ -24,6 +24,7 @@ #include "src/cpu/operators/CpuConvertFullyConnectedWeights.h" #include "arm_compute/runtime/NEON/NEScheduler.h" + #include "src/common/utils/Log.h" #include "src/cpu/kernels/CpuConvertFullyConnectedWeightsKernel.h" @@ -31,7 +32,10 @@ namespace arm_compute { namespace cpu { -void CpuConvertFullyConnectedWeights::configure(const ITensorInfo *src, ITensorInfo *dst, const TensorShape &original_src_shape, DataLayout data_layout) +void CpuConvertFullyConnectedWeights::configure(const ITensorInfo *src, + ITensorInfo *dst, + const TensorShape &original_src_shape, + DataLayout data_layout) { ARM_COMPUTE_LOG_PARAMS(src, dst, original_src_shape, data_layout); auto k = std::make_unique<kernels::CpuConvertFullyConnectedWeightsKernel>(); @@ -39,7 +43,10 @@ void CpuConvertFullyConnectedWeights::configure(const ITensorInfo *src, ITensorI _kernel = std::move(k); } -Status CpuConvertFullyConnectedWeights::validate(const ITensorInfo *src, const ITensorInfo *dst, const TensorShape &original_src_shape, DataLayout data_layout) +Status CpuConvertFullyConnectedWeights::validate(const ITensorInfo *src, + const ITensorInfo *dst, + const TensorShape &original_src_shape, + DataLayout data_layout) { return kernels::CpuConvertFullyConnectedWeightsKernel::validate(src, dst, original_src_shape, data_layout); } @@ -48,5 +55,5 @@ void CpuConvertFullyConnectedWeights::run(ITensorPack &tensors) { NEScheduler::get().schedule_op(_kernel.get(), Window::DimZ, _kernel->window(), tensors); } -} // namesapce cpu +} // namespace cpu } // namespace arm_compute diff --git a/src/cpu/operators/CpuConvertFullyConnectedWeights.h b/src/cpu/operators/CpuConvertFullyConnectedWeights.h index ea70eee134..e208cca3a0 100644 --- a/src/cpu/operators/CpuConvertFullyConnectedWeights.h +++ b/src/cpu/operators/CpuConvertFullyConnectedWeights.h @@ -41,14 +41,18 @@ public: * @param[in] original_src_shape Shape of the original src tensor (the one entering fully connected layer). * @param[in] data_layout The data layout the weights have been trained in. */ - void configure(const ITensorInfo *src, ITensorInfo *dst, const TensorShape &original_src_shape, DataLayout data_layout); + void + configure(const ITensorInfo *src, ITensorInfo *dst, const TensorShape &original_src_shape, DataLayout data_layout); /** Static function to check if given info will lead to a valid configuration * * Similar to @ref CpuConvertFullyConnectedWeights::configure() * * @return a status */ - static Status validate(const ITensorInfo *src, const ITensorInfo *dst, const TensorShape &original_src_shape, DataLayout data_layout); + static Status validate(const ITensorInfo *src, + const ITensorInfo *dst, + const TensorShape &original_src_shape, + DataLayout data_layout); // Inherited methods overridden: void run(ITensorPack &tensors) override; }; diff --git a/src/cpu/operators/CpuCopy.cpp b/src/cpu/operators/CpuCopy.cpp index 7420ff6240..92c19d4df2 100644 --- a/src/cpu/operators/CpuCopy.cpp +++ b/src/cpu/operators/CpuCopy.cpp @@ -23,9 +23,8 @@ */ #include "src/cpu/operators/CpuCopy.h" -#include "src/cpu/kernels/CpuCopyKernel.h" - #include "src/common/utils/Log.h" +#include "src/cpu/kernels/CpuCopyKernel.h" namespace arm_compute { diff --git a/src/cpu/operators/CpuDepthwiseConv2d.cpp b/src/cpu/operators/CpuDepthwiseConv2d.cpp index 884fe5c4ed..54075f2afa 100644 --- a/src/cpu/operators/CpuDepthwiseConv2d.cpp +++ b/src/cpu/operators/CpuDepthwiseConv2d.cpp @@ -24,10 +24,11 @@ #include "src/cpu/operators/CpuDepthwiseConv2d.h" #include "arm_compute/core/TensorInfo.h" -#include "arm_compute/core/Validate.h" #include "arm_compute/core/utils/misc/InfoHelpers.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" +#include "arm_compute/core/Validate.h" #include "arm_compute/runtime/NEON/NEScheduler.h" + #include "src/common/utils/Log.h" #include "src/cpu/kernels/CpuDepthwiseConv2dNativeKernel.h" @@ -37,11 +38,16 @@ namespace cpu { namespace { -Status validate_arguments_optimized(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, const ConvolutionInfo &info) +Status validate_arguments_optimized(const ITensorInfo *src, + const ITensorInfo *weights, + const ITensorInfo *biases, + const ITensorInfo *dst, + const ConvolutionInfo &info) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, weights, dst); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F16, DataType::F32); - if(!is_data_type_quantized_per_channel(weights->data_type())) + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, + DataType::F16, DataType::F32); + if (!is_data_type_quantized_per_channel(weights->data_type())) { ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, weights); } @@ -49,14 +55,17 @@ Status validate_arguments_optimized(const ITensorInfo *src, const ITensorInfo *w ARM_COMPUTE_RETURN_ERROR_ON(info.dilation.x() < 1 || info.dilation.y() < 1); const size_t idx_w = get_data_layout_dimension_index(src->data_layout(), DataLayoutDimension::WIDTH); const size_t idx_h = get_data_layout_dimension_index(src->data_layout(), DataLayoutDimension::HEIGHT); - ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(idx_w) + (weights->dimension(idx_w) - 1) * (info.dilation.x() - 1) > src->dimension(idx_w) + info.pad_stride_info.pad_left() + - info.pad_stride_info.pad_right()); - ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(idx_h) + (weights->dimension(idx_h) - 1) * (info.dilation.y() - 1) > src->dimension(idx_h) + info.pad_stride_info.pad_top() + - info.pad_stride_info.pad_bottom()); - - if(biases != nullptr) + ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(idx_w) + (weights->dimension(idx_w) - 1) * (info.dilation.x() - 1) > + src->dimension(idx_w) + info.pad_stride_info.pad_left() + + info.pad_stride_info.pad_right()); + ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(idx_h) + (weights->dimension(idx_h) - 1) * (info.dilation.y() - 1) > + src->dimension(idx_h) + info.pad_stride_info.pad_top() + + info.pad_stride_info.pad_bottom()); + + if (biases != nullptr) { - const unsigned int channel_idx = get_data_layout_dimension_index(src->data_layout(), DataLayoutDimension::CHANNEL); + const unsigned int channel_idx = + get_data_layout_dimension_index(src->data_layout(), DataLayoutDimension::CHANNEL); ARM_COMPUTE_RETURN_ERROR_ON(biases->num_dimensions() > 1); ARM_COMPUTE_RETURN_ERROR_ON(biases->dimension(0) != weights->dimension(channel_idx)); } @@ -64,7 +73,7 @@ Status validate_arguments_optimized(const ITensorInfo *src, const ITensorInfo *w ARM_COMPUTE_RETURN_ON_ERROR(CpuDepthwiseConv2dAssemblyDispatch::validate(src, weights, biases, dst, info)); // Validate Activation Layer - if(info.act_info.enabled() && !CpuDepthwiseConv2dAssemblyDispatch::is_activation_supported(info.act_info)) + if (info.act_info.enabled() && !CpuDepthwiseConv2dAssemblyDispatch::is_activation_supported(info.act_info)) { ARM_COMPUTE_RETURN_ON_ERROR(CpuActivation::validate(dst, nullptr, info.act_info)); } @@ -80,8 +89,8 @@ void CpuDepthwiseConv2d::CpuDepthwiseConv2dOptimizedInternal::configure(ITensorI { ARM_COMPUTE_ERROR_ON_NULLPTR(src, weights, dst); // Perform validation step - ARM_COMPUTE_ERROR_THROW_ON(CpuDepthwiseConv2dOptimizedInternal::validate(src, weights, (biases == nullptr) ? nullptr : biases, - dst, info)); + ARM_COMPUTE_ERROR_THROW_ON( + CpuDepthwiseConv2dOptimizedInternal::validate(src, weights, (biases == nullptr) ? nullptr : biases, dst, info)); _is_quantized = is_data_type_quantized_asymmetric(src->data_type()); _has_bias = biases != nullptr; @@ -91,10 +100,11 @@ void CpuDepthwiseConv2d::CpuDepthwiseConv2dOptimizedInternal::configure(ITensorI _are_weights_const = weights->are_values_constant(); // Configure pipeline - _is_activationlayer_enabled = info.act_info.enabled() && !CpuDepthwiseConv2dAssemblyDispatch::is_activation_supported(info.act_info); + _is_activationlayer_enabled = + info.act_info.enabled() && !CpuDepthwiseConv2dAssemblyDispatch::is_activation_supported(info.act_info); _dwc_optimized_func = std::make_unique<CpuDepthwiseConv2dAssemblyDispatch>(); - if(_is_nchw) + if (_is_nchw) { _permute_input = std::make_unique<cpu::CpuPermute>(); _permute_weights = std::make_unique<cpu::CpuPermute>(); @@ -128,7 +138,7 @@ void CpuDepthwiseConv2d::CpuDepthwiseConv2dOptimizedInternal::configure(ITensorI } // Configure activation - if(_is_activationlayer_enabled) + if (_is_activationlayer_enabled) { _activationlayer_function = std::make_unique<cpu::CpuActivation>(); _activationlayer_function->configure(dst, nullptr, info.act_info); @@ -155,7 +165,7 @@ void CpuDepthwiseConv2d::CpuDepthwiseConv2dOptimizedInternal::run(ITensorPack &t auto packed_weights = tensors.get_tensor(TensorType::ACL_INT_4); // Permute input - if(_permute) + if (_permute) { ITensorPack pack; auto src = tensors.get_const_tensor(TensorType::ACL_SRC_0); @@ -166,7 +176,7 @@ void CpuDepthwiseConv2d::CpuDepthwiseConv2dOptimizedInternal::run(ITensorPack &t } // Run assembly function - if(_is_nchw) + if (_is_nchw) { auto src_perm = tensors.get_tensor(TensorType::ACL_INT_0); auto weights_perm = tensors.get_tensor(TensorType::ACL_INT_1); @@ -198,7 +208,7 @@ void CpuDepthwiseConv2d::CpuDepthwiseConv2dOptimizedInternal::run(ITensorPack &t } // Permute output - if(_is_nchw) + if (_is_nchw) { ITensorPack pack; auto dst_perm = tensors.get_tensor(TensorType::ACL_INT_2); @@ -208,7 +218,7 @@ void CpuDepthwiseConv2d::CpuDepthwiseConv2dOptimizedInternal::run(ITensorPack &t } // Run activation - if(_is_activationlayer_enabled) + if (_is_activationlayer_enabled) { ITensorPack pack; pack.add_tensor(TensorType::ACL_SRC, dst); @@ -221,7 +231,7 @@ void CpuDepthwiseConv2d::CpuDepthwiseConv2dOptimizedInternal::prepare(ITensorPac { // if weights are not constant then we need to repack so that weights // can be updated in-place - if(!_are_weights_const) + if (!_are_weights_const) { auto weights = tensors.get_const_tensor(TensorType::ACL_SRC_1); auto bias = tensors.get_const_tensor(TensorType::ACL_SRC_2); @@ -238,14 +248,14 @@ void CpuDepthwiseConv2d::CpuDepthwiseConv2dOptimizedInternal::prepare(ITensorPac return; } - if(!_is_prepared) + if (!_is_prepared) { auto weights = tensors.get_const_tensor(TensorType::ACL_SRC_1); auto bias = tensors.get_const_tensor(TensorType::ACL_SRC_2); auto packed_weights = tensors.get_tensor(TensorType::ACL_INT_4); // Permute weights - if(_permute) + if (_permute) { auto permuted_weights = tensors.get_tensor(TensorType::ACL_INT_1); @@ -279,11 +289,15 @@ void CpuDepthwiseConv2d::CpuDepthwiseConv2dOptimizedInternal::prepare(ITensorPac } } -void CpuDepthwiseConv2d::CpuDepthwiseConv2dGeneric::configure(ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, ITensorInfo *dst, const ConvolutionInfo &info) +void CpuDepthwiseConv2d::CpuDepthwiseConv2dGeneric::configure(ITensorInfo *src, + const ITensorInfo *weights, + const ITensorInfo *biases, + ITensorInfo *dst, + const ConvolutionInfo &info) { ARM_COMPUTE_ERROR_ON_NULLPTR(src, weights, dst); - ARM_COMPUTE_ERROR_THROW_ON(CpuDepthwiseConv2d::validate(src, weights, (biases == nullptr) ? nullptr : biases, - dst, info)); + ARM_COMPUTE_ERROR_THROW_ON( + CpuDepthwiseConv2d::validate(src, weights, (biases == nullptr) ? nullptr : biases, dst, info)); _is_nchw = src->data_layout() == DataLayout::NCHW; _is_prepared = !_is_nchw; @@ -294,9 +308,10 @@ void CpuDepthwiseConv2d::CpuDepthwiseConv2dGeneric::configure(ITensorInfo *src, auto input_perm = std::make_unique<TensorInfo>(); auto weights_perm = std::make_unique<TensorInfo>(); - auto output_perm = std::make_unique<TensorInfo>(dst->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(TensorShape())); + auto output_perm = std::make_unique<TensorInfo>( + dst->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(TensorShape())); - if(_is_nchw) + if (_is_nchw) { _permute_input = std::make_unique<cpu::CpuPermute>(); _permute_weights = std::make_unique<cpu::CpuPermute>(); @@ -315,7 +330,7 @@ void CpuDepthwiseConv2d::CpuDepthwiseConv2dGeneric::configure(ITensorInfo *src, _depthwise_conv_kernel = std::make_unique<cpu::kernels::CpuDepthwiseConv2dNativeKernel>(); _depthwise_conv_kernel->configure(input_to_use, weights_to_use, biases, output_to_use, info); - if(_is_nchw) + if (_is_nchw) { _permute_output = std::make_unique<cpu::CpuPermute>(); _permute_output->configure(output_perm.get(), dst, PermutationVector(1U, 2U, 0U)); @@ -324,43 +339,61 @@ void CpuDepthwiseConv2d::CpuDepthwiseConv2dGeneric::configure(ITensorInfo *src, //Configure Activation Layer _is_activationlayer_enabled = info.act_info.enabled(); - if(_is_activationlayer_enabled) + if (_is_activationlayer_enabled) { _activationlayer_function = std::make_unique<cpu::CpuActivation>(); _activationlayer_function->configure(dst, nullptr, info.act_info); } } -Status CpuDepthwiseConv2d::CpuDepthwiseConv2dGeneric::validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, +Status CpuDepthwiseConv2d::CpuDepthwiseConv2dGeneric::validate(const ITensorInfo *src, + const ITensorInfo *weights, + const ITensorInfo *biases, + const ITensorInfo *dst, const ConvolutionInfo &info) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, weights, dst); - if(src->data_layout() == DataLayout::NCHW) + if (src->data_layout() == DataLayout::NCHW) { TensorShape permuted_input_shape = src->tensor_shape(); TensorShape permuted_weights_shape = weights->tensor_shape(); - TensorShape permuted_output_shape = misc::shape_calculator::compute_depthwise_convolution_shape(*src, *weights, info); + TensorShape permuted_output_shape = + misc::shape_calculator::compute_depthwise_convolution_shape(*src, *weights, info); permute(permuted_input_shape, PermutationVector(2U, 0U, 1U)); permute(permuted_weights_shape, PermutationVector(2U, 0U, 1U)); permute(permuted_output_shape, PermutationVector(2U, 0U, 1U)); - const TensorInfo permuted_input = TensorInfo(src->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(permuted_input_shape).set_data_layout(DataLayout::NHWC)); - const TensorInfo permuted_weights = TensorInfo(weights->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(permuted_weights_shape).set_data_layout(DataLayout::NHWC)); - const TensorInfo permuted_output = TensorInfo(dst->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(permuted_output_shape).set_data_layout(DataLayout::NCHW)); + const TensorInfo permuted_input = TensorInfo(src->clone() + ->set_is_resizable(true) + .reset_padding() + .set_tensor_shape(permuted_input_shape) + .set_data_layout(DataLayout::NHWC)); + const TensorInfo permuted_weights = TensorInfo(weights->clone() + ->set_is_resizable(true) + .reset_padding() + .set_tensor_shape(permuted_weights_shape) + .set_data_layout(DataLayout::NHWC)); + const TensorInfo permuted_output = TensorInfo(dst->clone() + ->set_is_resizable(true) + .reset_padding() + .set_tensor_shape(permuted_output_shape) + .set_data_layout(DataLayout::NCHW)); ARM_COMPUTE_RETURN_ON_ERROR(CpuPermute::validate(src, &permuted_input, PermutationVector(2U, 0U, 1U))); ARM_COMPUTE_RETURN_ON_ERROR(CpuPermute::validate(weights, &permuted_weights, PermutationVector(2U, 0U, 1U))); ARM_COMPUTE_RETURN_ON_ERROR(CpuPermute::validate(&permuted_output, dst, PermutationVector(1U, 2U, 0U))); - ARM_COMPUTE_RETURN_ON_ERROR(cpu::kernels::CpuDepthwiseConv2dNativeKernel::validate(&permuted_input, &permuted_weights, biases, &permuted_output, info)); + ARM_COMPUTE_RETURN_ON_ERROR(cpu::kernels::CpuDepthwiseConv2dNativeKernel::validate( + &permuted_input, &permuted_weights, biases, &permuted_output, info)); } else { - ARM_COMPUTE_RETURN_ON_ERROR(cpu::kernels::CpuDepthwiseConv2dNativeKernel::validate(src, weights, biases, dst, info)); + ARM_COMPUTE_RETURN_ON_ERROR( + cpu::kernels::CpuDepthwiseConv2dNativeKernel::validate(src, weights, biases, dst, info)); } // Validate Activation Layer - if(info.act_info.enabled() && !CpuDepthwiseConv2dAssemblyDispatch::is_activation_supported(info.act_info)) + if (info.act_info.enabled() && !CpuDepthwiseConv2dAssemblyDispatch::is_activation_supported(info.act_info)) { ARM_COMPUTE_RETURN_ON_ERROR(CpuActivation::validate(dst, nullptr, info.act_info)); } @@ -375,7 +408,7 @@ void CpuDepthwiseConv2d::CpuDepthwiseConv2dGeneric::run(ITensorPack &tensors) auto biases = tensors.get_const_tensor(TensorType::ACL_SRC_2); auto dst = tensors.get_tensor(TensorType::ACL_DST_0); - if(_is_nchw) + if (_is_nchw) { prepare(tensors); auto src_perm = tensors.get_tensor(TensorType::ACL_INT_0); @@ -392,7 +425,8 @@ void CpuDepthwiseConv2d::CpuDepthwiseConv2dGeneric::run(ITensorPack &tensors) pack_depth.add_const_tensor(TensorType::ACL_SRC_1, weights_perm); pack_depth.add_tensor(TensorType::ACL_SRC_2, biases); pack_depth.add_tensor(TensorType::ACL_DST, dst_perm); - NEScheduler::get().schedule_op(_depthwise_conv_kernel.get(), Window::DimY, _depthwise_conv_kernel->window(), pack_depth); + NEScheduler::get().schedule_op(_depthwise_conv_kernel.get(), Window::DimY, _depthwise_conv_kernel->window(), + pack_depth); } else { @@ -401,10 +435,11 @@ void CpuDepthwiseConv2d::CpuDepthwiseConv2dGeneric::run(ITensorPack &tensors) pack_depth.add_tensor(TensorType::ACL_SRC_1, weights); pack_depth.add_tensor(TensorType::ACL_SRC_2, biases); pack_depth.add_tensor(TensorType::ACL_DST, dst); - NEScheduler::get().schedule_op(_depthwise_conv_kernel.get(), Window::DimY, _depthwise_conv_kernel->window(), pack_depth); + NEScheduler::get().schedule_op(_depthwise_conv_kernel.get(), Window::DimY, _depthwise_conv_kernel->window(), + pack_depth); } - if(_is_nchw) + if (_is_nchw) { ITensorPack pack; auto dst_perm = tensors.get_tensor(TensorType::ACL_INT_2); @@ -413,7 +448,7 @@ void CpuDepthwiseConv2d::CpuDepthwiseConv2dGeneric::run(ITensorPack &tensors) _permute_output->run(pack); } - if(_is_activationlayer_enabled) + if (_is_activationlayer_enabled) { ITensorPack pack; pack.add_tensor(TensorType::ACL_SRC, dst); @@ -424,7 +459,7 @@ void CpuDepthwiseConv2d::CpuDepthwiseConv2dGeneric::run(ITensorPack &tensors) void CpuDepthwiseConv2d::CpuDepthwiseConv2dGeneric::prepare(ITensorPack &tensors) { - if(!_is_prepared) + if (!_is_prepared) { auto weights = tensors.get_const_tensor(TensorType::ACL_SRC_1); auto weights_perm = tensors.get_tensor(TensorType::ACL_INT_1); @@ -441,12 +476,17 @@ void CpuDepthwiseConv2d::CpuDepthwiseConv2dGeneric::prepare(ITensorPack &tensors } } -void CpuDepthwiseConv2d::configure(ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, ITensorInfo *dst, const ConvolutionInfo &info) +void CpuDepthwiseConv2d::configure(ITensorInfo *src, + const ITensorInfo *weights, + const ITensorInfo *biases, + ITensorInfo *dst, + const ConvolutionInfo &info) { ARM_COMPUTE_LOG_PARAMS(src, weights, biases, dst, info); - _depth_conv_func = get_depthwiseconvolution_function(src, weights, (biases != nullptr) ? biases : nullptr, dst, info); - switch(_depth_conv_func) + _depth_conv_func = + get_depthwiseconvolution_function(src, weights, (biases != nullptr) ? biases : nullptr, dst, info); + switch (_depth_conv_func) { case DepthwiseConvolutionFunction::OPTIMIZED: _func_optimized.configure(src, weights, biases, dst, info); @@ -459,10 +499,14 @@ void CpuDepthwiseConv2d::configure(ITensorInfo *src, const ITensorInfo *weights, } } -Status CpuDepthwiseConv2d::validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, const ConvolutionInfo &info) +Status CpuDepthwiseConv2d::validate(const ITensorInfo *src, + const ITensorInfo *weights, + const ITensorInfo *biases, + const ITensorInfo *dst, + const ConvolutionInfo &info) { DepthwiseConvolutionFunction depth_conv_func = get_depthwiseconvolution_function(src, weights, biases, dst, info); - switch(depth_conv_func) + switch (depth_conv_func) { case DepthwiseConvolutionFunction::OPTIMIZED: return CpuDepthwiseConv2dOptimizedInternal::validate(src, weights, biases, dst, info); @@ -475,10 +519,13 @@ Status CpuDepthwiseConv2d::validate(const ITensorInfo *src, const ITensorInfo *w } } -DepthwiseConvolutionFunction CpuDepthwiseConv2d::get_depthwiseconvolution_function(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, +DepthwiseConvolutionFunction CpuDepthwiseConv2d::get_depthwiseconvolution_function(const ITensorInfo *src, + const ITensorInfo *weights, + const ITensorInfo *biases, + const ITensorInfo *dst, const ConvolutionInfo &info) { - if(bool(CpuDepthwiseConv2dOptimizedInternal::validate(src, weights, biases, dst, info))) + if (bool(CpuDepthwiseConv2dOptimizedInternal::validate(src, weights, biases, dst, info))) { return DepthwiseConvolutionFunction::OPTIMIZED; } @@ -490,7 +537,7 @@ DepthwiseConvolutionFunction CpuDepthwiseConv2d::get_depthwiseconvolution_functi void CpuDepthwiseConv2d::run(ITensorPack &tensors) { - switch(_depth_conv_func) + switch (_depth_conv_func) { case DepthwiseConvolutionFunction::OPTIMIZED: _func_optimized.run(tensors); @@ -505,7 +552,7 @@ void CpuDepthwiseConv2d::run(ITensorPack &tensors) void CpuDepthwiseConv2d::prepare(ITensorPack &tensors) { - switch(_depth_conv_func) + switch (_depth_conv_func) { case DepthwiseConvolutionFunction::OPTIMIZED: _func_optimized.prepare(tensors); diff --git a/src/cpu/operators/CpuDepthwiseConv2d.h b/src/cpu/operators/CpuDepthwiseConv2d.h index 3d8719ee44..7eaa0df857 100644 --- a/src/cpu/operators/CpuDepthwiseConv2d.h +++ b/src/cpu/operators/CpuDepthwiseConv2d.h @@ -24,8 +24,9 @@ #ifndef ARM_COMPUTE_CPU_DEPTHWISE_CONV2D_H #define ARM_COMPUTE_CPU_DEPTHWISE_CONV2D_H -#include "arm_compute/core/ITensorInfo.h" #include "arm_compute/core/experimental/Types.h" +#include "arm_compute/core/ITensorInfo.h" + #include "src/cpu/ICpuKernel.h" #include "src/cpu/ICpuOperator.h" #include "src/cpu/kernels/CpuDepthwiseConv2dNativeKernel.h" @@ -56,14 +57,22 @@ public: * Data type supported: Same as @p src, S32 when src is QASYMM8/QASYMM8_SIGNED. * @param[in] info Depthwise convolution meta-data. */ - void configure(ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, ITensorInfo *dst, const ConvolutionInfo &info); + void configure(ITensorInfo *src, + const ITensorInfo *weights, + const ITensorInfo *biases, + ITensorInfo *dst, + const ConvolutionInfo &info); /** Static function to check if given info will lead to a valid configuration * * Similar to CpuDepthwiseConv2d::configure() * * @return a status */ - static Status validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, const ConvolutionInfo &info); + static Status validate(const ITensorInfo *src, + const ITensorInfo *weights, + const ITensorInfo *biases, + const ITensorInfo *dst, + const ConvolutionInfo &info); /** Static function to choose the best depthwise convolution function for @ref CpuDepthwiseConv2d * * @param[in] src Source tensor info. Data type supported: QASYMM8/QASYMM8_SIGNED/F16/F32 @@ -76,7 +85,10 @@ public: * * @return a Depthwise Convolution Function */ - static DepthwiseConvolutionFunction get_depthwiseconvolution_function(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, + static DepthwiseConvolutionFunction get_depthwiseconvolution_function(const ITensorInfo *src, + const ITensorInfo *weights, + const ITensorInfo *biases, + const ITensorInfo *dst, const ConvolutionInfo &info); // Inherited methods overriden: @@ -118,32 +130,40 @@ private: * @param[out] dst Destination tensor info. Data type supported: same as @p src. * @param[in] info Depthwise convolution meta-data. */ - void configure(ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, ITensorInfo *dst, const ConvolutionInfo &info); + void configure(ITensorInfo *src, + const ITensorInfo *weights, + const ITensorInfo *biases, + ITensorInfo *dst, + const ConvolutionInfo &info); /** Static function to check if given info will lead to a valid configuration * * Similar to CpuDepthwiseConv2dOptimizedInternal::configure() * * @return a status */ - static Status validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, const ConvolutionInfo &info); + static Status validate(const ITensorInfo *src, + const ITensorInfo *weights, + const ITensorInfo *biases, + const ITensorInfo *dst, + const ConvolutionInfo &info); // Inherited methods overriden: void run(ITensorPack &tensors) override; void prepare(ITensorPack &tensors) override; private: - std::unique_ptr<CpuDepthwiseConv2dAssemblyDispatch> _dwc_optimized_func{ nullptr }; - std::unique_ptr<CpuPermute> _permute_input{ nullptr }; - std::unique_ptr<CpuPermute> _permute_weights{ nullptr }; - std::unique_ptr<CpuPermute> _permute_output{ nullptr }; - std::unique_ptr<CpuActivation> _activationlayer_function{ nullptr }; - bool _has_bias{ false }; - bool _is_quantized{ false }; - bool _is_nchw{ true }; - bool _permute{ false }; - bool _is_activationlayer_enabled{ false }; - bool _is_prepared{ false }; - bool _are_weights_const{ true }; + std::unique_ptr<CpuDepthwiseConv2dAssemblyDispatch> _dwc_optimized_func{nullptr}; + std::unique_ptr<CpuPermute> _permute_input{nullptr}; + std::unique_ptr<CpuPermute> _permute_weights{nullptr}; + std::unique_ptr<CpuPermute> _permute_output{nullptr}; + std::unique_ptr<CpuActivation> _activationlayer_function{nullptr}; + bool _has_bias{false}; + bool _is_quantized{false}; + bool _is_nchw{true}; + bool _permute{false}; + bool _is_activationlayer_enabled{false}; + bool _is_prepared{false}; + bool _are_weights_const{true}; }; /** Basic function to execute a generic depthwise convolution. This function calls the following kernel: @@ -176,7 +196,11 @@ private: * Data type supported: Same as @p src, S32 when src is QASYMM8/QASYMM8_SIGNED. * @param[in] info Depthwise convolution meta-data. */ - void configure(ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, ITensorInfo *dst, const ConvolutionInfo &info); + void configure(ITensorInfo *src, + const ITensorInfo *weights, + const ITensorInfo *biases, + ITensorInfo *dst, + const ConvolutionInfo &info); /** Static function to check if given info will lead to a valid configuration * @@ -184,24 +208,28 @@ private: * * @return a status */ - static Status validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, const ConvolutionInfo &info); + static Status validate(const ITensorInfo *src, + const ITensorInfo *weights, + const ITensorInfo *biases, + const ITensorInfo *dst, + const ConvolutionInfo &info); // Inherited methods overridden: void run(ITensorPack &tensors) override; void prepare(ITensorPack &tensors) override; private: - std::unique_ptr<kernels::CpuDepthwiseConv2dNativeKernel> _depthwise_conv_kernel{ nullptr }; - std::unique_ptr<CpuPermute> _permute_input{ nullptr }; - std::unique_ptr<CpuPermute> _permute_weights{ nullptr }; - std::unique_ptr<CpuPermute> _permute_output{ nullptr }; - std::unique_ptr<CpuActivation> _activationlayer_function{ nullptr }; - bool _is_nchw{ true }; - bool _is_prepared{ false }; - bool _is_activationlayer_enabled{ false }; + std::unique_ptr<kernels::CpuDepthwiseConv2dNativeKernel> _depthwise_conv_kernel{nullptr}; + std::unique_ptr<CpuPermute> _permute_input{nullptr}; + std::unique_ptr<CpuPermute> _permute_weights{nullptr}; + std::unique_ptr<CpuPermute> _permute_output{nullptr}; + std::unique_ptr<CpuActivation> _activationlayer_function{nullptr}; + bool _is_nchw{true}; + bool _is_prepared{false}; + bool _is_activationlayer_enabled{false}; }; - DepthwiseConvolutionFunction _depth_conv_func{ DepthwiseConvolutionFunction::GENERIC }; + DepthwiseConvolutionFunction _depth_conv_func{DepthwiseConvolutionFunction::GENERIC}; CpuDepthwiseConv2dOptimizedInternal _func_optimized{}; CpuDepthwiseConv2dGeneric _func_generic{}; }; diff --git a/src/cpu/operators/CpuDepthwiseConv2dAssemblyDispatch.cpp b/src/cpu/operators/CpuDepthwiseConv2dAssemblyDispatch.cpp index d078155155..8d3741de96 100644 --- a/src/cpu/operators/CpuDepthwiseConv2dAssemblyDispatch.cpp +++ b/src/cpu/operators/CpuDepthwiseConv2dAssemblyDispatch.cpp @@ -26,6 +26,7 @@ #include "arm_compute/core/ITensorInfo.h" #include "arm_compute/runtime/NEON/NEScheduler.h" + #include "src/common/utils/Log.h" #include "src/core/CPP/Validate.h" #include "src/core/helpers/AutoConfiguration.h" @@ -38,15 +39,14 @@ namespace cpu { struct CpuDepthwiseConv2dAssemblyDispatch::LocalImpl { - std::unique_ptr<kernels::CpuDepthwiseConv2dAssemblyWrapperKernel> asm_kernel{ nullptr }; - bool is_prepared{ false }; - bool are_weights_const{ true }; + std::unique_ptr<kernels::CpuDepthwiseConv2dAssemblyWrapperKernel> asm_kernel{nullptr}; + bool is_prepared{false}; + bool are_weights_const{true}; experimental::MemoryRequirements mem_req{}; }; #ifndef DOXYGEN_SKIP_THIS -CpuDepthwiseConv2dAssemblyDispatch::CpuDepthwiseConv2dAssemblyDispatch() - : _pImpl(std::make_unique<LocalImpl>()) +CpuDepthwiseConv2dAssemblyDispatch::CpuDepthwiseConv2dAssemblyDispatch() : _pImpl(std::make_unique<LocalImpl>()) { } #endif /* DOXYGEN_SKIP_THIS */ @@ -66,7 +66,7 @@ void CpuDepthwiseConv2dAssemblyDispatch::configure(const ITensorInfo *src, _pImpl->are_weights_const = weights->are_values_constant(); // If we don't support a combination of data types, silently return: it is the caller's responsibility to check if configure() was successful via is_configured() - if(!CpuDepthwiseConv2dAssemblyDispatch::validate(src, weights, bias, dst, info)) + if (!CpuDepthwiseConv2dAssemblyDispatch::validate(src, weights, bias, dst, info)) { return; } @@ -77,12 +77,16 @@ void CpuDepthwiseConv2dAssemblyDispatch::configure(const ITensorInfo *src, // Compute memory requirements for assembly kernels constexpr size_t alignment = 4096; - _pImpl->mem_req.push_back({ TensorType::ACL_INT_0, dwc_wrapper->get_working_size(num_threads), alignment }); - _pImpl->mem_req.push_back({ TensorType::ACL_INT_1, dwc_wrapper->get_storage_size(), alignment }); + _pImpl->mem_req.push_back({TensorType::ACL_INT_0, dwc_wrapper->get_working_size(num_threads), alignment}); + _pImpl->mem_req.push_back({TensorType::ACL_INT_1, dwc_wrapper->get_storage_size(), alignment}); _pImpl->asm_kernel = std::move(dwc_wrapper); } -Status CpuDepthwiseConv2dAssemblyDispatch::validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *bias, const ITensorInfo *dst, const ConvolutionInfo &info) +Status CpuDepthwiseConv2dAssemblyDispatch::validate(const ITensorInfo *src, + const ITensorInfo *weights, + const ITensorInfo *bias, + const ITensorInfo *dst, + const ConvolutionInfo &info) { return kernels::CpuDepthwiseConv2dAssemblyWrapperKernel::validate(src, weights, bias, dst, info); } @@ -111,7 +115,7 @@ void CpuDepthwiseConv2dAssemblyDispatch::prepare(ITensorPack &tensors) { const ITensor *weights = tensors.get_const_tensor(TensorType::ACL_SRC_1); - if((!_pImpl->are_weights_const && weights != nullptr) || !_pImpl->is_prepared) + if ((!_pImpl->are_weights_const && weights != nullptr) || !_pImpl->is_prepared) { // Pack weights and bias const ITensor *bias = tensors.get_const_tensor(TensorType::ACL_SRC_2); @@ -125,11 +129,12 @@ void CpuDepthwiseConv2dAssemblyDispatch::prepare(ITensorPack &tensors) const auto weights_padding = weights->info()->padding(); const size_t ld_weights_col = weights_shape[0] + weights_padding.left + weights_padding.right; - const size_t ld_weights_row = ld_weights_col * (weights_shape[1] + weights_padding.top + weights_padding.bottom); + const size_t ld_weights_row = + ld_weights_col * (weights_shape[1] + weights_padding.top + weights_padding.bottom); _pImpl->asm_kernel->pack_parameters(parameters_ptr, bias_ptr, weights_ptr, ld_weights_col, ld_weights_row); weights->mark_as_unused(); - if(bias != nullptr) + if (bias != nullptr) { bias->mark_as_unused(); } diff --git a/src/cpu/operators/CpuDepthwiseConv2dAssemblyDispatch.h b/src/cpu/operators/CpuDepthwiseConv2dAssemblyDispatch.h index f222ab9cf9..f1816625d2 100644 --- a/src/cpu/operators/CpuDepthwiseConv2dAssemblyDispatch.h +++ b/src/cpu/operators/CpuDepthwiseConv2dAssemblyDispatch.h @@ -25,6 +25,7 @@ #define ARM_COMPUTE_CPU_DEPTHWISE_CONV2D_ASSEMBLY_DISPATCH_H #include "arm_compute/function_info/ActivationLayerInfo.h" + #include "src/core/common/Macros.h" #include "src/cpu/ICpuOperator.h" @@ -53,14 +54,22 @@ public: * @param[out] dst Destination tensor info. Data type supported: same as @p src. * @param[in] info Depthwise convolution meta-data. */ - void configure(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *bias, ITensorInfo *dst, const ConvolutionInfo &info); + void configure(const ITensorInfo *src, + const ITensorInfo *weights, + const ITensorInfo *bias, + ITensorInfo *dst, + const ConvolutionInfo &info); /** Static function to check if given info will lead to a valid configuration * * Similar to CpuDepthwiseConv2dAssemblyDispatch::configure() * * @return a status */ - static Status validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *bias, const ITensorInfo *dst, const ConvolutionInfo &info); + static Status validate(const ITensorInfo *src, + const ITensorInfo *weights, + const ITensorInfo *bias, + const ITensorInfo *dst, + const ConvolutionInfo &info); /** Checks if activation is supported by the assembly kernels * * @param[in] activation Activation to check @@ -70,8 +79,8 @@ public: static bool is_activation_supported(const ActivationLayerInfo &activation); // Inherited methods overridden: - void run(ITensorPack &tensors) override; - void prepare(ITensorPack &tensors) override; + void run(ITensorPack &tensors) override; + void prepare(ITensorPack &tensors) override; experimental::MemoryRequirements workspace() const override; private: diff --git a/src/cpu/operators/CpuDequantize.cpp b/src/cpu/operators/CpuDequantize.cpp index 12dc136ba3..c05a23f3a7 100644 --- a/src/cpu/operators/CpuDequantize.cpp +++ b/src/cpu/operators/CpuDequantize.cpp @@ -26,6 +26,7 @@ #include "arm_compute/core/TensorInfo.h" #include "arm_compute/core/Validate.h" #include "arm_compute/runtime/NEON/NEScheduler.h" + #include "src/common/utils/Log.h" #include "src/cpu/kernels/CpuDequantizeKernel.h" diff --git a/src/cpu/operators/CpuDirectConv2d.cpp b/src/cpu/operators/CpuDirectConv2d.cpp index 9cdbdb61c1..135a3bb2b9 100644 --- a/src/cpu/operators/CpuDirectConv2d.cpp +++ b/src/cpu/operators/CpuDirectConv2d.cpp @@ -27,6 +27,7 @@ #include "arm_compute/core/Utils.h" #include "arm_compute/core/Validate.h" #include "arm_compute/runtime/NEON/NEScheduler.h" + #include "src/common/utils/Log.h" namespace arm_compute @@ -36,12 +37,25 @@ namespace cpu CpuDirectConv2d::~CpuDirectConv2d() = default; CpuDirectConv2d::CpuDirectConv2d(std::shared_ptr<IMemoryManager> memory_manager) - : _memory_group(std::move(memory_manager)), _output_stage_kernel(), _conv_kernel(), _input_border_handler(), _activationlayer_function(), _accumulator(), _has_bias(false), - _is_activationlayer_enabled(false), _dim_split(Window::DimZ), _is_padding_required() + : _memory_group(std::move(memory_manager)), + _output_stage_kernel(), + _conv_kernel(), + _input_border_handler(), + _activationlayer_function(), + _accumulator(), + _has_bias(false), + _is_activationlayer_enabled(false), + _dim_split(Window::DimZ), + _is_padding_required() { } -void CpuDirectConv2d::configure(ITensorInfo *src, ITensorInfo *weights, const ITensorInfo *bias, ITensorInfo *dst, const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info) +void CpuDirectConv2d::configure(ITensorInfo *src, + ITensorInfo *weights, + const ITensorInfo *bias, + ITensorInfo *dst, + const PadStrideInfo &conv_info, + const ActivationLayerInfo &act_info) { ARM_COMPUTE_ERROR_ON(src->data_layout() == DataLayout::UNKNOWN); ARM_COMPUTE_LOG_PARAMS(src, weights, bias, dst, conv_info, act_info); @@ -51,7 +65,7 @@ void CpuDirectConv2d::configure(ITensorInfo *src, ITensorInfo *weights, const IT _input_border_handler = std::make_unique<NEFillBorderKernel>(); // Free accumulator - if(_accumulator.buffer() != nullptr) + if (_accumulator.buffer() != nullptr) { _accumulator.allocator()->free(); } @@ -62,28 +76,33 @@ void CpuDirectConv2d::configure(ITensorInfo *src, ITensorInfo *weights, const IT _has_bias = (bias != nullptr); _conv_kernel->configure(src, weights, dst, conv_info); - if(_has_bias) + if (_has_bias) { _output_stage_kernel->configure(dst, bias); } _is_padding_required = !_conv_kernel->border_size().empty(); - if(_is_padding_required) + if (_is_padding_required) { // Add zero padding XY - _input_border_handler->configure(src, _conv_kernel->border_size(), BorderMode::CONSTANT, PixelValue(static_cast<float>(0.f))); + _input_border_handler->configure(src, _conv_kernel->border_size(), BorderMode::CONSTANT, + PixelValue(static_cast<float>(0.f))); } //Configure Activation Layer _is_activationlayer_enabled = act_info.enabled(); - if(_is_activationlayer_enabled) + if (_is_activationlayer_enabled) { _activationlayer_function = std::make_unique<CpuActivation>(); _activationlayer_function->configure(dst, dst, act_info); } } -Status CpuDirectConv2d::validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *bias, const ITensorInfo *dst, const PadStrideInfo &conv_info, +Status CpuDirectConv2d::validate(const ITensorInfo *src, + const ITensorInfo *weights, + const ITensorInfo *bias, + const ITensorInfo *dst, + const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, weights, dst); @@ -95,7 +114,7 @@ Status CpuDirectConv2d::validate(const ITensorInfo *src, const ITensorInfo *weig // Validate Convolution kernel ARM_COMPUTE_RETURN_ON_ERROR(kernels::CpuDirectConv2dKernel::validate(src, weights, &accumulator, conv_info)); - if(bias != nullptr) + if (bias != nullptr) { ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(weights, bias); ARM_COMPUTE_RETURN_ERROR_ON_MSG(bias->dimension(0) != weights->dimension(3), @@ -106,7 +125,7 @@ Status CpuDirectConv2d::validate(const ITensorInfo *src, const ITensorInfo *weig // Validate bias kernel ARM_COMPUTE_RETURN_ON_ERROR(kernels::CpuDirectConv2dOutputStageKernel::validate(&accumulator, bias, dst)); - if(act_info.enabled()) + if (act_info.enabled()) { ARM_COMPUTE_RETURN_ON_ERROR(CpuActivation::validate(dst, nullptr, act_info)); } @@ -122,14 +141,15 @@ void CpuDirectConv2d::run(ITensorPack &tensors) auto bias = tensors.get_const_tensor(TensorType::ACL_SRC_2); auto dst = tensors.get_tensor(TensorType::ACL_DST); - if(_is_padding_required) + if (_is_padding_required) { ITensorPack pack; pack.add_tensor(TensorType::ACL_SRC_DST, src); - NEScheduler::get().schedule_op(_input_border_handler.get(), Window::DimZ, _input_border_handler->window(), pack); + NEScheduler::get().schedule_op(_input_border_handler.get(), Window::DimZ, _input_border_handler->window(), + pack); } NEScheduler::get().schedule_op(_conv_kernel.get(), _dim_split, _conv_kernel->window(), tensors); - if(_has_bias) + if (_has_bias) { ITensorPack pack; pack.add_tensor(TensorType::ACL_SRC_0, dst); @@ -138,7 +158,7 @@ void CpuDirectConv2d::run(ITensorPack &tensors) NEScheduler::get().schedule_op(_output_stage_kernel.get(), Window::DimY, _output_stage_kernel->window(), pack); } - if(_is_activationlayer_enabled) + if (_is_activationlayer_enabled) { ITensorPack pack; pack.add_tensor(TensorType::ACL_SRC, dst); diff --git a/src/cpu/operators/CpuDirectConv2d.h b/src/cpu/operators/CpuDirectConv2d.h index fa8d61e083..73c85f2dcd 100644 --- a/src/cpu/operators/CpuDirectConv2d.h +++ b/src/cpu/operators/CpuDirectConv2d.h @@ -24,13 +24,14 @@ #ifndef ARM_COMPUTE_CPU_DIRECTCONV2D_H #define ARM_COMPUTE_CPU_DIRECTCONV2D_H +#include "arm_compute/core/experimental/Types.h" #include "arm_compute/core/ITensorInfo.h" #include "arm_compute/core/Types.h" -#include "arm_compute/core/experimental/Types.h" #include "arm_compute/runtime/IMemoryManager.h" #include "arm_compute/runtime/MemoryGroup.h" #include "arm_compute/runtime/NEON/functions/NEActivationLayer.h" #include "arm_compute/runtime/Tensor.h" + #include "src/core/NEON/kernels/NEFillBorderKernel.h" #include "src/cpu/ICpuKernel.h" #include "src/cpu/ICpuOperator.h" @@ -75,14 +76,23 @@ public: * @param[in] conv_info Contains padding and stride information described in @ref PadStrideInfo. * @param[in] act_info (Optional) Activation layer information in case of a fused activation. */ - void configure(ITensorInfo *src, ITensorInfo *weights, const ITensorInfo *bias, ITensorInfo *dst, const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info = ActivationLayerInfo()); + void configure(ITensorInfo *src, + ITensorInfo *weights, + const ITensorInfo *bias, + ITensorInfo *dst, + const PadStrideInfo &conv_info, + const ActivationLayerInfo &act_info = ActivationLayerInfo()); /** Static function to check if given info will lead to a valid configuration * * Similar to CpuDirectConv2d::configure() * * @return a status */ - static Status validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *bias, const ITensorInfo *dst, const PadStrideInfo &conv_info, + static Status validate(const ITensorInfo *src, + const ITensorInfo *weights, + const ITensorInfo *bias, + const ITensorInfo *dst, + const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info = ActivationLayerInfo()); // Inherited methods overridden: @@ -95,10 +105,10 @@ private: std::unique_ptr<NEFillBorderKernel> _input_border_handler; std::unique_ptr<CpuActivation> _activationlayer_function; Tensor _accumulator; - bool _has_bias{ false }; - bool _is_activationlayer_enabled{ false }; - unsigned int _dim_split{ 0 }; - bool _is_padding_required{ false }; + bool _has_bias{false}; + bool _is_activationlayer_enabled{false}; + unsigned int _dim_split{0}; + bool _is_padding_required{false}; }; } // namespace cpu } // namespace arm_compute diff --git a/src/cpu/operators/CpuDirectConv3d.cpp b/src/cpu/operators/CpuDirectConv3d.cpp index aa74e420a6..626f1c6775 100644 --- a/src/cpu/operators/CpuDirectConv3d.cpp +++ b/src/cpu/operators/CpuDirectConv3d.cpp @@ -27,6 +27,7 @@ #include "arm_compute/core/Utils.h" #include "arm_compute/core/Validate.h" #include "arm_compute/runtime/NEON/NEScheduler.h" + #include "src/common/utils/Log.h" namespace arm_compute @@ -36,11 +37,17 @@ namespace cpu CpuDirectConv3d::~CpuDirectConv3d() = default; CpuDirectConv3d::CpuDirectConv3d(std::shared_ptr<IMemoryManager> memory_manager) - : _memory_group(std::move(memory_manager)), _conv_kernel(), _activationlayer_function(), _accumulator(), _is_activationlayer_enabled(false), _dim_split(Window::DimZ) + : _memory_group(std::move(memory_manager)), + _conv_kernel(), + _activationlayer_function(), + _accumulator(), + _is_activationlayer_enabled(false), + _dim_split(Window::DimZ) { } -void CpuDirectConv3d::configure(ITensorInfo *src0, ITensorInfo *src1, const ITensorInfo *src2, ITensorInfo *dst, const Conv3dInfo conv_info) +void CpuDirectConv3d::configure( + ITensorInfo *src0, ITensorInfo *src1, const ITensorInfo *src2, ITensorInfo *dst, const Conv3dInfo conv_info) { ARM_COMPUTE_LOG_PARAMS(src0, src1, src2, dst, conv_info); ARM_COMPUTE_ERROR_ON(src0->data_layout() != DataLayout::NDHWC); @@ -48,7 +55,7 @@ void CpuDirectConv3d::configure(ITensorInfo *src0, ITensorInfo *src1, const ITen _conv_kernel = std::make_unique<kernels::CpuDirectConv3dKernel>(); // Free accumulator - if(_accumulator.buffer() != nullptr) + if (_accumulator.buffer() != nullptr) { _accumulator.allocator()->free(); } @@ -59,21 +66,25 @@ void CpuDirectConv3d::configure(ITensorInfo *src0, ITensorInfo *src1, const ITen //Configure Activation Layer _is_activationlayer_enabled = conv_info.act_info.enabled(); - if(_is_activationlayer_enabled) + if (_is_activationlayer_enabled) { _activationlayer_function = std::make_unique<CpuActivation>(); _activationlayer_function->configure(dst, dst, conv_info.act_info); } } -Status CpuDirectConv3d::validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, const Conv3dInfo conv_info) +Status CpuDirectConv3d::validate(const ITensorInfo *src0, + const ITensorInfo *src1, + const ITensorInfo *src2, + const ITensorInfo *dst, + const Conv3dInfo conv_info) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src0, src1, dst); // Validate Convolution kernel ARM_COMPUTE_RETURN_ON_ERROR(kernels::CpuDirectConv3dKernel::validate(src0, src1, src2, dst, conv_info)); - if(conv_info.act_info.enabled()) + if (conv_info.act_info.enabled()) { ARM_COMPUTE_RETURN_ON_ERROR(CpuActivation::validate(dst, nullptr, conv_info.act_info)); } @@ -89,7 +100,7 @@ void CpuDirectConv3d::run(ITensorPack &tensors) NEScheduler::get().schedule_op(_conv_kernel.get(), _dim_split, _conv_kernel->window(), tensors); - if(_is_activationlayer_enabled) + if (_is_activationlayer_enabled) { ITensorPack pack; pack.add_tensor(TensorType::ACL_SRC, dst); @@ -98,4 +109,4 @@ void CpuDirectConv3d::run(ITensorPack &tensors) } } } // namespace cpu -} // namespace arm_compute
\ No newline at end of file +} // namespace arm_compute diff --git a/src/cpu/operators/CpuDirectConv3d.h b/src/cpu/operators/CpuDirectConv3d.h index cde01f07c2..3ad1e09a14 100644 --- a/src/cpu/operators/CpuDirectConv3d.h +++ b/src/cpu/operators/CpuDirectConv3d.h @@ -24,14 +24,15 @@ #ifndef ARM_COMPUTE_CPU_DIRECTCONV3D_H #define ARM_COMPUTE_CPU_DIRECTCONV3D_H +#include "arm_compute/core/experimental/Types.h" #include "arm_compute/core/ITensorInfo.h" #include "arm_compute/core/Types.h" -#include "arm_compute/core/experimental/Types.h" #include "arm_compute/runtime/FunctionDescriptors.h" #include "arm_compute/runtime/IMemoryManager.h" #include "arm_compute/runtime/MemoryGroup.h" #include "arm_compute/runtime/NEON/functions/NEActivationLayer.h" #include "arm_compute/runtime/Tensor.h" + #include "src/core/NEON/kernels/NEFillBorderKernel.h" #include "src/cpu/ICpuKernel.h" #include "src/cpu/ICpuOperator.h" @@ -76,14 +77,19 @@ public: * The 1st dimensions must be equal to the 1st dimension of the @p kernels tensor. * @param[in] conv_info Contains padding, stride, acitvation information. */ - void configure(ITensorInfo *src0, ITensorInfo *src1, const ITensorInfo *src2, ITensorInfo *dst, const Conv3dInfo conv_info); + void configure( + ITensorInfo *src0, ITensorInfo *src1, const ITensorInfo *src2, ITensorInfo *dst, const Conv3dInfo conv_info); /** Static function to check if given info will lead to a valid configuration * * Similar to CpuDirectConv3d::configure() * * @return a status */ - static Status validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, const Conv3dInfo conv_info); + static Status validate(const ITensorInfo *src0, + const ITensorInfo *src1, + const ITensorInfo *src2, + const ITensorInfo *dst, + const Conv3dInfo conv_info); // Inherited methods overridden: void run(ITensorPack &tensors) override; @@ -93,8 +99,8 @@ private: std::unique_ptr<kernels::CpuDirectConv3dKernel> _conv_kernel; std::unique_ptr<CpuActivation> _activationlayer_function; Tensor _accumulator; - bool _is_activationlayer_enabled{ false }; - unsigned int _dim_split{ 0 }; + bool _is_activationlayer_enabled{false}; + unsigned int _dim_split{0}; }; } // namespace cpu } // namespace arm_compute diff --git a/src/cpu/operators/CpuElementwise.cpp b/src/cpu/operators/CpuElementwise.cpp index b88ae3e514..c2ae8773c6 100644 --- a/src/cpu/operators/CpuElementwise.cpp +++ b/src/cpu/operators/CpuElementwise.cpp @@ -22,6 +22,7 @@ * SOFTWARE. */ #include "src/cpu/operators/CpuElementwise.h" + #include "src/common/utils/Log.h" #include "src/core/helpers/WindowHelpers.h" #include "src/cpu/kernels/CpuElementwiseKernel.h" @@ -33,7 +34,7 @@ namespace cpu void CpuElementwiseBase::run(ITensorPack &tensors) { // If the kernel has been configured, use the window from the kernel. - if(_kernel->is_window_configured()) + if (_kernel->is_window_configured()) { ICpuOperator::run(tensors); return; @@ -101,12 +102,16 @@ void CpuElementwiseComparisonStatic<COP>::configure(const ITensorInfo *src0, con } template <ComparisonOperation COP> -Status CpuElementwiseComparisonStatic<COP>::validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst) +Status +CpuElementwiseComparisonStatic<COP>::validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst) { return kernels::CpuComparisonKernel::validate(COP, src0, src1, dst); } -void CpuElementwiseComparison::configure(const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst, ComparisonOperation op) +void CpuElementwiseComparison::configure(const ITensorInfo *src0, + const ITensorInfo *src1, + ITensorInfo *dst, + ComparisonOperation op) { ARM_COMPUTE_LOG_PARAMS(src0, src1, dst); auto k = std::make_unique<kernels::CpuComparisonKernel>(); @@ -114,7 +119,10 @@ void CpuElementwiseComparison::configure(const ITensorInfo *src0, const ITensorI _kernel = std::move(k); } -Status CpuElementwiseComparison::validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst, ComparisonOperation op) +Status CpuElementwiseComparison::validate(const ITensorInfo *src0, + const ITensorInfo *src1, + const ITensorInfo *dst, + ComparisonOperation op) { return kernels::CpuComparisonKernel::validate(op, src0, src1, dst); } @@ -127,4 +135,4 @@ template class CpuElementwiseComparisonStatic<ComparisonOperation::GreaterEqual> template class CpuElementwiseComparisonStatic<ComparisonOperation::Less>; template class CpuElementwiseComparisonStatic<ComparisonOperation::LessEqual>; } // namespace cpu -} // namespace arm_compute
\ No newline at end of file +} // namespace arm_compute diff --git a/src/cpu/operators/CpuElementwise.h b/src/cpu/operators/CpuElementwise.h index b6c61cf245..5db53c8026 100644 --- a/src/cpu/operators/CpuElementwise.h +++ b/src/cpu/operators/CpuElementwise.h @@ -139,7 +139,8 @@ public: * * @return a status */ - static Status validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst, ComparisonOperation op); + static Status + validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst, ComparisonOperation op); }; /** Basic function to run @ref cpu::kernels::CpuComparisonKernel @@ -182,4 +183,4 @@ using NELessEqual = CpuElementwiseComparisonStatic<ComparisonOperation::LessEqua } // namespace cpu } // namespace arm_compute -#endif /* ARM_COMPUTE_CPU_ELEMENTWISE_H */
\ No newline at end of file +#endif /* ARM_COMPUTE_CPU_ELEMENTWISE_H */ diff --git a/src/cpu/operators/CpuElementwiseUnary.cpp b/src/cpu/operators/CpuElementwiseUnary.cpp index 7fd14dba7d..04ab7bf8f5 100644 --- a/src/cpu/operators/CpuElementwiseUnary.cpp +++ b/src/cpu/operators/CpuElementwiseUnary.cpp @@ -22,6 +22,7 @@ * SOFTWARE. */ #include "src/cpu/operators/CpuElementwiseUnary.h" + #include "src/common/utils/Log.h" #include "src/core/helpers/WindowHelpers.h" #include "src/cpu/kernels/CpuElementwiseUnaryKernel.h" @@ -47,7 +48,7 @@ Status CpuElementwiseUnary::validate(ElementWiseUnary op, const ITensorInfo &src void CpuElementwiseUnary::run(ITensorPack &tensors) { - if(_kernel->is_window_configured()) + if (_kernel->is_window_configured()) { ICpuOperator::run(tensors); return; @@ -57,4 +58,4 @@ void CpuElementwiseUnary::run(ITensorPack &tensors) ICpuOperator::run(tensors, compute_output_shape_and_window(src_info->tensor_shape()).second); } } // namespace cpu -} // namespace arm_compute
\ No newline at end of file +} // namespace arm_compute diff --git a/src/cpu/operators/CpuElementwiseUnary.h b/src/cpu/operators/CpuElementwiseUnary.h index 5e8e98d047..1e51bfaa1c 100644 --- a/src/cpu/operators/CpuElementwiseUnary.h +++ b/src/cpu/operators/CpuElementwiseUnary.h @@ -25,6 +25,7 @@ #define ARM_COMPUTE_CPU_ELEMENTWISE_UNARY_H #include "arm_compute/core/Types.h" + #include "src/cpu/ICpuOperator.h" namespace arm_compute @@ -56,4 +57,4 @@ public: } // namespace cpu } // namespace arm_compute -#endif /* ARM_COMPUTE_CPU_ELEMENTWISE_UNARY_H */
\ No newline at end of file +#endif /* ARM_COMPUTE_CPU_ELEMENTWISE_UNARY_H */ diff --git a/src/cpu/operators/CpuFill.cpp b/src/cpu/operators/CpuFill.cpp index 3d8f62fe07..1890d0b916 100644 --- a/src/cpu/operators/CpuFill.cpp +++ b/src/cpu/operators/CpuFill.cpp @@ -23,9 +23,8 @@ */ #include "src/cpu/operators/CpuFill.h" -#include "src/cpu/kernels/CpuFillKernel.h" - #include "src/common/utils/Log.h" +#include "src/cpu/kernels/CpuFillKernel.h" namespace arm_compute { diff --git a/src/cpu/operators/CpuFill.h b/src/cpu/operators/CpuFill.h index 41d9a9fa8a..cb83745d29 100644 --- a/src/cpu/operators/CpuFill.h +++ b/src/cpu/operators/CpuFill.h @@ -25,6 +25,7 @@ #define ARM_COMPUTE_CPU_FILL_H #include "arm_compute/core/PixelValue.h" + #include "src/cpu/ICpuOperator.h" namespace arm_compute diff --git a/src/cpu/operators/CpuFlatten.cpp b/src/cpu/operators/CpuFlatten.cpp index 7bab9e481c..2609d44590 100644 --- a/src/cpu/operators/CpuFlatten.cpp +++ b/src/cpu/operators/CpuFlatten.cpp @@ -23,16 +23,14 @@ */ #include "src/cpu/operators/CpuFlatten.h" -#include "src/cpu/operators/CpuReshape.h" - #include "src/common/utils/Log.h" +#include "src/cpu/operators/CpuReshape.h" namespace arm_compute { namespace cpu { -CpuFlatten::CpuFlatten() - : _reshape(nullptr) +CpuFlatten::CpuFlatten() : _reshape(nullptr) { } diff --git a/src/cpu/operators/CpuFloor.cpp b/src/cpu/operators/CpuFloor.cpp index 868add7d29..a107393b01 100644 --- a/src/cpu/operators/CpuFloor.cpp +++ b/src/cpu/operators/CpuFloor.cpp @@ -23,9 +23,8 @@ */ #include "src/cpu/operators/CpuFloor.h" -#include "src/cpu/kernels/CpuFloorKernel.h" - #include "src/common/utils/Log.h" +#include "src/cpu/kernels/CpuFloorKernel.h" namespace arm_compute { diff --git a/src/cpu/operators/CpuFullyConnected.cpp b/src/cpu/operators/CpuFullyConnected.cpp index 395d8d2aa5..85a0b0311b 100644 --- a/src/cpu/operators/CpuFullyConnected.cpp +++ b/src/cpu/operators/CpuFullyConnected.cpp @@ -25,10 +25,11 @@ #include "arm_compute/core/Helpers.h" #include "arm_compute/core/ITensorPack.h" -#include "arm_compute/core/Validate.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" #include "arm_compute/core/utils/quantization/AsymmHelpers.h" +#include "arm_compute/core/Validate.h" #include "arm_compute/runtime/NEON/NEScheduler.h" + #include "src/common/utils/Log.h" #include "src/core/helpers/AutoConfiguration.h" #include "src/core/helpers/MemoryHelpers.h" @@ -49,8 +50,11 @@ using namespace arm_compute::misc::shape_calculator; namespace { -Status get_gemmlowp_output_stage_info(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *dst, const ActivationLayerInfo &act, - GEMMLowpOutputStageInfo &gemmlowp_output_stage_info) +Status get_gemmlowp_output_stage_info(const ITensorInfo *src, + const ITensorInfo *weights, + const ITensorInfo *dst, + const ActivationLayerInfo &act, + GEMMLowpOutputStageInfo &gemmlowp_output_stage_info) { const auto data_type = src->data_type(); const QuantizationInfo oq_info = dst->quantization_info(); @@ -62,10 +66,11 @@ Status get_gemmlowp_output_stage_info(const ITensorInfo *src, const ITensorInfo int32_t output_multiplier; int32_t output_shift; - ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier(multiplier, &output_multiplier, &output_shift)); + ARM_COMPUTE_RETURN_ON_ERROR( + quantization::calculate_quantized_multiplier(multiplier, &output_multiplier, &output_shift)); - int32_t type_min = 0; - int32_t type_max = 0; + int32_t type_min = 0; + int32_t type_max = 0; std::tie(type_min, type_max) = quantization::get_quantized_asymmetric_output_min_max(oq_info, act, data_type); gemmlowp_output_stage_info.gemmlowp_multiplier = output_multiplier; @@ -78,14 +83,22 @@ Status get_gemmlowp_output_stage_info(const ITensorInfo *src, const ITensorInfo return Status{}; } -Status validate_mm(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, const ActivationLayerInfo &act, bool enable_fast_math, WeightFormat weight_format) +Status validate_mm(const ITensorInfo *src, + const ITensorInfo *weights, + const ITensorInfo *biases, + const ITensorInfo *dst, + const ActivationLayerInfo &act, + bool enable_fast_math, + WeightFormat weight_format) { - if(is_data_type_quantized_asymmetric(src->data_type())) + if (is_data_type_quantized_asymmetric(src->data_type())) { // Since we need negative offsets for computing convolution, we need to change QuantizationInfo() // Extract and negate src and weights offset - const QuantizationInfo src_quantization_info(src->quantization_info().uniform().scale, -src->quantization_info().uniform().offset); - const QuantizationInfo weights_quantization_info(weights->quantization_info().uniform().scale, -weights->quantization_info().uniform().offset); + const QuantizationInfo src_quantization_info(src->quantization_info().uniform().scale, + -src->quantization_info().uniform().offset); + const QuantizationInfo weights_quantization_info(weights->quantization_info().uniform().scale, + -weights->quantization_info().uniform().offset); GEMMLowpOutputStageInfo gemmlowp_output_stage_info; ARM_COMPUTE_RETURN_ON_ERROR(get_gemmlowp_output_stage_info(src, weights, dst, act, gemmlowp_output_stage_info)); @@ -97,11 +110,8 @@ Status validate_mm(const ITensorInfo *src, const ITensorInfo *weights, const ITe // Validate gemmlowp function TensorInfo src_info = src->clone()->set_quantization_info(src_quantization_info); TensorInfo weights_info = weights->clone()->set_quantization_info(weights_quantization_info); - ARM_COMPUTE_RETURN_ON_ERROR(CpuGemmLowpMatrixMultiplyCore::validate(&src_info, - &weights_info, - biases, - dst, - gemm_info)); + ARM_COMPUTE_RETURN_ON_ERROR( + CpuGemmLowpMatrixMultiplyCore::validate(&src_info, &weights_info, biases, dst, gemm_info)); } else { @@ -142,21 +152,28 @@ CpuFullyConnected::CpuFullyConnected() CpuFullyConnected::~CpuFullyConnected() = default; -void CpuFullyConnected::configure_mm(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, ITensorInfo *dst, const ActivationLayerInfo &act) +void CpuFullyConnected::configure_mm(const ITensorInfo *src, + const ITensorInfo *weights, + const ITensorInfo *biases, + ITensorInfo *dst, + const ActivationLayerInfo &act) { - if(_is_quantized_asymmetric) + if (_is_quantized_asymmetric) { // Since we need negative offsets for computing convolution, we need to change QuantizationInfo() // Extract and negate src and weights offset - const QuantizationInfo src_quantization_info(src->quantization_info().uniform().scale, -src->quantization_info().uniform().offset); - const QuantizationInfo weights_quantization_info(weights->quantization_info().uniform().scale, -weights->quantization_info().uniform().offset); + const QuantizationInfo src_quantization_info(src->quantization_info().uniform().scale, + -src->quantization_info().uniform().offset); + const QuantizationInfo weights_quantization_info(weights->quantization_info().uniform().scale, + -weights->quantization_info().uniform().offset); TensorInfo src_info = src->clone()->set_quantization_info(src_quantization_info); TensorInfo weights_info = weights->clone()->set_quantization_info(weights_quantization_info); // Configure gemmlowp function and output stage for asymmetric quantized types GEMMLowpOutputStageInfo gemmlowp_output_stage_info; - const Status status = get_gemmlowp_output_stage_info(&src_info, &weights_info, dst, act, gemmlowp_output_stage_info); + const Status status = + get_gemmlowp_output_stage_info(&src_info, &weights_info, dst, act, gemmlowp_output_stage_info); ARM_COMPUTE_ERROR_ON(status.error_code() != ErrorCode::OK); GEMMInfo gemm_info; @@ -179,7 +196,11 @@ void CpuFullyConnected::configure_mm(const ITensorInfo *src, const ITensorInfo * } } -void CpuFullyConnected::configure_conv_fc(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, ITensorInfo *dst, const ActivationLayerInfo &act) +void CpuFullyConnected::configure_conv_fc(const ITensorInfo *src, + const ITensorInfo *weights, + const ITensorInfo *biases, + ITensorInfo *dst, + const ActivationLayerInfo &act) { ARM_COMPUTE_ERROR_ON((weights->dimension(1) != (src->dimension(0) * src->dimension(1) * src->dimension(2)))); @@ -195,7 +216,11 @@ void CpuFullyConnected::configure_conv_fc(const ITensorInfo *src, const ITensorI configure_mm(&_flattened_src, weights, biases, dst, act); } -void CpuFullyConnected::configure_fc_fc(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, ITensorInfo *dst, const ActivationLayerInfo &act) +void CpuFullyConnected::configure_fc_fc(const ITensorInfo *src, + const ITensorInfo *weights, + const ITensorInfo *biases, + ITensorInfo *dst, + const ActivationLayerInfo &act) { ARM_COMPUTE_ERROR_ON(src->dimension(0) != weights->dimension(1)); @@ -203,17 +228,17 @@ void CpuFullyConnected::configure_fc_fc(const ITensorInfo *src, const ITensorInf configure_mm(src, weights, biases, dst, act); } -void CpuFullyConnected::configure(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, ITensorInfo *dst, - FullyConnectedLayerInfo fc_info, const WeightsInfo &weights_info) +void CpuFullyConnected::configure(const ITensorInfo *src, + const ITensorInfo *weights, + const ITensorInfo *biases, + ITensorInfo *dst, + FullyConnectedLayerInfo fc_info, + const WeightsInfo &weights_info) { // Perform validate step ARM_COMPUTE_ERROR_ON_NULLPTR(src, weights, dst); - ARM_COMPUTE_ERROR_THROW_ON(CpuFullyConnected::validate(src, - weights, - biases != nullptr ? biases : nullptr, - dst, - fc_info, - weights_info)); + ARM_COMPUTE_ERROR_THROW_ON( + CpuFullyConnected::validate(src, weights, biases != nullptr ? biases : nullptr, dst, fc_info, weights_info)); ARM_COMPUTE_LOG_PARAMS(src, weights, biases, dst, fc_info); _needs_weights_conversion = false; @@ -238,9 +263,11 @@ void CpuFullyConnected::configure(const ITensorInfo *src, const ITensorInfo *wei // Check if we have a fully connected layer with batches const bool is_batched_fc_layer = dst->dimension(1) > 1; - if(is_batched_fc_layer) + if (is_batched_fc_layer) { - _is_fc_after_conv = (TensorShape::num_max_dimensions >= 4) && (std::equal(src->tensor_shape().cbegin() + 3, src->tensor_shape().cend(), dst->tensor_shape().cbegin() + 1)); + _is_fc_after_conv = (TensorShape::num_max_dimensions >= 4) && + (std::equal(src->tensor_shape().cbegin() + 3, src->tensor_shape().cend(), + dst->tensor_shape().cbegin() + 1)); } else { @@ -248,7 +275,7 @@ void CpuFullyConnected::configure(const ITensorInfo *src, const ITensorInfo *wei } // Reshape weights if needed - if(_needs_weights_reshape) + if (_needs_weights_reshape) { // Reshape the weights _transpose_weights = std::make_unique<kernels::CpuTransposeKernel>(); @@ -260,13 +287,11 @@ void CpuFullyConnected::configure(const ITensorInfo *src, const ITensorInfo *wei } // Convert weights if needed - if(_is_fc_after_conv && (src->data_layout() != fc_info.weights_trained_layout)) + if (_is_fc_after_conv && (src->data_layout() != fc_info.weights_trained_layout)) { // Convert weights _convert_weights = std::make_unique<CpuConvertFullyConnectedWeights>(); - _convert_weights->configure(weights_to_use, - &_converted_weights, - src->tensor_shape(), + _convert_weights->configure(weights_to_use, &_converted_weights, src->tensor_shape(), fc_info.weights_trained_layout); _converted_weights.set_are_values_constant(weights_to_use->are_values_constant()); @@ -275,7 +300,7 @@ void CpuFullyConnected::configure(const ITensorInfo *src, const ITensorInfo *wei _trans_weights_idx = AuxTensorIdx::ConvertedWeights; } - if(_is_fc_after_conv) + if (_is_fc_after_conv) { // Fully Connected layer after a Convolution Layer without batches configure_conv_fc(src, weights_to_use, biases, dst, fc_info.activation_info); @@ -287,54 +312,57 @@ void CpuFullyConnected::configure(const ITensorInfo *src, const ITensorInfo *wei } // Retain the tensorinfo with the weights to use - if(_needs_weights_reshape || _needs_weights_conversion) + if (_needs_weights_reshape || _needs_weights_conversion) { _trans_weights = *weights_to_use; } // Set auxiliary memory requirements auto gemm_mem_req = (_is_quantized_asymmetric) ? _mm_gemmlowp->workspace() : _mm_gemm->workspace(); - for(unsigned int i = 0; i < gemm_mem_req.size(); ++i) + for (unsigned int i = 0; i < gemm_mem_req.size(); ++i) { _aux_mem[i] = gemm_mem_req[i]; } - if(_aux_mem[Pretranspose].size > 0) + if (_aux_mem[Pretranspose].size > 0) { // Release permuted weights at the end of prepare as they are further transposed by the assembly dispatch // Do not release them if biases are dynamic and data type is quantized, since the weights tensor will be used for biases offset calculation // Keep all the auxiliary tensors in case of dynamic weights as they are recalculated every time. _aux_mem[TransposedWeights] = MemoryInfo( offset_int_vec(TransposedWeights), - _dynamic_weights ? MemoryLifetime::Temporary : - (_is_quantized_asymmetric && biases && !(biases->are_values_constant())) ? MemoryLifetime::Persistent : - MemoryLifetime::Prepare, + _dynamic_weights ? MemoryLifetime::Temporary + : (_is_quantized_asymmetric && biases && !(biases->are_values_constant())) ? MemoryLifetime::Persistent + : MemoryLifetime::Prepare, _reshaped_weights.total_size()); - _aux_mem[ConvertedWeights] = MemoryInfo( - offset_int_vec(ConvertedWeights), - _dynamic_weights ? MemoryLifetime::Temporary : MemoryLifetime::Prepare, - _converted_weights.total_size()); + _aux_mem[ConvertedWeights] = MemoryInfo(offset_int_vec(ConvertedWeights), + _dynamic_weights ? MemoryLifetime::Temporary : MemoryLifetime::Prepare, + _converted_weights.total_size()); } else { - _aux_mem[TransposedWeights] = MemoryInfo( - offset_int_vec(TransposedWeights), - _dynamic_weights ? MemoryLifetime::Temporary : - _needs_weights_conversion ? MemoryLifetime::Prepare : - MemoryLifetime::Persistent, - _reshaped_weights.total_size()); + _aux_mem[TransposedWeights] = MemoryInfo(offset_int_vec(TransposedWeights), + _dynamic_weights ? MemoryLifetime::Temporary + : _needs_weights_conversion ? MemoryLifetime::Prepare + : MemoryLifetime::Persistent, + _reshaped_weights.total_size()); _aux_mem[ConvertedWeights] = MemoryInfo( - offset_int_vec(ConvertedWeights), - _dynamic_weights ? MemoryLifetime::Temporary : MemoryLifetime::Persistent, + offset_int_vec(ConvertedWeights), _dynamic_weights ? MemoryLifetime::Temporary : MemoryLifetime::Persistent, _converted_weights.total_size()); } - _aux_mem[FlattenedSrc] = MemoryInfo(offset_int_vec(FlattenedSrc), MemoryLifetime::Temporary, _flattened_src.total_size()); + _aux_mem[FlattenedSrc] = + MemoryInfo(offset_int_vec(FlattenedSrc), MemoryLifetime::Temporary, _flattened_src.total_size()); } -Status CpuFullyConnected::has_opt_impl(arm_compute::WeightFormat &expected_weight_format, const ITensorInfo *src, const ITensorInfo *weights, - const ITensorInfo *biases, const ITensorInfo *dst, FullyConnectedLayerInfo fc_info, WeightsInfo weights_info) +Status CpuFullyConnected::has_opt_impl(arm_compute::WeightFormat &expected_weight_format, + const ITensorInfo *src, + const ITensorInfo *weights, + const ITensorInfo *biases, + const ITensorInfo *dst, + FullyConnectedLayerInfo fc_info, + WeightsInfo weights_info) { GEMMInfo gemm_info; gemm_info.set_activation_info(fc_info.activation_info); @@ -345,12 +373,17 @@ Status CpuFullyConnected::has_opt_impl(arm_compute::WeightFormat &expected_weigh return CpuGemm::has_opt_impl(expected_weight_format, src, weights, biases, dst, gemm_info); } -Status CpuFullyConnected::validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, - FullyConnectedLayerInfo fc_info, const WeightsInfo &weights_info) +Status CpuFullyConnected::validate(const ITensorInfo *src, + const ITensorInfo *weights, + const ITensorInfo *biases, + const ITensorInfo *dst, + FullyConnectedLayerInfo fc_info, + const WeightsInfo &weights_info) { ARM_COMPUTE_UNUSED(fc_info.retain_internal_weights); ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, weights, dst); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F16, DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, + DataType::F16, DataType::F32); if (is_fixed_format_fast_math(weights_info.weight_format())) { @@ -364,15 +397,22 @@ Status CpuFullyConnected::validate(const ITensorInfo *src, const ITensorInfo *we } ARM_COMPUTE_RETURN_ERROR_ON(weights->num_dimensions() > 2); - ARM_COMPUTE_RETURN_ERROR_ON(fc_info.activation_info.enabled() && is_data_type_quantized(src->data_type()) && fc_info.activation_info.activation() != ActivationLayerInfo::ActivationFunction::RELU - && fc_info.activation_info.activation() != ActivationLayerInfo::ActivationFunction::BOUNDED_RELU && fc_info.activation_info.activation() != ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU); + ARM_COMPUTE_RETURN_ERROR_ON( + fc_info.activation_info.enabled() && is_data_type_quantized(src->data_type()) && + fc_info.activation_info.activation() != ActivationLayerInfo::ActivationFunction::RELU && + fc_info.activation_info.activation() != ActivationLayerInfo::ActivationFunction::BOUNDED_RELU && + fc_info.activation_info.activation() != ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU); bool weights_reshaped = fc_info.transpose_weights ? fc_info.are_weights_reshaped : true; bool is_fc_after_conv = true; - const ITensorInfo &flatten_src = TensorInfo(src->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(compute_flatten_shape(src))); - const ITensorInfo &reshaped_weights = TensorInfo(weights->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(compute_transposed_shape(*weights))); - const ITensorInfo &converted_weights = weights_reshaped ? TensorInfo(weights->clone()->set_is_resizable(true).reset_padding()) : TensorInfo(*reshaped_weights.clone()); + const ITensorInfo &flatten_src = + TensorInfo(src->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(compute_flatten_shape(src))); + const ITensorInfo &reshaped_weights = TensorInfo( + weights->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(compute_transposed_shape(*weights))); + const ITensorInfo &converted_weights = weights_reshaped + ? TensorInfo(weights->clone()->set_is_resizable(true).reset_padding()) + : TensorInfo(*reshaped_weights.clone()); // With the Fully Connected layer we can have 4 different cases: // 1) Convolution layer -> Fully Connected layer without batches @@ -386,10 +426,10 @@ Status CpuFullyConnected::validate(const ITensorInfo *src, const ITensorInfo *we // Check if we have a fully connected layer with batches const bool is_batched_fc_layer = dst->dimension(1) > 1; - if(biases != nullptr) + if (biases != nullptr) { ARM_COMPUTE_RETURN_ERROR_ON(biases->num_dimensions() > 1); - if(is_data_type_quantized(src->data_type())) + if (is_data_type_quantized(src->data_type())) { ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(biases, 1, DataType::S32); } @@ -399,36 +439,37 @@ Status CpuFullyConnected::validate(const ITensorInfo *src, const ITensorInfo *we } } - if(is_batched_fc_layer) + if (is_batched_fc_layer) { - is_fc_after_conv = (TensorShape::num_max_dimensions >= 4) && (std::equal(src->tensor_shape().cbegin() + 3, src->tensor_shape().cend(), dst->tensor_shape().cbegin() + 1)); + is_fc_after_conv = (TensorShape::num_max_dimensions >= 4) && + (std::equal(src->tensor_shape().cbegin() + 3, src->tensor_shape().cend(), + dst->tensor_shape().cbegin() + 1)); } else { is_fc_after_conv = src->num_dimensions() > 1; } - if(!weights_reshaped) + if (!weights_reshaped) { // Validate reshape weights kernel ARM_COMPUTE_RETURN_ON_ERROR(kernels::CpuTransposeKernel::validate(weights, &reshaped_weights)); weights_to_use = &reshaped_weights; } - if(is_fc_after_conv && (src->data_layout() != fc_info.weights_trained_layout)) + if (is_fc_after_conv && (src->data_layout() != fc_info.weights_trained_layout)) { // Validate convert weights kernel - ARM_COMPUTE_RETURN_ON_ERROR(CpuConvertFullyConnectedWeights::validate(weights_to_use, - &converted_weights, - src->tensor_shape(), - fc_info.weights_trained_layout)); + ARM_COMPUTE_RETURN_ON_ERROR(CpuConvertFullyConnectedWeights::validate( + weights_to_use, &converted_weights, src->tensor_shape(), fc_info.weights_trained_layout)); weights_to_use = &converted_weights; } - if(is_fc_after_conv) + if (is_fc_after_conv) { // Fully Connected layer after a Convolution Layer without batches - ARM_COMPUTE_RETURN_ERROR_ON((weights_to_use->dimension(1) != (src->dimension(0) * src->dimension(1) * src->dimension(2)))); + ARM_COMPUTE_RETURN_ERROR_ON( + (weights_to_use->dimension(1) != (src->dimension(0) * src->dimension(1) * src->dimension(2)))); // Validate flatten kernel ARM_COMPUTE_RETURN_ON_ERROR(CpuFlatten::validate(src, &flatten_src)); @@ -440,7 +481,8 @@ Status CpuFullyConnected::validate(const ITensorInfo *src, const ITensorInfo *we ARM_COMPUTE_RETURN_ERROR_ON(src->dimension(0) != weights_to_use->dimension(1)); } // Validate matrix multiply kernel - ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(src_to_use, weights_to_use, biases, dst, fc_info.activation_info, fc_info.enable_fast_math, weights_info.weight_format())); + ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(src_to_use, weights_to_use, biases, dst, fc_info.activation_info, + fc_info.enable_fast_math, weights_info.weight_format())); return Status{}; } @@ -460,21 +502,21 @@ void CpuFullyConnected::run(ITensorPack &tensors) CpuAuxTensorHandler transformed_wei(offset_int_vec(_trans_weights_idx), _trans_weights, tensors, false); // Linearize src if it comes from a convolutional layer - if(_is_fc_after_conv) + if (_is_fc_after_conv) { - ITensorPack flatten_pack{ { ACL_SRC, src }, { ACL_DST, flattened_src.get() } }; + ITensorPack flatten_pack{{ACL_SRC, src}, {ACL_DST, flattened_src.get()}}; _flatten->run(flatten_pack); } ITensorPack gemm_pack = tensors; gemm_pack.add_const_tensor(ACL_SRC_0, (_is_fc_after_conv) ? flattened_src.get() : src); - if(_needs_weights_reshape || _needs_weights_conversion) + if (_needs_weights_reshape || _needs_weights_conversion) { gemm_pack.add_const_tensor(ACL_SRC_1, transformed_wei.get()); } // Run matrix multiply - if(_is_quantized_asymmetric) + if (_is_quantized_asymmetric) { _mm_gemmlowp->run(gemm_pack); } @@ -486,7 +528,7 @@ void CpuFullyConnected::run(ITensorPack &tensors) void CpuFullyConnected::prepare(ITensorPack &tensors) { - if(!_is_prepared || _dynamic_weights) + if (!_is_prepared || _dynamic_weights) { #ifdef ARM_COMPUTE_ASSERTS_ENABLED ++_asrt_prepare_count; @@ -502,20 +544,21 @@ void CpuFullyConnected::prepare(ITensorPack &tensors) const ITensor *cur_weights = weights; // Reshape of the weights (happens only once) - if(_needs_weights_reshape) + if (_needs_weights_reshape) { // Run reshape weights kernel and mark weights as unused - ITensorPack transpose_pack{ { ACL_SRC, weights }, { ACL_DST, reshaped_weights.get() } }; - NEScheduler::get().schedule_op(_transpose_weights.get(), Window::DimY, _transpose_weights->window(), transpose_pack); + ITensorPack transpose_pack{{ACL_SRC, weights}, {ACL_DST, reshaped_weights.get()}}; + NEScheduler::get().schedule_op(_transpose_weights.get(), Window::DimY, _transpose_weights->window(), + transpose_pack); cur_weights->mark_as_unused(); cur_weights = reshaped_weights.get(); } // Convert weights if needed (happens only once) - if(_needs_weights_conversion) + if (_needs_weights_conversion) { - ITensorPack convert_pack{ { ACL_SRC, cur_weights }, { ACL_DST, converted_weights.get() } }; + ITensorPack convert_pack{{ACL_SRC, cur_weights}, {ACL_DST, converted_weights.get()}}; _convert_weights->run(convert_pack); cur_weights->mark_as_unused(); @@ -526,7 +569,7 @@ void CpuFullyConnected::prepare(ITensorPack &tensors) gemm_pack.add_const_tensor(ACL_SRC_1, cur_weights); // Prepare GEMM prepare and release unused weights - if(!_is_quantized_asymmetric) + if (!_is_quantized_asymmetric) { _mm_gemm->prepare(gemm_pack); } diff --git a/src/cpu/operators/CpuFullyConnected.h b/src/cpu/operators/CpuFullyConnected.h index 1e8c6478d0..7073fb9f7c 100644 --- a/src/cpu/operators/CpuFullyConnected.h +++ b/src/cpu/operators/CpuFullyConnected.h @@ -24,11 +24,11 @@ #ifndef ARM_COMPUTE_CPU_FULLY_CONNECTED_H #define ARM_COMPUTE_CPU_FULLY_CONNECTED_H -#include "src/cpu/ICpuOperator.h" - #include "arm_compute/core/TensorInfo.h" #include "arm_compute/function_info/FullyConnectedLayerInfo.h" +#include "src/cpu/ICpuOperator.h" + #include <memory> namespace arm_compute @@ -86,16 +86,24 @@ public: * @param[in] fc_info (Optional) Fully connected layer additional info * @param[in] weights_info (Optional) Stores neccessary compute information when weights are already reshaped */ - void configure(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, ITensorInfo *dst, - FullyConnectedLayerInfo fc_info = FullyConnectedLayerInfo(), const WeightsInfo &weights_info = WeightsInfo()); + void configure(const ITensorInfo *src, + const ITensorInfo *weights, + const ITensorInfo *biases, + ITensorInfo *dst, + FullyConnectedLayerInfo fc_info = FullyConnectedLayerInfo(), + const WeightsInfo &weights_info = WeightsInfo()); /** Static function to check if given info will lead to a valid configuration of @ref CpuFullyConnected * * Similar to @ref CpuFullyConnected::configure() * * @return a status */ - static Status validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, - FullyConnectedLayerInfo fc_info = FullyConnectedLayerInfo(), const WeightsInfo &weights_info = WeightsInfo()); + static Status validate(const ITensorInfo *src, + const ITensorInfo *weights, + const ITensorInfo *biases, + const ITensorInfo *dst, + FullyConnectedLayerInfo fc_info = FullyConnectedLayerInfo(), + const WeightsInfo &weights_info = WeightsInfo()); /** Static function that queries whether there exists fixed-format kernel and if it exists it will return in the first argument in what format * weights are expected to be reshaped as defined by WeightFormat class. Apart from the first argument the rest of the arguments are the same @@ -103,19 +111,35 @@ public: * * @return a status */ - static Status has_opt_impl(arm_compute::WeightFormat &expected_weight_format, const ITensorInfo *src, const ITensorInfo *weights, - const ITensorInfo *biases, const ITensorInfo *dst, - FullyConnectedLayerInfo fc_info, WeightsInfo weights_info); + static Status has_opt_impl(arm_compute::WeightFormat &expected_weight_format, + const ITensorInfo *src, + const ITensorInfo *weights, + const ITensorInfo *biases, + const ITensorInfo *dst, + FullyConnectedLayerInfo fc_info, + WeightsInfo weights_info); //Inherited methods override - void run(ITensorPack &tensors) override; - void prepare(ITensorPack &tensors) override; + void run(ITensorPack &tensors) override; + void prepare(ITensorPack &tensors) override; experimental::MemoryRequirements workspace() const override; private: - void configure_fc_fc(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, ITensorInfo *dst, const ActivationLayerInfo &act); - void configure_conv_fc(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, ITensorInfo *dst, const ActivationLayerInfo &act); - void configure_mm(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, ITensorInfo *dst, const ActivationLayerInfo &act); + void configure_fc_fc(const ITensorInfo *src, + const ITensorInfo *weights, + const ITensorInfo *biases, + ITensorInfo *dst, + const ActivationLayerInfo &act); + void configure_conv_fc(const ITensorInfo *src, + const ITensorInfo *weights, + const ITensorInfo *biases, + ITensorInfo *dst, + const ActivationLayerInfo &act); + void configure_mm(const ITensorInfo *src, + const ITensorInfo *weights, + const ITensorInfo *biases, + ITensorInfo *dst, + const ActivationLayerInfo &act); enum AuxTensorIdx { diff --git a/src/cpu/operators/CpuGemm.cpp b/src/cpu/operators/CpuGemm.cpp index 34b845928d..8da166dbef 100644 --- a/src/cpu/operators/CpuGemm.cpp +++ b/src/cpu/operators/CpuGemm.cpp @@ -24,9 +24,10 @@ #include "src/cpu/operators/CpuGemm.h" #include "arm_compute/core/TensorInfo.h" -#include "arm_compute/core/Validate.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" +#include "arm_compute/core/Validate.h" #include "arm_compute/runtime/NEON/NEScheduler.h" + #include "src/common/utils/Log.h" #include "src/core/CPP/Validate.h" #include "src/core/helpers/AutoConfiguration.h" @@ -57,17 +58,25 @@ cpu::AsmGemmInfo init_assembly_metadata(const GEMMInfo &info) } } // namespace -void CpuGemm::configure(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, ITensorInfo *d, float alpha, float beta, const GEMMInfo &gemm_info) +void CpuGemm::configure(const ITensorInfo *a, + const ITensorInfo *b, + const ITensorInfo *c, + ITensorInfo *d, + float alpha, + float beta, + const GEMMInfo &gemm_info) { ARM_COMPUTE_ERROR_ON_NULLPTR(a, b, d); ARM_COMPUTE_ERROR_THROW_ON(CpuGemm::validate(a, b, c, d, alpha, beta, gemm_info)); ARM_COMPUTE_LOG_PARAMS(a, b, c, d, alpha, beta, gemm_info); - const cpu::AsmGemmInfo asm_info = init_assembly_metadata(gemm_info); - const bool is_c_bias = beta == 1 && c != nullptr; - bool run_optimised = bool(cpu::CpuGemmAssemblyDispatch::validate(a, b, (is_c_bias) ? c : nullptr, d, asm_info)) && - (c == nullptr || beta == 0.f || beta == 1.f) && // Optimized GeMM doesn't support beta coefficient. - !(!b->are_values_constant() && b->tensor_shape().z() > 1); // Disable batch matmul as optimized GeMM handles batching differently. + const cpu::AsmGemmInfo asm_info = init_assembly_metadata(gemm_info); + const bool is_c_bias = beta == 1 && c != nullptr; + bool run_optimised = + bool(cpu::CpuGemmAssemblyDispatch::validate(a, b, (is_c_bias) ? c : nullptr, d, asm_info)) && + (c == nullptr || beta == 0.f || beta == 1.f) && // Optimized GeMM doesn't support beta coefficient. + !(!b->are_values_constant() && + b->tensor_shape().z() > 1); // Disable batch matmul as optimized GeMM handles batching differently. // Check if we need to reshape the matrix B only on the first run _is_prepared = false; @@ -76,9 +85,12 @@ void CpuGemm::configure(const ITensorInfo *a, const ITensorInfo *b, const ITenso _run_alpha_scale = alpha != 1.f; _run_bias_addition = is_c_bias; _run_addition = beta != 0 && beta != 1 && c != nullptr; - _run_activation = gemm_info.activation_info().enabled() && (!run_optimised || (run_optimised && !cpu::CpuGemmAssemblyDispatch::is_activation_supported(gemm_info.activation_info()))); + _run_activation = + gemm_info.activation_info().enabled() && + (!run_optimised || + (run_optimised && !cpu::CpuGemmAssemblyDispatch::is_activation_supported(gemm_info.activation_info()))); - if(run_optimised) + if (run_optimised) { const ITensorInfo *c_to_use = is_c_bias ? c : nullptr; _asm_glue = std::make_unique<cpu::CpuGemmAssemblyDispatch>(); @@ -90,10 +102,11 @@ void CpuGemm::configure(const ITensorInfo *a, const ITensorInfo *b, const ITenso _aux_mem[Pretraspose] = asm_mem_req[Pretraspose]; // Scale product by alpha - if(_run_alpha_scale) + if (_run_alpha_scale) { _alpha_scale_func = std::make_unique<cpu::CpuActivation>(); - _alpha_scale_func->configure(d, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LINEAR, alpha, 0.f)); + _alpha_scale_func->configure( + d, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LINEAR, alpha, 0.f)); } } else @@ -104,7 +117,7 @@ void CpuGemm::configure(const ITensorInfo *a, const ITensorInfo *b, const ITenso _mm_kernel = std::make_unique<cpu::kernels::CpuGemmMatrixMultiplyKernel>(); // Select between GEMV and GEMM - if(_run_vector_matrix_multiplication) + if (_run_vector_matrix_multiplication) { // Configure the matrix multiply kernel _mm_kernel->configure(a, b, gemm_output_to_use, alpha, false); @@ -118,41 +131,50 @@ void CpuGemm::configure(const ITensorInfo *a, const ITensorInfo *b, const ITenso // Configure interleave kernel _interleave_kernel = std::make_unique<cpu::kernels::CpuGemmInterleave4x4Kernel>(); _interleave_kernel->configure(a, &_tmp_a); - _aux_mem[InterleavedLHS] = MemoryInfo(offset_int_vec(InterleavedLHS), MemoryLifetime::Temporary, _tmp_a.total_size()); + _aux_mem[InterleavedLHS] = + MemoryInfo(offset_int_vec(InterleavedLHS), MemoryLifetime::Temporary, _tmp_a.total_size()); // Configure transpose kernel _transpose_kernel = std::make_unique<cpu::kernels::CpuGemmTranspose1xWKernel>(); _transpose_kernel->configure(b, &_tmp_b); - _aux_mem[TransposedRHS] = MemoryInfo(offset_int_vec(TransposedRHS), MemoryLifetime::Persistent, _tmp_b.total_size()); + _aux_mem[TransposedRHS] = + MemoryInfo(offset_int_vec(TransposedRHS), MemoryLifetime::Persistent, _tmp_b.total_size()); // Configure matrix multiplication kernel _mm_kernel->configure(&_tmp_a, &_tmp_b, gemm_output_to_use, alpha, true, GEMMReshapeInfo(m, n, k)); } - if(_run_bias_addition) + if (_run_bias_addition) { _add_bias = std::make_unique<cpu::CpuAdd>(); _add_bias->configure(gemm_output_to_use, c, d, ConvertPolicy::SATURATE); - _aux_mem[TempResult] = MemoryInfo(offset_int_vec(TempResult), MemoryLifetime::Temporary, _tmp_d.total_size()); + _aux_mem[TempResult] = + MemoryInfo(offset_int_vec(TempResult), MemoryLifetime::Temporary, _tmp_d.total_size()); } } // Configure matrix addition kernel - if(_run_addition) + if (_run_addition) { _ma_kernel = std::make_unique<cpu::kernels::CpuGemmMatrixAdditionKernel>(); _ma_kernel->configure(c, d, beta); } // Configure activation - if(_run_activation) + if (_run_activation) { _activation_func = std::make_unique<cpu::CpuActivation>(); _activation_func->configure(d, nullptr, gemm_info.activation_info()); } } -Status CpuGemm::validate(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *d, float alpha, float beta, const GEMMInfo &gemm_info) +Status CpuGemm::validate(const ITensorInfo *a, + const ITensorInfo *b, + const ITensorInfo *c, + const ITensorInfo *d, + float alpha, + float beta, + const GEMMInfo &gemm_info) { ARM_COMPUTE_UNUSED(alpha); const bool is_c_bias = beta == 1 && c != nullptr; @@ -162,7 +184,7 @@ Status CpuGemm::validate(const ITensorInfo *a, const ITensorInfo *b, const ITens ARM_COMPUTE_RETURN_ERROR_ON_CPU_BF16_UNSUPPORTED(a); ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(a, 1, DataType::BFLOAT16, DataType::F16, DataType::F32); - if(is_fixed_format_fast_math(gemm_info.weight_format())) + if (is_fixed_format_fast_math(gemm_info.weight_format())) { ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_NOT_IN(a, DataType::F32); ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_NOT_IN(b, DataType::BFLOAT16); @@ -174,46 +196,54 @@ Status CpuGemm::validate(const ITensorInfo *a, const ITensorInfo *b, const ITens const int block_by = arm_compute::block_by(gemm_info.weight_format()); // test if im2col has changed the dimensions that are needed for padding - if(a->dimension(0) != b->dimension(1) && block_by > 1) + if (a->dimension(0) != b->dimension(1) && block_by > 1) { // have to verify bias const size_t dim0_sz = a->dimension(0); - ARM_COMPUTE_RETURN_ERROR_ON_MSG((dim0_sz % block_by) != 0, ("The matrix A number of columns must be a multiple of block_by=" + std::to_string(block_by)).c_str()); + ARM_COMPUTE_RETURN_ERROR_ON_MSG( + (dim0_sz % block_by) != 0, + ("The matrix A number of columns must be a multiple of block_by=" + std::to_string(block_by)).c_str()); // a->dimension(0) = kernel_area * input_channel + kernel_area * input_pad_right // b->dimension(1) = kernel_area * input_channel // a->dimension(0) = b->dimension(1) + kernel_area * input_pad_right const size_t input_pad_right = (dim0_sz - b->dimension(1)) % block_by; const size_t kernel_area = (dim0_sz - b->dimension(1)) / input_pad_right; - ARM_COMPUTE_RETURN_ERROR_ON_MSG((dim0_sz - kernel_area * input_pad_right) != b->dimension(1), "The product AB is defined only if A number of columns and B number of rows are related"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG( + (dim0_sz - kernel_area * input_pad_right) != b->dimension(1), + "The product AB is defined only if A number of columns and B number of rows are related"); } else { - ARM_COMPUTE_RETURN_ERROR_ON_MSG(a->dimension(0) != b->dimension(1), "The product AB is defined only if the number of columns in A is equal to the number of rows in B"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG( + a->dimension(0) != b->dimension(1), + "The product AB is defined only if the number of columns in A is equal to the number of rows in B"); } ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.is_a_reshaped(), "Matrix A already reshaped is not supported"); ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.is_b_reshaped(), "Matrix B already reshaped is not supported"); - if(a->data_type() != DataType::BFLOAT16) + if (a->data_type() != DataType::BFLOAT16) { ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(a, d); } - if(run_addition) + if (run_addition) { ARM_COMPUTE_RETURN_ERROR_ON(gemm_info.depth_output_gemm3d() != 0); ARM_COMPUTE_RETURN_ERROR_ON(gemm_info.reinterpret_input_as_3d()); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(c, d); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(a->dimension(1) != c->dimension(1), "The C matrix must have the same number of rows as the matrix A"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(b->dimension(0) != c->dimension(0), "The C matrix must have the same number of columns as the matrix B"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(a->dimension(1) != c->dimension(1), + "The C matrix must have the same number of rows as the matrix A"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(b->dimension(0) != c->dimension(0), + "The C matrix must have the same number of columns as the matrix B"); } - if(d->total_size() != 0) + if (d->total_size() != 0) { // For fixed format we are expecting some kind of blocked format for B/RHS so the dimension won't necessarily match the result matrix any more. ARM_COMPUTE_RETURN_ERROR_ON(!gemm_info.fixed_format() && b->dimension(0) != d->dimension(0)); - if(gemm_info.depth_output_gemm3d() != 0) + if (gemm_info.depth_output_gemm3d() != 0) { - if(gemm_info.reinterpret_input_as_3d()) + if (gemm_info.reinterpret_input_as_3d()) { ARM_COMPUTE_RETURN_ERROR_ON(a->dimension(1) != d->dimension(1)); ARM_COMPUTE_RETURN_ERROR_ON(a->dimension(2) != d->dimension(2)); @@ -230,15 +260,19 @@ Status CpuGemm::validate(const ITensorInfo *a, const ITensorInfo *b, const ITens } // Check if we need to run the optimized assembly kernel - cpu::AsmGemmInfo asm_info = init_assembly_metadata(gemm_info); - const bool run_optimised = bool(cpu::CpuGemmAssemblyDispatch::validate(a, b, is_c_bias ? c : nullptr, d, asm_info)) && - (c == nullptr || beta == 0.f || beta == 1.f) && // Optimized GeMM doesn't support beta coefficient. - !(!b->are_values_constant() && b->tensor_shape().z() > 1); // Disable batch matmul as optimized GeMM handles batching differently. - - if(!run_optimised) + cpu::AsmGemmInfo asm_info = init_assembly_metadata(gemm_info); + const bool run_optimised = + bool(cpu::CpuGemmAssemblyDispatch::validate(a, b, is_c_bias ? c : nullptr, d, asm_info)) && + (c == nullptr || beta == 0.f || beta == 1.f) && // Optimized GeMM doesn't support beta coefficient. + !(!b->are_values_constant() && + b->tensor_shape().z() > 1); // Disable batch matmul as optimized GeMM handles batching differently. + + if (!run_optimised) { - ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.reinterpret_input_as_3d(), "CpuGemm cannot reinterpret the input tensor as 3D"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.depth_output_gemm3d() != 0, "CpuGemm cannot reinterpret the output tensor as 3D"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.reinterpret_input_as_3d(), + "CpuGemm cannot reinterpret the input tensor as 3D"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.depth_output_gemm3d() != 0, + "CpuGemm cannot reinterpret the output tensor as 3D"); // Check if the first input tensor is a vector. const bool run_vector_matrix_multiplication = a->dimension(1) < 2; @@ -254,7 +288,8 @@ Status CpuGemm::validate(const ITensorInfo *a, const ITensorInfo *b, const ITens int mult_transpose1xW_width = 1; int mult_interleave4x4_height = 1; - const GEMMReshapeInfo reshape_info = GEMMReshapeInfo(m, n, k, mult_transpose1xW_width, mult_interleave4x4_height, gemm_info.depth_output_gemm3d()); + const GEMMReshapeInfo reshape_info = GEMMReshapeInfo( + m, n, k, mult_transpose1xW_width, mult_interleave4x4_height, gemm_info.depth_output_gemm3d()); const ITensorInfo *matrix_a_info = a; const ITensorInfo *matrix_b_info = b; @@ -263,39 +298,44 @@ Status CpuGemm::validate(const ITensorInfo *a, const ITensorInfo *b, const ITens TensorInfo tmp_b_info{}; TensorInfo tmp_output_info = *d->clone(); - if(run_interleave_transpose) + if (run_interleave_transpose) { matrix_a_info = &tmp_a_info; matrix_b_info = &tmp_b_info; // Validate interleave kernel - auto_init_if_empty(tmp_a_info, a->clone()->set_tensor_shape(compute_interleaved_shape(*a, mult_interleave4x4_height, gemm_info.reinterpret_input_as_3d()))); + auto_init_if_empty(tmp_a_info, a->clone()->set_tensor_shape(compute_interleaved_shape( + *a, mult_interleave4x4_height, gemm_info.reinterpret_input_as_3d()))); ARM_COMPUTE_RETURN_ON_ERROR(cpu::kernels::CpuGemmInterleave4x4Kernel::validate(a, &tmp_a_info)); // Validate transpose kernel - auto_init_if_empty(tmp_b_info, b->clone()->set_tensor_shape(compute_transpose1xW_with_element_size_shape(*b, mult_transpose1xW_width))); + auto_init_if_empty(tmp_b_info, b->clone()->set_tensor_shape(compute_transpose1xW_with_element_size_shape( + *b, mult_transpose1xW_width))); ARM_COMPUTE_RETURN_ON_ERROR(cpu::kernels::CpuGemmTranspose1xWKernel::validate(b, &tmp_b_info)); } // Validate matrix multiply - auto_init_if_empty(tmp_output_info, matrix_a_info->clone()->set_tensor_shape(compute_mm_shape(*matrix_a_info, *matrix_b_info, run_interleave_transpose, reshape_info))); - ARM_COMPUTE_RETURN_ON_ERROR(cpu::kernels::CpuGemmMatrixMultiplyKernel::validate(matrix_a_info, matrix_b_info, &tmp_output_info, alpha, run_interleave_transpose, reshape_info)); + auto_init_if_empty(tmp_output_info, + matrix_a_info->clone()->set_tensor_shape(compute_mm_shape( + *matrix_a_info, *matrix_b_info, run_interleave_transpose, reshape_info))); + ARM_COMPUTE_RETURN_ON_ERROR(cpu::kernels::CpuGemmMatrixMultiplyKernel::validate( + matrix_a_info, matrix_b_info, &tmp_output_info, alpha, run_interleave_transpose, reshape_info)); - if(is_c_bias) + if (is_c_bias) { ARM_COMPUTE_RETURN_ON_ERROR(cpu::CpuAdd::validate(&tmp_output_info, c, d, ConvertPolicy::SATURATE)); } } // Validate matrix addition kernel - if(run_addition) + if (run_addition) { ARM_COMPUTE_RETURN_ON_ERROR(cpu::kernels::CpuGemmMatrixAdditionKernel::validate(c, d, beta)); } // Validate activation const ActivationLayerInfo &activation = gemm_info.activation_info(); - if(activation.enabled()) + if (activation.enabled()) { ARM_COMPUTE_RETURN_ON_ERROR(cpu::CpuActivation::validate(d, nullptr, activation)); } @@ -312,15 +352,15 @@ void CpuGemm::run(ITensorPack &tensors) auto c = tensors.get_const_tensor(ACL_SRC_2); auto d = tensors.get_tensor(ACL_DST); - if(_asm_glue && _asm_glue->is_configured()) + if (_asm_glue && _asm_glue->is_configured()) { // Pass c to asm dispatch only if it's the bias tensor ITensorPack asm_pack = tensors; asm_pack.add_const_tensor(ACL_SRC_2, _run_bias_addition ? c : nullptr); _asm_glue->run(asm_pack); - if(_run_alpha_scale) + if (_run_alpha_scale) { - ITensorPack pack{ { ACL_SRC, d }, { ACL_DST, d } }; + ITensorPack pack{{ACL_SRC, d}, {ACL_DST, d}}; _alpha_scale_func->run(pack); } } @@ -330,18 +370,20 @@ void CpuGemm::run(ITensorPack &tensors) CpuAuxTensorHandler transposed_b(offset_int_vec(TransposedRHS), _tmp_b, tensors, true); CpuAuxTensorHandler temp_d(offset_int_vec(TempResult), _tmp_d, tensors, true); - ITensorPack mm_pack{ { ACL_SRC_0, a }, { ACL_SRC_1, b }, { ACL_DST, (_run_bias_addition) ? temp_d.get() : d } }; - if(!_run_vector_matrix_multiplication) + ITensorPack mm_pack{{ACL_SRC_0, a}, {ACL_SRC_1, b}, {ACL_DST, (_run_bias_addition) ? temp_d.get() : d}}; + if (!_run_vector_matrix_multiplication) { // Run interleave kernel - ITensorPack interleave_pack{ { ACL_SRC, a }, { ACL_DST, interleaved_a.get() } }; - NEScheduler::get().schedule_op(_interleave_kernel.get(), Window::DimY, _interleave_kernel->window(), interleave_pack); + ITensorPack interleave_pack{{ACL_SRC, a}, {ACL_DST, interleaved_a.get()}}; + NEScheduler::get().schedule_op(_interleave_kernel.get(), Window::DimY, _interleave_kernel->window(), + interleave_pack); - if(!_reshape_b_only_on_first_run) + if (!_reshape_b_only_on_first_run) { // Run transpose kernel - ITensorPack transpose_pack{ { ACL_SRC, b }, { ACL_DST, transposed_b.get() } }; - NEScheduler::get().schedule_op(_transpose_kernel.get(), Window::DimY, _transpose_kernel->window(), transpose_pack); + ITensorPack transpose_pack{{ACL_SRC, b}, {ACL_DST, transposed_b.get()}}; + NEScheduler::get().schedule_op(_transpose_kernel.get(), Window::DimY, _transpose_kernel->window(), + transpose_pack); } // Use reshaped matrices @@ -349,48 +391,52 @@ void CpuGemm::run(ITensorPack &tensors) mm_pack.add_const_tensor(ACL_SRC_1, transposed_b.get()); } - NEScheduler::get().schedule_op(_mm_kernel.get(), _run_vector_matrix_multiplication ? Window::DimX : Window::DimY, _mm_kernel->window(), mm_pack); + NEScheduler::get().schedule_op(_mm_kernel.get(), + _run_vector_matrix_multiplication ? Window::DimX : Window::DimY, + _mm_kernel->window(), mm_pack); // Run bias addition kernel - if(_run_bias_addition) + if (_run_bias_addition) { - ITensorPack pack{ { ACL_SRC_0, temp_d.get() }, { ACL_SRC_1, c }, { ACL_DST, d } }; + ITensorPack pack{{ACL_SRC_0, temp_d.get()}, {ACL_SRC_1, c}, {ACL_DST, d}}; _add_bias->run(pack); } } // Run matrix addition kernel - if(_run_addition) + if (_run_addition) { - ITensorPack c_add_pack{ { ACL_SRC, c }, { ACL_DST, d } }; + ITensorPack c_add_pack{{ACL_SRC, c}, {ACL_DST, d}}; NEScheduler::get().schedule_op(_ma_kernel.get(), Window::DimY, _ma_kernel->window(), c_add_pack); } // Run activation function - if(_run_activation) + if (_run_activation) { - ITensorPack pack{ { ACL_SRC, d }, { ACL_DST, d } }; + ITensorPack pack{{ACL_SRC, d}, {ACL_DST, d}}; _activation_func->run(pack); } } void CpuGemm::prepare(ITensorPack &tensors) { - if(!_is_prepared) + if (!_is_prepared) { - if(_asm_glue && _asm_glue->is_configured()) + if (_asm_glue && _asm_glue->is_configured()) { _asm_glue->prepare(tensors); } - else if(_reshape_b_only_on_first_run && !_run_vector_matrix_multiplication) + else if (_reshape_b_only_on_first_run && !_run_vector_matrix_multiplication) { - const ITensor *b = tensors.get_const_tensor(ACL_SRC_1); - ITensor *b_aux = utils::cast::polymorphic_cast<ITensor *>(tensors.get_tensor(offset_int_vec(TransposedRHS))); + const ITensor *b = tensors.get_const_tensor(ACL_SRC_1); + ITensor *b_aux = + utils::cast::polymorphic_cast<ITensor *>(tensors.get_tensor(offset_int_vec(TransposedRHS))); ARM_COMPUTE_ERROR_ON_NULLPTR(b, b_aux); CpuAuxTensorHandler transposed_b(_tmp_b, *b_aux); - ITensorPack transpose_pack{ { ACL_SRC, b }, { ACL_DST, transposed_b.get() } }; - NEScheduler::get().schedule_op(_transpose_kernel.get(), Window::DimY, _transpose_kernel->window(), transpose_pack); + ITensorPack transpose_pack{{ACL_SRC, b}, {ACL_DST, transposed_b.get()}}; + NEScheduler::get().schedule_op(_transpose_kernel.get(), Window::DimY, _transpose_kernel->window(), + transpose_pack); } _is_prepared = true; } @@ -401,8 +447,12 @@ experimental::MemoryRequirements CpuGemm::workspace() const return _aux_mem; } -Status CpuGemm::has_opt_impl(arm_compute::WeightFormat &expected_weight_format, const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *d, - const GEMMInfo &gemm_info) +Status CpuGemm::has_opt_impl(arm_compute::WeightFormat &expected_weight_format, + const ITensorInfo *a, + const ITensorInfo *b, + const ITensorInfo *c, + const ITensorInfo *d, + const GEMMInfo &gemm_info) { const cpu::AsmGemmInfo asm_info = init_assembly_metadata(gemm_info); diff --git a/src/cpu/operators/CpuGemm.h b/src/cpu/operators/CpuGemm.h index 9b08e5d0f6..6b30d134fa 100644 --- a/src/cpu/operators/CpuGemm.h +++ b/src/cpu/operators/CpuGemm.h @@ -24,12 +24,12 @@ #ifndef ARM_COMPUTE_CPU_GEMM_H #define ARM_COMPUTE_CPU_GEMM_H -#include "src/cpu/ICpuOperator.h" - #include "arm_compute/core/ITensorPack.h" #include "arm_compute/core/TensorInfo.h" #include "arm_compute/core/Types.h" #include "arm_compute/function_info/GEMMInfo.h" + +#include "src/cpu/ICpuOperator.h" #include "src/cpu/kernels/CpuGemmInterleave4x4Kernel.h" #include "src/cpu/kernels/CpuGemmMatrixAdditionKernel.h" #include "src/cpu/kernels/CpuGemmMatrixMultiplyKernel.h" @@ -93,16 +93,26 @@ public: * @param[in] gemm_info (Optional) Specifies if the matrix A and/or matrix B have been reshaped and * if the reshape of matrix B should happen only for the first run */ - void configure(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, ITensorInfo *d, - float alpha, float beta, const GEMMInfo &gemm_info = GEMMInfo()); + void configure(const ITensorInfo *a, + const ITensorInfo *b, + const ITensorInfo *c, + ITensorInfo *d, + float alpha, + float beta, + const GEMMInfo &gemm_info = GEMMInfo()); /** Static function to check if given info will lead to a valid configuration of @ref CpuGemm. * * Similar to @ref CpuGemm::configure() * * @return a status */ - static Status validate(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *d, - float alpha, float beta, const GEMMInfo &gemm_info = GEMMInfo()); + static Status validate(const ITensorInfo *a, + const ITensorInfo *b, + const ITensorInfo *c, + const ITensorInfo *d, + float alpha, + float beta, + const GEMMInfo &gemm_info = GEMMInfo()); /** Indicates whether or not there is an optimal assembly implementation that can be used to process the given parameters. * @@ -111,12 +121,16 @@ public: * the value of arm_compute::WeightFormat need to be passed via the * parameter gemm_info. */ - static Status has_opt_impl(arm_compute::WeightFormat &weight_format, const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *d, - const GEMMInfo &gemm_info = GEMMInfo()); + static Status has_opt_impl(arm_compute::WeightFormat &weight_format, + const ITensorInfo *a, + const ITensorInfo *b, + const ITensorInfo *c, + const ITensorInfo *d, + const GEMMInfo &gemm_info = GEMMInfo()); // Inherited methods overridden: - void run(ITensorPack &tensors) override; - void prepare(ITensorPack &constants) override; + void run(ITensorPack &tensors) override; + void prepare(ITensorPack &constants) override; experimental::MemoryRequirements workspace() const override; /** Indicates if the convolution executes in variable weights mode. @@ -138,28 +152,28 @@ private: Count }; - std::unique_ptr<kernels::CpuGemmInterleave4x4Kernel> _interleave_kernel{ nullptr }; - std::unique_ptr<kernels::CpuGemmTranspose1xWKernel> _transpose_kernel{ nullptr }; - std::unique_ptr<kernels::CpuGemmMatrixMultiplyKernel> _mm_kernel{ nullptr }; - std::unique_ptr<CpuGemmAssemblyDispatch> _asm_glue{ nullptr }; - std::unique_ptr<kernels::CpuGemmMatrixAdditionKernel> _ma_kernel{ nullptr }; - std::unique_ptr<CpuActivation> _alpha_scale_func{ nullptr }; - std::unique_ptr<CpuAdd> _add_bias{ nullptr }; - std::unique_ptr<CpuActivation> _activation_func{ nullptr }; + std::unique_ptr<kernels::CpuGemmInterleave4x4Kernel> _interleave_kernel{nullptr}; + std::unique_ptr<kernels::CpuGemmTranspose1xWKernel> _transpose_kernel{nullptr}; + std::unique_ptr<kernels::CpuGemmMatrixMultiplyKernel> _mm_kernel{nullptr}; + std::unique_ptr<CpuGemmAssemblyDispatch> _asm_glue{nullptr}; + std::unique_ptr<kernels::CpuGemmMatrixAdditionKernel> _ma_kernel{nullptr}; + std::unique_ptr<CpuActivation> _alpha_scale_func{nullptr}; + std::unique_ptr<CpuAdd> _add_bias{nullptr}; + std::unique_ptr<CpuActivation> _activation_func{nullptr}; TensorInfo _tmp_a{}; TensorInfo _tmp_b{}; TensorInfo _tmp_d{}; - bool _run_vector_matrix_multiplication{ false }; - bool _run_alpha_scale{ false }; - bool _run_addition{ false }; - bool _run_bias_addition{ false }; - bool _run_activation{ false }; - bool _reshape_b_only_on_first_run{ false }; - bool _is_prepared{ false }; + bool _run_vector_matrix_multiplication{false}; + bool _run_alpha_scale{false}; + bool _run_addition{false}; + bool _run_bias_addition{false}; + bool _run_activation{false}; + bool _reshape_b_only_on_first_run{false}; + bool _is_prepared{false}; - experimental::MemoryRequirements _aux_mem{ Count }; + experimental::MemoryRequirements _aux_mem{Count}; }; } // namespace cpu } // namespace arm_compute diff --git a/src/cpu/operators/CpuGemmConv2d.cpp b/src/cpu/operators/CpuGemmConv2d.cpp index 39b410d609..7c59d88c61 100644 --- a/src/cpu/operators/CpuGemmConv2d.cpp +++ b/src/cpu/operators/CpuGemmConv2d.cpp @@ -26,9 +26,9 @@ #include "arm_compute/core/Size2D.h" #include "arm_compute/core/TensorInfo.h" #include "arm_compute/core/Utils.h" -#include "arm_compute/core/Validate.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" #include "arm_compute/core/utils/quantization/AsymmHelpers.h" +#include "arm_compute/core/Validate.h" #include "arm_compute/runtime/NEON/NEScheduler.h" #include "src/common/utils/Log.h" @@ -52,8 +52,11 @@ namespace arm_compute { namespace cpu { -CpuGemmConv2d::SkipInfo CpuGemmConv2d::skip_im_col_info(const ITensorInfo *src, const ITensorInfo *weights, const PadStrideInfo &conv_info, - const Size2D &dilation, const ActivationLayerInfo &act_info) +CpuGemmConv2d::SkipInfo CpuGemmConv2d::skip_im_col_info(const ITensorInfo *src, + const ITensorInfo *weights, + const PadStrideInfo &conv_info, + const Size2D &dilation, + const ActivationLayerInfo &act_info) { const DataLayout data_layout = src->data_layout(); const int idx_width = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH); @@ -62,63 +65,86 @@ CpuGemmConv2d::SkipInfo CpuGemmConv2d::skip_im_col_info(const ITensorInfo *src, const unsigned int kernel_height = weights->dimension(idx_height); unsigned int conv_w = 0; unsigned int conv_h = 0; - std::tie(conv_w, conv_h) = scaled_dimensions(src->dimension(idx_width), - src->dimension(idx_height), - kernel_width, - kernel_height, - conv_info, - dilation); - const bool skip_im2col = (data_layout == DataLayout::NHWC && kernel_width == 1 && kernel_height == 1 && conv_info.stride().first == 1 && conv_info.stride().second == 1); - - if(skip_im2col) + std::tie(conv_w, conv_h) = scaled_dimensions(src->dimension(idx_width), src->dimension(idx_height), kernel_width, + kernel_height, conv_info, dilation); + const bool skip_im2col = (data_layout == DataLayout::NHWC && kernel_width == 1 && kernel_height == 1 && + conv_info.stride().first == 1 && conv_info.stride().second == 1); + + if (skip_im2col) { - const bool skip_col2im = (data_layout == DataLayout::NHWC && (bool(CpuGemmConv2d::validate_gemm3d(src, weights, act_info, conv_h, /*skip_im2col*/ true)))); - if(skip_col2im) + const bool skip_col2im = + (data_layout == DataLayout::NHWC && + (bool(CpuGemmConv2d::validate_gemm3d(src, weights, act_info, conv_h, /*skip_im2col*/ true)))); + if (skip_col2im) { - return { true, true }; + return {true, true}; } } else { - const bool skip_col2im = (data_layout == DataLayout::NHWC && (bool(CpuGemmConv2d::validate_gemm3d(src, weights, act_info, conv_h, /*skip_im2col*/ false)))); - if(skip_col2im) + const bool skip_col2im = + (data_layout == DataLayout::NHWC && + (bool(CpuGemmConv2d::validate_gemm3d(src, weights, act_info, conv_h, /*skip_im2col*/ false)))); + if (skip_col2im) { - return { false, true }; + return {false, true}; } } // Default case when we cannot reinterpret the input and output as 3D. - return { false, false }; + return {false, false}; } CpuGemmConv2d::CpuGemmConv2d() - : _weights_reshape_kernel(nullptr), _im2col_kernel(), _mm_gemm(), _mm_gemmlowp(), _col2im_kernel(), _reshape(), _im2col_output(), _weights_reshaped(), _gemm_output(), _gemm_output_3d(), - _data_layout(DataLayout::NCHW), _skip_im2col(false), _skip_col2im(false), _is_quantized(false), _is_prepared(false), _aux_mem(AuxTensorIdx::Count) + : _weights_reshape_kernel(nullptr), + _im2col_kernel(), + _mm_gemm(), + _mm_gemmlowp(), + _col2im_kernel(), + _reshape(), + _im2col_output(), + _weights_reshaped(), + _gemm_output(), + _gemm_output_3d(), + _data_layout(DataLayout::NCHW), + _skip_im2col(false), + _skip_col2im(false), + _is_quantized(false), + _is_prepared(false), + _aux_mem(AuxTensorIdx::Count) { } CpuGemmConv2d::~CpuGemmConv2d() = default; -void CpuGemmConv2d::configure_mm(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, ITensorInfo *dst, const ActivationLayerInfo &act_info, - bool enable_fast_math, int gemm_3d_depth, bool fixed_format, arm_compute::WeightFormat weight_format) +void CpuGemmConv2d::configure_mm(const ITensorInfo *src, + const ITensorInfo *weights, + const ITensorInfo *biases, + ITensorInfo *dst, + const ActivationLayerInfo &act_info, + bool enable_fast_math, + int gemm_3d_depth, + bool fixed_format, + arm_compute::WeightFormat weight_format) { ARM_COMPUTE_ERROR_ON_NULLPTR(src, weights); - ARM_COMPUTE_ERROR_THROW_ON(validate_mm(src, weights, biases, dst, act_info, enable_fast_math, gemm_3d_depth, _skip_im2col, fixed_format, weight_format)); + ARM_COMPUTE_ERROR_THROW_ON(validate_mm(src, weights, biases, dst, act_info, enable_fast_math, gemm_3d_depth, + _skip_im2col, fixed_format, weight_format)); // Create GEMMInfo structure - const GEMMInfo &gemm_info = GEMMInfo(false, false, true /* Reshape weights only for the first run */, - gemm_3d_depth, _skip_im2col /* Reinterpret the input as 3D if im2col is skipped */, - false, GEMMLowpOutputStageInfo(), false, enable_fast_math, false, act_info, fixed_format, weight_format); + const GEMMInfo &gemm_info = + GEMMInfo(false, false, true /* Reshape weights only for the first run */, gemm_3d_depth, + _skip_im2col /* Reinterpret the input as 3D if im2col is skipped */, false, GEMMLowpOutputStageInfo(), + false, enable_fast_math, false, act_info, fixed_format, weight_format); // Supported activations in GEMM - const std::set<ActivationLayerInfo::ActivationFunction> supported_acts = { ActivationLayerInfo::ActivationFunction::RELU, - ActivationLayerInfo::ActivationFunction::BOUNDED_RELU, - ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU - }; + const std::set<ActivationLayerInfo::ActivationFunction> supported_acts = { + ActivationLayerInfo::ActivationFunction::RELU, ActivationLayerInfo::ActivationFunction::BOUNDED_RELU, + ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU}; - if(_is_quantized) + if (_is_quantized) { - TensorInfo tmp_src{ *src }; - TensorInfo tmp_weights{ *weights }; + TensorInfo tmp_src{*src}; + TensorInfo tmp_weights{*weights}; // Since we need negative offsets for computing convolution, we need to change QuantizationInfo() // Extract and negate input and weights offset const QuantizationInfo iqinfo = src->quantization_info(); @@ -129,7 +155,7 @@ void CpuGemmConv2d::configure_mm(const ITensorInfo *src, const ITensorInfo *weig const DataType data_type = src->data_type(); tmp_src.set_quantization_info(QuantizationInfo(uiqinfo.scale, -uiqinfo.offset)); - if(!is_data_type_quantized_per_channel(tmp_weights.data_type())) + if (!is_data_type_quantized_per_channel(tmp_weights.data_type())) { const UniformQuantizationInfo uwqinfo = wqinfo.uniform(); tmp_weights.set_quantization_info(QuantizationInfo(uwqinfo.scale, -uwqinfo.offset)); @@ -142,7 +168,7 @@ void CpuGemmConv2d::configure_mm(const ITensorInfo *src, const ITensorInfo *weig int32_t min_activation = type_min.get<int32_t>(); int32_t max_activation = type_max.get<int32_t>(); - if(supported_acts.count(act_info.activation()) != 0) + if (supported_acts.count(act_info.activation()) != 0) { std::tie(min_activation, max_activation) = get_quantized_activation_min_max(act_info, data_type, uoqinfo); } @@ -156,11 +182,12 @@ void CpuGemmConv2d::configure_mm(const ITensorInfo *src, const ITensorInfo *weig quantization::calculate_quantized_multipliers(iqinfo, wqinfo, oqinfo, output_info); _mm_gemmlowp = std::make_unique<CpuGemmLowpMatrixMultiplyCore>(); - _mm_gemmlowp->configure(&tmp_src, &tmp_weights, biases, dst, GEMMInfo(false, false, true, gemm_3d_depth, _skip_im2col, false, output_info, false, enable_fast_math, false, act_info, fixed_format, - weight_format)); + _mm_gemmlowp->configure(&tmp_src, &tmp_weights, biases, dst, + GEMMInfo(false, false, true, gemm_3d_depth, _skip_im2col, false, output_info, false, + enable_fast_math, false, act_info, fixed_format, weight_format)); auto mm_mem_req = _mm_gemmlowp->workspace(); - for(unsigned int cont = 0; cont < mm_mem_req.size(); ++cont) + for (unsigned int cont = 0; cont < mm_mem_req.size(); ++cont) { _aux_mem[cont] = mm_mem_req[cont]; } @@ -171,26 +198,35 @@ void CpuGemmConv2d::configure_mm(const ITensorInfo *src, const ITensorInfo *weig _mm_gemm = std::make_unique<CpuGemm>(); _mm_gemm->configure(src, weights, biases, dst, 1.0f, 1.0f, gemm_info); auto mm_mem_req = _mm_gemm->workspace(); - for(unsigned int cont = 0; cont < mm_mem_req.size(); ++cont) + for (unsigned int cont = 0; cont < mm_mem_req.size(); ++cont) { _aux_mem[cont] = mm_mem_req[cont]; } } } -Status CpuGemmConv2d::validate_mm(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, - const ActivationLayerInfo &act_info, bool enable_fast_math, int gemm_3d_depth, bool skip_im2col, bool fixed_format, arm_compute::WeightFormat weight_format) +Status CpuGemmConv2d::validate_mm(const ITensorInfo *src, + const ITensorInfo *weights, + const ITensorInfo *biases, + const ITensorInfo *dst, + const ActivationLayerInfo &act_info, + bool enable_fast_math, + int gemm_3d_depth, + bool skip_im2col, + bool fixed_format, + arm_compute::WeightFormat weight_format) { const DataType data_type = src->data_type(); const bool is_quantized = is_data_type_quantized_asymmetric(data_type); const bool is_activation_enabled = act_info.enabled(); // Create GEMMInfo structure - const GEMMInfo gemm_info = GEMMInfo(false, false, true /* Reshape weights only for the first run */, - gemm_3d_depth, skip_im2col /* Reinterpret the input as 3D if im2col is skipped */, - false, GEMMLowpOutputStageInfo(), false, enable_fast_math, false, act_info, fixed_format, weight_format); + const GEMMInfo gemm_info = + GEMMInfo(false, false, true /* Reshape weights only for the first run */, gemm_3d_depth, + skip_im2col /* Reinterpret the input as 3D if im2col is skipped */, false, GEMMLowpOutputStageInfo(), + false, enable_fast_math, false, act_info, fixed_format, weight_format); - if(is_quantized) + if (is_quantized) { // Since we need negative offsets for computing convolution, we need to change QuantizationInfo() // Extract and negate input and weights offset @@ -206,11 +242,10 @@ Status CpuGemmConv2d::validate_mm(const ITensorInfo *src, const ITensorInfo *wei int32_t min_activation = type_min.get<int32_t>(); int32_t max_activation = type_max.get<int32_t>(); - const std::set<ActivationLayerInfo::ActivationFunction> supported_acts = { ActivationLayerInfo::ActivationFunction::RELU, - ActivationLayerInfo::ActivationFunction::BOUNDED_RELU, - ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU - }; - if(is_activation_enabled && supported_acts.count(act_info.activation()) != 0) + const std::set<ActivationLayerInfo::ActivationFunction> supported_acts = { + ActivationLayerInfo::ActivationFunction::RELU, ActivationLayerInfo::ActivationFunction::BOUNDED_RELU, + ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU}; + if (is_activation_enabled && supported_acts.count(act_info.activation()) != 0) { std::tie(min_activation, max_activation) = get_quantized_activation_min_max(act_info, data_type, uoqinfo); } @@ -229,8 +264,9 @@ Status CpuGemmConv2d::validate_mm(const ITensorInfo *src, const ITensorInfo *wei input_qa->set_quantization_info(QuantizationInfo(iqinfo.uniform().scale, -iqinfo.uniform().offset)); weights_qa->set_quantization_info(QuantizationInfo(wqinfo.uniform().scale, -wqinfo.uniform().offset)); - return CpuGemmLowpMatrixMultiplyCore::validate(input_qa.get(), weights_qa.get(), biases, dst, GEMMInfo(false, false, true, gemm_3d_depth, skip_im2col, false, output_info, false, enable_fast_math, - false, act_info)); + return CpuGemmLowpMatrixMultiplyCore::validate(input_qa.get(), weights_qa.get(), biases, dst, + GEMMInfo(false, false, true, gemm_3d_depth, skip_im2col, false, + output_info, false, enable_fast_math, false, act_info)); } else { @@ -239,36 +275,44 @@ Status CpuGemmConv2d::validate_mm(const ITensorInfo *src, const ITensorInfo *wei } } -Status CpuGemmConv2d::validate_gemm3d(const ITensorInfo *input_info, const ITensorInfo *weights_info, const ActivationLayerInfo &act_info, int gemm_3d_depth, bool skip_im2col) +Status CpuGemmConv2d::validate_gemm3d(const ITensorInfo *input_info, + const ITensorInfo *weights_info, + const ActivationLayerInfo &act_info, + int gemm_3d_depth, + bool skip_im2col) { const DataType data_type = input_info->data_type(); const unsigned int mult_y = skip_im2col ? 1U : gemm_3d_depth; const unsigned int mult_z = skip_im2col ? gemm_3d_depth : 1U; // Set dummy tensor shapes for the validation - const TensorInfo dummy_input_info(TensorShape(4U, 4U * mult_y, 1U * mult_z), 1, data_type, input_info->quantization_info()); + const TensorInfo dummy_input_info(TensorShape(4U, 4U * mult_y, 1U * mult_z), 1, data_type, + input_info->quantization_info()); const TensorInfo dummy_weights_info(TensorShape(4U, 4U), 1, data_type, weights_info->quantization_info()); - const TensorInfo dummy_output_info(TensorShape(4U, 4U, gemm_3d_depth), 1, data_type, input_info->quantization_info()); + const TensorInfo dummy_output_info(TensorShape(4U, 4U, gemm_3d_depth), 1, data_type, + input_info->quantization_info()); - return validate_mm(&dummy_input_info, &dummy_weights_info, nullptr, &dummy_output_info, act_info, false, gemm_3d_depth, skip_im2col); + return validate_mm(&dummy_input_info, &dummy_weights_info, nullptr, &dummy_output_info, act_info, false, + gemm_3d_depth, skip_im2col); } -void CpuGemmConv2d::configure(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, ITensorInfo *dst, const PadStrideInfo &conv_info, const WeightsInfo &weights_info, - const Size2D &dilation, const ActivationLayerInfo &act_info, bool enable_fast_math, unsigned int num_groups) +void CpuGemmConv2d::configure(const ITensorInfo *src, + const ITensorInfo *weights, + const ITensorInfo *biases, + ITensorInfo *dst, + const PadStrideInfo &conv_info, + const WeightsInfo &weights_info, + const Size2D &dilation, + const ActivationLayerInfo &act_info, + bool enable_fast_math, + unsigned int num_groups) { ARM_COMPUTE_ERROR_ON_NULLPTR(src, weights, dst); ARM_COMPUTE_UNUSED(num_groups, weights_info); - ARM_COMPUTE_ERROR_THROW_ON(CpuGemmConv2d::validate(src, - weights, - biases, - dst, - conv_info, - weights_info, - dilation, - act_info, - enable_fast_math, - num_groups)); - ARM_COMPUTE_LOG_PARAMS(src, weights, biases, dst, conv_info, weights_info, dilation, act_info, enable_fast_math, num_groups); + ARM_COMPUTE_ERROR_THROW_ON(CpuGemmConv2d::validate(src, weights, biases, dst, conv_info, weights_info, dilation, + act_info, enable_fast_math, num_groups)); + ARM_COMPUTE_LOG_PARAMS(src, weights, biases, dst, conv_info, weights_info, dilation, act_info, enable_fast_math, + num_groups); const DataType data_type = src->data_type(); const DataLayout data_layout = src->data_layout(); @@ -283,7 +327,8 @@ void CpuGemmConv2d::configure(const ITensorInfo *src, const ITensorInfo *weights _is_prepared = weights_info.retain_internal_weights(); _is_quantized = is_data_type_quantized_asymmetric(src->data_type()); _data_layout = data_layout; - _skip_im2col = (data_layout == DataLayout::NHWC && kernel_width == 1 && kernel_height == 1 && conv_info.stride().first == 1 && conv_info.stride().second == 1); + _skip_im2col = (data_layout == DataLayout::NHWC && kernel_width == 1 && kernel_height == 1 && + conv_info.stride().first == 1 && conv_info.stride().second == 1); const ITensorInfo *gemm_input_to_use = src; ITensorInfo *gemm_output_to_use = dst; @@ -291,20 +336,17 @@ void CpuGemmConv2d::configure(const ITensorInfo *src, const ITensorInfo *weights // Get convolved dimensions unsigned int conv_w = 0; unsigned int conv_h = 0; - std::tie(conv_w, conv_h) = scaled_dimensions(src->dimension(idx_width), - src->dimension(idx_height), - kernel_width, - kernel_height, - conv_info, - dilation); + std::tie(conv_w, conv_h) = scaled_dimensions(src->dimension(idx_width), src->dimension(idx_height), kernel_width, + kernel_height, conv_info, dilation); ARM_COMPUTE_ERROR_ON_MSG((dst->dimension(idx_width) != conv_w) || (dst->dimension(idx_height) != conv_h), "Output shape does not match the expected one"); // Check if GEMM3D is supported - const CpuGemmConv2d::SkipInfo skip_info = CpuGemmConv2d::skip_im_col_info(src, weights, conv_info, dilation, act_info); - _skip_im2col = skip_info.skip_im2col; - _skip_col2im = skip_info.skip_col2im; + const CpuGemmConv2d::SkipInfo skip_info = + CpuGemmConv2d::skip_im_col_info(src, weights, conv_info, dilation, act_info); + _skip_im2col = skip_info.skip_im2col; + _skip_col2im = skip_info.skip_col2im; // Get parameters from conv_info unsigned int stride_x = 0; @@ -320,17 +362,19 @@ void CpuGemmConv2d::configure(const ITensorInfo *src, const ITensorInfo *weights _weights_reshaped.set_quantization_info(weights->quantization_info()); // Create tensor to store im2col reshaped inputs - if(!_skip_im2col) + if (!_skip_im2col) { const int block_by = arm_compute::block_by(weights_info.weight_format()); unsigned int input_pad_right = 0; - if(block_by > 1) + if (block_by > 1) { - input_pad_right = (src->dimension(idx_channel) % block_by) == 0 ? 0 : block_by - (src->dimension(idx_channel) % block_by); + input_pad_right = + (src->dimension(idx_channel) % block_by) == 0 ? 0 : block_by - (src->dimension(idx_channel) % block_by); } // Configure _im2col_kernel = std::make_unique<kernels::CpuIm2ColKernel>(); - _im2col_kernel->configure(src, &_im2col_output, Size2D(kernel_width, kernel_height), conv_info, false, dilation, num_groups, input_pad_right); + _im2col_kernel->configure(src, &_im2col_output, Size2D(kernel_width, kernel_height), conv_info, false, dilation, + num_groups, input_pad_right); // Update GEMM input gemm_input_to_use = &_im2col_output; @@ -338,7 +382,7 @@ void CpuGemmConv2d::configure(const ITensorInfo *src, const ITensorInfo *weights // Create temporary GEMM output tensor in case we cannot skip col2im const DataType output_data_type = data_type == DataType::BFLOAT16 ? DataType::F32 : data_type; - if(!_skip_col2im) + if (!_skip_col2im) { TensorShape shape_gemm; @@ -368,9 +412,10 @@ void CpuGemmConv2d::configure(const ITensorInfo *src, const ITensorInfo *weights // In case we need to skip col2im, GEMM3D (gemm_3d_depth != 0) must be called in order to avoid reshaping the output matrix const unsigned int gemm_3d_depth = _skip_col2im ? conv_h : 0; const bool fixed_format = weights_info.weight_format() != arm_compute::WeightFormat::UNSPECIFIED; - configure_mm(gemm_input_to_use, &_weights_reshaped, biases, gemm_output_to_use, act_info, enable_fast_math, gemm_3d_depth, fixed_format, weights_info.weight_format()); + configure_mm(gemm_input_to_use, &_weights_reshaped, biases, gemm_output_to_use, act_info, enable_fast_math, + gemm_3d_depth, fixed_format, weights_info.weight_format()); - if(!_skip_col2im && _data_layout == DataLayout::NCHW) + if (!_skip_col2im && _data_layout == DataLayout::NCHW) { // Configure col2im _col2im_kernel = std::make_unique<kernels::CpuCol2ImKernel>(); @@ -390,14 +435,24 @@ void CpuGemmConv2d::configure(const ITensorInfo *src, const ITensorInfo *weights gemm_trans_wei = _mm_gemmlowp != nullptr ? _aux_mem[5].size > 0 : gemm_trans_wei; // Transpose RHS // Check lifetime - _aux_mem[Im2ColOutput] = MemoryInfo(offset_int_vec(Im2ColOutput), MemoryLifetime::Temporary, _im2col_output.total_size()); - _aux_mem[WeightsReshaped] = MemoryInfo(offset_int_vec(WeightsReshaped), gemm_trans_wei ? MemoryLifetime::Prepare : MemoryLifetime::Persistent, _weights_reshaped.total_size()); - _aux_mem[GemmOutput] = MemoryInfo(offset_int_vec(GemmOutput), MemoryLifetime::Temporary, _gemm_output.total_size()); + _aux_mem[Im2ColOutput] = + MemoryInfo(offset_int_vec(Im2ColOutput), MemoryLifetime::Temporary, _im2col_output.total_size()); + _aux_mem[WeightsReshaped] = MemoryInfo(offset_int_vec(WeightsReshaped), + gemm_trans_wei ? MemoryLifetime::Prepare : MemoryLifetime::Persistent, + _weights_reshaped.total_size()); + _aux_mem[GemmOutput] = MemoryInfo(offset_int_vec(GemmOutput), MemoryLifetime::Temporary, _gemm_output.total_size()); } -Status CpuGemmConv2d::has_opt_impl(arm_compute::WeightFormat &expected_weight_format, const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, - const PadStrideInfo &conv_info, - const WeightsInfo &weights_info, const Size2D &dilation, const ActivationLayerInfo &act_info, const bool enable_fast_math) +Status CpuGemmConv2d::has_opt_impl(arm_compute::WeightFormat &expected_weight_format, + const ITensorInfo *src, + const ITensorInfo *weights, + const ITensorInfo *biases, + const ITensorInfo *dst, + const PadStrideInfo &conv_info, + const WeightsInfo &weights_info, + const Size2D &dilation, + const ActivationLayerInfo &act_info, + const bool enable_fast_math) { const DataLayout data_layout = src->data_layout(); const int idx_width = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH); @@ -406,36 +461,44 @@ Status CpuGemmConv2d::has_opt_impl(arm_compute::WeightFormat &expected_weight_fo const unsigned int kernel_height = weights->dimension(idx_height); unsigned int conv_w = 0; unsigned int conv_h = 0; - std::tie(conv_w, conv_h) = scaled_dimensions(src->dimension(idx_width), - src->dimension(idx_height), - kernel_width, - kernel_height, - conv_info, - dilation); + std::tie(conv_w, conv_h) = scaled_dimensions(src->dimension(idx_width), src->dimension(idx_height), kernel_width, + kernel_height, conv_info, dilation); - const CpuGemmConv2d::SkipInfo skip_info = CpuGemmConv2d::skip_im_col_info(src, weights, conv_info, - dilation, act_info); + const CpuGemmConv2d::SkipInfo skip_info = + CpuGemmConv2d::skip_im_col_info(src, weights, conv_info, dilation, act_info); const bool skip_im2col = skip_info.skip_im2col; const bool skip_col2im = skip_info.skip_col2im; const unsigned int gemm_3d_depth = skip_col2im ? conv_h : 0; const bool fixed_format = weights_info.weight_format() != arm_compute::WeightFormat::UNSPECIFIED; - const GEMMInfo gemm_info = GEMMInfo(false, false, true /* Reshape weights only for the first run */, - gemm_3d_depth, skip_im2col /* Reinterpret the input as 3D if im2col is skipped */, - false, GEMMLowpOutputStageInfo(), false, enable_fast_math, false, act_info, fixed_format, weights_info.weight_format()); + const GEMMInfo gemm_info = + GEMMInfo(false, false, true /* Reshape weights only for the first run */, gemm_3d_depth, + skip_im2col /* Reinterpret the input as 3D if im2col is skipped */, false, GEMMLowpOutputStageInfo(), + false, enable_fast_math, false, act_info, fixed_format, weights_info.weight_format()); return CpuGemm::has_opt_impl(expected_weight_format, src, weights, biases, dst, gemm_info); } -Status CpuGemmConv2d::validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, const PadStrideInfo &conv_info, - const WeightsInfo &weights_info, const Size2D &dilation, const ActivationLayerInfo &act_info, bool enable_fast_math, unsigned int num_groups) +Status CpuGemmConv2d::validate(const ITensorInfo *src, + const ITensorInfo *weights, + const ITensorInfo *biases, + const ITensorInfo *dst, + const PadStrideInfo &conv_info, + const WeightsInfo &weights_info, + const Size2D &dilation, + const ActivationLayerInfo &act_info, + bool enable_fast_math, + unsigned int num_groups) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, weights, dst); ARM_COMPUTE_RETURN_ERROR_ON_MSG(weights_info.are_reshaped(), "Weights already reshaped are not supported!"); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::BFLOAT16, DataType::F16, DataType::F32); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(weights, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM8_PER_CHANNEL, DataType::BFLOAT16, DataType::F16, DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, + DataType::BFLOAT16, DataType::F16, DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(weights, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, + DataType::QSYMM8_PER_CHANNEL, DataType::BFLOAT16, + DataType::F16, DataType::F32); - if(!is_fixed_format(weights_info.weight_format())) + if (!is_fixed_format(weights_info.weight_format())) { ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(src, weights); } @@ -468,29 +531,25 @@ Status CpuGemmConv2d::validate(const ITensorInfo *src, const ITensorInfo *weight unsigned int conv_w = 0; unsigned int conv_h = 0; - std::tie(conv_w, conv_h) = scaled_dimensions(src->dimension(idx_width), - src->dimension(idx_height), - kernel_width, - kernel_height, - conv_info, - dilation); + std::tie(conv_w, conv_h) = scaled_dimensions(src->dimension(idx_width), src->dimension(idx_height), kernel_width, + kernel_height, conv_info, dilation); // Check if GEMM3D is supported - const CpuGemmConv2d::SkipInfo skip_info = CpuGemmConv2d::skip_im_col_info(src, weights, conv_info, - dilation, act_info); - const bool skip_im2col = skip_info.skip_im2col, skip_col2im = skip_info.skip_col2im; + const CpuGemmConv2d::SkipInfo skip_info = + CpuGemmConv2d::skip_im_col_info(src, weights, conv_info, dilation, act_info); + const bool skip_im2col = skip_info.skip_im2col, skip_col2im = skip_info.skip_col2im; ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(idx_channel) != src->dimension(idx_channel)); ARM_COMPUTE_RETURN_ERROR_ON(weights->num_dimensions() > 4); // Validate biases - if(biases != nullptr) + if (biases != nullptr) { - if(is_quantized) + if (is_quantized) { ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(biases, 1, DataType::S32); } - else if(is_bf16) + else if (is_bf16) { ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(biases, 1, DataType::F32); } @@ -503,20 +562,23 @@ Status CpuGemmConv2d::validate(const ITensorInfo *src, const ITensorInfo *weight } unsigned int mat_weights_cols = weights->dimension(idx_kernels); - unsigned int mat_weights_rows = weights->dimension(idx_width) * weights->dimension(idx_height) * weights->dimension(idx_channel); + unsigned int mat_weights_rows = + weights->dimension(idx_width) * weights->dimension(idx_height) * weights->dimension(idx_channel); weights_reshaped_info = TensorInfo(compute_weights_reshaped_shape(*weights, append_bias), 1, weights->data_type()); weights_reshaped_info.set_quantization_info(weights->quantization_info()); weights_to_use = &weights_reshaped_info; - if(!skip_im2col) + if (!skip_im2col) { const int block_by = arm_compute::block_by(weights_info.weight_format()); int input_pad_right = 0; - if(block_by > 1) + if (block_by > 1) { - input_pad_right = (src->dimension(idx_channel) % block_by) == 0 ? 0 : block_by - (src->dimension(idx_channel) % block_by); - mat_weights_rows = weights->dimension(idx_width) * weights->dimension(idx_height) * (weights->dimension(idx_channel) + input_pad_right); + input_pad_right = + (src->dimension(idx_channel) % block_by) == 0 ? 0 : block_by - (src->dimension(idx_channel) % block_by); + mat_weights_rows = weights->dimension(idx_width) * weights->dimension(idx_height) * + (weights->dimension(idx_channel) + input_pad_right); } // Create tensor info for im2col reshaped inputs @@ -528,13 +590,15 @@ Status CpuGemmConv2d::validate(const ITensorInfo *src, const ITensorInfo *weight im2col_reshaped_info = TensorInfo(shape_im2col, 1, data_type); im2col_reshaped_info.set_quantization_info(src->quantization_info()); - ARM_COMPUTE_RETURN_ON_ERROR(kernels::CpuIm2ColKernel::validate(src, &im2col_reshaped_info, Size2D(kernel_width, kernel_height), conv_info, append_bias, dilation, num_groups, input_pad_right)); + ARM_COMPUTE_RETURN_ON_ERROR( + kernels::CpuIm2ColKernel::validate(src, &im2col_reshaped_info, Size2D(kernel_width, kernel_height), + conv_info, append_bias, dilation, num_groups, input_pad_right)); gemm_input_to_use = &im2col_reshaped_info; } // Create temporary GEMM output tensor in case we cannot skip col2im const DataType output_data_type = data_type == DataType::BFLOAT16 ? DataType::F32 : data_type; - if(!skip_col2im) + if (!skip_col2im) { TensorShape shape_gemm = gemm_input_to_use->tensor_shape(); shape_gemm.set(0, mat_weights_cols); @@ -549,13 +613,15 @@ Status CpuGemmConv2d::validate(const ITensorInfo *src, const ITensorInfo *weight gemm_output_to_use = &info_gemm; const bool fixed_format = weights_info.weight_format() != arm_compute::WeightFormat::UNSPECIFIED; - ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemm_input_to_use, weights_to_use, biases, gemm_output_to_use, act_info, enable_fast_math, skip_col2im ? conv_h : 0, skip_im2col, fixed_format, + ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemm_input_to_use, weights_to_use, biases, gemm_output_to_use, act_info, + enable_fast_math, skip_col2im ? conv_h : 0, skip_im2col, fixed_format, weights_info.weight_format())); // Validate Col2Im/ReshapeLayer - if(!skip_col2im && (data_layout == DataLayout::NCHW)) + if (!skip_col2im && (data_layout == DataLayout::NCHW)) { - ARM_COMPUTE_RETURN_ON_ERROR(kernels::CpuCol2ImKernel::validate(gemm_output_to_use, dst, Size2D(conv_w, conv_h))); + ARM_COMPUTE_RETURN_ON_ERROR( + kernels::CpuCol2ImKernel::validate(gemm_output_to_use, dst, Size2D(conv_w, conv_h))); } return Status{}; @@ -574,15 +640,11 @@ void CpuGemmConv2d::run(ITensorPack &tensors) CpuAuxTensorHandler reshaped_wei(offset_int_vec(WeightsReshaped), _weights_reshaped, tensors, false); bool out_has_padding = _skip_col2im && (dst->info()->padding().bottom != 0 || dst->info()->padding().top != 0); - if(!_skip_im2col) + if (!_skip_im2col) { // Run input reshaping unsigned int y_dim = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::HEIGHT); - ITensorPack pack = - { - { TensorType::ACL_SRC, src }, - { TensorType::ACL_DST, im2col_output.get() } - }; + ITensorPack pack = {{TensorType::ACL_SRC, src}, {TensorType::ACL_DST, im2col_output.get()}}; NEScheduler::get().schedule_op(_im2col_kernel.get(), y_dim, _im2col_kernel->window(), pack); gemm_input_to_use = im2col_output.get(); } @@ -595,11 +657,11 @@ void CpuGemmConv2d::run(ITensorPack &tensors) gemm3d.allocator()->import_memory(out_to_use->buffer()); auto gemm_output_to_use = gemm_output.get(); - if(_skip_im2col) + if (_skip_im2col) { gemm_output_to_use = &gemm3d; } - if(_skip_col2im && !out_has_padding) + if (_skip_col2im && !out_has_padding) { gemm_output_to_use = dst; } @@ -607,12 +669,12 @@ void CpuGemmConv2d::run(ITensorPack &tensors) // Runs CpuGemm or CpuGemmLowpMatrixMultiplyCore functions ITensorPack pack_mm = tensors; pack_mm.add_const_tensor(TensorType::ACL_SRC_0, gemm_input_to_use); - if(!this->isVarWeightsKernel()) + if (!this->isVarWeightsKernel()) { pack_mm.add_const_tensor(TensorType::ACL_SRC_1, reshaped_wei.get()); } pack_mm.add_tensor(TensorType::ACL_DST, gemm_output_to_use); - if(_is_quantized) + if (_is_quantized) { // Run gemmlowp _mm_gemmlowp->run(pack_mm); @@ -624,45 +686,33 @@ void CpuGemmConv2d::run(ITensorPack &tensors) } // Reshape output matrix - if(!_skip_col2im) + if (!_skip_col2im) { - if(_data_layout == DataLayout::NCHW) + if (_data_layout == DataLayout::NCHW) { - ITensorPack pack = - { - { TensorType::ACL_SRC, gemm_output.get() }, - { TensorType::ACL_DST, dst } - }; + ITensorPack pack = {{TensorType::ACL_SRC, gemm_output.get()}, {TensorType::ACL_DST, dst}}; NEScheduler::get().schedule_op(_col2im_kernel.get(), Window::DimY, _col2im_kernel->window(), pack); } else { - ITensorPack pack = - { - { TensorType::ACL_SRC, gemm_output_to_use }, - { TensorType::ACL_DST, dst } - }; + ITensorPack pack = {{TensorType::ACL_SRC, gemm_output_to_use}, {TensorType::ACL_DST, dst}}; _reshape->run(pack); } } - else if(out_has_padding) + else if (out_has_padding) { - ITensorPack pack = - { - { TensorType::ACL_SRC, gemm_output_to_use }, - { TensorType::ACL_DST, dst } - }; + ITensorPack pack = {{TensorType::ACL_SRC, gemm_output_to_use}, {TensorType::ACL_DST, dst}}; _reshape->run(pack); } } void CpuGemmConv2d::prepare(ITensorPack &tensors) { - if(!_is_prepared) + if (!_is_prepared) { // Variable weights executions that use fixed-format kernels // need no reshaping of the weights. - if(this->isVarWeightsKernel()) + if (this->isVarWeightsKernel()) { _is_quantized ? _mm_gemmlowp->prepare(tensors) : _mm_gemm->prepare(tensors); _is_prepared = true; @@ -672,11 +722,7 @@ void CpuGemmConv2d::prepare(ITensorPack &tensors) // Run weights reshaping and mark original weights tensor as unused CpuAuxTensorHandler weights_reshaped(offset_int_vec(WeightsReshaped), _weights_reshaped, tensors); auto weights = tensors.get_const_tensor(TensorType::ACL_SRC_1); - ITensorPack pack = - { - { TensorType::ACL_SRC, weights }, - { TensorType::ACL_DST, weights_reshaped.get() } - }; + ITensorPack pack = {{TensorType::ACL_SRC, weights}, {TensorType::ACL_DST, weights_reshaped.get()}}; NEScheduler::get().schedule_op(_weights_reshape_kernel.get(), 3, _weights_reshape_kernel->window(), pack); weights->mark_as_unused(); ITensorPack gemm_pack = tensors; diff --git a/src/cpu/operators/CpuGemmConv2d.h b/src/cpu/operators/CpuGemmConv2d.h index 61fe63a79f..118d366517 100644 --- a/src/cpu/operators/CpuGemmConv2d.h +++ b/src/cpu/operators/CpuGemmConv2d.h @@ -27,6 +27,7 @@ #include "arm_compute/core/TensorInfo.h" #include "arm_compute/core/Types.h" #include "arm_compute/function_info/ActivationLayerInfo.h" + #include "src/cpu/ICpuOperator.h" #include <memory> @@ -106,17 +107,32 @@ public: * available which may introduce a drop of accuracy as well. Default is false * @param[in] num_groups (Optional) Number of groups when performing a grouped convolution. num_groups != 1 is not supported */ - void configure(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, ITensorInfo *dst, const PadStrideInfo &conv_info, const WeightsInfo &weights_info = WeightsInfo(), - const Size2D &dilation = Size2D(1U, 1U), const ActivationLayerInfo &act_info = ActivationLayerInfo(), bool enable_fast_math = false, unsigned int num_groups = 1); + void configure(const ITensorInfo *src, + const ITensorInfo *weights, + const ITensorInfo *biases, + ITensorInfo *dst, + const PadStrideInfo &conv_info, + const WeightsInfo &weights_info = WeightsInfo(), + const Size2D &dilation = Size2D(1U, 1U), + const ActivationLayerInfo &act_info = ActivationLayerInfo(), + bool enable_fast_math = false, + unsigned int num_groups = 1); /** Static function to check if given info will lead to a valid configuration * * Similar to CpuGemmConvolution::configure() * * @return a status */ - static Status validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info, - const WeightsInfo &weights_info = WeightsInfo(), const Size2D &dilation = Size2D(1U, 1U), const ActivationLayerInfo &act_info = ActivationLayerInfo(), - bool enable_fast_math = false, unsigned int num_groups = 1); + static Status validate(const ITensorInfo *src, + const ITensorInfo *weights, + const ITensorInfo *biases, + const ITensorInfo *output, + const PadStrideInfo &conv_info, + const WeightsInfo &weights_info = WeightsInfo(), + const Size2D &dilation = Size2D(1U, 1U), + const ActivationLayerInfo &act_info = ActivationLayerInfo(), + bool enable_fast_math = false, + unsigned int num_groups = 1); /** Indicates whether or not there is an optimal assembly implementation that can be used to process the given parameters. * @@ -124,10 +140,16 @@ public: * * @return a status. */ - static Status has_opt_impl(arm_compute::WeightFormat &expected_weight_format, const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, - const PadStrideInfo &conv_info, - const WeightsInfo &weights_info = WeightsInfo(), const Size2D &dilation = Size2D(1U, 1U), const ActivationLayerInfo &act_info = ActivationLayerInfo(), - const bool enable_fast_math = false); + static Status has_opt_impl(arm_compute::WeightFormat &expected_weight_format, + const ITensorInfo *src, + const ITensorInfo *weights, + const ITensorInfo *biases, + const ITensorInfo *output, + const PadStrideInfo &conv_info, + const WeightsInfo &weights_info = WeightsInfo(), + const Size2D &dilation = Size2D(1U, 1U), + const ActivationLayerInfo &act_info = ActivationLayerInfo(), + const bool enable_fast_math = false); // Inherited methods overridden: void run(ITensorPack &tensors) override; @@ -150,8 +172,15 @@ private: * @param[in] fixed_format (Optional) Select GEMM execution with variable weights. * @param[in] weight_format (Optional) The layout to be used for the weights tensor when running GEMM with variable weights. */ - void configure_mm(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, ITensorInfo *output, const ActivationLayerInfo &act_info = ActivationLayerInfo(), - bool enable_fast_math = false, int gemm_3d_depth = 1, bool fixed_format = false, arm_compute::WeightFormat weight_format = arm_compute::WeightFormat::UNSPECIFIED); + void configure_mm(const ITensorInfo *src, + const ITensorInfo *weights, + const ITensorInfo *biases, + ITensorInfo *output, + const ActivationLayerInfo &act_info = ActivationLayerInfo(), + bool enable_fast_math = false, + int gemm_3d_depth = 1, + bool fixed_format = false, + arm_compute::WeightFormat weight_format = arm_compute::WeightFormat::UNSPECIFIED); /** Static function to check if given info will lead to a valid configuration of @ref NEGEMMConvolutionLayer matrix multiply routines * * @param[in] src Input tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/BFLOAT16/F16/F32. @@ -170,8 +199,16 @@ private: * * @return a status */ - static Status validate_mm(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, const ActivationLayerInfo &act_info = ActivationLayerInfo(), - bool enable_fast_math = false, int gemm_3d_depth = 1, bool skip_im2col = false, bool fixed_format = false, arm_compute::WeightFormat weight_format = arm_compute::WeightFormat::UNSPECIFIED); + static Status validate_mm(const ITensorInfo *src, + const ITensorInfo *weights, + const ITensorInfo *biases, + const ITensorInfo *dst, + const ActivationLayerInfo &act_info = ActivationLayerInfo(), + bool enable_fast_math = false, + int gemm_3d_depth = 1, + bool skip_im2col = false, + bool fixed_format = false, + arm_compute::WeightFormat weight_format = arm_compute::WeightFormat::UNSPECIFIED); /** Static function to check if GEMM3D is supported in @ref NEGEMM or in @ref CpuGemmMLowpMatrixMultiplyCore * * @param[in] src Input tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/BFLOAT16/F16/F32. @@ -182,7 +219,11 @@ private: * * @return a status */ - static Status validate_gemm3d(const ITensorInfo *src, const ITensorInfo *weights, const ActivationLayerInfo &act_info, int gemm_3d_depth, bool skip_im2col); + static Status validate_gemm3d(const ITensorInfo *src, + const ITensorInfo *weights, + const ActivationLayerInfo &act_info, + int gemm_3d_depth, + bool skip_im2col); struct SkipInfo { @@ -200,8 +241,11 @@ private: * * @return a SkipInfo instance. */ - static SkipInfo skip_im_col_info(const ITensorInfo *src, const ITensorInfo *weights, const PadStrideInfo &conv_info, - const Size2D &dilation, const ActivationLayerInfo &act_info); + static SkipInfo skip_im_col_info(const ITensorInfo *src, + const ITensorInfo *weights, + const PadStrideInfo &conv_info, + const Size2D &dilation, + const ActivationLayerInfo &act_info); /** Indicates if the convolution executes in variable weights mode. * @@ -236,7 +280,7 @@ private: bool _is_quantized; bool _is_prepared; - experimental::MemoryRequirements _aux_mem{ Count }; + experimental::MemoryRequirements _aux_mem{Count}; }; } // namespace cpu } // namespace arm_compute diff --git a/src/cpu/operators/CpuGemmDirectConv2d.cpp b/src/cpu/operators/CpuGemmDirectConv2d.cpp index 5ce285cb6f..8fa81b1907 100644 --- a/src/cpu/operators/CpuGemmDirectConv2d.cpp +++ b/src/cpu/operators/CpuGemmDirectConv2d.cpp @@ -26,10 +26,10 @@ #include "arm_compute/core/utils/misc/ShapeCalculator.h" #include "arm_compute/core/utils/quantization/AsymmHelpers.h" #include "arm_compute/runtime/FunctionDescriptors.h" + #include "src/common/utils/Log.h" #include "src/core/helpers/MemoryHelpers.h" #include "src/cpu/utils/CpuAuxTensorHandler.h" - #include "support/Cast.h" #include <set> @@ -43,7 +43,10 @@ using namespace arm_compute::utils::cast; namespace { -GEMMLowpOutputStageInfo calculate_output_stage_metadata(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *dst, const ActivationLayerInfo &act) +GEMMLowpOutputStageInfo calculate_output_stage_metadata(const ITensorInfo *src, + const ITensorInfo *weights, + const ITensorInfo *dst, + const ActivationLayerInfo &act) { // Since we need negative offsets for computing convolution, we need to change QuantizationInfo() // Extract and negate input and weights offset @@ -53,16 +56,15 @@ GEMMLowpOutputStageInfo calculate_output_stage_metadata(const ITensorInfo *src, const UniformQuantizationInfo uoqinfo = oqinfo.uniform(); const DataType data_type = src->data_type(); // Merge activation with output stage - const std::set<ActivationLayerInfo::ActivationFunction> supported_acts = { ActivationLayerInfo::ActivationFunction::RELU, - ActivationLayerInfo::ActivationFunction::BOUNDED_RELU, - ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU - }; - PixelValue type_min{}; - PixelValue type_max{}; + const std::set<ActivationLayerInfo::ActivationFunction> supported_acts = { + ActivationLayerInfo::ActivationFunction::RELU, ActivationLayerInfo::ActivationFunction::BOUNDED_RELU, + ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU}; + PixelValue type_min{}; + PixelValue type_max{}; std::tie(type_min, type_max) = get_min_max(data_type); int32_t min_activation = type_min.get<int32_t>(); int32_t max_activation = type_max.get<int32_t>(); - if(supported_acts.count(act.activation()) != 0) + if (supported_acts.count(act.activation()) != 0) { std::tie(min_activation, max_activation) = get_quantized_activation_min_max(act, data_type, uoqinfo); } @@ -107,31 +109,32 @@ CpuGemmDirectConv2d::CpuGemmDirectConv2d() CpuGemmDirectConv2d::~CpuGemmDirectConv2d() = default; -void CpuGemmDirectConv2d::configure(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, ITensorInfo *dst, const Conv2dInfo &info) +void CpuGemmDirectConv2d::configure(const ITensorInfo *src, + const ITensorInfo *weights, + const ITensorInfo *biases, + ITensorInfo *dst, + const Conv2dInfo &info) { ARM_COMPUTE_ERROR_ON_NULLPTR(src, weights, dst); - ARM_COMPUTE_ERROR_THROW_ON(CpuGemmDirectConv2d::validate(src, - weights, - biases != nullptr ? biases : nullptr, - dst, - info)); + ARM_COMPUTE_ERROR_THROW_ON( + CpuGemmDirectConv2d::validate(src, weights, biases != nullptr ? biases : nullptr, dst, info)); ARM_COMPUTE_LOG_PARAMS(src, weights, biases, dst, info); _run_activation = info.act_info.enabled() && !_gemm_asm_func->is_activation_supported(info.act_info); _is_prepared = false; - _weights_permute_func->configure(weights, &_perm_weights, PermutationVector{ 3, 0, 1, 2 }); + _weights_permute_func->configure(weights, &_perm_weights, PermutationVector{3, 0, 1, 2}); // Configure assembly dispatch cpu::AsmGemmInfo asm_info = init_assembly_metadata(info, false); - if(is_data_type_quantized(src->data_type())) + if (is_data_type_quantized(src->data_type())) { asm_info.output_stage = calculate_output_stage_metadata(src, weights, dst, info.act_info); } _gemm_asm_func->configure(src, &_perm_weights, biases, dst, asm_info); // Configure activation - if(_run_activation) + if (_run_activation) { _activation_func->configure(dst, nullptr, info.act_info); } @@ -141,24 +144,33 @@ void CpuGemmDirectConv2d::configure(const ITensorInfo *src, const ITensorInfo *w _aux_mem[AsmGemmWorkspace] = asm_mem_req[AsmGemmWorkspace]; _aux_mem[Pretranspose] = asm_mem_req[Pretranspose]; - if(_aux_mem[Pretranspose].size > 0) + if (_aux_mem[Pretranspose].size > 0) { // Release permuted weights at the of prepare as they are further transposed by the assembly dispatch - _aux_mem[PermutedWeights] = MemoryInfo(offset_int_vec(PermutedWeights), MemoryLifetime::Prepare, weights->total_size()); + _aux_mem[PermutedWeights] = + MemoryInfo(offset_int_vec(PermutedWeights), MemoryLifetime::Prepare, weights->total_size()); } else { // We must permute weights if they are WeightFormat::UNSPECIFIED - if(info.weights_info.weight_format() == WeightFormat::UNSPECIFIED) - _aux_mem[PermutedWeights] = MemoryInfo(offset_int_vec(PermutedWeights), MemoryLifetime::Persistent, weights->total_size()); + if (info.weights_info.weight_format() == WeightFormat::UNSPECIFIED) + _aux_mem[PermutedWeights] = + MemoryInfo(offset_int_vec(PermutedWeights), MemoryLifetime::Persistent, weights->total_size()); } } -Status CpuGemmDirectConv2d::validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, const Conv2dInfo &info) +Status CpuGemmDirectConv2d::validate(const ITensorInfo *src, + const ITensorInfo *weights, + const ITensorInfo *biases, + const ITensorInfo *dst, + const Conv2dInfo &info) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, weights, dst); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::BFLOAT16, DataType::F16, DataType::F32); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(weights, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM8_PER_CHANNEL, DataType::BFLOAT16, DataType::F16, DataType::F32); - if(!is_fixed_format(info.weights_info.weight_format())) + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, + DataType::BFLOAT16, DataType::F16, DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(weights, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, + DataType::QSYMM8_PER_CHANNEL, DataType::BFLOAT16, + DataType::F16, DataType::F32); + if (!is_fixed_format(info.weights_info.weight_format())) { ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(src, weights); } @@ -171,13 +183,13 @@ Status CpuGemmDirectConv2d::validate(const ITensorInfo *src, const ITensorInfo * ARM_COMPUTE_RETURN_ERROR_ON(info.dilation != Size2D(1U, 1U)); ARM_COMPUTE_RETURN_ERROR_ON(weights->num_dimensions() > 4); // Validate biases - if(biases != nullptr) + if (biases != nullptr) { - if(is_data_type_quantized_asymmetric(data_type)) + if (is_data_type_quantized_asymmetric(data_type)) { ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(biases, 1, DataType::S32); } - else if(data_type == DataType::BFLOAT16) + else if (data_type == DataType::BFLOAT16) { ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(biases, 1, DataType::F32); } @@ -198,31 +210,32 @@ void CpuGemmDirectConv2d::run(ITensorPack &tensors) prepare(tensors); _gemm_asm_func->run(tensors); - if(_run_activation) + if (_run_activation) { ITensor *io = tensors.get_tensor(ACL_DST); - ITensorPack pack{ { ACL_SRC, io }, { ACL_DST, io } }; + ITensorPack pack{{ACL_SRC, io}, {ACL_DST, io}}; _activation_func->run(pack); } } void CpuGemmDirectConv2d::prepare(ITensorPack &tensors) { - if(!_is_prepared) + if (!_is_prepared) { // If we are using fixed-format kernel the weights are already reshaped - if(_gemm_asm_func && _gemm_asm_func->isVarWeightsKernel()) + if (_gemm_asm_func && _gemm_asm_func->isVarWeightsKernel()) { _gemm_asm_func->prepare(tensors); _is_prepared = true; return; } - const ITensor *weights = tensors.get_const_tensor(ACL_SRC_1); - ITensor *weights_aux = utils::cast::polymorphic_cast<ITensor *>(tensors.get_tensor(offset_int_vec(PermutedWeights))); + const ITensor *weights = tensors.get_const_tensor(ACL_SRC_1); + ITensor *weights_aux = + utils::cast::polymorphic_cast<ITensor *>(tensors.get_tensor(offset_int_vec(PermutedWeights))); ARM_COMPUTE_ERROR_ON_NULLPTR(weights, weights_aux); CpuAuxTensorHandler permuted_weights(_perm_weights, *weights_aux); - ITensorPack permute_tensors{ { ACL_SRC, weights }, { ACL_DST, permuted_weights.get() } }; + ITensorPack permute_tensors{{ACL_SRC, weights}, {ACL_DST, permuted_weights.get()}}; _weights_permute_func->run(permute_tensors); tensors.add_const_tensor(ACL_SRC_1, permuted_weights.get()); diff --git a/src/cpu/operators/CpuGemmDirectConv2d.h b/src/cpu/operators/CpuGemmDirectConv2d.h index e55a461f36..1cc3caadae 100644 --- a/src/cpu/operators/CpuGemmDirectConv2d.h +++ b/src/cpu/operators/CpuGemmDirectConv2d.h @@ -25,6 +25,7 @@ #define ARM_COMPUTE_CPU_GEMM_DIRECT_CONV_2D_H #include "arm_compute/core/TensorInfo.h" + #include "src/core/common/Macros.h" #include "src/cpu/ICpuOperator.h" #include "src/cpu/operators/CpuActivation.h" @@ -69,18 +70,26 @@ public: * Data types supported: Same as @p input. * @param[in] info Contains padding and stride information described in @ref PadStrideInfo. */ - void configure(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, ITensorInfo *dst, const Conv2dInfo &info); + void configure(const ITensorInfo *src, + const ITensorInfo *weights, + const ITensorInfo *biases, + ITensorInfo *dst, + const Conv2dInfo &info); /** Static function to check if given info will lead to a valid configuration of @ref CpuGemmDirectConv2d * * Similar to CpuGemmDirectConv2d::configure() * * @return a status */ - static Status validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, const Conv2dInfo &info); + static Status validate(const ITensorInfo *src, + const ITensorInfo *weights, + const ITensorInfo *biases, + const ITensorInfo *dst, + const Conv2dInfo &info); // Inherited methods overridden: - void run(ITensorPack &tensors) override; - void prepare(ITensorPack &constants) override; + void run(ITensorPack &tensors) override; + void prepare(ITensorPack &constants) override; experimental::MemoryRequirements workspace() const override; private: diff --git a/src/cpu/operators/CpuGemmLowpMatrixMultiplyCore.cpp b/src/cpu/operators/CpuGemmLowpMatrixMultiplyCore.cpp index 8ca128fb07..2ee879b67b 100644 --- a/src/cpu/operators/CpuGemmLowpMatrixMultiplyCore.cpp +++ b/src/cpu/operators/CpuGemmLowpMatrixMultiplyCore.cpp @@ -28,14 +28,14 @@ #include "arm_compute/core/ITensor.h" #include "arm_compute/core/KernelDescriptors.h" #include "arm_compute/core/Types.h" -#include "arm_compute/core/Validate.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" +#include "arm_compute/core/Validate.h" #include "arm_compute/runtime/NEON/NEScheduler.h" #include "arm_compute/runtime/TensorAllocator.h" -#include "src/core/helpers/AutoConfiguration.h" -#include "src/core/helpers/MemoryHelpers.h" #include "src/common/utils/Log.h" +#include "src/core/helpers/AutoConfiguration.h" +#include "src/core/helpers/MemoryHelpers.h" #include "src/cpu/kernels/CpuConvertQuantizedSignednessKernel.h" #include "src/cpu/kernels/CpuGemmInterleave4x4Kernel.h" #include "src/cpu/kernels/CpuGemmLowpMatrixMultiplyKernel.h" @@ -59,12 +59,12 @@ namespace cpu::AsmGemmInfo init_assembly_metadata(const GEMMInfo &info) { cpu::AsmGemmInfo asm_info; - asm_info.method = cpu::AsmConvMethod::Im2Col; - asm_info.reinterpret_input_as_3d = info.reinterpret_input_as_3d(); - asm_info.depth_output_gemm3d = info.depth_output_gemm3d(); - asm_info.activation_info = info.activation_info(); - asm_info.output_stage = info.gemmlowp_output_stage(); - asm_info.fast_mode = info.fast_math(); + asm_info.method = cpu::AsmConvMethod::Im2Col; + asm_info.reinterpret_input_as_3d = info.reinterpret_input_as_3d(); + asm_info.depth_output_gemm3d = info.depth_output_gemm3d(); + asm_info.activation_info = info.activation_info(); + asm_info.output_stage = info.gemmlowp_output_stage(); + asm_info.fast_mode = info.fast_math(); return asm_info; } @@ -105,7 +105,8 @@ CpuGemmLowpMatrixMultiplyCore::CpuGemmLowpMatrixMultiplyCore() } CpuGemmLowpMatrixMultiplyCore::~CpuGemmLowpMatrixMultiplyCore() = default; -void CpuGemmLowpMatrixMultiplyCore::configure(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, ITensorInfo *dst, const GEMMInfo &gemm_info) +void CpuGemmLowpMatrixMultiplyCore::configure( + const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, ITensorInfo *dst, const GEMMInfo &gemm_info) { ARM_COMPUTE_ERROR_ON_NULLPTR(a, b, dst); ARM_COMPUTE_ERROR_THROW_ON(CpuGemmLowpMatrixMultiplyCore::validate(a, b, c, dst, gemm_info)); @@ -122,28 +123,31 @@ void CpuGemmLowpMatrixMultiplyCore::configure(const ITensorInfo *a, const ITenso _reshape_b_only_on_first_run = b->are_values_constant(); _is_prepared = false; _fused_assembly_path = false; - _flip_signedness = is_data_type_quantized_per_channel(b->data_type()) && (a->data_type() == DataType::QASYMM8) && _reshape_b_only_on_first_run; - _gemm_info = gemm_info; + _flip_signedness = is_data_type_quantized_per_channel(b->data_type()) && (a->data_type() == DataType::QASYMM8) && + _reshape_b_only_on_first_run; + _gemm_info = gemm_info; _asm_glue = std::make_unique<cpu::CpuGemmAssemblyDispatch>(); const ITensorInfo *a_to_use = a; // Convert to QASYMM8 -> QASYMM8_SIGNED and back - if(_flip_signedness) + if (_flip_signedness) { const int32_t offset_correction = 128; const DataType dt = DataType::QASYMM8_SIGNED; const UniformQuantizationInfo iqinfo = a_to_use->quantization_info().uniform(); - _signed_a = a_to_use->clone()->set_data_type(dt).set_quantization_info(QuantizationInfo(iqinfo.scale, iqinfo.offset + offset_correction)); + _signed_a = a_to_use->clone()->set_data_type(dt).set_quantization_info( + QuantizationInfo(iqinfo.scale, iqinfo.offset + offset_correction)); _convert_to_signed_asymm = std::make_unique<kernels::CpuConvertQuantizedSignednessKernel>(); _convert_to_signed_asymm->configure(a_to_use, &_signed_a); a_to_use = &_signed_a; _a_offset = _signed_a.quantization_info().uniform().offset; const UniformQuantizationInfo oqinfo = dst->quantization_info().uniform(); - _signed_output = dst->clone()->set_data_type(dt).set_quantization_info(QuantizationInfo(oqinfo.scale, oqinfo.offset - offset_correction)); + _signed_output = dst->clone()->set_data_type(dt).set_quantization_info( + QuantizationInfo(oqinfo.scale, oqinfo.offset - offset_correction)); // Output stage correction GEMMLowpOutputStageInfo output_stage_corr = info.gemmlowp_output_stage(); @@ -157,7 +161,7 @@ void CpuGemmLowpMatrixMultiplyCore::configure(const ITensorInfo *a, const ITenso } // If GEMMLowpOutputStage != NONE, fuse the offset contribution with the output stage - if(info.gemmlowp_output_stage().type != GEMMLowpOutputStageType::NONE) + if (info.gemmlowp_output_stage().type != GEMMLowpOutputStageType::NONE) { _fuse_output_stage = true; _mm_result_s32 = TensorInfo(dst->tensor_shape(), 1, DataType::S32); @@ -166,16 +170,18 @@ void CpuGemmLowpMatrixMultiplyCore::configure(const ITensorInfo *a, const ITenso // Initialize assembly kernel meta-data const cpu::AsmGemmInfo asm_info = init_assembly_metadata(gemm_info); #ifdef __aarch64__ - if(!(!b->are_values_constant() && b->tensor_shape().z() > 1)) // Disable batch matmul as optimized GeMM handles batching differently. + if (!(!b->are_values_constant() && + b->tensor_shape().z() > 1)) // Disable batch matmul as optimized GeMM handles batching differently. { - switch(a->data_type()) + switch (a->data_type()) { case DataType::QASYMM8: case DataType::QASYMM8_SIGNED: case DataType::U8: case DataType::S8: { - if(is_data_type_quantized_asymmetric(a_to_use->data_type()) && info.gemmlowp_output_stage().type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT) + if (is_data_type_quantized_asymmetric(a_to_use->data_type()) && + info.gemmlowp_output_stage().type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT) { auto c_info_to_use = c == nullptr ? nullptr : c; _asm_glue->configure(a_to_use, b, c_info_to_use, dst, asm_info); @@ -197,13 +203,14 @@ void CpuGemmLowpMatrixMultiplyCore::configure(const ITensorInfo *a, const ITenso } } #endif /* __aarch64__ */ - if(!(_assembly_path || _run_vector_matrix_multiplication)) + if (!(_assembly_path || _run_vector_matrix_multiplication)) { matrix_a = &_tmp_a; matrix_b = &_tmp_b; // The interleaved output matrix will have the following shape: [ a_height * 4, ceil(a_width / 4.0f) ] - _tmp_a = TensorInfo(compute_interleaved_shape(*a_to_use), 1, a_to_use->data_type(), a_to_use->quantization_info()); + _tmp_a = + TensorInfo(compute_interleaved_shape(*a_to_use), 1, a_to_use->data_type(), a_to_use->quantization_info()); // The transpose1xW output matrix will have the following shape: [ b_height * 16, ceil(b_width / 16.0f) ] _tmp_b = TensorInfo(compute_transpose1xW_shape(*b), 1, b->data_type(), b->quantization_info()); @@ -216,13 +223,13 @@ void CpuGemmLowpMatrixMultiplyCore::configure(const ITensorInfo *a, const ITenso _mtx_b_reshape_kernel->configure(b, &_tmp_b); } - if(!_fused_assembly_path) + if (!_fused_assembly_path) { // Build reduction info const GEMMLowpReductionKernelInfo reduction_info(a_to_use->dimension(0), false, 0, false); // Initialize matrix B reduction kernel only if _a_offset is not equal to 0 - if(_a_offset != 0) + if (_a_offset != 0) { _vector_sum_col = TensorInfo(compute_reductionA_shape(*b), 1, DataType::S32); @@ -232,7 +239,7 @@ void CpuGemmLowpMatrixMultiplyCore::configure(const ITensorInfo *a, const ITenso } // Initialize Matrix A reduction kernel only if _b_offset is not equal to 0 - if(_b_offset != 0) + if (_b_offset != 0) { _vector_sum_row = TensorInfo(compute_reductionB_shape(*a_to_use), 1, DataType::S32); @@ -241,24 +248,23 @@ void CpuGemmLowpMatrixMultiplyCore::configure(const ITensorInfo *a, const ITenso _mtx_a_reduction_kernel->configure(a_to_use, &_vector_sum_row, reduction_info); } - if(_fuse_output_stage) + if (_fuse_output_stage) { // Configure matrix multiply kernel - if(!_assembly_path) + if (!_assembly_path) { _mm_kernel = std::make_unique<kernels::CpuGemmLowpMatrixMultiplyKernel>(); _mm_kernel->configure(matrix_a, matrix_b, &_mm_result_s32); } - _offset_contribution_output_stage_kernel = std::make_unique<kernels::CpuGemmLowpOffsetContributionOutputStageKernel>(); - _offset_contribution_output_stage_kernel->configure(&_mm_result_s32, - _a_offset == 0 ? nullptr : &_vector_sum_col, - _b_offset == 0 ? nullptr : &_vector_sum_row, c, - _flip_signedness ? &_signed_output : dst, - a->dimension(0), - _a_offset, _b_offset, info.gemmlowp_output_stage()); + _offset_contribution_output_stage_kernel = + std::make_unique<kernels::CpuGemmLowpOffsetContributionOutputStageKernel>(); + _offset_contribution_output_stage_kernel->configure( + &_mm_result_s32, _a_offset == 0 ? nullptr : &_vector_sum_col, + _b_offset == 0 ? nullptr : &_vector_sum_row, c, _flip_signedness ? &_signed_output : dst, + a->dimension(0), _a_offset, _b_offset, info.gemmlowp_output_stage()); - if(_flip_signedness) + if (_flip_signedness) { _convert_from_signed_asymm = std::make_unique<kernels::CpuConvertQuantizedSignednessKernel>(); _convert_from_signed_asymm->configure(&_signed_output, dst); @@ -267,27 +273,29 @@ void CpuGemmLowpMatrixMultiplyCore::configure(const ITensorInfo *a, const ITenso else { // Configure matrix multiply kernel - if(!_assembly_path) + if (!_assembly_path) { _mm_kernel = std::make_unique<kernels::CpuGemmLowpMatrixMultiplyKernel>(); _mm_kernel->configure(matrix_a, matrix_b, dst); } // Configure offset contribution kernel _offset_contribution_kernel = std::make_unique<kernels::CpuGemmLowpOffsetContributionKernel>(); - _offset_contribution_kernel->configure(dst, _a_offset == 0 ? nullptr : &_vector_sum_col, _b_offset == 0 ? nullptr : &_vector_sum_row, a_to_use->dimension(0), + _offset_contribution_kernel->configure(dst, _a_offset == 0 ? nullptr : &_vector_sum_col, + _b_offset == 0 ? nullptr : &_vector_sum_row, a_to_use->dimension(0), _a_offset, _b_offset); } } // Configure activation const ActivationLayerInfo &activation = gemm_info.activation_info(); - _run_activation = activation.enabled() && (!_assembly_path || !cpu::CpuGemmAssemblyDispatch::is_activation_supported(activation)); - if(_run_activation) + _run_activation = + activation.enabled() && (!_assembly_path || !cpu::CpuGemmAssemblyDispatch::is_activation_supported(activation)); + if (_run_activation) { _activation_func = std::make_unique<CpuActivation>(); _activation_func->configure(dst, nullptr, activation); } - if(_assembly_path) + if (_assembly_path) { auto asm_mem_req = _asm_glue->workspace(); _aux_mem[AsmGemmWorkspace] = asm_mem_req[AsmGemmWorkspace]; @@ -295,27 +303,41 @@ void CpuGemmLowpMatrixMultiplyCore::configure(const ITensorInfo *a, const ITenso } // Request memory for LHS and RHS reshape matrix - _aux_mem[VectorSumCol] = MemoryInfo(offset_int_vec(VectorSumCol), !_fused_assembly_path && _a_offset != 0 - && _reshape_b_only_on_first_run ? - MemoryLifetime::Persistent : - MemoryLifetime::Temporary, - _vector_sum_col.total_size()); - _aux_mem[VectorSumRow] = MemoryInfo(offset_int_vec(VectorSumRow), MemoryLifetime::Temporary, _vector_sum_row.total_size()); - _aux_mem[TmpA] = MemoryInfo(offset_int_vec(TmpA), MemoryLifetime::Temporary, _tmp_a.total_size()); - _aux_mem[TmpB] = MemoryInfo(offset_int_vec(TmpB), _reshape_b_only_on_first_run ? MemoryLifetime::Persistent : MemoryLifetime::Temporary, _tmp_b.total_size()); - _aux_mem[MMResultS32] = MemoryInfo(offset_int_vec(MMResultS32), MemoryLifetime::Temporary, _mm_result_s32.total_size()); - _aux_mem[SignedA] = MemoryInfo(offset_int_vec(SignedA), MemoryLifetime::Temporary, _signed_a.total_size()); - _aux_mem[SignedOutput] = MemoryInfo(offset_int_vec(SignedOutput), MemoryLifetime::Temporary, _signed_output.total_size()); + _aux_mem[VectorSumCol] = + MemoryInfo(offset_int_vec(VectorSumCol), + !_fused_assembly_path && _a_offset != 0 && _reshape_b_only_on_first_run ? MemoryLifetime::Persistent + : MemoryLifetime::Temporary, + _vector_sum_col.total_size()); + _aux_mem[VectorSumRow] = + MemoryInfo(offset_int_vec(VectorSumRow), MemoryLifetime::Temporary, _vector_sum_row.total_size()); + _aux_mem[TmpA] = MemoryInfo(offset_int_vec(TmpA), MemoryLifetime::Temporary, _tmp_a.total_size()); + _aux_mem[TmpB] = MemoryInfo(offset_int_vec(TmpB), + _reshape_b_only_on_first_run ? MemoryLifetime::Persistent : MemoryLifetime::Temporary, + _tmp_b.total_size()); + _aux_mem[MMResultS32] = + MemoryInfo(offset_int_vec(MMResultS32), MemoryLifetime::Temporary, _mm_result_s32.total_size()); + _aux_mem[SignedA] = MemoryInfo(offset_int_vec(SignedA), MemoryLifetime::Temporary, _signed_a.total_size()); + _aux_mem[SignedOutput] = + MemoryInfo(offset_int_vec(SignedOutput), MemoryLifetime::Temporary, _signed_output.total_size()); } -Status CpuGemmLowpMatrixMultiplyCore::validate(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *output, const GEMMInfo &gemm_info) +Status CpuGemmLowpMatrixMultiplyCore::validate(const ITensorInfo *a, + const ITensorInfo *b, + const ITensorInfo *c, + const ITensorInfo *output, + const GEMMInfo &gemm_info) { ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(a, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(b, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM8, DataType::QSYMM8_PER_CHANNEL); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::S32, DataType::QASYMM8, DataType::QASYMM8_SIGNED); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(c != nullptr && gemm_info.gemmlowp_output_stage().type == GEMMLowpOutputStageType::NONE, "Bias addition not supported in NEGEMMLowpMatrixMultiplyCore for output S32"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG((a)->dimension(0) != (b)->dimension(1), - "The product AB is defined only if the number of columns in A is equal to the number of rows in B"); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(b, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, + DataType::QSYMM8, DataType::QSYMM8_PER_CHANNEL); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::S32, DataType::QASYMM8, + DataType::QASYMM8_SIGNED); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(c != nullptr && + gemm_info.gemmlowp_output_stage().type == GEMMLowpOutputStageType::NONE, + "Bias addition not supported in NEGEMMLowpMatrixMultiplyCore for output S32"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG( + (a)->dimension(0) != (b)->dimension(1), + "The product AB is defined only if the number of columns in A is equal to the number of rows in B"); ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.is_a_reshaped(), "Matrix A already reshaped is not supported"); ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.is_b_reshaped(), "Matrix B already reshaped is not supported"); @@ -333,28 +355,32 @@ Status CpuGemmLowpMatrixMultiplyCore::validate(const ITensorInfo *a, const ITens int32_t b_offset = b->quantization_info().uniform().offset; bool fuse_output_stage = info.gemmlowp_output_stage().type != GEMMLowpOutputStageType::NONE; - if(fuse_output_stage) + if (fuse_output_stage) { - auto_init_if_empty(mm_result_s32_info, a->clone()->set_tensor_shape(output->tensor_shape()).set_data_type(DataType::S32)); + auto_init_if_empty(mm_result_s32_info, + a->clone()->set_tensor_shape(output->tensor_shape()).set_data_type(DataType::S32)); } // Convert QASYMM8->QASYMM8_SIGNED TensorInfo signed_a{}; TensorInfo signed_output{}; - bool flip_signedness = is_data_type_quantized_per_channel(b->data_type()) && (a->data_type() == DataType::QASYMM8) && info.reshape_b_only_on_first_run(); - if(flip_signedness) + bool flip_signedness = is_data_type_quantized_per_channel(b->data_type()) && + (a->data_type() == DataType::QASYMM8) && info.reshape_b_only_on_first_run(); + if (flip_signedness) { const int32_t offset_correction = 128; const DataType dt = DataType::QASYMM8_SIGNED; const UniformQuantizationInfo iqinfo = a_to_use->quantization_info().uniform(); - signed_a = a_to_use->clone()->set_data_type(dt).set_quantization_info(QuantizationInfo(iqinfo.scale, iqinfo.offset + offset_correction)); + signed_a = a_to_use->clone()->set_data_type(dt).set_quantization_info( + QuantizationInfo(iqinfo.scale, iqinfo.offset + offset_correction)); ARM_COMPUTE_RETURN_ON_ERROR(kernels::CpuConvertQuantizedSignednessKernel::validate(a_to_use, &signed_a)); a_to_use = &signed_a; a_offset = signed_a.quantization_info().uniform().offset; const UniformQuantizationInfo oqinfo = output->quantization_info().uniform(); - signed_output = output->clone()->set_data_type(dt).set_quantization_info(QuantizationInfo(oqinfo.scale, oqinfo.offset - offset_correction)); + signed_output = output->clone()->set_data_type(dt).set_quantization_info( + QuantizationInfo(oqinfo.scale, oqinfo.offset - offset_correction)); // Output stage correction GEMMLowpOutputStageInfo output_stage_corr = info.gemmlowp_output_stage(); @@ -374,25 +400,28 @@ Status CpuGemmLowpMatrixMultiplyCore::validate(const ITensorInfo *a, const ITens bool run_optimised = false; bool run_optimised_requantized = false; - if(!(!b->are_values_constant() && b->tensor_shape().z() > 1)) // Disable batch matmul as optimized GeMM handles batching differently. + if (!(!b->are_values_constant() && + b->tensor_shape().z() > 1)) // Disable batch matmul as optimized GeMM handles batching differently. { - if(is_data_type_quantized_asymmetric(a_to_use->data_type()) && info.gemmlowp_output_stage().type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT) + if (is_data_type_quantized_asymmetric(a_to_use->data_type()) && + info.gemmlowp_output_stage().type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT) { run_optimised = bool(CpuGemmAssemblyDispatch::validate(a_to_use, b, c, output, asm_info)); run_optimised_requantized = run_optimised; } else { - run_optimised = bool(CpuGemmAssemblyDispatch::validate(a_to_use, b, nullptr, fuse_output_stage ? &mm_result_s32_info : output, asm_info)); + run_optimised = bool(CpuGemmAssemblyDispatch::validate( + a_to_use, b, nullptr, fuse_output_stage ? &mm_result_s32_info : output, asm_info)); } } - if(run_optimised) + if (run_optimised) { ARM_COMPUTE_RETURN_ERROR_ON(b->dimension(0) != output->dimension(0)); - if(info.depth_output_gemm3d() != 0) + if (info.depth_output_gemm3d() != 0) { - if(info.reinterpret_input_as_3d()) + if (info.reinterpret_input_as_3d()) { ARM_COMPUTE_RETURN_ERROR_ON(a->dimension(1) != output->dimension(1)); ARM_COMPUTE_RETURN_ERROR_ON(a->dimension(2) != output->dimension(2)); @@ -409,11 +438,13 @@ Status CpuGemmLowpMatrixMultiplyCore::validate(const ITensorInfo *a, const ITens } else { - ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.reinterpret_input_as_3d(), "NEGEMM cannot reinterpret the input tensor as 3D"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.depth_output_gemm3d() != 0, "NEGEMM cannot reinterpret the output tensor as 3D"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.reinterpret_input_as_3d(), + "NEGEMM cannot reinterpret the input tensor as 3D"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.depth_output_gemm3d() != 0, + "NEGEMM cannot reinterpret the output tensor as 3D"); const bool run_vector_matrix_multiplication = a->dimension(1) < 2; - if(!run_vector_matrix_multiplication) + if (!run_vector_matrix_multiplication) { matrix_a_info = &tmp_a_info; matrix_b_info = &tmp_b_info; @@ -437,7 +468,7 @@ Status CpuGemmLowpMatrixMultiplyCore::validate(const ITensorInfo *a, const ITens } } - if(!run_optimised_requantized) + if (!run_optimised_requantized) { TensorInfo info_vector_sum_col{}; TensorInfo info_vector_sum_row{}; @@ -445,62 +476,70 @@ Status CpuGemmLowpMatrixMultiplyCore::validate(const ITensorInfo *a, const ITens const GEMMLowpReductionKernelInfo reduction_info(a_to_use->dimension(0), false, 0, false); // Validate matrix B reduction kernel only if _a_offset is not equal to 0 - if(a_offset != 0) + if (a_offset != 0) { info_vector_sum_col = TensorInfo(compute_reductionA_shape(*b), 1, DataType::S32); // Configure Matrix B reduction kernel - ARM_COMPUTE_RETURN_ON_ERROR(kernels::CpuGemmLowpMatrixBReductionKernel::validate(b, &info_vector_sum_col, reduction_info)); + ARM_COMPUTE_RETURN_ON_ERROR( + kernels::CpuGemmLowpMatrixBReductionKernel::validate(b, &info_vector_sum_col, reduction_info)); } // Validate Matrix A reduction kernel only if _b_offset is not equal to 0 - if(b_offset != 0) + if (b_offset != 0) { info_vector_sum_row = TensorInfo(compute_reductionB_shape(*a), 1, DataType::S32); // Configure matrix A reduction kernel - ARM_COMPUTE_RETURN_ON_ERROR(kernels::CpuGemmLowpMatrixAReductionKernel::validate(a_to_use, &info_vector_sum_row, reduction_info)); + ARM_COMPUTE_RETURN_ON_ERROR( + kernels::CpuGemmLowpMatrixAReductionKernel::validate(a_to_use, &info_vector_sum_row, reduction_info)); } - if(fuse_output_stage) + if (fuse_output_stage) { - if(!run_optimised) + if (!run_optimised) { - ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.reinterpret_input_as_3d(), "CpuGemmLowpMatrixMultiplyKernel cannot reinterpret the input tensor as 3D"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.depth_output_gemm3d() != 0, "CpuGemmLowpMatrixMultiplyKernel cannot reinterpret the output tensor as 3D"); - - ARM_COMPUTE_RETURN_ON_ERROR(kernels::CpuGemmLowpMatrixMultiplyKernel::validate(matrix_a_info, matrix_b_info, &mm_result_s32_info)); + ARM_COMPUTE_RETURN_ERROR_ON_MSG( + info.reinterpret_input_as_3d(), + "CpuGemmLowpMatrixMultiplyKernel cannot reinterpret the input tensor as 3D"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG( + info.depth_output_gemm3d() != 0, + "CpuGemmLowpMatrixMultiplyKernel cannot reinterpret the output tensor as 3D"); + + ARM_COMPUTE_RETURN_ON_ERROR(kernels::CpuGemmLowpMatrixMultiplyKernel::validate( + matrix_a_info, matrix_b_info, &mm_result_s32_info)); } // Validate offset contribution kernel - ARM_COMPUTE_RETURN_ON_ERROR(kernels::CpuGemmLowpOffsetContributionOutputStageKernel::validate(&mm_result_s32_info, - a_offset == 0 ? nullptr : &info_vector_sum_col, - b_offset == 0 ? nullptr : &info_vector_sum_row, - c, - flip_signedness ? &signed_output : output, - a_offset, b_offset, - info.gemmlowp_output_stage())); + ARM_COMPUTE_RETURN_ON_ERROR(kernels::CpuGemmLowpOffsetContributionOutputStageKernel::validate( + &mm_result_s32_info, a_offset == 0 ? nullptr : &info_vector_sum_col, + b_offset == 0 ? nullptr : &info_vector_sum_row, c, flip_signedness ? &signed_output : output, a_offset, + b_offset, info.gemmlowp_output_stage())); } else { - if(!run_optimised) + if (!run_optimised) { - ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.reinterpret_input_as_3d(), "CpuGemmLowpMatrixMultiplyKernel cannot reinterpret the input tensor as 3D"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.depth_output_gemm3d() != 0, "CpuGemmLowpMatrixMultiplyKernel cannot reinterpret the output tensor as 3D"); - - ARM_COMPUTE_RETURN_ON_ERROR(kernels::CpuGemmLowpMatrixMultiplyKernel::validate(matrix_a_info, matrix_b_info, output)); + ARM_COMPUTE_RETURN_ERROR_ON_MSG( + info.reinterpret_input_as_3d(), + "CpuGemmLowpMatrixMultiplyKernel cannot reinterpret the input tensor as 3D"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG( + info.depth_output_gemm3d() != 0, + "CpuGemmLowpMatrixMultiplyKernel cannot reinterpret the output tensor as 3D"); + + ARM_COMPUTE_RETURN_ON_ERROR( + kernels::CpuGemmLowpMatrixMultiplyKernel::validate(matrix_a_info, matrix_b_info, output)); } // Validate offset contribution kernel - ARM_COMPUTE_RETURN_ON_ERROR(kernels::CpuGemmLowpOffsetContributionKernel::validate(output, - a_offset == 0 ? nullptr : &info_vector_sum_col, - b_offset == 0 ? nullptr : &info_vector_sum_row, - a_offset, b_offset)); + ARM_COMPUTE_RETURN_ON_ERROR(kernels::CpuGemmLowpOffsetContributionKernel::validate( + output, a_offset == 0 ? nullptr : &info_vector_sum_col, b_offset == 0 ? nullptr : &info_vector_sum_row, + a_offset, b_offset)); } } // Validate activation const ActivationLayerInfo &activation = gemm_info.activation_info(); - if(activation.enabled()) + if (activation.enabled()) { ARM_COMPUTE_RETURN_ON_ERROR(CpuActivation::validate(output, nullptr, activation)); } @@ -529,24 +568,22 @@ void CpuGemmLowpMatrixMultiplyCore::run(ITensorPack &tensors) CpuAuxTensorHandler signed_output(offset_int_vec(SignedOutput), _signed_output, tensors, false); // Convert QASYMM8->QASYMM8_SIGNED - if(_flip_signedness) + if (_flip_signedness) { - ITensorPack pack = - { - { TensorType::ACL_SRC, a }, - { TensorType::ACL_DST, signed_a.get() } - }; - NEScheduler::get().schedule_op(_convert_to_signed_asymm.get(), Window::DimY, _convert_to_signed_asymm->window(), pack); + ITensorPack pack = {{TensorType::ACL_SRC, a}, {TensorType::ACL_DST, signed_a.get()}}; + NEScheduler::get().schedule_op(_convert_to_signed_asymm.get(), Window::DimY, _convert_to_signed_asymm->window(), + pack); a_to_use = signed_a.get(); matrix_a = signed_a.get(); } // Run GEMM - if(_asm_glue->is_configured()) + if (_asm_glue->is_configured()) { ITensorPack asm_glue_tensors = tensors; auto output_to_use = (_fuse_output_stage ? mm_result_s32.get() : dst); - if(is_data_type_quantized_asymmetric(a_to_use->info()->data_type()) && _gemm_info.gemmlowp_output_stage().type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT) + if (is_data_type_quantized_asymmetric(a_to_use->info()->data_type()) && + _gemm_info.gemmlowp_output_stage().type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT) { asm_glue_tensors.add_const_tensor(TensorType::ACL_SRC_0, a_to_use); asm_glue_tensors.add_const_tensor(TensorType::ACL_SRC_1, b); @@ -563,35 +600,25 @@ void CpuGemmLowpMatrixMultiplyCore::run(ITensorPack &tensors) } else { - if(!_run_vector_matrix_multiplication) + if (!_run_vector_matrix_multiplication) { matrix_a = tmp_a.get(); matrix_b = tmp_b.get(); // Run interleave kernel - ITensorPack pack_a = - { - { TensorType::ACL_SRC, a_to_use }, - { TensorType::ACL_DST, tmp_a.get() } - }; - NEScheduler::get().schedule_op(_mtx_a_reshape_kernel.get(), Window::DimY, _mtx_a_reshape_kernel->window(), pack_a); + ITensorPack pack_a = {{TensorType::ACL_SRC, a_to_use}, {TensorType::ACL_DST, tmp_a.get()}}; + NEScheduler::get().schedule_op(_mtx_a_reshape_kernel.get(), Window::DimY, _mtx_a_reshape_kernel->window(), + pack_a); - if(!_reshape_b_only_on_first_run) + if (!_reshape_b_only_on_first_run) { - ITensorPack pack_b = - { - { TensorType::ACL_SRC, b }, - { TensorType::ACL_DST, tmp_b.get() } - }; + ITensorPack pack_b = {{TensorType::ACL_SRC, b}, {TensorType::ACL_DST, tmp_b.get()}}; // Run transpose kernel - NEScheduler::get().schedule_op(_mtx_b_reshape_kernel.get(), Window::DimY, _mtx_b_reshape_kernel->window(), pack_b); + NEScheduler::get().schedule_op(_mtx_b_reshape_kernel.get(), Window::DimY, + _mtx_b_reshape_kernel->window(), pack_b); } } - ITensorPack pack_mm = - { - { TensorType::ACL_SRC_0, matrix_a }, - { TensorType::ACL_SRC_1, matrix_b } - }; - if(_fuse_output_stage) + ITensorPack pack_mm = {{TensorType::ACL_SRC_0, matrix_a}, {TensorType::ACL_SRC_1, matrix_b}}; + if (_fuse_output_stage) { pack_mm.add_tensor(TensorType::ACL_DST, mm_result_s32.get()); } @@ -602,31 +629,25 @@ void CpuGemmLowpMatrixMultiplyCore::run(ITensorPack &tensors) NEScheduler::get().schedule_op(_mm_kernel.get(), Window::DimY, _mm_kernel->window(), pack_mm); } - if(!_fused_assembly_path) + if (!_fused_assembly_path) { // Run matrix A reduction kernel only if _b_offset is not equal to 0 - if(_b_offset != 0) + if (_b_offset != 0) { - ITensorPack pack = - { - { TensorType::ACL_SRC, a_to_use }, - { TensorType::ACL_DST, vector_sum_row.get() } - }; - NEScheduler::get().schedule_op(_mtx_a_reduction_kernel.get(), Window::DimX, _mtx_a_reduction_kernel->window(), pack); + ITensorPack pack = {{TensorType::ACL_SRC, a_to_use}, {TensorType::ACL_DST, vector_sum_row.get()}}; + NEScheduler::get().schedule_op(_mtx_a_reduction_kernel.get(), Window::DimX, + _mtx_a_reduction_kernel->window(), pack); } // Run matrix B reduction kernel only if _a_offset is not equal to 0 - if(_a_offset != 0 && !_reshape_b_only_on_first_run) + if (_a_offset != 0 && !_reshape_b_only_on_first_run) { - ITensorPack pack = - { - { TensorType::ACL_SRC, b }, - { TensorType::ACL_DST, vector_sum_col.get() } - }; - NEScheduler::get().schedule_op(_mtx_b_reduction_kernel.get(), Window::DimX, _mtx_b_reduction_kernel->window(), pack); + ITensorPack pack = {{TensorType::ACL_SRC, b}, {TensorType::ACL_DST, vector_sum_col.get()}}; + NEScheduler::get().schedule_op(_mtx_b_reduction_kernel.get(), Window::DimX, + _mtx_b_reduction_kernel->window(), pack); } - if(_fuse_output_stage) + if (_fuse_output_stage) { ITensorPack pack; pack.add_tensor(TensorType::ACL_SRC_0, mm_result_s32.get()); @@ -636,7 +657,8 @@ void CpuGemmLowpMatrixMultiplyCore::run(ITensorPack &tensors) pack.add_tensor(TensorType::ACL_DST, _flip_signedness ? signed_output.get() : dst); // Run offset contribution kernel - NEScheduler::get().schedule_op(_offset_contribution_output_stage_kernel.get(), Window::DimY, _offset_contribution_output_stage_kernel->window(), pack); + NEScheduler::get().schedule_op(_offset_contribution_output_stage_kernel.get(), Window::DimY, + _offset_contribution_output_stage_kernel->window(), pack); } else { @@ -646,68 +668,57 @@ void CpuGemmLowpMatrixMultiplyCore::run(ITensorPack &tensors) pack.add_tensor(TensorType::ACL_DST, dst); // Run offset contribution kernel - NEScheduler::get().schedule_op(_offset_contribution_kernel.get(), Window::DimY, _offset_contribution_kernel->window(), pack); + NEScheduler::get().schedule_op(_offset_contribution_kernel.get(), Window::DimY, + _offset_contribution_kernel->window(), pack); } } // Convert QASYMM8_SIGNED->QASYMM8 - if(!_fused_assembly_path && _fuse_output_stage && _flip_signedness) + if (!_fused_assembly_path && _fuse_output_stage && _flip_signedness) { - ITensorPack pack = - { - { TensorType::ACL_SRC, signed_output.get() }, - { TensorType::ACL_DST, dst } - }; - NEScheduler::get().schedule_op(_convert_from_signed_asymm.get(), Window::DimY, _convert_from_signed_asymm->window(), pack); + ITensorPack pack = {{TensorType::ACL_SRC, signed_output.get()}, {TensorType::ACL_DST, dst}}; + NEScheduler::get().schedule_op(_convert_from_signed_asymm.get(), Window::DimY, + _convert_from_signed_asymm->window(), pack); } // Run fused activation unless already run in the fused assembly - if(_run_activation) + if (_run_activation) { - ITensorPack pack = - { - { TensorType::ACL_SRC, dst }, - { TensorType::ACL_DST, dst } - }; + ITensorPack pack = {{TensorType::ACL_SRC, dst}, {TensorType::ACL_DST, dst}}; _activation_func->run(pack); } } void CpuGemmLowpMatrixMultiplyCore::prepare(ITensorPack &tensors) { - if(!_is_prepared) + if (!_is_prepared) { auto original_b = tensors.get_const_tensor(TensorType::ACL_SRC_1); // Run assembly reshape - if(_asm_glue->is_configured()) + if (_asm_glue->is_configured()) { _asm_glue->prepare(tensors); } // Run non-assembly reshape - else if(_reshape_b_only_on_first_run && !_run_vector_matrix_multiplication && !_asm_glue->is_configured()) + else if (_reshape_b_only_on_first_run && !_run_vector_matrix_multiplication && !_asm_glue->is_configured()) { // Run reshape kernel and mark original weights tensor as unused - ITensor *tmp_b_p = utils::cast::polymorphic_downcast<ITensor *>(tensors.get_tensor(offset_int_vec(TmpB))); + ITensor *tmp_b_p = utils::cast::polymorphic_downcast<ITensor *>(tensors.get_tensor(offset_int_vec(TmpB))); CpuAuxTensorHandler tmp_b(_tmp_b, *tmp_b_p); - ITensorPack pack = - { - { TensorType::ACL_SRC, original_b }, - { TensorType::ACL_DST, tmp_b.get() } - }; - NEScheduler::get().schedule_op(_mtx_b_reshape_kernel.get(), Window::DimY, _mtx_b_reshape_kernel->window(), pack); + ITensorPack pack = {{TensorType::ACL_SRC, original_b}, {TensorType::ACL_DST, tmp_b.get()}}; + NEScheduler::get().schedule_op(_mtx_b_reshape_kernel.get(), Window::DimY, _mtx_b_reshape_kernel->window(), + pack); } // Run matrix B reduction kernel only if _a_offset is not equal to 0 - if(!_fused_assembly_path && _a_offset != 0 && _reshape_b_only_on_first_run) + if (!_fused_assembly_path && _a_offset != 0 && _reshape_b_only_on_first_run) { - ITensor *vector_sum_col_p = utils::cast::polymorphic_downcast<ITensor *>(tensors.get_tensor(offset_int_vec(VectorSumCol))); + ITensor *vector_sum_col_p = + utils::cast::polymorphic_downcast<ITensor *>(tensors.get_tensor(offset_int_vec(VectorSumCol))); CpuAuxTensorHandler vector_sum_col(_vector_sum_col, *vector_sum_col_p); - ITensorPack pack = - { - { TensorType::ACL_SRC, original_b }, - { TensorType::ACL_DST, vector_sum_col.get() } - }; - NEScheduler::get().schedule_op(_mtx_b_reduction_kernel.get(), Window::DimX, _mtx_b_reduction_kernel->window(), pack); + ITensorPack pack = {{TensorType::ACL_SRC, original_b}, {TensorType::ACL_DST, vector_sum_col.get()}}; + NEScheduler::get().schedule_op(_mtx_b_reduction_kernel.get(), Window::DimX, + _mtx_b_reduction_kernel->window(), pack); } _is_prepared = true; } diff --git a/src/cpu/operators/CpuGemmLowpMatrixMultiplyCore.h b/src/cpu/operators/CpuGemmLowpMatrixMultiplyCore.h index a1b34291d0..a7798938e7 100644 --- a/src/cpu/operators/CpuGemmLowpMatrixMultiplyCore.h +++ b/src/cpu/operators/CpuGemmLowpMatrixMultiplyCore.h @@ -26,6 +26,7 @@ #include "arm_compute/core/TensorInfo.h" #include "arm_compute/function_info/GEMMInfo.h" + #include "src/core/common/Macros.h" #include "src/cpu/ICpuOperator.h" @@ -108,18 +109,26 @@ public: * @param[in] gemm_info (Optional) Specifies if the matrix A and/or matrix B have been reshaped and * if the reshape of matrix B should be executed only for the first run */ - void configure(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, ITensorInfo *dst, const GEMMInfo &gemm_info = GEMMInfo()); + void configure(const ITensorInfo *a, + const ITensorInfo *b, + const ITensorInfo *c, + ITensorInfo *dst, + const GEMMInfo &gemm_info = GEMMInfo()); /** Static function to check if given info will lead to a valid configuration * * Similar to CpuGemmLowpMatrixMultiplyCore::configure() * * @return a status */ - static Status validate(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *dst, const GEMMInfo &gemm_info = GEMMInfo()); + static Status validate(const ITensorInfo *a, + const ITensorInfo *b, + const ITensorInfo *c, + const ITensorInfo *dst, + const GEMMInfo &gemm_info = GEMMInfo()); // Inherited methods overridden: - void run(ITensorPack &tensors) override; - void prepare(ITensorPack &tensors) override; + void run(ITensorPack &tensors) override; + void prepare(ITensorPack &tensors) override; experimental::MemoryRequirements workspace() const override; private: diff --git a/src/cpu/operators/CpuGemmLowpOutputStage.cpp b/src/cpu/operators/CpuGemmLowpOutputStage.cpp index 58f98acff0..4215eed199 100644 --- a/src/cpu/operators/CpuGemmLowpOutputStage.cpp +++ b/src/cpu/operators/CpuGemmLowpOutputStage.cpp @@ -26,6 +26,7 @@ #include "arm_compute/core/ITensor.h" #include "arm_compute/core/Validate.h" #include "arm_compute/runtime/NEON/NEScheduler.h" + #include "src/common/utils/Log.h" #include "src/cpu/kernels/CpuGemmLowpQuantizeDownInt32ScaleKernel.h" #include "src/cpu/kernels/CpuGemmLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel.h" @@ -36,36 +37,42 @@ namespace arm_compute { namespace cpu { -void CpuGemmLowpOutputStage::configure(ITensorInfo *src, ITensorInfo *bias, ITensorInfo *dst, const GEMMLowpOutputStageInfo &info) +void CpuGemmLowpOutputStage::configure(ITensorInfo *src, + ITensorInfo *bias, + ITensorInfo *dst, + const GEMMLowpOutputStageInfo &info) { // Perform validate step ARM_COMPUTE_ERROR_THROW_ON(CpuGemmLowpOutputStage::validate(src, bias, dst, info)); ARM_COMPUTE_LOG_PARAMS(src, bias, dst, info); - switch(info.type) + switch (info.type) { case GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT: { - switch(info.output_data_type) + switch (info.output_data_type) { case DataType::QASYMM8: { auto k = std::make_unique<kernels::CpuGemmLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel>(); - k->configure(src, bias, dst, info.gemmlowp_multiplier, info.gemmlowp_shift, info.gemmlowp_offset, info.gemmlowp_min_bound, info.gemmlowp_max_bound); + k->configure(src, bias, dst, info.gemmlowp_multiplier, info.gemmlowp_shift, info.gemmlowp_offset, + info.gemmlowp_min_bound, info.gemmlowp_max_bound); _kernel = std::move(k); break; } case DataType::QASYMM8_SIGNED: { auto k = std::make_unique<kernels::CpuGemmLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel>(); - k->configure(src, bias, dst, info.gemmlowp_multiplier, info.gemmlowp_shift, info.gemmlowp_offset, info.gemmlowp_min_bound, info.gemmlowp_max_bound); + k->configure(src, bias, dst, info.gemmlowp_multiplier, info.gemmlowp_shift, info.gemmlowp_offset, + info.gemmlowp_min_bound, info.gemmlowp_max_bound); _kernel = std::move(k); break; } case DataType::QSYMM16: { auto k = std::make_unique<kernels::CpuGemmLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel>(); - k->configure(src, bias, dst, info.gemmlowp_multiplier, info.gemmlowp_shift, info.gemmlowp_min_bound, info.gemmlowp_max_bound); + k->configure(src, bias, dst, info.gemmlowp_multiplier, info.gemmlowp_shift, info.gemmlowp_min_bound, + info.gemmlowp_max_bound); _kernel = std::move(k); break; } @@ -79,7 +86,7 @@ void CpuGemmLowpOutputStage::configure(ITensorInfo *src, ITensorInfo *bias, ITen } case GEMMLowpOutputStageType::QUANTIZE_DOWN: { - switch(info.output_data_type) + switch (info.output_data_type) { case DataType::QASYMM8: case DataType::QASYMM8_SIGNED: @@ -102,32 +109,41 @@ void CpuGemmLowpOutputStage::configure(ITensorInfo *src, ITensorInfo *bias, ITen } } -Status CpuGemmLowpOutputStage::validate(const ITensorInfo *src, const ITensorInfo *bias, const ITensorInfo *dst, const GEMMLowpOutputStageInfo &info) +Status CpuGemmLowpOutputStage::validate(const ITensorInfo *src, + const ITensorInfo *bias, + const ITensorInfo *dst, + const GEMMLowpOutputStageInfo &info) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(dst->data_type() == DataType::UNKNOWN, "CpuGemmLowpOutputStage cannot be used with UNKNOWN output data type."); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM16); - ARM_COMPUTE_RETURN_ERROR_ON((info.type != GEMMLowpOutputStageType::QUANTIZE_DOWN) && (info.type != GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT)); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(dst->data_type() == DataType::UNKNOWN, + "CpuGemmLowpOutputStage cannot be used with UNKNOWN output data type."); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, + DataType::QSYMM16); + ARM_COMPUTE_RETURN_ERROR_ON((info.type != GEMMLowpOutputStageType::QUANTIZE_DOWN) && + (info.type != GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT)); - switch(info.type) + switch (info.type) { case GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT: { - switch(dst->data_type()) + switch (dst->data_type()) { case DataType::QASYMM8: - return kernels::CpuGemmLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel::validate(src, bias, dst, info.gemmlowp_min_bound, info.gemmlowp_max_bound); + return kernels::CpuGemmLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel::validate( + src, bias, dst, info.gemmlowp_min_bound, info.gemmlowp_max_bound); case DataType::QASYMM8_SIGNED: - return kernels::CpuGemmLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel::validate(src, bias, dst, info.gemmlowp_min_bound, info.gemmlowp_max_bound); + return kernels::CpuGemmLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel::validate( + src, bias, dst, info.gemmlowp_min_bound, info.gemmlowp_max_bound); case DataType::QSYMM16: - return kernels::CpuGemmLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel::validate(src, bias, dst, info.gemmlowp_min_bound, info.gemmlowp_max_bound); + return kernels::CpuGemmLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel::validate( + src, bias, dst, info.gemmlowp_min_bound, info.gemmlowp_max_bound); default: return ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Unsupported output data type."); } } case GEMMLowpOutputStageType::QUANTIZE_DOWN: { - switch(dst->data_type()) + switch (dst->data_type()) { case DataType::QASYMM8: case DataType::QASYMM8_SIGNED: @@ -146,4 +162,4 @@ void CpuGemmLowpOutputStage::run(ITensorPack &tensors) NEScheduler::get().schedule_op(_kernel.get(), Window::DimY, _kernel->window(), tensors); } } // namespace cpu -} // namespace arm_compute
\ No newline at end of file +} // namespace arm_compute diff --git a/src/cpu/operators/CpuGemmLowpOutputStage.h b/src/cpu/operators/CpuGemmLowpOutputStage.h index 39394f6b5f..e5e2f41fa9 100644 --- a/src/cpu/operators/CpuGemmLowpOutputStage.h +++ b/src/cpu/operators/CpuGemmLowpOutputStage.h @@ -25,6 +25,7 @@ #define ARM_COMPUTE_CPU_GEMMLOWP_OUTPUT_STAGE_H #include "arm_compute/core/Types.h" + #include "src/cpu/ICpuOperator.h" /** This file contains all available output stages for GEMMLowp. @@ -76,7 +77,10 @@ public: * * @return a status */ - static Status validate(const ITensorInfo *src, const ITensorInfo *bias, const ITensorInfo *dst, const GEMMLowpOutputStageInfo &info); + static Status validate(const ITensorInfo *src, + const ITensorInfo *bias, + const ITensorInfo *dst, + const GEMMLowpOutputStageInfo &info); // Inherited methods overridden: void run(ITensorPack &tensors) override; diff --git a/src/cpu/operators/CpuMatMul.cpp b/src/cpu/operators/CpuMatMul.cpp index 8811a7ea6b..89087129c3 100644 --- a/src/cpu/operators/CpuMatMul.cpp +++ b/src/cpu/operators/CpuMatMul.cpp @@ -23,14 +23,16 @@ */ #include "src/cpu/operators/CpuMatMul.h" -#include "arm_compute/core/Types.h" -#include "arm_compute/core/Validate.h" + #include "arm_compute/core/experimental/Types.h" +#include "arm_compute/core/Types.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" #include "arm_compute/core/utils/quantization/AsymmHelpers.h" +#include "arm_compute/core/Validate.h" #include "arm_compute/function_info/MatMulInfo.h" -#include "arm_compute/runtime/NEON/NEScheduler.h" #include "arm_compute/runtime/NEON/functions/NEMatMul.h" +#include "arm_compute/runtime/NEON/NEScheduler.h" + #include "src/common/utils/Log.h" #include "src/core/CPP/Validate.h" #include "src/core/helpers/AutoConfiguration.h" @@ -46,8 +48,11 @@ namespace cpu { namespace { -Status get_gemmlowp_output_stage_info(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *dst, const ActivationLayerInfo &act, - GEMMLowpOutputStageInfo &gemmlowp_output_stage_info) +Status get_gemmlowp_output_stage_info(const ITensorInfo *src, + const ITensorInfo *weights, + const ITensorInfo *dst, + const ActivationLayerInfo &act, + GEMMLowpOutputStageInfo &gemmlowp_output_stage_info) { const auto data_type = src->data_type(); const QuantizationInfo oq_info = dst->quantization_info(); @@ -59,10 +64,11 @@ Status get_gemmlowp_output_stage_info(const ITensorInfo *src, const ITensorInfo int32_t output_multiplier; int32_t output_shift; - ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier(multiplier, &output_multiplier, &output_shift)); + ARM_COMPUTE_RETURN_ON_ERROR( + quantization::calculate_quantized_multiplier(multiplier, &output_multiplier, &output_shift)); - int32_t type_min = 0; - int32_t type_max = 0; + int32_t type_min = 0; + int32_t type_max = 0; std::tie(type_min, type_max) = quantization::get_quantized_asymmetric_output_min_max(oq_info, act, data_type); gemmlowp_output_stage_info.gemmlowp_multiplier = output_multiplier; @@ -77,14 +83,27 @@ Status get_gemmlowp_output_stage_info(const ITensorInfo *src, const ITensorInfo } // namespace CpuMatMul::CpuMatMul() - : _transpose_kernel_lhs(), _transpose_kernel_rhs(), _asm_glue(), _lhs_transposed(), _rhs_transposed(), _original_lhs_shape(), _original_rhs_shape(), _original_dst_shape() + : _transpose_kernel_lhs(), + _transpose_kernel_rhs(), + _asm_glue(), + _lhs_transposed(), + _rhs_transposed(), + _original_lhs_shape(), + _original_rhs_shape(), + _original_dst_shape() { } -Status CpuMatMul::validate(const ITensorInfo *lhs, const ITensorInfo *rhs, const ITensorInfo *dst, const MatMulInfo &info, const CpuMatMulSettings &settings, const ActivationLayerInfo &act_info) +Status CpuMatMul::validate(const ITensorInfo *lhs, + const ITensorInfo *rhs, + const ITensorInfo *dst, + const MatMulInfo &info, + const CpuMatMulSettings &settings, + const ActivationLayerInfo &act_info) { ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(lhs, rhs, dst); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lhs, 1, DataType::F32, DataType::F16, DataType::QASYMM8, DataType::QASYMM8_SIGNED); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lhs, 1, DataType::F32, DataType::F16, DataType::QASYMM8, + DataType::QASYMM8_SIGNED); ARM_COMPUTE_RETURN_ERROR_ON_MSG(lhs->are_values_constant(), "LHS Tensor must be dynamic."); ARM_COMPUTE_RETURN_ERROR_ON_MSG(rhs->are_values_constant(), "RHS Tensor must be dynamic."); ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(lhs); @@ -103,34 +122,39 @@ Status CpuMatMul::validate(const ITensorInfo *lhs, const ITensorInfo *rhs, const gemm_info.fast_mode = settings.fast_math(); // Validate and then permute a/b - if(adj_lhs) + if (adj_lhs) { - auto_init_if_empty(lhs_transposed, lhs->clone()->set_tensor_shape(misc::shape_calculator::compute_transposed_shape(*lhs))); + auto_init_if_empty(lhs_transposed, + lhs->clone()->set_tensor_shape(misc::shape_calculator::compute_transposed_shape(*lhs))); ARM_COMPUTE_RETURN_ON_ERROR(cpu::kernels::CpuTransposeKernel::validate(lhs_to_use, &lhs_transposed)); // Assign lhs_to_use pointer to use transposed TensorInfo lhs_to_use = &lhs_transposed; } - if(adj_rhs) + if (adj_rhs) { - auto_init_if_empty(rhs_transposed, rhs->clone()->set_tensor_shape(misc::shape_calculator::compute_transposed_shape(*rhs))); + auto_init_if_empty(rhs_transposed, + rhs->clone()->set_tensor_shape(misc::shape_calculator::compute_transposed_shape(*rhs))); ARM_COMPUTE_RETURN_ON_ERROR(cpu::kernels::CpuTransposeKernel::validate(rhs_to_use, &rhs_transposed)); // Assign rhs_to_use pointer to use transposed TensorInfo rhs_to_use = &rhs_transposed; } ARM_COMPUTE_RETURN_ERROR_ON_MSG(lhs_to_use->dimension(0) != rhs_to_use->dimension(1), - "The product AB is defined only if the number of columns in A is equal to the number of rows in B (after transpose)"); + "The product AB is defined only if the number of columns in A is equal to the " + "number of rows in B (after transpose)"); // Iterate over dimensions to be collapsed in operator - check dimensions are equivalent between tensors - for(unsigned int i = 2; i < Coordinates::num_max_dimensions; i++) + for (unsigned int i = 2; i < Coordinates::num_max_dimensions; i++) { - ARM_COMPUTE_RETURN_ERROR_ON_MSG(lhs_to_use->dimension(i) != rhs_to_use->dimension(i), "Broadcasting in Batch dimension is unsupported by this operator."); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(lhs_to_use->dimension(i) != rhs_to_use->dimension(i), + "Broadcasting in Batch dimension is unsupported by this operator."); } // Quantized-specific configuration - if(is_data_type_quantized(lhs->data_type())) + if (is_data_type_quantized(lhs->data_type())) { - ARM_COMPUTE_RETURN_ON_ERROR(get_gemmlowp_output_stage_info(lhs_to_use, rhs_to_use, dst, gemm_info.activation_info, gemm_info.output_stage)); + ARM_COMPUTE_RETURN_ON_ERROR(get_gemmlowp_output_stage_info(lhs_to_use, rhs_to_use, dst, + gemm_info.activation_info, gemm_info.output_stage)); } cpu::CpuGemmAssemblyDispatch::validate(lhs_to_use, rhs_to_use, nullptr, dst, gemm_info); @@ -138,7 +162,12 @@ Status CpuMatMul::validate(const ITensorInfo *lhs, const ITensorInfo *rhs, const return Status{}; } -void CpuMatMul::configure(ITensorInfo *lhs, ITensorInfo *rhs, ITensorInfo *dst, const MatMulInfo &info, const CpuMatMulSettings &settings, const ActivationLayerInfo &act_info) +void CpuMatMul::configure(ITensorInfo *lhs, + ITensorInfo *rhs, + ITensorInfo *dst, + const MatMulInfo &info, + const CpuMatMulSettings &settings, + const ActivationLayerInfo &act_info) { ARM_COMPUTE_ERROR_ON_NULLPTR(lhs, rhs, dst); ARM_COMPUTE_LOG_PARAMS(lhs, rhs, dst, info, settings); @@ -163,21 +192,23 @@ void CpuMatMul::configure(ITensorInfo *lhs, ITensorInfo *rhs, ITensorInfo *dst, _original_rhs_shape = rhs_to_use.tensor_shape(); // Reshape lhs for use with assembly kernels. - lhs_to_use.set_tensor_shape(TensorShape(_original_lhs_shape.x(), _original_lhs_shape.y(), 1, _original_lhs_shape.collapsed_from(2).z())); - dst_to_use.set_tensor_shape(TensorShape(_original_dst_shape.x(), _original_dst_shape.y(), 1, _original_dst_shape.collapsed_from(2).z())); + lhs_to_use.set_tensor_shape( + TensorShape(_original_lhs_shape.x(), _original_lhs_shape.y(), 1, _original_lhs_shape.collapsed_from(2).z())); + dst_to_use.set_tensor_shape( + TensorShape(_original_dst_shape.x(), _original_dst_shape.y(), 1, _original_dst_shape.collapsed_from(2).z())); rhs_to_use.set_tensor_shape(_original_rhs_shape.collapsed_from(2)); // 2. Configuration for transpose of lhs/rhs // ------------------------------------------------------ // Initialise transposed TensorInfo class for aux tensors (intermediary tensors) - if(_adj_lhs) + if (_adj_lhs) { // Setup transpose LHS _transpose_kernel_lhs = std::make_unique<cpu::kernels::CpuTransposeKernel>(); _transpose_kernel_lhs->configure(&lhs_to_use, &_lhs_transposed); } - if(_adj_rhs) + if (_adj_rhs) { // Setup transpose RHS _transpose_kernel_rhs = std::make_unique<cpu::kernels::CpuTransposeKernel>(); @@ -196,20 +227,22 @@ void CpuMatMul::configure(ITensorInfo *lhs, ITensorInfo *rhs, ITensorInfo *dst, rhs_to_use = (_adj_rhs) ? _rhs_transposed : rhs_to_use; // Quantized-specific configuration - if(is_data_type_quantized(lhs->data_type())) + if (is_data_type_quantized(lhs->data_type())) { - get_gemmlowp_output_stage_info(&lhs_to_use, &rhs_to_use, &dst_to_use, _gemm_info.activation_info, _gemm_info.output_stage); + get_gemmlowp_output_stage_info(&lhs_to_use, &rhs_to_use, &dst_to_use, _gemm_info.activation_info, + _gemm_info.output_stage); } // Configure Asm Kernel _asm_glue = std::make_unique<cpu::CpuGemmAssemblyDispatch>(); - _asm_glue->configure(&lhs_to_use, &rhs_to_use, nullptr, &dst_to_use, _gemm_info); // c is nullptr as bias not supported in MatMul + _asm_glue->configure(&lhs_to_use, &rhs_to_use, nullptr, &dst_to_use, + _gemm_info); // c is nullptr as bias not supported in MatMul // Specify memory requirements for intermediate tensors auto asm_mem_req = _asm_glue->workspace(); // Specify memory required by gemm kernel int idx = 0; - for(const auto &aux : asm_mem_req) + for (const auto &aux : asm_mem_req) { _aux_mem[idx] = aux; idx++; @@ -228,8 +261,12 @@ void CpuMatMul::run(ITensorPack &tensors) // Reshape LHS and DST to ensure compatibility with GEMM asm kernel (Batch dimensions is 4th for lhs and dst within asm) // Collapse RHS (necessary to support dimensions larger than 3 in gemm assembly) - lhs->info()->set_tensor_shape(TensorShape(_original_lhs_shape.x(), _original_lhs_shape.y(), 1, _original_lhs_shape.collapsed_from(2).z())); // Collapsed 3+ dimensions into z - dst->info()->set_tensor_shape(TensorShape(_original_dst_shape.x(), _original_dst_shape.y(), 1, _original_dst_shape.collapsed_from(2).z())); // Collapsed 3+ dimensions into z + lhs->info()->set_tensor_shape( + TensorShape(_original_lhs_shape.x(), _original_lhs_shape.y(), 1, + _original_lhs_shape.collapsed_from(2).z())); // Collapsed 3+ dimensions into z + dst->info()->set_tensor_shape( + TensorShape(_original_dst_shape.x(), _original_dst_shape.y(), 1, + _original_dst_shape.collapsed_from(2).z())); // Collapsed 3+ dimensions into z rhs->info()->set_tensor_shape(_original_rhs_shape.collapsed_from(2)); // Initialise object to handle stored transposed tensors in auxillary memory @@ -240,17 +277,19 @@ void CpuMatMul::run(ITensorPack &tensors) ITensorPack asm_tensors(tensors); // Run transpose lhs if necessary - if(_adj_lhs) + if (_adj_lhs) { - ITensorPack lhs_transpose_pack = { { TensorType::ACL_SRC, lhs }, { TensorType::ACL_DST, lhs_transposed.get() } }; - NEScheduler::get().schedule_op(_transpose_kernel_lhs.get(), Window::DimY, _transpose_kernel_lhs->window(), lhs_transpose_pack); + ITensorPack lhs_transpose_pack = {{TensorType::ACL_SRC, lhs}, {TensorType::ACL_DST, lhs_transposed.get()}}; + NEScheduler::get().schedule_op(_transpose_kernel_lhs.get(), Window::DimY, _transpose_kernel_lhs->window(), + lhs_transpose_pack); asm_tensors.add_const_tensor(TensorType::ACL_SRC_0, lhs_transposed.get()); } // Run transpose rhs if necessary - if(_adj_rhs) + if (_adj_rhs) { - ITensorPack rhs_transpose_pack = { { TensorType::ACL_SRC, rhs }, { TensorType::ACL_DST, rhs_transposed.get() } }; - NEScheduler::get().schedule_op(_transpose_kernel_rhs.get(), Window::DimY, _transpose_kernel_rhs->window(), rhs_transpose_pack); + ITensorPack rhs_transpose_pack = {{TensorType::ACL_SRC, rhs}, {TensorType::ACL_DST, rhs_transposed.get()}}; + NEScheduler::get().schedule_op(_transpose_kernel_rhs.get(), Window::DimY, _transpose_kernel_rhs->window(), + rhs_transpose_pack); asm_tensors.add_const_tensor(TensorType::ACL_SRC_1, rhs_transposed.get()); } // Run asm kernel diff --git a/src/cpu/operators/CpuMatMul.h b/src/cpu/operators/CpuMatMul.h index 475c019fd0..24db3da346 100644 --- a/src/cpu/operators/CpuMatMul.h +++ b/src/cpu/operators/CpuMatMul.h @@ -25,6 +25,7 @@ #define ACL_SRC_CPU_OPERATORS_CPUMATMUL #include "arm_compute/core/TensorInfo.h" + #include "src/core/common/Macros.h" #include "src/cpu/ICpuOperator.h" #include "src/cpu/kernels/CpuTransposeKernel.h" @@ -66,18 +67,27 @@ public: * @param[in] settings The settings for matmul operation (i.e fast math) * @param[in] act_info Class containing information about fused activation function. */ - void configure(ITensorInfo *lhs, ITensorInfo *rhs, ITensorInfo *dst, const MatMulInfo &info, const CpuMatMulSettings &settings, const ActivationLayerInfo &act_info = ActivationLayerInfo()); + void configure(ITensorInfo *lhs, + ITensorInfo *rhs, + ITensorInfo *dst, + const MatMulInfo &info, + const CpuMatMulSettings &settings, + const ActivationLayerInfo &act_info = ActivationLayerInfo()); /** Static function to check if given info will lead to a valid configuration * * Similar to CpuMatMul::configure() * * @return a status */ - static Status validate(const ITensorInfo *lhs, const ITensorInfo *rhs, const ITensorInfo *dst, const MatMulInfo &info, const CpuMatMulSettings &settings, + static Status validate(const ITensorInfo *lhs, + const ITensorInfo *rhs, + const ITensorInfo *dst, + const MatMulInfo &info, + const CpuMatMulSettings &settings, const ActivationLayerInfo &act_info = ActivationLayerInfo()); // Inherited methods overridden: - void run(ITensorPack &tensors) override; + void run(ITensorPack &tensors) override; experimental::MemoryRequirements workspace() const override; private: @@ -91,9 +101,9 @@ private: }; // Define unique pointers to kernels/operators used by matmul - std::unique_ptr<kernels::CpuTransposeKernel> _transpose_kernel_lhs{ nullptr }; - std::unique_ptr<kernels::CpuTransposeKernel> _transpose_kernel_rhs{ nullptr }; - std::unique_ptr<CpuGemmAssemblyDispatch> _asm_glue{ nullptr }; + std::unique_ptr<kernels::CpuTransposeKernel> _transpose_kernel_lhs{nullptr}; + std::unique_ptr<kernels::CpuTransposeKernel> _transpose_kernel_rhs{nullptr}; + std::unique_ptr<CpuGemmAssemblyDispatch> _asm_glue{nullptr}; // TensorInfo for tensors stored in auxillary memory TensorInfo _lhs_transposed{}; @@ -105,13 +115,13 @@ private: TensorShape _original_dst_shape{}; // Note : adj_lhs means the same as transposing lhs - bool _adj_lhs{ false }; - bool _adj_rhs{ false }; - bool _fast_math{ false }; + bool _adj_lhs{false}; + bool _adj_rhs{false}; + bool _fast_math{false}; AsmGemmInfo _gemm_info{}; - experimental::MemoryRequirements _aux_mem{ Count }; + experimental::MemoryRequirements _aux_mem{Count}; }; -} -} +} // namespace cpu +} // namespace arm_compute #endif /* ACL_SRC_CPU_OPERATORS_CPUMATMUL */ diff --git a/src/cpu/operators/CpuMaxUnpooling.cpp b/src/cpu/operators/CpuMaxUnpooling.cpp index 24e9fd6d46..697fc40ab3 100644 --- a/src/cpu/operators/CpuMaxUnpooling.cpp +++ b/src/cpu/operators/CpuMaxUnpooling.cpp @@ -22,6 +22,7 @@ * SOFTWARE. */ #include "src/cpu/operators/CpuMaxUnpooling.h" + #include "src/common/utils/Log.h" #include "src/cpu/kernels/CpuMaxUnpoolingLayerKernel.h" @@ -29,7 +30,10 @@ namespace arm_compute { namespace cpu { -void CpuMaxUnpooling::configure(const ITensorInfo *src, const ITensorInfo *indices, ITensorInfo *dst, const PoolingLayerInfo &pool_info) +void CpuMaxUnpooling::configure(const ITensorInfo *src, + const ITensorInfo *indices, + ITensorInfo *dst, + const PoolingLayerInfo &pool_info) { ARM_COMPUTE_LOG_PARAMS(src, indices, dst, pool_info); auto k = std::make_unique<kernels::CpuMaxUnpoolingLayerKernel>(); @@ -37,9 +41,12 @@ void CpuMaxUnpooling::configure(const ITensorInfo *src, const ITensorInfo *indic _kernel = std::move(k); } -Status CpuMaxUnpooling::validate(const ITensorInfo *src, const ITensorInfo *indices, const ITensorInfo *dst, const PoolingLayerInfo &pool_info) +Status CpuMaxUnpooling::validate(const ITensorInfo *src, + const ITensorInfo *indices, + const ITensorInfo *dst, + const PoolingLayerInfo &pool_info) { return kernels::CpuMaxUnpoolingLayerKernel::validate(src, indices, dst, pool_info); } -} // namesapce cpu +} // namespace cpu } // namespace arm_compute diff --git a/src/cpu/operators/CpuMaxUnpooling.h b/src/cpu/operators/CpuMaxUnpooling.h index aa1f1072a5..5dc00bce9e 100644 --- a/src/cpu/operators/CpuMaxUnpooling.h +++ b/src/cpu/operators/CpuMaxUnpooling.h @@ -44,14 +44,18 @@ public: * @param[out] dst Destination tensor. Data types supported: Same as @p src * @param[in] pool_info Contains pooling operation information described in @ref PoolingLayerInfo. */ - void configure(const ITensorInfo *src, const ITensorInfo *indices, ITensorInfo *dst, const PoolingLayerInfo &pool_info); + void + configure(const ITensorInfo *src, const ITensorInfo *indices, ITensorInfo *dst, const PoolingLayerInfo &pool_info); /** Static function to check if given info will lead to a valid configuration * * Similar to @ref CpuMaxUnpooling::configure() * * @return a status */ - static Status validate(const ITensorInfo *src, const ITensorInfo *indices, const ITensorInfo *dst, const PoolingLayerInfo &pool_info); + static Status validate(const ITensorInfo *src, + const ITensorInfo *indices, + const ITensorInfo *dst, + const PoolingLayerInfo &pool_info); }; } // namespace cpu } // namespace arm_compute diff --git a/src/cpu/operators/CpuMul.cpp b/src/cpu/operators/CpuMul.cpp index 4c15015206..ac9847111d 100644 --- a/src/cpu/operators/CpuMul.cpp +++ b/src/cpu/operators/CpuMul.cpp @@ -26,6 +26,7 @@ #include "arm_compute/core/TensorInfo.h" #include "arm_compute/core/Validate.h" #include "arm_compute/runtime/NEON/NEScheduler.h" + #include "src/common/utils/Log.h" #include "src/cpu/kernels/CpuMulKernel.h" @@ -33,14 +34,24 @@ namespace arm_compute { namespace cpu { -Status CpuMul::validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, float scale, ConvertPolicy overflow_policy, RoundingPolicy rounding_policy, +Status CpuMul::validate(const ITensorInfo *src1, + const ITensorInfo *src2, + const ITensorInfo *dst, + float scale, + ConvertPolicy overflow_policy, + RoundingPolicy rounding_policy, const ActivationLayerInfo &act_info) { ARM_COMPUTE_RETURN_ERROR_ON(act_info.enabled()); return kernels::CpuMulKernel::validate(src1, src2, dst, scale, overflow_policy, rounding_policy); } -void CpuMul::configure(ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, float scale, ConvertPolicy overflow_policy, RoundingPolicy rounding_policy, +void CpuMul::configure(ITensorInfo *src1, + ITensorInfo *src2, + ITensorInfo *dst, + float scale, + ConvertPolicy overflow_policy, + RoundingPolicy rounding_policy, const ActivationLayerInfo &act_info) { ARM_COMPUTE_UNUSED(act_info); @@ -58,13 +69,19 @@ void CpuMul::run(ITensorPack &tensors) NEScheduler::get().schedule_op(_kernel.get(), split_dimension, _kernel->window(), tensors); } -Status CpuComplexMul::validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, const ActivationLayerInfo &act_info) +Status CpuComplexMul::validate(const ITensorInfo *src1, + const ITensorInfo *src2, + const ITensorInfo *dst, + const ActivationLayerInfo &act_info) { ARM_COMPUTE_RETURN_ERROR_ON(act_info.enabled()); return kernels::CpuComplexMulKernel::validate(src1, src2, dst); } -void CpuComplexMul::configure(ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, const ActivationLayerInfo &act_info) +void CpuComplexMul::configure(ITensorInfo *src1, + ITensorInfo *src2, + ITensorInfo *dst, + const ActivationLayerInfo &act_info) { ARM_COMPUTE_UNUSED(act_info); ARM_COMPUTE_LOG_PARAMS(src1, src2, dst, act_info); @@ -80,4 +97,4 @@ void CpuComplexMul::run(ITensorPack &tensors) NEScheduler::get().schedule_op(_kernel.get(), Window::DimY, _kernel->window(), tensors); } } // namespace cpu -} // namespace arm_compute
\ No newline at end of file +} // namespace arm_compute diff --git a/src/cpu/operators/CpuMul.h b/src/cpu/operators/CpuMul.h index 3e0edbf050..82b309830b 100644 --- a/src/cpu/operators/CpuMul.h +++ b/src/cpu/operators/CpuMul.h @@ -26,6 +26,7 @@ #include "arm_compute/core/ITensorInfo.h" #include "arm_compute/function_info/ActivationLayerInfo.h" + #include "src/cpu/ICpuOperator.h" namespace arm_compute @@ -61,7 +62,12 @@ public: * @param[in] rounding_policy Rounding policy. * @param[in] act_info (Optional) Activation layer information in case of a fused activation. Currently not supported. */ - void configure(ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, float scale, ConvertPolicy overflow_policy, RoundingPolicy rounding_policy, + void configure(ITensorInfo *src1, + ITensorInfo *src2, + ITensorInfo *dst, + float scale, + ConvertPolicy overflow_policy, + RoundingPolicy rounding_policy, const ActivationLayerInfo &act_info = ActivationLayerInfo()); /** Static function to check if given info will lead to a valid configuration * @@ -69,7 +75,12 @@ public: * * @return a status */ - static Status validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, float scale, ConvertPolicy overflow_policy, RoundingPolicy rounding_policy, + static Status validate(const ITensorInfo *src1, + const ITensorInfo *src2, + const ITensorInfo *dst, + float scale, + ConvertPolicy overflow_policy, + RoundingPolicy rounding_policy, const ActivationLayerInfo &act_info = ActivationLayerInfo()); // Inherited methods overridden: @@ -89,14 +100,20 @@ public: * @param[out] dst The dst tensor. Data types supported: same as @p src1. Number of channels: same as @p src1. * @param[in] act_info (Optional) Activation layer information in case of a fused activation. Currently not supported. */ - void configure(ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, const ActivationLayerInfo &act_info = ActivationLayerInfo()); + void configure(ITensorInfo *src1, + ITensorInfo *src2, + ITensorInfo *dst, + const ActivationLayerInfo &act_info = ActivationLayerInfo()); /** Static function to check if given info will lead to a valid configuration * * Similar to @ref CpuComplexMul::configure() * * @return a status */ - static Status validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, const ActivationLayerInfo &act_info = ActivationLayerInfo()); + static Status validate(const ITensorInfo *src1, + const ITensorInfo *src2, + const ITensorInfo *dst, + const ActivationLayerInfo &act_info = ActivationLayerInfo()); // Inherited methods overridden: void run(ITensorPack &tensors) override; diff --git a/src/cpu/operators/CpuPermute.cpp b/src/cpu/operators/CpuPermute.cpp index babaf21b6f..25acc92d00 100644 --- a/src/cpu/operators/CpuPermute.cpp +++ b/src/cpu/operators/CpuPermute.cpp @@ -23,9 +23,8 @@ */ #include "src/cpu/operators/CpuPermute.h" -#include "src/cpu/kernels/CpuPermuteKernel.h" - #include "src/common/utils/Log.h" +#include "src/cpu/kernels/CpuPermuteKernel.h" namespace arm_compute { @@ -43,5 +42,5 @@ Status CpuPermute::validate(const ITensorInfo *src, const ITensorInfo *dst, cons { return kernels::CpuPermuteKernel::validate(src, dst, perm); } -} // namesapce cpu +} // namespace cpu } // namespace arm_compute diff --git a/src/cpu/operators/CpuPool2d.cpp b/src/cpu/operators/CpuPool2d.cpp index 722cd36ee5..b72bde6978 100644 --- a/src/cpu/operators/CpuPool2d.cpp +++ b/src/cpu/operators/CpuPool2d.cpp @@ -26,6 +26,7 @@ #include "arm_compute/core/ITensor.h" #include "arm_compute/core/TensorInfo.h" #include "arm_compute/runtime/NEON/NEScheduler.h" + #include "src/common/utils/Log.h" #include "src/cpu/kernels/CpuPool2dKernel.h" #include "src/cpu/kernels/internal/CpuPool2dAssemblyWrapperKernel.h" @@ -53,7 +54,8 @@ void CpuPool2d::configure(ITensorInfo *src, ITensorInfo *dst, const PoolingLayer ARM_COMPUTE_LOG_PARAMS(src, dst, pool_info, indices); // Check if we can run assembly kernels. Currently, indices are not supported by those kernels - const bool run_optimised = bool(kernels::CpuPool2dAssemblyWrapperKernel::validate(src, dst, pool_info)) && (indices == nullptr); + const bool run_optimised = + bool(kernels::CpuPool2dAssemblyWrapperKernel::validate(src, dst, pool_info)) && (indices == nullptr); // Get data layout _data_layout = pool_info.data_layout == DataLayout::UNKNOWN ? src->data_layout() : pool_info.data_layout; @@ -61,10 +63,11 @@ void CpuPool2d::configure(ITensorInfo *src, ITensorInfo *dst, const PoolingLayer // Check if we have Global Pooling Layer const unsigned int idx_width = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::WIDTH); const unsigned int idx_height = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::HEIGHT); - _is_global_pooling_layer = (src->dimension(idx_width) == pool_info.pool_size.width) && (src->dimension(idx_height) == pool_info.pool_size.height); - _use_kernel_indices = pool_info.use_kernel_indices; + _is_global_pooling_layer = (src->dimension(idx_width) == pool_info.pool_size.width) && + (src->dimension(idx_height) == pool_info.pool_size.height); + _use_kernel_indices = pool_info.use_kernel_indices; - if(run_optimised) + if (run_optimised) { const CPUInfo &ci = NEScheduler::get().cpu_info(); const unsigned int num_threads = NEScheduler::get().num_threads(); @@ -76,7 +79,7 @@ void CpuPool2d::configure(ITensorInfo *src, ITensorInfo *dst, const PoolingLayer // Get kernel's memory requirements constexpr size_t alignment = 4096; const size_t workspace_size = pooling_wrapper->get_working_size(num_threads); - _aux_mem[0] = MemoryInfo(TensorType::ACL_INT_0, MemoryLifetime::Temporary, workspace_size, alignment); + _aux_mem[0] = MemoryInfo(TensorType::ACL_INT_0, MemoryLifetime::Temporary, workspace_size, alignment); _asm_glue = std::move(pooling_wrapper); } @@ -89,11 +92,15 @@ void CpuPool2d::configure(ITensorInfo *src, ITensorInfo *dst, const PoolingLayer } } -Status CpuPool2d::validate(const ITensorInfo *src, const ITensorInfo *dst, const PoolingLayerInfo &pool_info, const ITensorInfo *indices) +Status CpuPool2d::validate(const ITensorInfo *src, + const ITensorInfo *dst, + const PoolingLayerInfo &pool_info, + const ITensorInfo *indices) { - const bool run_optimised = bool(kernels::CpuPool2dAssemblyWrapperKernel::validate(src, dst, pool_info)) && (indices == nullptr); + const bool run_optimised = + bool(kernels::CpuPool2dAssemblyWrapperKernel::validate(src, dst, pool_info)) && (indices == nullptr); - if(run_optimised) + if (run_optimised) { return Status{}; } @@ -105,20 +112,24 @@ void CpuPool2d::run(ITensorPack &tensors) { ARM_COMPUTE_ERROR_ON_MSG(tensors.empty(), "No tensors provided"); - if(_asm_glue) + if (_asm_glue) { const auto hints = (_is_global_pooling_layer) ? Window::DimX : Window::DimY; NEScheduler::get().schedule_op(_asm_glue.get(), hints, _asm_glue->window(), tensors); } else { - switch(_data_layout) + switch (_data_layout) { case DataLayout::NCHW: - NEScheduler::get().schedule_op(_pooling_layer_kernel.get(), _is_global_pooling_layer ? Window::DimZ : Window::DimY, _pooling_layer_kernel->window(), tensors); + NEScheduler::get().schedule_op(_pooling_layer_kernel.get(), + _is_global_pooling_layer ? Window::DimZ : Window::DimY, + _pooling_layer_kernel->window(), tensors); break; case DataLayout::NHWC: - NEScheduler::get().schedule_op(_pooling_layer_kernel.get(), (_use_kernel_indices ? Window::DimY : Window::DimX), _pooling_layer_kernel->window(), tensors); + NEScheduler::get().schedule_op(_pooling_layer_kernel.get(), + (_use_kernel_indices ? Window::DimY : Window::DimX), + _pooling_layer_kernel->window(), tensors); break; default: ARM_COMPUTE_ERROR("Data layout not supported"); diff --git a/src/cpu/operators/CpuPool2d.h b/src/cpu/operators/CpuPool2d.h index 5c571db88a..ea73e3f335 100644 --- a/src/cpu/operators/CpuPool2d.h +++ b/src/cpu/operators/CpuPool2d.h @@ -25,6 +25,7 @@ #define ARM_COMPUTE_CPU_POOL2D_H #include "arm_compute/core/experimental/Types.h" + #include "src/core/common/Macros.h" #include "src/cpu/ICpuOperator.h" @@ -58,17 +59,21 @@ public: * @param[in] pool_info Contains pooling operation information described in @ref PoolingLayerInfo. * @param[out] indices (optional) The indices of the maximal values. Data type supported: U32. */ - void configure(ITensorInfo *src, ITensorInfo *dst, const PoolingLayerInfo &pool_info, ITensorInfo *indices = nullptr); + void + configure(ITensorInfo *src, ITensorInfo *dst, const PoolingLayerInfo &pool_info, ITensorInfo *indices = nullptr); /** Static function to check if given info will lead to a valid configuration * * Similar to CpuPool2d::configure() * * @return a status */ - static Status validate(const ITensorInfo *src, const ITensorInfo *dst, const PoolingLayerInfo &pool_info, const ITensorInfo *indices = nullptr); + static Status validate(const ITensorInfo *src, + const ITensorInfo *dst, + const PoolingLayerInfo &pool_info, + const ITensorInfo *indices = nullptr); // Inherited methods overridden: - void run(ITensorPack &tensors) override; + void run(ITensorPack &tensors) override; experimental::MemoryRequirements workspace() const override; private: diff --git a/src/cpu/operators/CpuPool3d.cpp b/src/cpu/operators/CpuPool3d.cpp index 14e4ac6c97..7fa78c1f80 100644 --- a/src/cpu/operators/CpuPool3d.cpp +++ b/src/cpu/operators/CpuPool3d.cpp @@ -26,6 +26,7 @@ #include "arm_compute/core/ITensor.h" #include "arm_compute/core/TensorInfo.h" #include "arm_compute/runtime/Scheduler.h" + #include "src/common/utils/Log.h" #include "src/cpu/kernels/CpuPool3dKernel.h" @@ -35,8 +36,7 @@ namespace arm_compute { namespace cpu { -CpuPool3d::CpuPool3d() - : _aux_mem(1) +CpuPool3d::CpuPool3d() : _aux_mem(1) { } @@ -70,4 +70,4 @@ experimental::MemoryRequirements CpuPool3d::workspace() const } } // namespace cpu -} // namespace arm_compute
\ No newline at end of file +} // namespace arm_compute diff --git a/src/cpu/operators/CpuPool3d.h b/src/cpu/operators/CpuPool3d.h index 8a73f8a0af..235d798095 100644 --- a/src/cpu/operators/CpuPool3d.h +++ b/src/cpu/operators/CpuPool3d.h @@ -25,6 +25,7 @@ #define ARM_COMPUTE_CPU_POOL3D_H #include "arm_compute/core/experimental/Types.h" + #include "src/core/common/Macros.h" #include "src/cpu/ICpuOperator.h" @@ -61,7 +62,7 @@ public: static Status validate(const ITensorInfo *src, const ITensorInfo *dst, const Pooling3dLayerInfo &pool_info); // Inherited methods overridden: - void run(ITensorPack &tensors) override; + void run(ITensorPack &tensors) override; experimental::MemoryRequirements workspace() const override; private: diff --git a/src/cpu/operators/CpuQuantize.cpp b/src/cpu/operators/CpuQuantize.cpp index f9e14d1f88..4315499c39 100644 --- a/src/cpu/operators/CpuQuantize.cpp +++ b/src/cpu/operators/CpuQuantize.cpp @@ -27,6 +27,7 @@ #include "arm_compute/core/Types.h" #include "arm_compute/core/Validate.h" #include "arm_compute/runtime/NEON/NEScheduler.h" + #include "src/common/utils/Log.h" #include "src/cpu/kernels/CpuQuantizeKernel.h" diff --git a/src/cpu/operators/CpuReshape.cpp b/src/cpu/operators/CpuReshape.cpp index e6892a2e7e..a423abb49a 100644 --- a/src/cpu/operators/CpuReshape.cpp +++ b/src/cpu/operators/CpuReshape.cpp @@ -23,11 +23,10 @@ */ #include "src/cpu/operators/CpuReshape.h" -#include "src/cpu/kernels/CpuReshapeKernel.h" +#include "arm_compute/runtime/NEON/NEScheduler.h" #include "src/common/utils/Log.h" - -#include "arm_compute/runtime/NEON/NEScheduler.h" +#include "src/cpu/kernels/CpuReshapeKernel.h" namespace arm_compute { @@ -49,7 +48,7 @@ Status CpuReshape::validate(const ITensorInfo *src, const ITensorInfo *dst) void CpuReshape::run(ITensorPack &tensors) { ARM_COMPUTE_ERROR_ON_MSG(tensors.empty(), "No inputs provided"); - if(!_is_prepared) + if (!_is_prepared) { static_cast<kernels::CpuReshapeKernel *>(_kernel.get())->prepare(tensors); _is_prepared = true; diff --git a/src/cpu/operators/CpuReshape.h b/src/cpu/operators/CpuReshape.h index 9bc43e7db4..33da792319 100644 --- a/src/cpu/operators/CpuReshape.h +++ b/src/cpu/operators/CpuReshape.h @@ -24,9 +24,10 @@ #ifndef ARM_COMPUTE_CPU_RESHAPE_H #define ARM_COMPUTE_CPU_RESHAPE_H -#include "src/cpu/ICpuOperator.h" #include "arm_compute/core/Window.h" +#include "src/cpu/ICpuOperator.h" + namespace arm_compute { namespace cpu @@ -53,7 +54,7 @@ public: void run(ITensorPack &tensors) override; private: - bool _is_prepared{ false } ; + bool _is_prepared{false}; }; } // namespace cpu } // namespace arm_compute diff --git a/src/cpu/operators/CpuScale.cpp b/src/cpu/operators/CpuScale.cpp index 8a712bf088..7df9296931 100644 --- a/src/cpu/operators/CpuScale.cpp +++ b/src/cpu/operators/CpuScale.cpp @@ -24,8 +24,9 @@ #include "src/cpu/operators/CpuScale.h" #include "arm_compute/core/Helpers.h" -#include "arm_compute/runtime/NEON/NEScheduler.h" #include "arm_compute/core/TensorInfo.h" +#include "arm_compute/runtime/NEON/NEScheduler.h" + #include "src/common/utils/Log.h" #include "src/core/utils/ScaleUtils.h" #include "src/cpu/kernels/CpuScaleKernel.h" @@ -37,11 +38,12 @@ namespace cpu { namespace { -void precompute_dx_dy_offsets(ITensor *dx, ITensor *dy, ITensor *offsets, float wr, float hr, SamplingPolicy sampling_policy, bool align_corners) +void precompute_dx_dy_offsets( + ITensor *dx, ITensor *dy, ITensor *offsets, float wr, float hr, SamplingPolicy sampling_policy, bool align_corners) { ARM_COMPUTE_ERROR_ON(offsets == nullptr); float sampling_offset = 0.0f; - if(sampling_policy == SamplingPolicy::CENTER) + if (sampling_policy == SamplingPolicy::CENTER) { sampling_offset = 0.5f; } @@ -50,38 +52,44 @@ void precompute_dx_dy_offsets(ITensor *dx, ITensor *dy, ITensor *offsets, float win.set(Window::DimX, Window::Dimension(0, offsets->info()->dimension(0), 1)); win.set(Window::DimY, Window::Dimension(0, offsets->info()->dimension(1), 1)); - if(dx != nullptr && dy != nullptr) + if (dx != nullptr && dy != nullptr) { // Pre-compute the offset and pixel's distance for BILINEAR interpolation Iterator offsets_it(offsets, win); Iterator dx_it(dx, win); Iterator dy_it(dy, win); - execute_window_loop(win, [&](const Coordinates & id) - { - const float in_x = (id.x() + sampling_offset) * wr - sampling_offset; - const float in_y = (id.y() + sampling_offset) * hr - sampling_offset; - const int in_xi = std::floor(in_x); - const int in_yi = std::floor(in_y); + execute_window_loop( + win, + [&](const Coordinates &id) + { + const float in_x = (id.x() + sampling_offset) * wr - sampling_offset; + const float in_y = (id.y() + sampling_offset) * hr - sampling_offset; + const int in_xi = std::floor(in_x); + const int in_yi = std::floor(in_y); - *reinterpret_cast<int32_t *>(offsets_it.ptr()) = in_xi; - *reinterpret_cast<float *>(dx_it.ptr()) = in_x - in_xi; - *reinterpret_cast<float *>(dy_it.ptr()) = in_y - in_yi; - }, - offsets_it, dx_it, dy_it); + *reinterpret_cast<int32_t *>(offsets_it.ptr()) = in_xi; + *reinterpret_cast<float *>(dx_it.ptr()) = in_x - in_xi; + *reinterpret_cast<float *>(dy_it.ptr()) = in_y - in_yi; + }, + offsets_it, dx_it, dy_it); } else { // Pre-compute the offset for NEAREST interpolation Iterator offsets_it(offsets, win); - execute_window_loop(win, [&](const Coordinates & id) - { - const float float_in_xi = (id.x() + sampling_offset) * wr; - const auto in_xi = static_cast<size_t>(align_corners ? arm_compute::utils::rounding::round_half_away_from_zero(float_in_xi) : std::floor(float_in_xi)); - *reinterpret_cast<int32_t *>(offsets_it.ptr()) = in_xi; - }, - offsets_it); + execute_window_loop( + win, + [&](const Coordinates &id) + { + const float float_in_xi = (id.x() + sampling_offset) * wr; + const auto in_xi = static_cast<size_t>( + align_corners ? arm_compute::utils::rounding::round_half_away_from_zero(float_in_xi) + : std::floor(float_in_xi)); + *reinterpret_cast<int32_t *>(offsets_it.ptr()) = in_xi; + }, + offsets_it); } } } // namespace @@ -96,20 +104,24 @@ void CpuScale::configure(ITensorInfo *src, ITensorInfo *dst, const ScaleKernelIn _is_prepared = false; // Get data layout and width/height indices - _data_layout = _scale_info.data_layout == DataLayout::UNKNOWN ? src->data_layout() : _scale_info.data_layout; - const int idx_width = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::WIDTH); + _data_layout = _scale_info.data_layout == DataLayout::UNKNOWN ? src->data_layout() : _scale_info.data_layout; + const int idx_width = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::WIDTH); const int idx_height = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::HEIGHT); // Compute the ratio between source width/height and destination width/height - const bool is_align_corners_used = _scale_info.align_corners && arm_compute::scale_utils::is_align_corners_allowed_sampling_policy(_scale_info.sampling_policy); - const auto wr = arm_compute::scale_utils::calculate_resize_ratio(src->dimension(idx_width), dst->dimension(idx_width), is_align_corners_used); - const auto hr = arm_compute::scale_utils::calculate_resize_ratio(src->dimension(idx_height), dst->dimension(idx_height), is_align_corners_used); + const bool is_align_corners_used = + _scale_info.align_corners && + arm_compute::scale_utils::is_align_corners_allowed_sampling_policy(_scale_info.sampling_policy); + const auto wr = arm_compute::scale_utils::calculate_resize_ratio(src->dimension(idx_width), + dst->dimension(idx_width), is_align_corners_used); + const auto hr = arm_compute::scale_utils::calculate_resize_ratio(src->dimension(idx_height), + dst->dimension(idx_height), is_align_corners_used); // Area interpolation behaves as Nearest Neighbour in case of up-sampling - InterpolationPolicy policy_to_use = (_scale_info.interpolation_policy == InterpolationPolicy::AREA && wr <= 1.f - && hr <= 1.f) ? - InterpolationPolicy::NEAREST_NEIGHBOR : - _scale_info.interpolation_policy; + InterpolationPolicy policy_to_use = + (_scale_info.interpolation_policy == InterpolationPolicy::AREA && wr <= 1.f && hr <= 1.f) + ? InterpolationPolicy::NEAREST_NEIGHBOR + : _scale_info.interpolation_policy; // Get the tensor shape TensorShape shape(dst->dimension(idx_width)); @@ -122,7 +134,7 @@ void CpuScale::configure(ITensorInfo *src, ITensorInfo *dst, const ScaleKernelIn auto dy = std::make_unique<TensorInfo>(tensor_info_dxdy); auto offsets = std::make_unique<TensorInfo>(tensor_info_offsets); auto scale_kernel = std::make_unique<kernels::CpuScaleKernel>(); - switch(policy_to_use) + switch (policy_to_use) { case InterpolationPolicy::NEAREST_NEIGHBOR: { @@ -148,7 +160,8 @@ void CpuScale::configure(ITensorInfo *src, ITensorInfo *dst, const ScaleKernelIn Status CpuScale::validate(const ITensorInfo *src, const ITensorInfo *dst, const ScaleKernelInfo &info) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst); - ARM_COMPUTE_RETURN_ERROR_ON(info.sampling_policy != SamplingPolicy::CENTER && info.sampling_policy != SamplingPolicy::TOP_LEFT); + ARM_COMPUTE_RETURN_ERROR_ON(info.sampling_policy != SamplingPolicy::CENTER && + info.sampling_policy != SamplingPolicy::TOP_LEFT); ITensorInfo *offsets = nullptr; ITensorInfo *dx = nullptr; @@ -160,19 +173,25 @@ Status CpuScale::validate(const ITensorInfo *src, const ITensorInfo *dst, const const int idx_height = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT); // Compute the ratio between source width/height and destination width/height - const bool is_align_corners_used = info.align_corners && arm_compute::scale_utils::is_align_corners_allowed_sampling_policy(info.sampling_policy); - const auto wr = arm_compute::scale_utils::calculate_resize_ratio(src->dimension(idx_width), dst->dimension(idx_width), is_align_corners_used); - const auto hr = arm_compute::scale_utils::calculate_resize_ratio(src->dimension(idx_height), dst->dimension(idx_height), is_align_corners_used); + const bool is_align_corners_used = + info.align_corners && arm_compute::scale_utils::is_align_corners_allowed_sampling_policy(info.sampling_policy); + const auto wr = arm_compute::scale_utils::calculate_resize_ratio(src->dimension(idx_width), + dst->dimension(idx_width), is_align_corners_used); + const auto hr = arm_compute::scale_utils::calculate_resize_ratio(src->dimension(idx_height), + dst->dimension(idx_height), is_align_corners_used); // Area interpolation behaves as Nearest Neighbour in case of up-sampling - InterpolationPolicy policy_to_use = (info.interpolation_policy == InterpolationPolicy::AREA && wr <= 1.f && hr <= 1.f) ? InterpolationPolicy::NEAREST_NEIGHBOR : info.interpolation_policy; + InterpolationPolicy policy_to_use = + (info.interpolation_policy == InterpolationPolicy::AREA && wr <= 1.f && hr <= 1.f) + ? InterpolationPolicy::NEAREST_NEIGHBOR + : info.interpolation_policy; // Get the tensor shape of auxilary buffers const TensorShape shape(dst->dimension(idx_width), dst->dimension(idx_height)); TensorInfo tensor_info_offsets(shape, Format::S32); TensorInfo tensor_info_dx(shape, Format::F32); TensorInfo tensor_info_dy(shape, Format::F32); - switch(policy_to_use) + switch (policy_to_use) { case InterpolationPolicy::NEAREST_NEIGHBOR: offsets = &tensor_info_offsets; @@ -186,13 +205,14 @@ Status CpuScale::validate(const ITensorInfo *src, const ITensorInfo *dst, const break; } - ARM_COMPUTE_RETURN_ON_ERROR(kernels::CpuScaleKernel::validate(src->clone().get(), dx, dy, offsets, dst->clone().get(), info)); + ARM_COMPUTE_RETURN_ON_ERROR( + kernels::CpuScaleKernel::validate(src->clone().get(), dx, dy, offsets, dst->clone().get(), info)); return Status{}; } void CpuScale::prepare(ITensorPack &tensors) { - if(!_is_prepared) + if (!_is_prepared) { _is_prepared = true; const auto src = tensors.get_const_tensor(TensorType::ACL_SRC); @@ -206,22 +226,27 @@ void CpuScale::prepare(ITensorPack &tensors) const int idx_height = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::HEIGHT); // Compute the ratio between source width/height and destination width/height - const bool is_align_corners_used = _scale_info.align_corners && arm_compute::scale_utils::is_align_corners_allowed_sampling_policy(_scale_info.sampling_policy); - const auto wr = arm_compute::scale_utils::calculate_resize_ratio(src->info()->dimension(idx_width), dst->info()->dimension(idx_width), is_align_corners_used); - const auto hr = arm_compute::scale_utils::calculate_resize_ratio(src->info()->dimension(idx_height), dst->info()->dimension(idx_height), is_align_corners_used); + const bool is_align_corners_used = + _scale_info.align_corners && + arm_compute::scale_utils::is_align_corners_allowed_sampling_policy(_scale_info.sampling_policy); + const auto wr = arm_compute::scale_utils::calculate_resize_ratio( + src->info()->dimension(idx_width), dst->info()->dimension(idx_width), is_align_corners_used); + const auto hr = arm_compute::scale_utils::calculate_resize_ratio( + src->info()->dimension(idx_height), dst->info()->dimension(idx_height), is_align_corners_used); // Area interpolation behaves as Nearest Neighbour in case of up-sampling - InterpolationPolicy policy_to_use = (_scale_info.interpolation_policy == InterpolationPolicy::AREA && wr <= 1.f - && hr <= 1.f) ? - InterpolationPolicy::NEAREST_NEIGHBOR : - _scale_info.interpolation_policy; + InterpolationPolicy policy_to_use = + (_scale_info.interpolation_policy == InterpolationPolicy::AREA && wr <= 1.f && hr <= 1.f) + ? InterpolationPolicy::NEAREST_NEIGHBOR + : _scale_info.interpolation_policy; const SamplingPolicy sampling_policy = _scale_info.sampling_policy; - bool precompute_indices_weights = arm_compute::scale_utils::is_precomputation_required(_data_layout, src->info()->data_type(), policy_to_use, _scale_info.border_mode); + bool precompute_indices_weights = arm_compute::scale_utils::is_precomputation_required( + _data_layout, src->info()->data_type(), policy_to_use, _scale_info.border_mode); - if(precompute_indices_weights) + if (precompute_indices_weights) { - switch(policy_to_use) + switch (policy_to_use) { case InterpolationPolicy::NEAREST_NEIGHBOR: { @@ -245,7 +270,8 @@ void CpuScale::prepare(ITensorPack &tensors) } else { - if(policy_to_use != InterpolationPolicy::NEAREST_NEIGHBOR && policy_to_use != InterpolationPolicy::BILINEAR && policy_to_use != InterpolationPolicy::AREA) + if (policy_to_use != InterpolationPolicy::NEAREST_NEIGHBOR && + policy_to_use != InterpolationPolicy::BILINEAR && policy_to_use != InterpolationPolicy::AREA) { ARM_COMPUTE_ERROR("Unsupported interpolation mode"); } diff --git a/src/cpu/operators/CpuScale.h b/src/cpu/operators/CpuScale.h index ee7c523bad..c12a8e733a 100644 --- a/src/cpu/operators/CpuScale.h +++ b/src/cpu/operators/CpuScale.h @@ -24,9 +24,10 @@ #ifndef ARM_COMPUTE_CPU_SCALE_H #define ARM_COMPUTE_CPU_SCALE_H +#include "arm_compute/core/experimental/Types.h" #include "arm_compute/core/ITensorInfo.h" #include "arm_compute/core/KernelDescriptors.h" -#include "arm_compute/core/experimental/Types.h" + #include "src/cpu/ICpuKernel.h" #include "src/cpu/ICpuOperator.h" @@ -62,9 +63,9 @@ public: void run(ITensorPack &tensors) override; private: - ScaleKernelInfo _scale_info{ InterpolationPolicy::NEAREST_NEIGHBOR, BorderMode::UNDEFINED }; - DataLayout _data_layout{ DataLayout::UNKNOWN }; - bool _is_prepared{ false }; + ScaleKernelInfo _scale_info{InterpolationPolicy::NEAREST_NEIGHBOR, BorderMode::UNDEFINED}; + DataLayout _data_layout{DataLayout::UNKNOWN}; + bool _is_prepared{false}; }; } // namespace cpu } // namespace arm_compute diff --git a/src/cpu/operators/CpuSoftmax.cpp b/src/cpu/operators/CpuSoftmax.cpp index bf4c2fa3a2..e55d7f903e 100644 --- a/src/cpu/operators/CpuSoftmax.cpp +++ b/src/cpu/operators/CpuSoftmax.cpp @@ -25,9 +25,10 @@ #include "arm_compute/core/Helpers.h" #include "arm_compute/core/TensorInfo.h" -#include "arm_compute/core/Validate.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" +#include "arm_compute/core/Validate.h" #include "arm_compute/runtime/NEON/NEScheduler.h" + #include "src/common/utils/Log.h" #include "src/core/helpers/MemoryHelpers.h" #include "src/core/helpers/SoftmaxHelpers.h" @@ -63,13 +64,15 @@ void CpuSoftmaxGeneric<IS_LOG>::configure(const ITensorInfo *src, ITensorInfo *d ARM_COMPUTE_ERROR_THROW_ON(CpuSoftmaxGeneric::validate(src, dst, beta, axis)); ARM_COMPUTE_LOG_PARAMS(src, dst, beta, axis); - const unsigned int actual_axis = static_cast<unsigned int>(wrap_around(axis, static_cast<int32_t>(src->num_dimensions()))); + const unsigned int actual_axis = + static_cast<unsigned int>(wrap_around(axis, static_cast<int32_t>(src->num_dimensions()))); _needs_permute = actual_axis > 0; - if(_needs_permute) + if (_needs_permute) { - _permute_input.configure(src, &_input_permuted, softmax_helpers::get_permutation_vector_from_softmax_axis(actual_axis)); + _permute_input.configure(src, &_input_permuted, + softmax_helpers::get_permutation_vector_from_softmax_axis(actual_axis)); } // We want to deal with a 2D input. Either it is the permuted version of the original input (4D case) @@ -79,10 +82,11 @@ void CpuSoftmaxGeneric<IS_LOG>::configure(const ITensorInfo *src, ITensorInfo *d // Create intermediate tensors shapes TensorShape max_sum_shape = tmp_input->tensor_shape(); max_sum_shape.set(0, 1); - const TensorInfo input_info = tmp_input->clone()->reset_padding().set_is_resizable(true); - DataType tmp_data_type = is_data_type_quantized_asymmetric(tmp_input->data_type()) ? DataType::F32 : tmp_input->data_type(); - TensorInfo tensor_info_tmp(input_info.clone()->set_data_type(tmp_data_type)); - TensorInfo max_info(tmp_input->clone()->set_tensor_shape(max_sum_shape)); + const TensorInfo input_info = tmp_input->clone()->reset_padding().set_is_resizable(true); + DataType tmp_data_type = + is_data_type_quantized_asymmetric(tmp_input->data_type()) ? DataType::F32 : tmp_input->data_type(); + TensorInfo tensor_info_tmp(input_info.clone()->set_data_type(tmp_data_type)); + TensorInfo max_info(tmp_input->clone()->set_tensor_shape(max_sum_shape)); // Init intermediate tensors _max = TensorInfo(max_info); @@ -94,13 +98,14 @@ void CpuSoftmaxGeneric<IS_LOG>::configure(const ITensorInfo *src, ITensorInfo *d _max_kernel = std::move(mk); auto sm = std::make_unique<kernels::CpuLogits1DSoftmaxKernel<IS_LOG>>(); - if(_needs_permute) + if (_needs_permute) { // The normalization kernel stores the result in a permuted output tensor sm->configure(tmp_input, &_max, &_output_permuted, beta, &_tmp); // Re-permute the permuted output into the requested (4D) output - _permute_output.configure(&_output_permuted, dst, softmax_helpers::get_permutation_vector_from_softmax_axis(actual_axis)); + _permute_output.configure(&_output_permuted, dst, + softmax_helpers::get_permutation_vector_from_softmax_axis(actual_axis)); } else { @@ -109,11 +114,15 @@ void CpuSoftmaxGeneric<IS_LOG>::configure(const ITensorInfo *src, ITensorInfo *d } _softmax_kernel = std::move(sm); - _aux_mem[InternalTensorIdx::MAX] = MemoryInfo(offset_int_vec(InternalTensorIdx::MAX), MemoryLifetime::Temporary, _max.total_size()); - _aux_mem[InternalTensorIdx::TMP] = MemoryInfo(offset_int_vec(InternalTensorIdx::TMP), MemoryLifetime::Temporary, _tmp.total_size()); + _aux_mem[InternalTensorIdx::MAX] = + MemoryInfo(offset_int_vec(InternalTensorIdx::MAX), MemoryLifetime::Temporary, _max.total_size()); + _aux_mem[InternalTensorIdx::TMP] = + MemoryInfo(offset_int_vec(InternalTensorIdx::TMP), MemoryLifetime::Temporary, _tmp.total_size()); - _aux_mem[InternalTensorIdx::PERMUTED_SRC] = MemoryInfo(offset_int_vec(InternalTensorIdx::PERMUTED_SRC), MemoryLifetime::Temporary, _input_permuted.total_size()); - _aux_mem[InternalTensorIdx::PERMUTED_DST] = MemoryInfo(offset_int_vec(InternalTensorIdx::PERMUTED_DST), MemoryLifetime::Temporary, _output_permuted.total_size()); + _aux_mem[InternalTensorIdx::PERMUTED_SRC] = MemoryInfo(offset_int_vec(InternalTensorIdx::PERMUTED_SRC), + MemoryLifetime::Temporary, _input_permuted.total_size()); + _aux_mem[InternalTensorIdx::PERMUTED_DST] = MemoryInfo(offset_int_vec(InternalTensorIdx::PERMUTED_DST), + MemoryLifetime::Temporary, _output_permuted.total_size()); } template <bool IS_LOG> @@ -123,7 +132,8 @@ Status CpuSoftmaxGeneric<IS_LOG>::validate(const ITensorInfo *src, const ITensor ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst); ARM_COMPUTE_RETURN_ERROR_ON_MSG(src->num_dimensions() > 4, "Only up to 4 dimensions are supported"); ARM_COMPUTE_UNUSED(beta); - ARM_COMPUTE_RETURN_ERROR_ON(axis < static_cast<int32_t>(-src->num_dimensions()) || static_cast<int32_t>(src->num_dimensions()) <= axis); + ARM_COMPUTE_RETURN_ERROR_ON(axis < static_cast<int32_t>(-src->num_dimensions()) || + static_cast<int32_t>(src->num_dimensions()) <= axis); // Create intermediate tensor info DataType tmp_data_type = src->data_type(); @@ -131,25 +141,33 @@ Status CpuSoftmaxGeneric<IS_LOG>::validate(const ITensorInfo *src, const ITensor TensorShape max_sum_shape = src->tensor_shape(); max_sum_shape.set(0, 1); - const TensorInfo tensor_info_max_sum(src->clone()->set_tensor_shape(max_sum_shape).set_data_type(tmp_data_type).set_quantization_info(src->quantization_info()).set_is_resizable(true)); + const TensorInfo tensor_info_max_sum(src->clone() + ->set_tensor_shape(max_sum_shape) + .set_data_type(tmp_data_type) + .set_quantization_info(src->quantization_info()) + .set_is_resizable(true)); const TensorInfo dont_care; - const unsigned int actual_axis = static_cast<unsigned int>(wrap_around(axis, static_cast<int32_t>(src->num_dimensions()))); + const unsigned int actual_axis = + static_cast<unsigned int>(wrap_around(axis, static_cast<int32_t>(src->num_dimensions()))); const bool needs_permute = actual_axis > 0; - if(needs_permute) + if (needs_permute) { - const PermutationVector permutation_vector = softmax_helpers::get_permutation_vector_from_softmax_axis(actual_axis); - const TensorShape permuted_shape = misc::shape_calculator::compute_permutation_output_shape(*src, permutation_vector); - TensorInfo input_permuted(src->clone()->set_tensor_shape(permuted_shape)); + const PermutationVector permutation_vector = + softmax_helpers::get_permutation_vector_from_softmax_axis(actual_axis); + const TensorShape permuted_shape = + misc::shape_calculator::compute_permutation_output_shape(*src, permutation_vector); + TensorInfo input_permuted(src->clone()->set_tensor_shape(permuted_shape)); ARM_COMPUTE_RETURN_ON_ERROR(CpuPermute::validate(src, &input_permuted, permutation_vector)); TensorInfo output_permuted(dst->clone()->set_tensor_shape(permuted_shape)); ARM_COMPUTE_RETURN_ON_ERROR(CpuPermute::validate(&output_permuted, dst, permutation_vector)); } ARM_COMPUTE_RETURN_ON_ERROR(kernels::CpuLogits1DMaxKernel::validate(src, &tensor_info_max_sum)); - ARM_COMPUTE_RETURN_ON_ERROR(kernels::CpuLogits1DSoftmaxKernel<IS_LOG>::validate(&tensor_info_tmp, &tensor_info_max_sum, dst, beta, &dont_care)); + ARM_COMPUTE_RETURN_ON_ERROR(kernels::CpuLogits1DSoftmaxKernel<IS_LOG>::validate( + &tensor_info_tmp, &tensor_info_max_sum, dst, beta, &dont_care)); return Status{}; } @@ -166,43 +184,38 @@ void CpuSoftmaxGeneric<IS_LOG>::run(ITensorPack &tensors) CpuAuxTensorHandler max(offset_int_vec(InternalTensorIdx::MAX), _max, tensors, true); CpuAuxTensorHandler input_permuted(offset_int_vec(InternalTensorIdx::PERMUTED_SRC), _input_permuted, tensors, true); - CpuAuxTensorHandler output_permuted(offset_int_vec(InternalTensorIdx::PERMUTED_DST), _output_permuted, tensors, true); + CpuAuxTensorHandler output_permuted(offset_int_vec(InternalTensorIdx::PERMUTED_DST), _output_permuted, tensors, + true); ITensorPack max_pack; ITensorPack softmax_pack; - if(_needs_permute) + if (_needs_permute) { - ITensorPack permute_in_pack = { { TensorType::ACL_SRC, src }, { TensorType::ACL_DST, input_permuted.get() } }; + ITensorPack permute_in_pack = {{TensorType::ACL_SRC, src}, {TensorType::ACL_DST, input_permuted.get()}}; _permute_input.run(permute_in_pack); - max_pack = { { TensorType::ACL_SRC, input_permuted.get() }, { TensorType::ACL_DST, max.get() } }; + max_pack = {{TensorType::ACL_SRC, input_permuted.get()}, {TensorType::ACL_DST, max.get()}}; - softmax_pack = - { - { TensorType::ACL_SRC_0, input_permuted.get() }, - { TensorType::ACL_SRC_1, max.get() }, - { TensorType::ACL_DST_0, output_permuted.get() }, - { TensorType::ACL_DST_1, tmp.get() } - }; + softmax_pack = {{TensorType::ACL_SRC_0, input_permuted.get()}, + {TensorType::ACL_SRC_1, max.get()}, + {TensorType::ACL_DST_0, output_permuted.get()}, + {TensorType::ACL_DST_1, tmp.get()}}; } else { - max_pack = { { TensorType::ACL_SRC, src }, { TensorType::ACL_DST, max.get() } }; - - softmax_pack = - { - { TensorType::ACL_SRC_0, src }, - { TensorType::ACL_SRC_1, max.get() }, - { TensorType::ACL_DST_0, dst }, - { TensorType::ACL_DST_1, tmp.get() } - }; + max_pack = {{TensorType::ACL_SRC, src}, {TensorType::ACL_DST, max.get()}}; + + softmax_pack = {{TensorType::ACL_SRC_0, src}, + {TensorType::ACL_SRC_1, max.get()}, + {TensorType::ACL_DST_0, dst}, + {TensorType::ACL_DST_1, tmp.get()}}; } NEScheduler::get().schedule_op(_max_kernel.get(), Window::DimY, _max_kernel->window(), max_pack); NEScheduler::get().schedule_op(_softmax_kernel.get(), Window::DimY, _softmax_kernel->window(), softmax_pack); - if(_needs_permute) + if (_needs_permute) { ITensorPack permute_out_pack; permute_out_pack.add_tensor(TensorType::ACL_SRC, output_permuted.get()); @@ -211,7 +224,7 @@ void CpuSoftmaxGeneric<IS_LOG>::run(ITensorPack &tensors) } } -template <bool IS_LOG> +template <bool IS_LOG> experimental::MemoryRequirements CpuSoftmaxGeneric<IS_LOG>::workspace() const { return _aux_mem; diff --git a/src/cpu/operators/CpuSoftmax.h b/src/cpu/operators/CpuSoftmax.h index 64df8704f9..8cab70e14f 100644 --- a/src/cpu/operators/CpuSoftmax.h +++ b/src/cpu/operators/CpuSoftmax.h @@ -24,11 +24,13 @@ #ifndef ARM_COMPUTE_CPU_SOFTMAX_H #define ARM_COMPUTE_CPU_SOFTMAX_H -#include "arm_compute/core/TensorInfo.h" #include "arm_compute/core/experimental/Types.h" +#include "arm_compute/core/TensorInfo.h" + #include "src/cpu/ICpuKernel.h" #include "src/cpu/ICpuOperator.h" #include "src/cpu/operators/CpuPermute.h" + #include <memory> namespace arm_compute @@ -77,7 +79,7 @@ public: static Status validate(const ITensorInfo *src, const ITensorInfo *dst, float beta = 1.0f, int32_t axis = 0); // Inherited methods overridden: - void run(ITensorPack &tensors) override; + void run(ITensorPack &tensors) override; experimental::MemoryRequirements workspace() const override; private: diff --git a/src/cpu/operators/CpuSub.cpp b/src/cpu/operators/CpuSub.cpp index 91a5b6e63c..7d27efbc96 100644 --- a/src/cpu/operators/CpuSub.cpp +++ b/src/cpu/operators/CpuSub.cpp @@ -23,17 +23,20 @@ */ #include "src/cpu/operators/CpuSub.h" -#include "src/cpu/kernels/CpuSubKernel.h" +#include "arm_compute/runtime/NEON/NEScheduler.h" #include "src/common/utils/Log.h" - -#include "arm_compute/runtime/NEON/NEScheduler.h" +#include "src/cpu/kernels/CpuSubKernel.h" namespace arm_compute { namespace cpu { -void CpuSub::configure(const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst, ConvertPolicy policy, const ActivationLayerInfo &act_info) +void CpuSub::configure(const ITensorInfo *src0, + const ITensorInfo *src1, + ITensorInfo *dst, + ConvertPolicy policy, + const ActivationLayerInfo &act_info) { ARM_COMPUTE_UNUSED(act_info); ARM_COMPUTE_LOG_PARAMS(src0, src1, dst, policy); @@ -42,7 +45,11 @@ void CpuSub::configure(const ITensorInfo *src0, const ITensorInfo *src1, ITensor _kernel = std::move(k); } -Status CpuSub::validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst, ConvertPolicy policy, const ActivationLayerInfo &act_info) +Status CpuSub::validate(const ITensorInfo *src0, + const ITensorInfo *src1, + const ITensorInfo *dst, + ConvertPolicy policy, + const ActivationLayerInfo &act_info) { ARM_COMPUTE_RETURN_ERROR_ON(act_info.enabled()); return kernels::CpuSubKernel::validate(src0, src1, dst, policy); diff --git a/src/cpu/operators/CpuSub.h b/src/cpu/operators/CpuSub.h index 88908637aa..d1782a1d3c 100644 --- a/src/cpu/operators/CpuSub.h +++ b/src/cpu/operators/CpuSub.h @@ -25,6 +25,7 @@ #define ARM_COMPUTE_CPU_SUB_H #include "arm_compute/function_info/ActivationLayerInfo.h" + #include "src/cpu/ICpuOperator.h" namespace arm_compute @@ -53,14 +54,22 @@ public: * @param[in] policy Policy to use to handle overflow. Convert policy cannot be WRAP if datatype is quantized. * @param[in] act_info (Optional) Activation layer information in case of a fused activation. Currently not supported. */ - void configure(const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst, ConvertPolicy policy, const ActivationLayerInfo &act_info = ActivationLayerInfo()); + void configure(const ITensorInfo *src0, + const ITensorInfo *src1, + ITensorInfo *dst, + ConvertPolicy policy, + const ActivationLayerInfo &act_info = ActivationLayerInfo()); /** Static function to check if given info will lead to a valid configuration * * Similar to @ref CpuSub::configure() * * @return a status */ - static Status validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst, ConvertPolicy policy, const ActivationLayerInfo &act_info = ActivationLayerInfo()); + static Status validate(const ITensorInfo *src0, + const ITensorInfo *src1, + const ITensorInfo *dst, + ConvertPolicy policy, + const ActivationLayerInfo &act_info = ActivationLayerInfo()); // Inherited methods overridden: void run(ITensorPack &tensors) override; diff --git a/src/cpu/operators/CpuTranspose.cpp b/src/cpu/operators/CpuTranspose.cpp index 4e7854fd6e..ea548e0511 100644 --- a/src/cpu/operators/CpuTranspose.cpp +++ b/src/cpu/operators/CpuTranspose.cpp @@ -23,9 +23,8 @@ */ #include "src/cpu/operators/CpuTranspose.h" -#include "src/cpu/kernels/CpuTransposeKernel.h" - #include "src/common/utils/Log.h" +#include "src/cpu/kernels/CpuTransposeKernel.h" namespace arm_compute { @@ -43,5 +42,5 @@ Status CpuTranspose::validate(const ITensorInfo *src, const ITensorInfo *dst) { return kernels::CpuTransposeKernel::validate(src, dst); } -} // namesapce cpu +} // namespace cpu } // namespace arm_compute diff --git a/src/cpu/operators/CpuWinogradConv2d.cpp b/src/cpu/operators/CpuWinogradConv2d.cpp index c4edd89964..9d07736c13 100644 --- a/src/cpu/operators/CpuWinogradConv2d.cpp +++ b/src/cpu/operators/CpuWinogradConv2d.cpp @@ -22,23 +22,25 @@ * SOFTWARE. */ #include "src/cpu/operators/CpuWinogradConv2d.h" + #include "arm_compute/core/Error.h" #include "arm_compute/core/Utils.h" -#include "arm_compute/core/Validate.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" #include "arm_compute/core/utils/quantization/AsymmHelpers.h" +#include "arm_compute/core/Validate.h" #include "arm_compute/runtime/FunctionDescriptors.h" #include "arm_compute/runtime/NEON/NEScheduler.h" + #include "src/common/utils/Log.h" #include "src/core/CPP/Validate.h" +#include "src/core/helpers/MemoryHelpers.h" +#include "src/core/helpers/WindowHelpers.h" #include "src/core/NEON/kernels/assembly/winograd.hpp" #include "src/core/NEON/kernels/convolution/common/tensor.hpp" #include "src/core/NEON/kernels/convolution/common/utils.hpp" -#include "src/core/helpers/MemoryHelpers.h" -#include "src/core/helpers/WindowHelpers.h" #include "src/core/utils/AssemblyUtils.h" -#include "src/cpu/kernels/CpuWinogradConv2dKernel.h" #include "src/cpu/kernels/assembly/arm_gemm.hpp" +#include "src/cpu/kernels/CpuWinogradConv2dKernel.h" #include "src/cpu/operators/CpuActivation.h" #include "src/cpu/operators/CpuPermute.h" #include "src/cpu/utils/CpuAuxTensorHandler.h" @@ -56,21 +58,26 @@ namespace inline Tensor4DShape internal_get_shape(const ITensorInfo *in) { const DataLayout data_layout = in->data_layout(); - const int in_width = in->dimension(get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH)); - const int in_height = in->dimension(get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT)); - const int in_channels = in->dimension(get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL)); - const int in_batches = in->dimension(get_data_layout_dimension_index(data_layout, DataLayoutDimension::BATCHES)); + const int in_width = in->dimension(get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH)); + const int in_height = in->dimension(get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT)); + const int in_channels = in->dimension(get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL)); + const int in_batches = in->dimension(get_data_layout_dimension_index(data_layout, DataLayoutDimension::BATCHES)); - return Tensor4DShape{ in_batches, in_height, in_width, in_channels }; + return Tensor4DShape{in_batches, in_height, in_width, in_channels}; } -Status validate_arguments(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, const PadStrideInfo &conv_info) +Status validate_arguments(const ITensorInfo *src, + const ITensorInfo *weights, + const ITensorInfo *biases, + const ITensorInfo *dst, + const PadStrideInfo &conv_info) { ARM_COMPUTE_UNUSED(dst, weights); ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(src); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.stride().first != 1 || conv_info.stride().second != 1, "Winograd layer only supports unit strides."); - if(biases != nullptr) + ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.stride().first != 1 || conv_info.stride().second != 1, + "Winograd layer only supports unit strides."); + if (biases != nullptr) { ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, biases); ARM_COMPUTE_RETURN_ERROR_ON(biases->num_dimensions() > 1); @@ -80,43 +87,46 @@ Status validate_arguments(const ITensorInfo *src, const ITensorInfo *weights, co return Status{}; } -bool get_winograd_kernel_implementation(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *dst, - const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info, bool enable_fast_math, - arm_conv::winograd::WinogradImpl *winograd_impl, std::unique_ptr<arm_conv::ConvolutionArgs> &conv_args) +bool get_winograd_kernel_implementation(const ITensorInfo *src, + const ITensorInfo *weights, + const ITensorInfo *dst, + const PadStrideInfo &conv_info, + const ActivationLayerInfo &act_info, + bool enable_fast_math, + arm_conv::winograd::WinogradImpl *winograd_impl, + std::unique_ptr<arm_conv::ConvolutionArgs> &conv_args) { arm_conv::winograd::WinogradConfig winograd_cfg; arm_gemm::GemmConfig cfg; const DataType data_type = src->data_type(); - Tensor4DShape in_shape{ internal_get_shape(src) }; - Tensor4DShape out_shape{ internal_get_shape(dst) }; - Tensor4DShape kernel_shape{ internal_get_shape(weights) }; + Tensor4DShape in_shape{internal_get_shape(src)}; + Tensor4DShape out_shape{internal_get_shape(dst)}; + Tensor4DShape kernel_shape{internal_get_shape(weights)}; uint32_t nthreads = NEScheduler::get().num_threads(); // Get configuration arguments for Winograd winograd_cfg.output_rows = 0; winograd_cfg.output_cols = 0; conv_args = std::make_unique<arm_conv::ConvolutionArgs>( - in_shape.n_batches, - arm_conv::Shape2D{ static_cast<uint32_t>(in_shape.n_rows), static_cast<uint32_t>(in_shape.n_cols) }, - in_shape.n_channels, - conv_info.pad_top(), - conv_info.pad_left(), - arm_conv::Shape2D{ static_cast<uint32_t>(out_shape.n_rows), static_cast<uint32_t>(out_shape.n_cols) }, - out_shape.n_channels, - arm_conv::Shape2D{ static_cast<uint32_t>(kernel_shape.n_rows), static_cast<uint32_t>(kernel_shape.n_cols) }, - assembly_utils::map_to_arm_gemm_activation(act_info)); + in_shape.n_batches, + arm_conv::Shape2D{static_cast<uint32_t>(in_shape.n_rows), static_cast<uint32_t>(in_shape.n_cols)}, + in_shape.n_channels, conv_info.pad_top(), conv_info.pad_left(), + arm_conv::Shape2D{static_cast<uint32_t>(out_shape.n_rows), static_cast<uint32_t>(out_shape.n_cols)}, + out_shape.n_channels, + arm_conv::Shape2D{static_cast<uint32_t>(kernel_shape.n_rows), static_cast<uint32_t>(kernel_shape.n_cols)}, + assembly_utils::map_to_arm_gemm_activation(act_info)); bool success = false; - if(data_type == DataType::F32) + if (data_type == DataType::F32) { - success = arm_conv::winograd::get_implementation<float>( - *winograd_impl, &CPUInfo::get(), *conv_args, nthreads, enable_fast_math, &winograd_cfg, nullptr); + success = arm_conv::winograd::get_implementation<float>(*winograd_impl, &CPUInfo::get(), *conv_args, nthreads, + enable_fast_math, &winograd_cfg, nullptr); } #if defined(__aarch64__) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) - else if(data_type == DataType::F16) + else if (data_type == DataType::F16) { - success = arm_conv::winograd::get_implementation<__fp16>( - *winograd_impl, &CPUInfo::get(), *conv_args, nthreads, enable_fast_math, &winograd_cfg, nullptr); + success = arm_conv::winograd::get_implementation<__fp16>(*winograd_impl, &CPUInfo::get(), *conv_args, nthreads, + enable_fast_math, &winograd_cfg, nullptr); } #endif // defined(__aarch64__) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) else @@ -127,7 +137,8 @@ bool get_winograd_kernel_implementation(const ITensorInfo *src, const ITensorInf } inline bool fuse_function_supported(const ActivationLayerInfo &act_info) { - return act_info.activation() == ActivationLayerInfo::ActivationFunction::RELU || act_info.activation() == ActivationLayerInfo::ActivationFunction::BOUNDED_RELU; + return act_info.activation() == ActivationLayerInfo::ActivationFunction::RELU || + act_info.activation() == ActivationLayerInfo::ActivationFunction::BOUNDED_RELU; } } // namespace @@ -141,7 +152,7 @@ CpuWinogradConv2d::CpuWinogradConv2d() _permute_output(std::make_unique<CpuPermute>()), _permute_weights(std::make_unique<CpuPermute>()), _aux_mem(AuxTensorIdx::Count), - _conv_args{ nullptr }, + _conv_args{nullptr}, _winograd_impl{}, _data_layout(), _winograd_transformed_input{}, @@ -152,15 +163,20 @@ CpuWinogradConv2d::CpuWinogradConv2d() _weights_hwio(), _input_nhwc(), _output_nhwc(), - _is_prepared{ false }, - _run_activation{ false } + _is_prepared{false}, + _run_activation{false} { } CpuWinogradConv2d::~CpuWinogradConv2d() = default; -void CpuWinogradConv2d::configure(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, ITensorInfo *dst, - const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info, bool enable_fast_math) +void CpuWinogradConv2d::configure(const ITensorInfo *src, + const ITensorInfo *weights, + const ITensorInfo *biases, + ITensorInfo *dst, + const PadStrideInfo &conv_info, + const ActivationLayerInfo &act_info, + bool enable_fast_math) { ARM_COMPUTE_ERROR_ON_NULLPTR(src, weights, dst); ARM_COMPUTE_ERROR_THROW_ON(validate(src, weights, biases, dst, conv_info, act_info, enable_fast_math)); @@ -169,21 +185,29 @@ void CpuWinogradConv2d::configure(const ITensorInfo *src, const ITensorInfo *wei const DataType data_type = src->data_type(); uint32_t nthreads = NEScheduler::get().num_threads(); _data_layout = src->data_layout(); - const Tensor4DShape kernel_shape{ internal_get_shape(weights) }; - - bool success = get_winograd_kernel_implementation(src, weights, dst, conv_info, act_info, enable_fast_math, &_winograd_impl, _conv_args); - - ARM_COMPUTE_EXIT_ON_MSG_VAR(!success, "Unsupported kernel size: %d x %d.\n", kernel_shape.n_rows, kernel_shape.n_cols); - ARM_COMPUTE_LOG_MSG_WITH_FORMAT_ACL(arm_compute::logging::LogLevel::INFO, "Using input transform: %s\n", _winograd_impl.input_transform->get_name().c_str()); - ARM_COMPUTE_LOG_MSG_WITH_FORMAT_ACL(arm_compute::logging::LogLevel::INFO, "Using weight transform: %s\n", _winograd_impl.input_transform->get_name().c_str()); - ARM_COMPUTE_LOG_MSG_WITH_FORMAT_ACL(arm_compute::logging::LogLevel::INFO, "Using output transform: %s\n", _winograd_impl.input_transform->get_name().c_str()); - - const bool has_impl = ((_winograd_impl.input_transform != nullptr) && (_winograd_impl.output_transform != nullptr) && (_winograd_impl.gemm_args != nullptr)); - if(has_impl) + const Tensor4DShape kernel_shape{internal_get_shape(weights)}; + + bool success = get_winograd_kernel_implementation(src, weights, dst, conv_info, act_info, enable_fast_math, + &_winograd_impl, _conv_args); + + ARM_COMPUTE_EXIT_ON_MSG_VAR(!success, "Unsupported kernel size: %d x %d.\n", kernel_shape.n_rows, + kernel_shape.n_cols); + ARM_COMPUTE_LOG_MSG_WITH_FORMAT_ACL(arm_compute::logging::LogLevel::INFO, "Using input transform: %s\n", + _winograd_impl.input_transform->get_name().c_str()); + ARM_COMPUTE_LOG_MSG_WITH_FORMAT_ACL(arm_compute::logging::LogLevel::INFO, "Using weight transform: %s\n", + _winograd_impl.input_transform->get_name().c_str()); + ARM_COMPUTE_LOG_MSG_WITH_FORMAT_ACL(arm_compute::logging::LogLevel::INFO, "Using output transform: %s\n", + _winograd_impl.input_transform->get_name().c_str()); + + const bool has_impl = ((_winograd_impl.input_transform != nullptr) && + (_winograd_impl.output_transform != nullptr) && (_winograd_impl.gemm_args != nullptr)); + if (has_impl) { // Determine how much working space is required, allocate it. - const size_t input_workspace_size = _winograd_impl.input_transform->get_working_space_size(*_conv_args, nthreads); - const size_t output_workspace_size = _winograd_impl.output_transform->get_working_space_size(*_conv_args, nthreads); + const size_t input_workspace_size = + _winograd_impl.input_transform->get_working_space_size(*_conv_args, nthreads); + const size_t output_workspace_size = + _winograd_impl.output_transform->get_working_space_size(*_conv_args, nthreads); TensorInfo input_workspace_info(TensorShape(input_workspace_size), 1, DataType::U8); TensorInfo output_workspace_info(TensorShape(output_workspace_size), 1, DataType::U8); @@ -232,7 +256,7 @@ void CpuWinogradConv2d::configure(const ITensorInfo *src, const ITensorInfo *wei PermutationVector weights_permutation_vector(3U, 0U, 1U, 2U); // Configure the kernel to transform the input tensor from NCHW -> NHWC - if(_data_layout == DataLayout::NCHW) + if (_data_layout == DataLayout::NCHW) { _permute_input->configure(src, &_input_nhwc, PermutationVector(2U, 0U, 1U)); weights_permutation_vector = PermutationVector(3U, 2U, 0U, 1U); @@ -242,28 +266,30 @@ void CpuWinogradConv2d::configure(const ITensorInfo *src, const ITensorInfo *wei _permute_weights->configure(weights, &_weights_hwio, weights_permutation_vector); // Reorder the convoluted output to ACL's ordering NCHW - if(_data_layout == DataLayout::NCHW) + if (_data_layout == DataLayout::NCHW) { // configure and allocate dst tensor to be used to convert from winograd domain to spatial domain when calling to reshape_output() - TensorInfo info(TensorShape(dst->dimension(2), dst->dimension(0), - dst->dimension(1), dst->dimension(3)), - 1, dst->data_type()); + TensorInfo info(TensorShape(dst->dimension(2), dst->dimension(0), dst->dimension(1), dst->dimension(3)), 1, + dst->data_type()); _output_nhwc = info; _permute_output->configure(&_output_nhwc, dst, PermutationVector(1U, 2U, 0U)); } // Configure input transform kernel - _transform_input_kernel = std::make_unique<CpuWinogradConv2dTransformInputKernel>(_winograd_impl, *_conv_args, nthreads); + _transform_input_kernel = + std::make_unique<CpuWinogradConv2dTransformInputKernel>(_winograd_impl, *_conv_args, nthreads); // Configure GEMM function - _gemm_function->configure(&_winograd_transformed_input, &_winograd_transformed_weights, nullptr, &_winograd_transformed_output, 1.0f, 0.f); + _gemm_function->configure(&_winograd_transformed_input, &_winograd_transformed_weights, nullptr, + &_winograd_transformed_output, 1.0f, 0.f); // Configure output transform kernel - _transform_output_kernel = std::make_unique<CpuWinogradConv2dTransformOutputKernel>(_winograd_impl, *_conv_args, nthreads); + _transform_output_kernel = + std::make_unique<CpuWinogradConv2dTransformOutputKernel>(_winograd_impl, *_conv_args, nthreads); //Configure Activation Layer _run_activation = act_info.enabled() && !fuse_function_supported(act_info); - if(_run_activation) + if (_run_activation) { _activation_func->configure(dst, nullptr, act_info); } @@ -276,40 +302,55 @@ void CpuWinogradConv2d::configure(const ITensorInfo *src, const ITensorInfo *wei _aux_mem[TempResult] = asm_mem_req[TempResult]; // Request temporary memory. Overlap memory needed for Input/Output transformations as they run on different non-overlapping time-steps. - _aux_mem[TransformedInput] = MemoryInfo(offset_int_vec(TransformedInput), MemoryLifetime::Temporary, wds.input_matrix_size_bytes, storage_alignment); - _aux_mem[TransformedOutput] = MemoryInfo(offset_int_vec(TransformedOutput), MemoryLifetime::Temporary, wds.output_matrix_size_bytes, storage_alignment); - _aux_mem[WorkspaceIO] = MemoryInfo(offset_int_vec(WorkspaceIO), MemoryLifetime::Temporary, std::max(input_workspace_size, output_workspace_size)); - _aux_mem[PermutedWeights] = MemoryInfo(offset_int_vec(PermutedWeights), MemoryLifetime::Prepare, _weights_hwio.total_size()); - _aux_mem[TransformedWeights] = MemoryInfo(offset_int_vec(TransformedWeights), MemoryLifetime::Persistent, wds.weight_matrix_size_bytes, storage_alignment); - if(_data_layout == DataLayout::NCHW) + _aux_mem[TransformedInput] = MemoryInfo(offset_int_vec(TransformedInput), MemoryLifetime::Temporary, + wds.input_matrix_size_bytes, storage_alignment); + _aux_mem[TransformedOutput] = MemoryInfo(offset_int_vec(TransformedOutput), MemoryLifetime::Temporary, + wds.output_matrix_size_bytes, storage_alignment); + _aux_mem[WorkspaceIO] = MemoryInfo(offset_int_vec(WorkspaceIO), MemoryLifetime::Temporary, + std::max(input_workspace_size, output_workspace_size)); + _aux_mem[PermutedWeights] = + MemoryInfo(offset_int_vec(PermutedWeights), MemoryLifetime::Prepare, _weights_hwio.total_size()); + _aux_mem[TransformedWeights] = MemoryInfo(offset_int_vec(TransformedWeights), MemoryLifetime::Persistent, + wds.weight_matrix_size_bytes, storage_alignment); + if (_data_layout == DataLayout::NCHW) { _aux_mem[PermutedInput].merge(offset_int_vec(PermutedInput), src->total_size()); _aux_mem[PermutedOutput].merge(offset_int_vec(PermutedOutput), dst->total_size()); } } } -Status CpuWinogradConv2d::validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, - const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info, bool enable_fast_math) +Status CpuWinogradConv2d::validate(const ITensorInfo *src, + const ITensorInfo *weights, + const ITensorInfo *biases, + const ITensorInfo *dst, + const PadStrideInfo &conv_info, + const ActivationLayerInfo &act_info, + bool enable_fast_math) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, weights, dst); ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, weights, biases, dst, conv_info)); // Disable winograd for fp16 if fast math is false. - if(!enable_fast_math) + if (!enable_fast_math) { ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::F32); } - const Tensor4DShape kernel_shape{ internal_get_shape(weights) }; + const Tensor4DShape kernel_shape{internal_get_shape(weights)}; arm_conv::winograd::WinogradImpl winograd_impl{}; std::unique_ptr<arm_conv::ConvolutionArgs> conv_args; - const bool success = get_winograd_kernel_implementation(src, weights, dst, conv_info, act_info, enable_fast_math, &winograd_impl, conv_args); - - ARM_COMPUTE_RETURN_ERROR_ON_MSG_VAR(success == false, "Unsupported kernel size: %d x %d.\n", kernel_shape.n_rows, kernel_shape.n_cols); - ARM_COMPUTE_LOG_MSG_WITH_FORMAT_ACL(arm_compute::logging::LogLevel::INFO, "Using input transform: %s\n", winograd_impl.input_transform->get_name().c_str()); - ARM_COMPUTE_LOG_MSG_WITH_FORMAT_ACL(arm_compute::logging::LogLevel::INFO, "Using weight transform: %s\n", winograd_impl.input_transform->get_name().c_str()); - ARM_COMPUTE_LOG_MSG_WITH_FORMAT_ACL(arm_compute::logging::LogLevel::INFO, "Using output transform: %s\n", winograd_impl.input_transform->get_name().c_str()); + const bool success = get_winograd_kernel_implementation(src, weights, dst, conv_info, act_info, enable_fast_math, + &winograd_impl, conv_args); + + ARM_COMPUTE_RETURN_ERROR_ON_MSG_VAR(success == false, "Unsupported kernel size: %d x %d.\n", kernel_shape.n_rows, + kernel_shape.n_cols); + ARM_COMPUTE_LOG_MSG_WITH_FORMAT_ACL(arm_compute::logging::LogLevel::INFO, "Using input transform: %s\n", + winograd_impl.input_transform->get_name().c_str()); + ARM_COMPUTE_LOG_MSG_WITH_FORMAT_ACL(arm_compute::logging::LogLevel::INFO, "Using weight transform: %s\n", + winograd_impl.input_transform->get_name().c_str()); + ARM_COMPUTE_LOG_MSG_WITH_FORMAT_ACL(arm_compute::logging::LogLevel::INFO, "Using output transform: %s\n", + winograd_impl.input_transform->get_name().c_str()); return Status{}; } @@ -328,24 +369,29 @@ void CpuWinogradConv2d::run(ITensorPack &tensors) // Wrap the winograd-domain tensorInfos created in configuration in tensors and allocate the required memory. CpuAuxTensorHandler input_nhwc(offset_int_vec(PermutedInput), _input_nhwc, tensors, true); - CpuAuxTensorHandler winograd_input_transformed(offset_int_vec(TransformedInput), _winograd_transformed_input, tensors, true); + CpuAuxTensorHandler winograd_input_transformed(offset_int_vec(TransformedInput), _winograd_transformed_input, + tensors, true); CpuAuxTensorHandler input_workspace(offset_int_vec(WorkspaceIO), _input_workspace, tensors, true); const bool is_nchw = _data_layout == DataLayout::NCHW; - if(is_nchw) + if (is_nchw) { //Bring channels to the front as Winograd code expects the tensor to be in the format NHWC - ITensorPack pack{ { ACL_SRC, src }, { ACL_DST, input_nhwc.get() } }; + ITensorPack pack{{ACL_SRC, src}, {ACL_DST, input_nhwc.get()}}; _permute_input->run(pack); } - CpuAuxTensorHandler winograd_output_transformed(offset_int_vec(TransformedOutput), _winograd_transformed_output, tensors, true); + CpuAuxTensorHandler winograd_output_transformed(offset_int_vec(TransformedOutput), _winograd_transformed_output, + tensors, true); CpuAuxTensorHandler output_workspace(offset_int_vec(WorkspaceIO), _output_workspace, tensors, true); CpuAuxTensorHandler output_nhwc(offset_int_vec(PermutedOutput), _output_nhwc, tensors, true); - ITensorPack transform_input_pack{ { ACL_SRC, is_nchw ? input_nhwc.get() : src }, { ACL_DST, winograd_input_transformed.get() }, { ACL_INT, input_workspace.get() } }; + ITensorPack transform_input_pack{{ACL_SRC, is_nchw ? input_nhwc.get() : src}, + {ACL_DST, winograd_input_transformed.get()}, + {ACL_INT, input_workspace.get()}}; NEScheduler::get().schedule_op(_transform_input_kernel.get(), Window::DimX, win, transform_input_pack); - CpuAuxTensorHandler winograd_weights_transformed(offset_int_vec(TransformedWeights), _winograd_transformed_weights, tensors, true); + CpuAuxTensorHandler winograd_weights_transformed(offset_int_vec(TransformedWeights), _winograd_transformed_weights, + tensors, true); // Run 16 GEMMs in multiple threads, each kernel runs one or more GEMMs ITensorPack gemm_pack = tensors; @@ -356,30 +402,34 @@ void CpuWinogradConv2d::run(ITensorPack &tensors) _gemm_function->run(gemm_pack); // Output transform - ITensorPack transform_output_pack{ { ACL_SRC_0, winograd_output_transformed.get() }, { ACL_DST, is_nchw ? output_nhwc.get() : output }, { ACL_SRC_1, biases }, { ACL_INT, output_workspace.get() } }; + ITensorPack transform_output_pack{{ACL_SRC_0, winograd_output_transformed.get()}, + {ACL_DST, is_nchw ? output_nhwc.get() : output}, + {ACL_SRC_1, biases}, + {ACL_INT, output_workspace.get()}}; NEScheduler::get().schedule_op(_transform_output_kernel.get(), Window::DimX, win, transform_output_pack); - if(is_nchw) + if (is_nchw) { // Reorder the convoluted output to ACL's ordering NCHW - ITensorPack pack{ { ACL_SRC, output_nhwc.get() }, { ACL_DST, output } }; + ITensorPack pack{{ACL_SRC, output_nhwc.get()}, {ACL_DST, output}}; _permute_output->run(pack); } - if(_run_activation) + if (_run_activation) { - ITensorPack pack{ { ACL_SRC, output }, { ACL_DST, output } }; + ITensorPack pack{{ACL_SRC, output}, {ACL_DST, output}}; _activation_func->run(pack); } } void CpuWinogradConv2d::prepare(ITensorPack &tensors) { - if(!_is_prepared) + if (!_is_prepared) { - const ITensor *weights = tensors.get_const_tensor(ACL_SRC_1); - ITensor *weights_aux = utils::cast::polymorphic_cast<ITensor *>(tensors.get_tensor(offset_int_vec(PermutedWeights))); + const ITensor *weights = tensors.get_const_tensor(ACL_SRC_1); + ITensor *weights_aux = + utils::cast::polymorphic_cast<ITensor *>(tensors.get_tensor(offset_int_vec(PermutedWeights))); CpuAuxTensorHandler permuted_weights(_weights_hwio, *weights_aux); - ITensorPack permute_tensors{ { ACL_SRC, weights }, { ACL_DST, permuted_weights.get() } }; + ITensorPack permute_tensors{{ACL_SRC, weights}, {ACL_DST, permuted_weights.get()}}; _permute_weights->run(permute_tensors); const int element_size_in_bytes = permuted_weights.get()->info()->element_size(); // Weights were in OHWI format, before being permuted "permuted_weights" to be in HWIO format. @@ -387,31 +437,32 @@ void CpuWinogradConv2d::prepare(ITensorPack &tensors) const unsigned int width_idx = 2; // W in HWIO const unsigned int channel_idx = 1; // I in HWIO - const int permuted_weight_row_stride = permuted_weights.get()->info()->strides_in_bytes()[height_idx] / element_size_in_bytes; - const int permuted_weight_col_stride = permuted_weights.get()->info()->strides_in_bytes()[width_idx] / element_size_in_bytes; - const int permuted_weight_channel_stride = permuted_weights.get()->info()->strides_in_bytes()[channel_idx] / element_size_in_bytes; + const int permuted_weight_row_stride = + permuted_weights.get()->info()->strides_in_bytes()[height_idx] / element_size_in_bytes; + const int permuted_weight_col_stride = + permuted_weights.get()->info()->strides_in_bytes()[width_idx] / element_size_in_bytes; + const int permuted_weight_channel_stride = + permuted_weights.get()->info()->strides_in_bytes()[channel_idx] / element_size_in_bytes; // Wrap the winograd-domain transformed weight TensorInfo in Auxiliary tensor and allocate the required memory. - ITensor *weights_transf = utils::cast::polymorphic_cast<ITensor *>(tensors.get_tensor(offset_int_vec(TransformedWeights))); + ITensor *weights_transf = + utils::cast::polymorphic_cast<ITensor *>(tensors.get_tensor(offset_int_vec(TransformedWeights))); ARM_COMPUTE_ERROR_ON_NULLPTR(weights_transf); CpuAuxTensorHandler winograd_transformed_weights(_winograd_transformed_weights, *weights_transf); const void *permuted_weights_ptr; void *win_wght_transf_ptr; - permuted_weights_ptr = reinterpret_cast<const void *>(permuted_weights.get()->buffer() + permuted_weights.get()->info()->offset_first_element_in_bytes()); - win_wght_transf_ptr = reinterpret_cast<void *>(winograd_transformed_weights.get()->buffer() + winograd_transformed_weights.get()->info()->offset_first_element_in_bytes()); + permuted_weights_ptr = reinterpret_cast<const void *>( + permuted_weights.get()->buffer() + permuted_weights.get()->info()->offset_first_element_in_bytes()); + win_wght_transf_ptr = + reinterpret_cast<void *>(winograd_transformed_weights.get()->buffer() + + winograd_transformed_weights.get()->info()->offset_first_element_in_bytes()); // Prepare Weights _winograd_impl.weight_transform->execute( - *_conv_args, - permuted_weights_ptr, - permuted_weight_row_stride, - permuted_weight_col_stride, - permuted_weight_channel_stride, - win_wght_transf_ptr, - _winograd_impl.winograd_spec, - 0, 1 // Thread 1 of 1 + *_conv_args, permuted_weights_ptr, permuted_weight_row_stride, permuted_weight_col_stride, + permuted_weight_channel_stride, win_wght_transf_ptr, _winograd_impl.winograd_spec, 0, 1 // Thread 1 of 1 ); ITensorPack gemm_pack = tensors; gemm_pack.add_const_tensor(ACL_SRC_1, winograd_transformed_weights.get()); diff --git a/src/cpu/operators/CpuWinogradConv2d.h b/src/cpu/operators/CpuWinogradConv2d.h index e0df34e2db..7e1d952462 100644 --- a/src/cpu/operators/CpuWinogradConv2d.h +++ b/src/cpu/operators/CpuWinogradConv2d.h @@ -26,10 +26,11 @@ #include "arm_compute/core/TensorInfo.h" #include "arm_compute/runtime/FunctionDescriptors.h" + #include "src/core/common/Macros.h" #include "src/cpu/ICpuOperator.h" -#include "src/cpu/kernels/CpuWinogradConv2dKernel.h" #include "src/cpu/kernels/assembly/gemm_common.hpp" +#include "src/cpu/kernels/CpuWinogradConv2dKernel.h" #include "src/cpu/operators/CpuActivation.h" #include "src/cpu/operators/CpuGemm.h" #include "src/cpu/operators/CpuPermute.h" @@ -73,7 +74,11 @@ public: * @param[in] enable_fast_math (Optional) Enable fast math computation. In case this flag were set, the function could dispatch the fastest implementation * available which may introduce a drop of accuracy as well. Default is false */ - void configure(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, ITensorInfo *dst, const PadStrideInfo &conv_info, + void configure(const ITensorInfo *src, + const ITensorInfo *weights, + const ITensorInfo *biases, + ITensorInfo *dst, + const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info = ActivationLayerInfo(), bool enable_fast_math = false); /** Static function to check if given info will lead to a valid configuration of @ref CpuWinogradConv2d @@ -82,13 +87,17 @@ public: * * @return a status */ - static Status validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, const PadStrideInfo &conv_info, + static Status validate(const ITensorInfo *src, + const ITensorInfo *weights, + const ITensorInfo *biases, + const ITensorInfo *dst, + const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info = ActivationLayerInfo(), bool enable_fast_math = false); // Inherited methods overridden: - void run(ITensorPack &tensors) override; - void prepare(ITensorPack &constants) override; + void run(ITensorPack &tensors) override; + void prepare(ITensorPack &constants) override; experimental::MemoryRequirements workspace() const override; private: @@ -108,27 +117,28 @@ private: PermutedOutput = TransformedInput, Count = 10 }; - std::unique_ptr<CpuGemm> _gemm_function; - std::unique_ptr<CpuActivation> _activation_func; - std::unique_ptr<ICPPKernel> _transform_input_kernel; - std::unique_ptr<ICPPKernel> _transform_output_kernel; - std::unique_ptr<CpuPermute> _permute_input; - std::unique_ptr<CpuPermute> _permute_output; - std::unique_ptr<CpuPermute> _permute_weights; - experimental::MemoryRequirements _aux_mem{ Count }; - std::unique_ptr<arm_conv::ConvolutionArgs> _conv_args; // Make it unique ptr because this type does not have a default constructor - arm_conv::winograd::WinogradImpl _winograd_impl; - DataLayout _data_layout; - TensorInfo _winograd_transformed_input; - TensorInfo _winograd_transformed_output; - TensorInfo _winograd_transformed_weights; - TensorInfo _input_workspace; - TensorInfo _output_workspace; - TensorInfo _weights_hwio; - TensorInfo _input_nhwc; - TensorInfo _output_nhwc; - bool _is_prepared; - bool _run_activation; + std::unique_ptr<CpuGemm> _gemm_function; + std::unique_ptr<CpuActivation> _activation_func; + std::unique_ptr<ICPPKernel> _transform_input_kernel; + std::unique_ptr<ICPPKernel> _transform_output_kernel; + std::unique_ptr<CpuPermute> _permute_input; + std::unique_ptr<CpuPermute> _permute_output; + std::unique_ptr<CpuPermute> _permute_weights; + experimental::MemoryRequirements _aux_mem{Count}; + std::unique_ptr<arm_conv::ConvolutionArgs> + _conv_args; // Make it unique ptr because this type does not have a default constructor + arm_conv::winograd::WinogradImpl _winograd_impl; + DataLayout _data_layout; + TensorInfo _winograd_transformed_input; + TensorInfo _winograd_transformed_output; + TensorInfo _winograd_transformed_weights; + TensorInfo _input_workspace; + TensorInfo _output_workspace; + TensorInfo _weights_hwio; + TensorInfo _input_nhwc; + TensorInfo _output_nhwc; + bool _is_prepared; + bool _run_activation; }; } // namespace cpu } // namespace arm_compute diff --git a/src/cpu/operators/internal/CpuGemmAssemblyDispatch.cpp b/src/cpu/operators/internal/CpuGemmAssemblyDispatch.cpp index 3069d6b541..343ef21c0b 100644 --- a/src/cpu/operators/internal/CpuGemmAssemblyDispatch.cpp +++ b/src/cpu/operators/internal/CpuGemmAssemblyDispatch.cpp @@ -24,12 +24,13 @@ #include "src/cpu/operators/internal/CpuGemmAssemblyDispatch.h" #include "arm_compute/runtime/NEON/NEScheduler.h" + #include "src/core/CPP/Validate.h" -#include "src/core/NEON/kernels/arm_gemm/utils.hpp" #include "src/core/helpers/MemoryHelpers.h" +#include "src/core/NEON/kernels/arm_gemm/utils.hpp" #include "src/core/utils/AssemblyUtils.h" -#include "src/cpu/kernels/assembly/CpuGemmAssemblyWrapperKernel.h" #include "src/cpu/kernels/assembly/arm_gemm.hpp" +#include "src/cpu/kernels/assembly/CpuGemmAssemblyWrapperKernel.h" #include "src/cpu/utils/CpuAuxTensorHandler.h" #include <arm_neon.h> @@ -53,7 +54,12 @@ namespace * @param[in] num_threads Number of threads to run this method. Must be >= 1 */ template <typename TypeInput, typename TypeOutput> -void run_parallel_pretranspose_B_array(arm_gemm::GemmCommon<TypeInput, TypeOutput> *gemm_asm, ITensor *dst, const TypeInput *src, int src_ld, int src_multi_stride, unsigned int num_threads) +void run_parallel_pretranspose_B_array(arm_gemm::GemmCommon<TypeInput, TypeOutput> *gemm_asm, + ITensor *dst, + const TypeInput *src, + int src_ld, + int src_multi_stride, + unsigned int num_threads) { ARM_COMPUTE_ERROR_ON(gemm_asm == nullptr); ARM_COMPUTE_ERROR_ON(num_threads == 0); @@ -61,14 +67,14 @@ void run_parallel_pretranspose_B_array(arm_gemm::GemmCommon<TypeInput, TypeOutpu const unsigned int wsize = gemm_asm->get_B_pretranspose_window_size(); std::vector<IScheduler::Workload> workloads(num_threads); - for(unsigned int t = 0; t < num_threads; ++t) + for (unsigned int t = 0; t < num_threads; ++t) { - workloads[t] = [ = ](const ThreadInfo & info) + workloads[t] = [=](const ThreadInfo &info) { const unsigned int start = (info.thread_id * wsize) / num_threads; const unsigned int end = ((info.thread_id + 1) * wsize) / num_threads; - if(start < end) + if (start < end) { gemm_asm->pretranspose_B_array_part(dst->buffer(), src, src_ld, src_multi_stride, start, end); } @@ -113,7 +119,7 @@ Params extract_parameters(const ITensorInfo *a, const ITensorInfo *b, const ITen p.sections = 1; p.indirect = false; - if(info.method == AsmConvMethod::Conv || info.method == AsmConvMethod::Indirect) + if (info.method == AsmConvMethod::Conv || info.method == AsmConvMethod::Indirect) { p.indirect = true; p.sections = b->tensor_shape()[2] * b->tensor_shape()[3]; @@ -125,7 +131,7 @@ Params extract_parameters(const ITensorInfo *a, const ITensorInfo *b, const ITen } // Update M in case of GEMM3D for output - if(info.depth_output_gemm3d != 0) + if (info.depth_output_gemm3d != 0) { p.M = d->tensor_shape().y() * d->tensor_shape().z(); p.batches = d->tensor_shape().total_size_upper(3) / p.multis; @@ -139,19 +145,24 @@ IScheduler::Hints scheduling_hint_heuristic(arm_gemm::GemmMethod method, DataTyp // Schedule assembly kernel const int granule_threshold = 200; IScheduler::Hints scheduling_hint = IScheduler::Hints(Window::DimX); - if(method == arm_gemm::GemmMethod::GEMM_INTERLEAVED && data_type == DataType::F32) + if (method == arm_gemm::GemmMethod::GEMM_INTERLEAVED && data_type == DataType::F32) { scheduling_hint = IScheduler::Hints(Window::DimX, IScheduler::StrategyHint::DYNAMIC, granule_threshold); } - else if(method == arm_gemm::GemmMethod::GEMM_INTERLEAVED_2D && (data_type == DataType::F32 || data_type == DataType::F16 || data_type == DataType::U8 || data_type == DataType::S8)) + else if (method == arm_gemm::GemmMethod::GEMM_INTERLEAVED_2D && + (data_type == DataType::F32 || data_type == DataType::F16 || data_type == DataType::U8 || + data_type == DataType::S8)) { //GEMM_INTERLEAVED supports 2D parallelism, IScheduler::split_dimensions_all signals to parallelise over all window dimensions - scheduling_hint = IScheduler::Hints(IScheduler::split_dimensions_all, IScheduler::StrategyHint::STATIC, granule_threshold); + scheduling_hint = + IScheduler::Hints(IScheduler::split_dimensions_all, IScheduler::StrategyHint::STATIC, granule_threshold); } - else if(method == arm_gemm::GemmMethod::QUANTIZE_WRAPPER_2D && (data_type == DataType::QASYMM8 || data_type == DataType::QASYMM8_SIGNED)) + else if (method == arm_gemm::GemmMethod::QUANTIZE_WRAPPER_2D && + (data_type == DataType::QASYMM8 || data_type == DataType::QASYMM8_SIGNED)) { //special case for QASYMM8 to support 2D parallelism, scheduler here may be tweaked differently compared to FP32 case - scheduling_hint = IScheduler::Hints(IScheduler::split_dimensions_all, IScheduler::StrategyHint::STATIC, granule_threshold); + scheduling_hint = + IScheduler::Hints(IScheduler::split_dimensions_all, IScheduler::StrategyHint::STATIC, granule_threshold); } return scheduling_hint; @@ -175,8 +186,12 @@ public: * @param[in] gemm_info GEMM meta-data * @param[in] os Output stage meta-data. */ - void configure(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, ITensorInfo *d, - arm_gemm::GemmArgs args, const AsmGemmInfo &gemm_info, + void configure(const ITensorInfo *a, + const ITensorInfo *b, + const ITensorInfo *c, + ITensorInfo *d, + arm_gemm::GemmArgs args, + const AsmGemmInfo &gemm_info, const OutputStage &os = {}); /** Set requantization shifts to be used @@ -193,19 +208,20 @@ public: * * @return A tuple with the pointers to the shift and multiplier data respectively */ - std::tuple<bool, const int32_t *, const int32_t *, const int32_t *> set_requantize_data(const std::vector<int32_t> &shifts, - const std::vector<int32_t> &multipliers); + std::tuple<bool, const int32_t *, const int32_t *, const int32_t *> + set_requantize_data(const std::vector<int32_t> &shifts, const std::vector<int32_t> &multipliers); // Inherited methods overridden: - void run(ITensorPack &tensors) override; - void prepare(ITensorPack &tensors) override; + void run(ITensorPack &tensors) override; + void prepare(ITensorPack &tensors) override; bool is_configured() const override; experimental::MemoryRequirements workspace() const override; bool isVarWeightsKernel() const override { - if(!_gemm_kernel_asm) + if (!_gemm_kernel_asm) return false; - const arm_compute::WeightFormat wf = assembly_utils::map_to_arm_compute_weight_format(_gemm_kernel_asm->get_config().weight_format); + const arm_compute::WeightFormat wf = + assembly_utils::map_to_arm_compute_weight_format(_gemm_kernel_asm->get_config().weight_format); return wf != arm_compute::WeightFormat::UNSPECIFIED && wf != arm_compute::WeightFormat::ANY; } @@ -229,15 +245,15 @@ private: void prepare_indirect_buffer(ITensorPack &tensors); /** Assembly Gemm kernel */ - std::shared_ptr<arm_gemm::GemmCommon<TypeInput, TypeOutput>> _gemm_kernel_asm{ nullptr }; + std::shared_ptr<arm_gemm::GemmCommon<TypeInput, TypeOutput>> _gemm_kernel_asm{nullptr}; /** Optimised Arm® Neon™ kernel */ - std::unique_ptr<INEKernel> _optimised_kernel{ nullptr }; + std::unique_ptr<INEKernel> _optimised_kernel{nullptr}; /** Assembly GEMM workspace tensor info */ TensorInfo _workspace_info{}; /** Pre-transpose tensor info */ TensorInfo _pretranspose_info{}; /** Prepared flag */ - bool _is_prepared{ false }; + bool _is_prepared{false}; /** GEMM meta-data */ AsmGemmInfo _gemm_info{}; /** GEMM kernel description */ @@ -251,26 +267,27 @@ private: /** Indirect buffer */ std::unique_ptr<const TypeInput *const *, free_delete> _indirect_arg{}; std::unique_ptr<const TypeInput *, free_delete> _indirect_buf{}; - std::vector<TypeInput> _indirect_pad{}; - arm_gemm::ConvolutionParameters _cp{}; - experimental::MemoryRequirements _aux_mem{ Count }; - bool _B_pretranspose_required{ false }; - bool _is_b_constant{ true }; - bool _is_c_constant{ true }; + std::vector<TypeInput> _indirect_pad{}; + arm_gemm::ConvolutionParameters _cp{}; + experimental::MemoryRequirements _aux_mem{Count}; + bool _B_pretranspose_required{false}; + bool _is_b_constant{true}; + bool _is_c_constant{true}; }; template <typename TypeInput, typename TypeOutput, class OutputStage> std::tuple<bool, const int32_t *, const int32_t *, const int32_t *> -Fallback<TypeInput, TypeOutput, OutputStage>::set_requantize_data(const std::vector<int32_t> &shifts, const std::vector<int32_t> &multipliers) +Fallback<TypeInput, TypeOutput, OutputStage>::set_requantize_data(const std::vector<int32_t> &shifts, + const std::vector<int32_t> &multipliers) { _multipliers = multipliers; _shifts = shifts; bool need_left = false; - for(const auto s : _shifts) + for (const auto s : _shifts) { left_shifts.push_back(std::max(-s, int32_t(0))); right_shifts.push_back(std::min(-s, int32_t(0))); - if(s < 0 && !need_left) + if (s < 0 && !need_left) { need_left = true; } @@ -295,32 +312,35 @@ void Fallback<TypeInput, TypeOutput, OutputStage>::prepare_indirect_buffer(ITens const int multi_size = batch_size * batches; const size_t multi_stride = multi_size / sizeof(TypeInput); - for(int64_t m = 0; m < multis; m++) + for (int64_t m = 0; m < multis; m++) { - for(int64_t b = 0; b < batches; b++) + for (int64_t b = 0; b < batches; b++) { - for(int64_t output_y = 0; output_y < _cp.output_height; output_y++) + for (int64_t output_y = 0; output_y < _cp.output_height; output_y++) { - for(int64_t output_x = 0; output_x < _cp.output_width; output_x++) + for (int64_t output_x = 0; output_x < _cp.output_width; output_x++) { int64_t output_xy = (output_y * _cp.output_width) + output_x; - for(int64_t kernel_y = 0; kernel_y < _cp.kernel_height; kernel_y++) + for (int64_t kernel_y = 0; kernel_y < _cp.kernel_height; kernel_y++) { - for(int64_t kernel_x = 0; kernel_x < _cp.kernel_width; kernel_x++) + for (int64_t kernel_x = 0; kernel_x < _cp.kernel_width; kernel_x++) { int64_t input_x = (output_x * _cp.output_stride_w) + kernel_x - _cp.padding_left; int64_t input_y = (output_y * _cp.output_stride_h) + kernel_y - _cp.padding_top; int64_t kernel_xy = (kernel_y * _cp.kernel_width) + kernel_x; int64_t input_xy = (input_y * _cp.input_width) + input_x; - if(input_x < 0 || input_x >= _cp.input_width || input_y < 0 || input_y >= _cp.input_height) + if (input_x < 0 || input_x >= _cp.input_width || input_y < 0 || input_y >= _cp.input_height) { - _indirect_buf.get()[m * multi_stride + b * batch_stride + kernel_xy * output_hw + output_xy] = _indirect_pad.data(); + _indirect_buf + .get()[m * multi_stride + b * batch_stride + kernel_xy * output_hw + output_xy] = + _indirect_pad.data(); } else { - _indirect_buf.get()[m * multi_stride + b * batch_stride + kernel_xy * output_hw + output_xy] = + _indirect_buf + .get()[m * multi_stride + b * batch_stride + kernel_xy * output_hw + output_xy] = A_ptr + (m * multi_stride_A + b * batch_stride_A + input_xy * stride_A); } } @@ -332,12 +352,15 @@ void Fallback<TypeInput, TypeOutput, OutputStage>::prepare_indirect_buffer(ITens } template <typename TypeInput, typename TypeOutput, class OutputStage> -void Fallback<TypeInput, TypeOutput, OutputStage>::configure_indirect(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *d, const AsmGemmInfo &info) +void Fallback<TypeInput, TypeOutput, OutputStage>::configure_indirect(const ITensorInfo *a, + const ITensorInfo *b, + const ITensorInfo *d, + const AsmGemmInfo &info) { ARM_COMPUTE_ERROR_ON(!(info.method == AsmConvMethod::Conv || info.method == AsmConvMethod::Indirect)); float zeropad = 0.f; - if(is_data_type_quantized(a->data_type())) + if (is_data_type_quantized(a->data_type())) { zeropad = a->quantization_info().uniform().offset; } @@ -350,16 +373,25 @@ void Fallback<TypeInput, TypeOutput, OutputStage>::configure_indirect(const ITen const int64_t output_width = static_cast<int64_t>(d->tensor_shape()[1]); const int64_t output_height = static_cast<int64_t>(d->tensor_shape()[2]); - _cp = { input_width, input_height, input_channels, kernel_width, kernel_height, output_width, output_height, - info.ps_info.stride().first, info.ps_info.stride().second, info.padding_top, info.padding_left, zeropad - }; - - if(info.method == AsmConvMethod::Conv) + _cp = {input_width, + input_height, + input_channels, + kernel_width, + kernel_height, + output_width, + output_height, + info.ps_info.stride().first, + info.ps_info.stride().second, + info.padding_top, + info.padding_left, + zeropad}; + + if (info.method == AsmConvMethod::Conv) { _gemm_kernel_asm->set_convolution_parameters(_cp); } - if(info.method == AsmConvMethod::Indirect) + if (info.method == AsmConvMethod::Indirect) { const unsigned int multis = 1; const unsigned int batches = a->tensor_shape().total_size_upper(3); @@ -372,19 +404,22 @@ void Fallback<TypeInput, TypeOutput, OutputStage>::configure_indirect(const ITen const int multi_size = batch_size * batches; const size_t multi_stride = multi_size / sizeof(TypeInputPtr); - _indirect_buf = std::unique_ptr<const TypeInput *, free_delete>(reinterpret_cast<const TypeInput **>(malloc(multi_size * multis))); - _indirect_arg = std::unique_ptr<const TypeInput *const *, free_delete>(reinterpret_cast<const TypeInput *const **>(malloc(sizeof(TypeInput **) * kernel_hw * multis * batches))); + _indirect_buf = std::unique_ptr<const TypeInput *, free_delete>( + reinterpret_cast<const TypeInput **>(malloc(multi_size * multis))); + _indirect_arg = std::unique_ptr<const TypeInput *const *, free_delete>( + reinterpret_cast<const TypeInput *const **>(malloc(sizeof(TypeInput **) * kernel_hw * multis * batches))); _indirect_pad = std::vector<TypeInput>(_cp.input_channels, TypeInput(zeropad)); // Set indirect argument int64_t pos = 0; - for(int64_t m = 0; m < multis; m++) + for (int64_t m = 0; m < multis; m++) { - for(int64_t b = 0; b < batches; b++) + for (int64_t b = 0; b < batches; b++) { - for(int64_t kernel_xy = 0; kernel_xy < kernel_hw; kernel_xy++) + for (int64_t kernel_xy = 0; kernel_xy < kernel_hw; kernel_xy++) { - (_indirect_arg.get())[pos++] = _indirect_buf.get() + m * multi_stride + b * batch_stride + kernel_xy * output_hw; + (_indirect_arg.get())[pos++] = + _indirect_buf.get() + m * multi_stride + b * batch_stride + kernel_xy * output_hw; } } } @@ -394,8 +429,12 @@ void Fallback<TypeInput, TypeOutput, OutputStage>::configure_indirect(const ITen } template <typename TypeInput, typename TypeOutput, class OutputStage> -void Fallback<TypeInput, TypeOutput, OutputStage>::configure(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, ITensorInfo *d, - arm_gemm::GemmArgs args, const AsmGemmInfo &gemm_info, +void Fallback<TypeInput, TypeOutput, OutputStage>::configure(const ITensorInfo *a, + const ITensorInfo *b, + const ITensorInfo *c, + ITensorInfo *d, + arm_gemm::GemmArgs args, + const AsmGemmInfo &gemm_info, const OutputStage &os) { ARM_COMPUTE_UNUSED(c); @@ -404,7 +443,7 @@ void Fallback<TypeInput, TypeOutput, OutputStage>::configure(const ITensorInfo * _is_c_constant = c ? c->are_values_constant() : true; _gemm_kernel_asm = arm_gemm::gemm<TypeInput, TypeOutput, OutputStage>(args, os); - if(_gemm_kernel_asm == nullptr) + if (_gemm_kernel_asm == nullptr) { //configuration not supported: Leave function unconfigured: return; @@ -419,13 +458,14 @@ void Fallback<TypeInput, TypeOutput, OutputStage>::configure(const ITensorInfo * const size_t workspace_size = _gemm_kernel_asm->get_working_size(); const unsigned int alignment = 4096; _workspace_info = TensorInfo(TensorShape(workspace_size), 1, DataType::U8); - _aux_mem[AsmGemmWorkspace] = MemoryInfo(offset_int_vec(AsmGemmWorkspace), MemoryLifetime::Temporary, workspace_size, alignment); + _aux_mem[AsmGemmWorkspace] = + MemoryInfo(offset_int_vec(AsmGemmWorkspace), MemoryLifetime::Temporary, workspace_size, alignment); //if we disable this code below in brackets then ConvLayer deadlocks when threads > 1 and //the shapes are In=1x1x1024 Weights=1x1x1024x1001 Biases=1001 Out=1x1x1001 { const unsigned int window_size = _gemm_kernel_asm->get_window_size().total_size(); - if(window_size < static_cast<unsigned int>(args._maxthreads)) + if (window_size < static_cast<unsigned int>(args._maxthreads)) { _gemm_kernel_asm->set_nthreads(window_size); } @@ -434,18 +474,19 @@ void Fallback<TypeInput, TypeOutput, OutputStage>::configure(const ITensorInfo * _optimised_kernel = std::move(acl_gemm_wrapper); _gemm_info = gemm_info; // Check for pre-transposed support - if(_gemm_kernel_asm->B_pretranspose_required()) + if (_gemm_kernel_asm->B_pretranspose_required()) { // Forcing 128-byte alignment (required by 32-bit kernels) const unsigned int alignment = 128; const size_t B_pretranspose_size = _gemm_kernel_asm->get_B_pretransposed_array_size(); _pretranspose_info = TensorInfo(TensorShape(B_pretranspose_size), 1, DataType::U8); - _aux_mem[Pretranspose] = MemoryInfo(offset_int_vec(Pretranspose), MemoryLifetime::Persistent, B_pretranspose_size, alignment); - _B_pretranspose_required = true; + _aux_mem[Pretranspose] = + MemoryInfo(offset_int_vec(Pretranspose), MemoryLifetime::Persistent, B_pretranspose_size, alignment); + _B_pretranspose_required = true; } // Handle indirect GEMM convolution - if(gemm_info.method == AsmConvMethod::Conv || gemm_info.method == AsmConvMethod::Indirect) + if (gemm_info.method == AsmConvMethod::Conv || gemm_info.method == AsmConvMethod::Indirect) { configure_indirect(a, b, d, gemm_info); } @@ -454,34 +495,39 @@ void Fallback<TypeInput, TypeOutput, OutputStage>::configure(const ITensorInfo * template <typename TypeInput, typename TypeOutput, class OutputStage> void Fallback<TypeInput, TypeOutput, OutputStage>::prepare(ITensorPack &tensors) { - if(!_is_prepared) + if (!_is_prepared) { auto b = tensors.get_const_tensor(TensorType::ACL_SRC_1); auto c = tensors.get_const_tensor(TensorType::ACL_SRC_2); // Setup up matrix bias in the assembly kernel, it's just a pointer to matrix C. - if(c && c->info()->data_type() == DataType::S32) + if (c && c->info()->data_type() == DataType::S32) { - _gemm_kernel_asm->set_quantized_bias(reinterpret_cast<const int32_t *>(c->buffer() + c->info()->offset_first_element_in_bytes()), 0); + _gemm_kernel_asm->set_quantized_bias( + reinterpret_cast<const int32_t *>(c->buffer() + c->info()->offset_first_element_in_bytes()), 0); } // Pretranspose B if required - if(_gemm_kernel_asm->B_pretranspose_required()) + if (_gemm_kernel_asm->B_pretranspose_required()) { // Fixed format kernels need no pretranspose. - ARM_COMPUTE_ERROR_ON(arm_compute::is_fixed_format(assembly_utils::map_to_arm_compute_weight_format(_gemm_kernel_asm->get_config().weight_format))); - const int ldb = b->info()->strides_in_bytes().y() / b->info()->element_size(); - const auto in1_ptr = reinterpret_cast<const TypeInput *>(b->buffer() + b->info()->offset_first_element_in_bytes()); - const int multi_stride_b = b->info()->strides_in_bytes().z() / b->info()->element_size(); + ARM_COMPUTE_ERROR_ON(arm_compute::is_fixed_format( + assembly_utils::map_to_arm_compute_weight_format(_gemm_kernel_asm->get_config().weight_format))); + const int ldb = b->info()->strides_in_bytes().y() / b->info()->element_size(); + const auto in1_ptr = + reinterpret_cast<const TypeInput *>(b->buffer() + b->info()->offset_first_element_in_bytes()); + const int multi_stride_b = b->info()->strides_in_bytes().z() / b->info()->element_size(); CpuAuxTensorHandler pretranspose(offset_int_vec(Pretranspose), _pretranspose_info, tensors, false); ARM_COMPUTE_ERROR_ON(pretranspose.get()->buffer() == nullptr); - run_parallel_pretranspose_B_array<TypeInput, TypeOutput>(_gemm_kernel_asm.get(), pretranspose.get(), in1_ptr, ldb, multi_stride_b, NEScheduler::get().num_threads()); + run_parallel_pretranspose_B_array<TypeInput, TypeOutput>(_gemm_kernel_asm.get(), pretranspose.get(), + in1_ptr, ldb, multi_stride_b, + NEScheduler::get().num_threads()); b->mark_as_unused(); } - if(_gemm_info.method == AsmConvMethod::Indirect) + if (_gemm_info.method == AsmConvMethod::Indirect) { prepare_indirect_buffer(tensors); } @@ -526,12 +572,12 @@ void Fallback<TypeInput, TypeOutput, OutputStage>::run(ITensorPack &tensors) int multi_stride_b = 0; const int multi_stride_d = d->info()->strides_in_bytes()[d_multi_idx] / d->info()->element_size(); - auto in0_ptr = reinterpret_cast<const TypeInput *>(a->buffer() + a->info()->offset_first_element_in_bytes()); + auto in0_ptr = reinterpret_cast<const TypeInput *>(a->buffer() + a->info()->offset_first_element_in_bytes()); const TypeInput *in1_ptr = nullptr; auto out_ptr = reinterpret_cast<TypeOutput *>(d->buffer() + d->info()->offset_first_element_in_bytes()); // Check if B is pre-tranposed and de-reference if not - if(!_gemm_kernel_asm->B_is_pretransposed()) + if (!_gemm_kernel_asm->B_is_pretransposed()) { ldb = b->info()->strides_in_bytes().y() / b->info()->element_size(); multi_stride_b = b->info()->strides_in_bytes().z() / b->info()->element_size(); @@ -539,30 +585,34 @@ void Fallback<TypeInput, TypeOutput, OutputStage>::run(ITensorPack &tensors) } // If necessary, run pretranspose every time if either weights or biases are non-constant - if((b && !_is_b_constant) || (c && !_is_c_constant && c->info()->data_type() == DataType::S32)) + if ((b && !_is_b_constant) || (c && !_is_c_constant && c->info()->data_type() == DataType::S32)) { - if(c && c->info()->data_type() == DataType::S32) + if (c && c->info()->data_type() == DataType::S32) { - _gemm_kernel_asm->set_quantized_bias(reinterpret_cast<const int32_t *>(c->buffer() + c->info()->offset_first_element_in_bytes()), 0); + _gemm_kernel_asm->set_quantized_bias( + reinterpret_cast<const int32_t *>(c->buffer() + c->info()->offset_first_element_in_bytes()), 0); } // Pretranspose B if required - if(_B_pretranspose_required) + if (_B_pretranspose_required) { - const int ldb = b->info()->strides_in_bytes().y() / b->info()->element_size(); - const auto b_ptr = reinterpret_cast<const TypeInput *>(b->buffer() + b->info()->offset_first_element_in_bytes()); - const int multi_stride_b = b->info()->strides_in_bytes().z() / b->info()->element_size(); + const int ldb = b->info()->strides_in_bytes().y() / b->info()->element_size(); + const auto b_ptr = + reinterpret_cast<const TypeInput *>(b->buffer() + b->info()->offset_first_element_in_bytes()); + const int multi_stride_b = b->info()->strides_in_bytes().z() / b->info()->element_size(); CpuAuxTensorHandler pretranspose(offset_int_vec(Pretranspose), _pretranspose_info, tensors, true); ARM_COMPUTE_ERROR_ON(pretranspose.get()->buffer() == nullptr); - if(_is_b_constant) + if (_is_b_constant) { _gemm_kernel_asm->requantize_bias(pretranspose.get()->buffer(), b_ptr, ldb, multi_stride_b); } else { - run_parallel_pretranspose_B_array<TypeInput, TypeOutput>(_gemm_kernel_asm.get(), pretranspose.get(), b_ptr, ldb, multi_stride_b, NEScheduler::get().num_threads()); + run_parallel_pretranspose_B_array<TypeInput, TypeOutput>(_gemm_kernel_asm.get(), pretranspose.get(), + b_ptr, ldb, multi_stride_b, + NEScheduler::get().num_threads()); } } } @@ -571,17 +621,17 @@ void Fallback<TypeInput, TypeOutput, OutputStage>::run(ITensorPack &tensors) // Set workspace if needed and reset number of threads as buffer manager gets re-created with max_threads CpuAuxTensorHandler workspace(offset_int_vec(AsmGemmWorkspace), _workspace_info, tensors, false); - if(workspace.get()->buffer() != nullptr) + if (workspace.get()->buffer() != nullptr) { _gemm_kernel_asm->set_working_space(reinterpret_cast<void *>(workspace.get()->buffer())); const unsigned int split_dim = scheduling_hint.split_dimension(); const unsigned int window_size = _gemm_kernel_asm->get_window_size().total_size(); unsigned int num_threads = NEScheduler::get().num_threads(); - if(window_size < num_threads) + if (window_size < num_threads) { num_threads = window_size; } - if(split_dim != IScheduler::split_dimensions_all) + if (split_dim != IScheduler::split_dimensions_all) { // Make sure the kernel does not expect more threads than we can actually spawn const unsigned int num_iterations = _optimised_kernel.get()->window().num_iterations(split_dim); @@ -595,12 +645,12 @@ void Fallback<TypeInput, TypeOutput, OutputStage>::run(ITensorPack &tensors) // Setup up matrix bias in the assembly kernel, it's just a pointer to matrix C. TypeOutput *bias = nullptr; - if(c && c->info()->data_type() != DataType::S32) + if (c && c->info()->data_type() != DataType::S32) { bias = reinterpret_cast<TypeOutput *>(c->buffer() + c->info()->offset_first_element_in_bytes()); } - if(_gemm_info.method == AsmConvMethod::Indirect) + if (_gemm_info.method == AsmConvMethod::Indirect) { in0_ptr = nullptr; lda = 0; @@ -609,18 +659,20 @@ void Fallback<TypeInput, TypeOutput, OutputStage>::run(ITensorPack &tensors) } // Set gemm parameters - _gemm_kernel_asm->set_arrays(in0_ptr, lda, batch_stride_a, multi_stride_a, - in1_ptr, ldb, multi_stride_b, - out_ptr, ldd, batch_stride_d, multi_stride_d, - bias, 0); + _gemm_kernel_asm->set_arrays(in0_ptr, lda, batch_stride_a, multi_stride_a, in1_ptr, ldb, multi_stride_b, out_ptr, + ldd, batch_stride_d, multi_stride_d, bias, 0); // Schedule NEScheduler::get().schedule(_optimised_kernel.get(), scheduling_hint); } template <typename TypeInput, typename TypeOutput> void create_arm_gemm(std::unique_ptr<CpuGemmAssemblyDispatch::IFallback> &arm_gemm, - const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, ITensorInfo *d, - arm_gemm::Activation activation, const AsmGemmInfo &info) + const ITensorInfo *a, + const ITensorInfo *b, + const ITensorInfo *c, + ITensorInfo *d, + arm_gemm::Activation activation, + const AsmGemmInfo &info) { Params p = extract_parameters(a, b, d, info); const CPUInfo &ci = NEScheduler::get().cpu_info(); @@ -628,7 +680,8 @@ void create_arm_gemm(std::unique_ptr<CpuGemmAssemblyDispatch::IFallback> &arm_ge arm_gemm::GemmConfig cfg; cfg.weight_format = assembly_utils::map_to_arm_gemm_weight_format(info.weight_format); - arm_gemm::GemmArgs args(&ci, p.M, p.N, p.K, p.sections, p.batches, p.multis, p.indirect, activation, num_threads, info.fixed_format, info.fast_mode, &cfg); + arm_gemm::GemmArgs args(&ci, p.M, p.N, p.K, p.sections, p.batches, p.multis, p.indirect, activation, num_threads, + info.fixed_format, info.fast_mode, &cfg); // Create arm_gemm fallback auto fallback = std::make_unique<Fallback<TypeInput, TypeOutput>>(); @@ -638,8 +691,12 @@ void create_arm_gemm(std::unique_ptr<CpuGemmAssemblyDispatch::IFallback> &arm_ge template <typename TypeInput, typename TypeOutput> void create_arm_gemm_quant(std::unique_ptr<CpuGemmAssemblyDispatch::IFallback> &arm_gemm, - const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, ITensorInfo *d, - arm_gemm::Activation activation, const AsmGemmInfo &info) + const ITensorInfo *a, + const ITensorInfo *b, + const ITensorInfo *c, + ITensorInfo *d, + arm_gemm::Activation activation, + const AsmGemmInfo &info) { ARM_COMPUTE_UNUSED(activation); Params p = extract_parameters(a, b, d, info); @@ -648,7 +705,8 @@ void create_arm_gemm_quant(std::unique_ptr<CpuGemmAssemblyDispatch::IFallback> & arm_gemm::GemmConfig cfg; cfg.weight_format = assembly_utils::map_to_arm_gemm_weight_format(info.weight_format); - arm_gemm::GemmArgs args(&ci, p.M, p.N, p.K, p.sections, p.batches, p.multis, p.indirect, activation, num_threads, info.fixed_format, info.fast_mode, &cfg); + arm_gemm::GemmArgs args(&ci, p.M, p.N, p.K, p.sections, p.batches, p.multis, p.indirect, activation, num_threads, + info.fixed_format, info.fast_mode, &cfg); // Create arm_gemm fallback auto fallback = std::make_unique<Fallback<TypeInput, TypeOutput, arm_gemm::Requantize32>>(); @@ -660,22 +718,20 @@ void create_arm_gemm_quant(std::unique_ptr<CpuGemmAssemblyDispatch::IFallback> & const GEMMLowpOutputStageInfo os_info = info.output_stage; arm_gemm::Requantize32 gemm_requant_info{}; - if(os_info.gemmlowp_shifts.size() > 1) + if (os_info.gemmlowp_shifts.size() > 1) { - const auto requantize_data = fallback->set_requantize_data(os_info.gemmlowp_shifts, os_info.gemmlowp_multipliers); - gemm_requant_info = arm_gemm::Requantize32(nullptr, 0, - a_offset, b_offset, os_info.gemmlowp_offset, - (std::get<0>(requantize_data)) ? std::get<1>(requantize_data) : nullptr, - std::get<2>(requantize_data), - std::get<3>(requantize_data), - os_info.gemmlowp_min_bound, os_info.gemmlowp_max_bound); + const auto requantize_data = + fallback->set_requantize_data(os_info.gemmlowp_shifts, os_info.gemmlowp_multipliers); + gemm_requant_info = arm_gemm::Requantize32( + nullptr, 0, a_offset, b_offset, os_info.gemmlowp_offset, + (std::get<0>(requantize_data)) ? std::get<1>(requantize_data) : nullptr, std::get<2>(requantize_data), + std::get<3>(requantize_data), os_info.gemmlowp_min_bound, os_info.gemmlowp_max_bound); } else { - gemm_requant_info = arm_gemm::Requantize32(nullptr, 0, - a_offset, b_offset, os_info.gemmlowp_offset, - -os_info.gemmlowp_shift, os_info.gemmlowp_multiplier, - os_info.gemmlowp_min_bound, os_info.gemmlowp_max_bound); + gemm_requant_info = + arm_gemm::Requantize32(nullptr, 0, a_offset, b_offset, os_info.gemmlowp_offset, -os_info.gemmlowp_shift, + os_info.gemmlowp_multiplier, os_info.gemmlowp_min_bound, os_info.gemmlowp_max_bound); } // Configure fallback @@ -684,13 +740,16 @@ void create_arm_gemm_quant(std::unique_ptr<CpuGemmAssemblyDispatch::IFallback> & } } //namespace -CpuGemmAssemblyDispatch::CpuGemmAssemblyDispatch() - : _arm_gemm(nullptr) +CpuGemmAssemblyDispatch::CpuGemmAssemblyDispatch() : _arm_gemm(nullptr) { } -Status CpuGemmAssemblyDispatch::has_opt_impl(arm_compute::WeightFormat &expected_weight_format, const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *d, - const AsmGemmInfo &info) +Status CpuGemmAssemblyDispatch::has_opt_impl(arm_compute::WeightFormat &expected_weight_format, + const ITensorInfo *a, + const ITensorInfo *b, + const ITensorInfo *c, + const ITensorInfo *d, + const AsmGemmInfo &info) { ARM_COMPUTE_ERROR_ON_NULLPTR(a, b, d); ARM_COMPUTE_UNUSED(c); @@ -701,53 +760,61 @@ Status CpuGemmAssemblyDispatch::has_opt_impl(arm_compute::WeightFormat &expected arm_gemm::GemmConfig cfg; cfg.weight_format = assembly_utils::map_to_arm_gemm_weight_format(info.weight_format); arm_gemm::WeightFormat arm_gemm_expected_wf = assembly_utils::map_to_arm_gemm_weight_format(expected_weight_format); - arm_gemm::GemmArgs args(&ci, p.M, p.N, p.K, p.sections, p.batches, p.multis, p.indirect, act, num_threads, info.fixed_format, info.fast_mode, &cfg); - switch(a->data_type()) + arm_gemm::GemmArgs args(&ci, p.M, p.N, p.K, p.sections, p.batches, p.multis, p.indirect, act, num_threads, + info.fixed_format, info.fast_mode, &cfg); + switch (a->data_type()) { case DataType::F32: - ARM_COMPUTE_RETURN_ERROR_ON_MSG(!(arm_gemm::has_opt_gemm<float, float, arm_gemm::Nothing>(arm_gemm_expected_wf, args, {})), - "We could not find an optimized kernel for F32 input"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG( + !(arm_gemm::has_opt_gemm<float, float, arm_gemm::Nothing>(arm_gemm_expected_wf, args, {})), + "We could not find an optimized kernel for F32 input"); break; #ifdef __aarch64__ case DataType::U8: case DataType::QASYMM8: - if(d->data_type() == DataType::S32) + if (d->data_type() == DataType::S32) { - ARM_COMPUTE_RETURN_ERROR_ON_MSG(!(arm_gemm::has_opt_gemm<uint8_t, uint32_t, arm_gemm::Nothing>(arm_gemm_expected_wf, args, {})), - "We could not find an optimized kernel for U8/QASYMM8 input and U32 output"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG( + !(arm_gemm::has_opt_gemm<uint8_t, uint32_t, arm_gemm::Nothing>(arm_gemm_expected_wf, args, {})), + "We could not find an optimized kernel for U8/QASYMM8 input and U32 output"); } else { - ARM_COMPUTE_RETURN_ERROR_ON_MSG(!(arm_gemm::has_opt_gemm<uint8_t, uint8_t, arm_gemm::Requantize32>(arm_gemm_expected_wf, args, {})), - "We could not find an optimized kernel for U8 input and U8 output"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG( + !(arm_gemm::has_opt_gemm<uint8_t, uint8_t, arm_gemm::Requantize32>(arm_gemm_expected_wf, args, {})), + "We could not find an optimized kernel for U8 input and U8 output"); } break; case DataType::S8: case DataType::QASYMM8_SIGNED: - if(d->data_type() == DataType::S32) + if (d->data_type() == DataType::S32) { - ARM_COMPUTE_RETURN_ERROR_ON_MSG(!(arm_gemm::has_opt_gemm<int8_t, int32_t, arm_gemm::Nothing>(arm_gemm_expected_wf, args, {})), - "We could not find an optimized kernel for S8/QASYMM8_SIGNED input and S32 output"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG( + !(arm_gemm::has_opt_gemm<int8_t, int32_t, arm_gemm::Nothing>(arm_gemm_expected_wf, args, {})), + "We could not find an optimized kernel for S8/QASYMM8_SIGNED input and S32 output"); } else { - ARM_COMPUTE_RETURN_ERROR_ON_MSG(!(arm_gemm::has_opt_gemm<int8_t, int8_t, arm_gemm::Requantize32>(arm_gemm_expected_wf, args, {})), - "We could not find an optimized kernel for S8 input and S8 output"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG( + !(arm_gemm::has_opt_gemm<int8_t, int8_t, arm_gemm::Requantize32>(arm_gemm_expected_wf, args, {})), + "We could not find an optimized kernel for S8 input and S8 output"); } break; #endif /* __aarch64__ */ #if defined(ARM_COMPUTE_ENABLE_BF16) case DataType::BFLOAT16: { - ARM_COMPUTE_RETURN_ERROR_ON_MSG(!(arm_gemm::has_opt_gemm<bfloat16, float, arm_gemm::Nothing>(arm_gemm_expected_wf, args, {})), - "We could not find an optimized kernel for BFLOAT16 input and F32 output"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG( + !(arm_gemm::has_opt_gemm<bfloat16, float, arm_gemm::Nothing>(arm_gemm_expected_wf, args, {})), + "We could not find an optimized kernel for BFLOAT16 input and F32 output"); break; } #endif /* defined(ARM_COMPUTE_ENABLE_BF16) */ #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC case DataType::F16: - ARM_COMPUTE_RETURN_ERROR_ON_MSG(!(arm_gemm::has_opt_gemm<float16_t, float16_t, arm_gemm::Nothing>(arm_gemm_expected_wf, args, {})), - "We could not find an optimized kernel for F16 input and F16 output"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG( + !(arm_gemm::has_opt_gemm<float16_t, float16_t, arm_gemm::Nothing>(arm_gemm_expected_wf, args, {})), + "We could not find an optimized kernel for F16 input and F16 output"); break; #endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */ default: @@ -759,26 +826,30 @@ Status CpuGemmAssemblyDispatch::has_opt_impl(arm_compute::WeightFormat &expected return Status{}; } -Status CpuGemmAssemblyDispatch::validate(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *d, const AsmGemmInfo &info) +Status CpuGemmAssemblyDispatch::validate( + const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *d, const AsmGemmInfo &info) { ARM_COMPUTE_UNUSED(c, info); ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(a, b, d); ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(a); ARM_COMPUTE_RETURN_ERROR_ON_CPU_BF16_UNSUPPORTED(a); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(!(info.reshape_b_only_on_first_run), "Assembly kernel will not be executed when reshape_b_only_on_first_run is false"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(!(info.reshape_b_only_on_first_run), + "Assembly kernel will not be executed when reshape_b_only_on_first_run is false"); #ifndef __aarch64__ ARM_COMPUTE_RETURN_ERROR_ON_MSG(a->element_size() == 1, "8bit integer types only supported for aarch64"); #endif /* __aarch64__ */ - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(a, 1, DataType::U8, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::S8, - DataType::BFLOAT16, DataType::F16, DataType::F32); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(b, 1, DataType::U8, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM8_PER_CHANNEL, DataType::S8, - DataType::BFLOAT16, DataType::F16, DataType::F32); - if(is_data_type_quantized_per_channel(b->data_type())) + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(a, 1, DataType::U8, DataType::QASYMM8, + DataType::QASYMM8_SIGNED, DataType::S8, DataType::BFLOAT16, + DataType::F16, DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN( + b, 1, DataType::U8, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM8_PER_CHANNEL, DataType::S8, + DataType::BFLOAT16, DataType::F16, DataType::F32); + if (is_data_type_quantized_per_channel(b->data_type())) { ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(a, 1, DataType::QASYMM8_SIGNED, DataType::S8); } - else if(is_fixed_format_fast_math(info.weight_format)) + else if (is_fixed_format_fast_math(info.weight_format)) { ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_NOT_IN(a, DataType::F32); ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_NOT_IN(b, DataType::BFLOAT16); @@ -787,22 +858,29 @@ Status CpuGemmAssemblyDispatch::validate(const ITensorInfo *a, const ITensorInfo { ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(a, b); } - ARM_COMPUTE_RETURN_ERROR_ON_MSG(a->data_type() == DataType::F32 && d->data_type() != DataType::F32, "Only F32 output supported for F32 input"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(a->data_type() == DataType::F16 && d->data_type() != DataType::F16, "Only F16 output supported for F16 input"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(a->data_type() == DataType::BFLOAT16 && d->data_type() != DataType::F32, "Only F32 output supported for BFLOAT16 input"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(a->data_type() == DataType::U8 && d->data_type() != DataType::U32, "Only U32 output supported for U8 input"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(a->data_type() == DataType::S8 && d->data_type() != DataType::S32, "Only S32 output supported for S8 input"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(a->data_type() == DataType::QASYMM8 && (d->data_type() != DataType::QASYMM8 && d->data_type() != DataType::S32), + ARM_COMPUTE_RETURN_ERROR_ON_MSG(a->data_type() == DataType::F32 && d->data_type() != DataType::F32, + "Only F32 output supported for F32 input"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(a->data_type() == DataType::F16 && d->data_type() != DataType::F16, + "Only F16 output supported for F16 input"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(a->data_type() == DataType::BFLOAT16 && d->data_type() != DataType::F32, + "Only F32 output supported for BFLOAT16 input"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(a->data_type() == DataType::U8 && d->data_type() != DataType::U32, + "Only U32 output supported for U8 input"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(a->data_type() == DataType::S8 && d->data_type() != DataType::S32, + "Only S32 output supported for S8 input"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(a->data_type() == DataType::QASYMM8 && + (d->data_type() != DataType::QASYMM8 && d->data_type() != DataType::S32), "Only QASYMM8/S32 output supported for QASYMM8 input"); arm_compute::WeightFormat expected_weight_format = arm_compute::WeightFormat::UNSPECIFIED; const Status ret = CpuGemmAssemblyDispatch::has_opt_impl(expected_weight_format, a, b, c, d, info); - if((bool)ret && expected_weight_format != arm_compute::WeightFormat::ANY) + if ((bool)ret && expected_weight_format != arm_compute::WeightFormat::ANY) { // Correctness check: if the format expected by the kernel is // not "any", make sure that the one found matches the format // intended by the caller. - ARM_COMPUTE_RETURN_ERROR_ON_MSG((expected_weight_format != info.weight_format), - "The format expected by the kernel does not correspond with the one requested by the user."); + ARM_COMPUTE_RETURN_ERROR_ON_MSG( + (expected_weight_format != info.weight_format), + "The format expected by the kernel does not correspond with the one requested by the user."); } return ret; } @@ -813,18 +891,19 @@ bool CpuGemmAssemblyDispatch::is_activation_supported(const ActivationLayerInfo return act.type != arm_gemm::Activation::Type::None; } -void CpuGemmAssemblyDispatch::configure(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, ITensorInfo *d, const AsmGemmInfo &info) +void CpuGemmAssemblyDispatch::configure( + const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, ITensorInfo *d, const AsmGemmInfo &info) { ARM_COMPUTE_ERROR_ON_NULLPTR(a, b, d); arm_gemm::Activation act = assembly_utils::map_to_arm_gemm_activation(info.activation_info); //If we don't support a combination of data types, silently return: it is the caller's responsibility to check if configure() was successful via is_configured() - if(!CpuGemmAssemblyDispatch::validate(a, b, c, d, info)) + if (!CpuGemmAssemblyDispatch::validate(a, b, c, d, info)) { return; } - switch(a->data_type()) + switch (a->data_type()) { case DataType::F32: create_arm_gemm<float, float>(_arm_gemm, a, b, c, d, act, info); @@ -832,7 +911,7 @@ void CpuGemmAssemblyDispatch::configure(const ITensorInfo *a, const ITensorInfo #ifdef __aarch64__ case DataType::U8: case DataType::QASYMM8: - if(d->data_type() == DataType::S32) + if (d->data_type() == DataType::S32) { create_arm_gemm<uint8_t, uint32_t>(_arm_gemm, a, b, c, d, act, info); } @@ -843,7 +922,7 @@ void CpuGemmAssemblyDispatch::configure(const ITensorInfo *a, const ITensorInfo break; case DataType::S8: case DataType::QASYMM8_SIGNED: - if(d->data_type() == DataType::S32) + if (d->data_type() == DataType::S32) { create_arm_gemm<int8_t, int32_t>(_arm_gemm, a, b, c, d, act, info); } diff --git a/src/cpu/operators/internal/CpuGemmAssemblyDispatch.h b/src/cpu/operators/internal/CpuGemmAssemblyDispatch.h index ceb7a3f775..5be39a54c0 100644 --- a/src/cpu/operators/internal/CpuGemmAssemblyDispatch.h +++ b/src/cpu/operators/internal/CpuGemmAssemblyDispatch.h @@ -25,6 +25,7 @@ #define ARM_COMPUTE_CPU_INTERNAL_CPU_GEMM_ASSEMBLY_DISPATCH_H #include "arm_compute/function_info/ActivationLayerInfo.h" + #include "src/core/common/Macros.h" #include "src/cpu/ICpuOperator.h" @@ -42,20 +43,20 @@ enum class AsmConvMethod struct AsmGemmInfo { - AsmConvMethod method{ AsmConvMethod::Im2Col }; + AsmConvMethod method{AsmConvMethod::Im2Col}; PadStrideInfo ps_info{}; ActivationLayerInfo activation_info{}; GEMMLowpOutputStageInfo output_stage{}; - bool negated_offsets{ true }; - bool reinterpret_input_as_3d{ false }; - bool depth_output_gemm3d{ false }; - int64_t padding_top{ 0 }; - int64_t padding_left{ 0 }; - float padding_value{ 0.f }; - bool fast_mode{ false }; - bool fixed_format{ false }; - arm_compute::WeightFormat weight_format{ arm_compute::WeightFormat::UNSPECIFIED }; - bool reshape_b_only_on_first_run{ true }; + bool negated_offsets{true}; + bool reinterpret_input_as_3d{false}; + bool depth_output_gemm3d{false}; + int64_t padding_top{0}; + int64_t padding_left{0}; + float padding_value{0.f}; + bool fast_mode{false}; + bool fixed_format{false}; + arm_compute::WeightFormat weight_format{arm_compute::WeightFormat::UNSPECIFIED}; + bool reshape_b_only_on_first_run{true}; }; /** Assembly kernel glue */ @@ -72,12 +73,12 @@ public: class IFallback { public: - virtual void run(ITensorPack &tensors) = 0; - virtual void prepare(ITensorPack &tensors) = 0; - virtual experimental::MemoryRequirements workspace() const = 0; - virtual bool is_configured() const = 0; - virtual bool isVarWeightsKernel() const = 0; - virtual ~IFallback() = default; + virtual void run(ITensorPack &tensors) = 0; + virtual void prepare(ITensorPack &tensors) = 0; + virtual experimental::MemoryRequirements workspace() const = 0; + virtual bool is_configured() const = 0; + virtual bool isVarWeightsKernel() const = 0; + virtual ~IFallback() = default; }; public: @@ -121,7 +122,8 @@ public: * @param[out] d Output tensor to store the result of matrix multiplication. Data type supported: same as @p input0. * @param[in] info GEMM meta-data */ - void configure(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, ITensorInfo *d, const AsmGemmInfo &info); + void configure( + const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, ITensorInfo *d, const AsmGemmInfo &info); /** Indicates whether or not this function can be used to process the given parameters. * @@ -133,7 +135,11 @@ public: * * @return a status. */ - static Status validate(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *d, const AsmGemmInfo &info); + static Status validate(const ITensorInfo *a, + const ITensorInfo *b, + const ITensorInfo *c, + const ITensorInfo *d, + const AsmGemmInfo &info); /** Indicates whether or not there is an optimal assembly implementation that can be used to process the given parameters. * @@ -144,7 +150,12 @@ public: * * @return a status. */ - static Status has_opt_impl(arm_compute::WeightFormat &weight_format, const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *d, const AsmGemmInfo &info); + static Status has_opt_impl(arm_compute::WeightFormat &weight_format, + const ITensorInfo *a, + const ITensorInfo *b, + const ITensorInfo *c, + const ITensorInfo *d, + const AsmGemmInfo &info); /** Checks if activation is supported by the gemm assembly dispatcher * * @param[in] activation Activation to check @@ -167,8 +178,8 @@ public: } // Inherited methods overridden: - void prepare(ITensorPack &tensors) override; - void run(ITensorPack &tensors) override; + void prepare(ITensorPack &tensors) override; + void run(ITensorPack &tensors) override; experimental::MemoryRequirements workspace() const override; private: |