diff options
Diffstat (limited to 'src/cpu')
48 files changed, 3179 insertions, 825 deletions
diff --git a/src/cpu/kernels/CpuActivationKernel.cpp b/src/cpu/kernels/CpuActivationKernel.cpp index 4253027231..555705bd45 100644 --- a/src/cpu/kernels/CpuActivationKernel.cpp +++ b/src/cpu/kernels/CpuActivationKernel.cpp @@ -32,6 +32,7 @@ #include "src/core/helpers/AutoConfiguration.h" #include "src/core/helpers/WindowHelpers.h" #include "src/cpu/kernels/activation/list.h" +#include "src/cpu/kernels/logistic/list.h" #include <array> @@ -71,6 +72,12 @@ static const std::vector<CpuActivationKernel::ActivationKernel> available_kernel }, REGISTER_Q8_NEON(arm_compute::cpu::neon_q8_activation_lut)}, #endif // __aarch64__ + {"sme2_fp32_logistic", + [](const ActivationDataTypeISASelectorData &data) { + return data.dt == DataType::F32 && data.f == ActivationLayerInfo::ActivationFunction::LOGISTIC && + data.isa.sme2; + }, + REGISTER_FP32_SME2(arm_compute::cpu::sme2_fp32_logistic)}, {"sve2_qu8_activation", [](const ActivationDataTypeISASelectorData &data) { return data.dt == DataType::QASYMM8 && data.isa.sve2 && @@ -316,6 +323,11 @@ void CpuActivationKernel::configure(const ITensorInfo *src, ITensorInfo *dst, Ac // Use squashed window std::tie(win, _split_dimension) = calculate_squashed_or_max_window(*src); + // Collapse window with SME kernels in Y-Dim + if (std::string(uk->name) == "sme2_fp32_logistic") + { + win = win.collapse(win, Window::DimY); + } ICPPKernel::configure(win); } diff --git a/src/cpu/kernels/CpuDirectConv2dOutputStageKernel.cpp b/src/cpu/kernels/CpuDirectConv2dOutputStageKernel.cpp index d4af8bedaf..9e9137a266 100644 --- a/src/cpu/kernels/CpuDirectConv2dOutputStageKernel.cpp +++ b/src/cpu/kernels/CpuDirectConv2dOutputStageKernel.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2021 Arm Limited. + * Copyright (c) 2017-2021, 2024 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -37,6 +37,7 @@ #include "src/core/NEON/NEAsymm.h" #include "src/core/NEON/NEFixedPoint.h" #include "src/core/NEON/wrapper/wrapper.h" +#include "src/cpu/kernels/directconv2d_output_stage/list.h" #include <arm_neon.h> #include <cstddef> @@ -95,316 +96,6 @@ Status validate_arguments(const ITensorInfo *src return Status{}; } - -template <typename T> -typename std::enable_if<arm_compute::utils::traits::is_floating_point<T>::value, void>::type -output_stage_nchw(ITensor *src, - const ITensor *bias, - const Window &window, - ITensor *dst, - int result_fixedpoint_multiplier, - int result_shift, - int result_offset_after_shift) -{ - const bool has_bias = bias != nullptr; - /** SIMD vector tag type. */ - using ExactTagType = typename wrapper::traits::neon_bitvector_tag_t<T, wrapper::traits::BitWidth::W128>; - - ARM_COMPUTE_ERROR_ON(src->info()->data_layout() == DataLayout::UNKNOWN); - ARM_COMPUTE_UNUSED(result_fixedpoint_multiplier); - ARM_COMPUTE_UNUSED(result_shift); - ARM_COMPUTE_UNUSED(result_offset_after_shift); - - const int window_start_x = window.x().start(); - const int window_end_x = window.x().end(); - const int window_step_x = 16 / src->info()->element_size(); - Window win = window; - win.set(Window::DimX, Window::Dimension(0, 1, 1)); - - Iterator in(src, win); - Iterator out(dst, win); - execute_window_loop( - win, - [&](const Coordinates &id) - { - int x = window_start_x; - for (; x <= (window_end_x - window_step_x); x += window_step_x) - { - // Get bias and pointer to input - const auto in_ptr = reinterpret_cast<const T *>(in.ptr()) + x; - auto v_in = wrapper::vloadq(in_ptr); - - // Accumulate bias - if (has_bias) - { - const auto vb = wrapper::vdup_n( - *reinterpret_cast<const T *>(bias->ptr_to_element(Coordinates(id.z()))), ExactTagType{}); - v_in = wrapper::vadd(v_in, vb); - } - - const auto out_ptr = reinterpret_cast<T *>(out.ptr()) + x; - wrapper::vstore(out_ptr, v_in); - } - - // Left-overs loop - for (; x < window_end_x; ++x) - { - // Get bias and pointer to input - auto s_in = *(reinterpret_cast<const T *>(in.ptr()) + x); - - // Accumulate bias - if (has_bias) - { - const auto b = *reinterpret_cast<const T *>(bias->ptr_to_element(Coordinates(id.z()))); - s_in += b; - } - - *(reinterpret_cast<T *>(out.ptr()) + x) = s_in; - } - }, - in, out); -} - -template <typename T> -typename std::enable_if<arm_compute::utils::traits::is_floating_point<T>::value, void>::type -output_stage_nhwc(ITensor *src, - const ITensor *bias, - const Window &window, - ITensor *dst, - int result_fixedpoint_multiplier, - int result_shift, - int result_offset_after_shift) -{ - const bool has_bias = bias != nullptr; - ARM_COMPUTE_UNUSED(result_fixedpoint_multiplier); - ARM_COMPUTE_UNUSED(result_shift); - ARM_COMPUTE_UNUSED(result_offset_after_shift); - - Window window_bias = window; - window_bias.set(Window::DimX, Window::Dimension(0, 1, 1)); - window_bias.set(Window::DimY, Window::Dimension(0, 0, 0)); - window_bias.set(Window::DimZ, Window::Dimension(0, 0, 0)); - window_bias.set(3, Window::Dimension(0, 0, 0)); - - const int window_start_x = window.x().start(); - const int window_end_x = window.x().end(); - const int window_step_x = 16 / src->info()->element_size(); - Window win = window; - win.set(Window::DimX, Window::Dimension(0, 1, 1)); - - Iterator in(src, win); - Iterator bi(bias, window_bias); - Iterator out(dst, win); - - execute_window_loop( - win, - [&](const Coordinates &) - { - int x = window_start_x; - for (; x <= (window_end_x - window_step_x); x += window_step_x) - { - // Get bias and pointer to input - const auto in_ptr = reinterpret_cast<const T *>(in.ptr()); - auto v_in = wrapper::vloadq(in_ptr + x); - - // Accumulate bias - if (has_bias) - { - const auto bias_ptr = reinterpret_cast<T *>(bi.ptr()) + x; - v_in = wrapper::vadd(v_in, wrapper::vloadq(bias_ptr)); - } - - const auto out_ptr = reinterpret_cast<T *>(out.ptr()); - wrapper::vstore(out_ptr + x, v_in); - } - - // Left-overs loop - for (; x < window_end_x; ++x) - { - // Get bias and pointer to input - auto s_in = *(reinterpret_cast<const T *>(in.ptr()) + x); - - // Accumulate bias - if (has_bias) - { - const auto bias_ptr = reinterpret_cast<T *>(bi.ptr()) + x; - s_in += *bias_ptr; - } - - const auto out_ptr = reinterpret_cast<T *>(out.ptr()); - *(out_ptr + x) = s_in; - } - }, - in, bi, out); -} - -// Quantized case -template < - typename TOut, - typename std::enable_if<std::is_same<TOut, uint8_t>::value || std::is_same<TOut, int8_t>::value, int>::type = 0> -void output_stage_nchw(ITensor *src, - const ITensor *bias, - const Window &window, - ITensor *dst, - int result_fixedpoint_multiplier, - int result_shift, - int result_offset_after_shift) -{ - const bool has_bias = bias != nullptr; - using VectorType = typename wrapper::traits::neon_bitvector_t<TOut, wrapper::traits::BitWidth::W128>; - using TagType = typename wrapper::traits::neon_bitvector_tag_t<TOut, wrapper::traits::BitWidth::W128>; - - const int32x4_t result_offset_after_shift_s32 = vdupq_n_s32(result_offset_after_shift); - - const VectorType min = wrapper::vdup_n(std::numeric_limits<TOut>::lowest(), TagType{}); - const VectorType max = wrapper::vdup_n(std::numeric_limits<TOut>::max(), TagType{}); - - const int window_start_x = window.x().start(); - const int window_end_x = window.x().end(); - const int window_step_x = 16 / src->info()->element_size(); - Window win = window; - win.set(Window::DimX, Window::Dimension(0, 1, 1)); - - Iterator in(src, win); - Iterator out(dst, win); - - execute_window_loop( - win, - [&](const Coordinates &id) - { - int x = window_start_x; - for (; x <= (window_end_x - window_step_x); x += window_step_x) - { - // Get bias and pointer to input - const auto in_ptr = reinterpret_cast<int32_t *>(in.ptr()) + x; - int32x4x4_t v_in = {{wrapper::vloadq(in_ptr), wrapper::vloadq(in_ptr + 4), wrapper::vloadq(in_ptr + 8), - wrapper::vloadq(in_ptr + 12)}}; - - // Accumulate bias - if (has_bias) - { - const auto vb = wrapper::vdup_n( - *reinterpret_cast<const int32_t *>(bias->ptr_to_element(Coordinates(id.z()))), TagType{}); - v_in = {{wrapper::vadd(v_in.val[0], vb), wrapper::vadd(v_in.val[1], vb), - wrapper::vadd(v_in.val[2], vb), wrapper::vadd(v_in.val[3], vb)}}; - } - - const auto out_ptr = reinterpret_cast<TOut *>(out.ptr()) + x; - wrapper::vstore(out_ptr, finalize_quantization(v_in, result_fixedpoint_multiplier, result_shift, - result_offset_after_shift_s32, min, max, false)); - } - - // Left-overs loop - for (; x < window_end_x; ++x) - { - // Get bias and pointer to input - int32_t s_in = *(reinterpret_cast<const int32_t *>(in.ptr()) + x); - - // Accumulate bias - if (has_bias) - { - const auto b = *reinterpret_cast<const int32_t *>(bias->ptr_to_element(Coordinates(id.z()))); - s_in += b; - } - - const auto out_ptr = reinterpret_cast<TOut *>(out.ptr()) + x; - *out_ptr = - finalize_quantization(s_in, result_fixedpoint_multiplier, result_shift, result_offset_after_shift, - std::numeric_limits<TOut>::lowest(), std::numeric_limits<TOut>::max(), false); - } - }, - in, out); -} -template < - typename TOut, - typename std::enable_if<std::is_same<TOut, uint8_t>::value || std::is_same<TOut, int8_t>::value, int>::type = 0> -void output_stage_nhwc(ITensor *src, - const ITensor *bias, - const Window &window, - ITensor *dst, - int result_fixedpoint_multiplier, - int result_shift, - int result_offset_after_shift) -{ - const bool has_bias = bias != nullptr; - using VectorType = typename wrapper::traits::neon_bitvector_t<TOut, wrapper::traits::BitWidth::W128>; - using TagType = typename wrapper::traits::neon_bitvector_tag_t<TOut, wrapper::traits::BitWidth::W128>; - - const int32x4_t result_offset_after_shift_s32 = vdupq_n_s32(result_offset_after_shift); - - const VectorType min = wrapper::vdup_n(std::numeric_limits<TOut>::lowest(), TagType{}); - const VectorType max = wrapper::vdup_n(std::numeric_limits<TOut>::max(), TagType{}); - - Window window_bias = window; - window_bias.set(Window::DimX, Window::Dimension(0, 1, 1)); - window_bias.set(Window::DimY, Window::Dimension(0, 0, 0)); - window_bias.set(Window::DimZ, Window::Dimension(0, 0, 0)); - window_bias.set(3, Window::Dimension(0, 0, 0)); - - const int window_start_x = window.x().start(); - const int window_end_x = window.x().end(); - const int window_step_x = 16 / src->info()->element_size(); - Window win = window; - win.set(Window::DimX, Window::Dimension(0, 1, 1)); - - Iterator in(src, win); - Iterator bi(bias, window_bias); - Iterator out(dst, win); - - execute_window_loop( - win, - [&](const Coordinates &) - { - int x = window_start_x; - for (; x <= (window_end_x - window_step_x); x += window_step_x) - { - // Get bias and pointer to input - const auto in_ptr = reinterpret_cast<int32_t *>(in.ptr()) + x; - int32x4x4_t v_in = {{ - wrapper::vloadq(in_ptr), - wrapper::vloadq(in_ptr + 4), - wrapper::vloadq(in_ptr + 8), - wrapper::vloadq(in_ptr + 12), - }}; - - // Accumulate bias - if (has_bias) - { - const auto bias_ptr = reinterpret_cast<int32_t *>(bi.ptr()) + x; - - wrapper::vadd(v_in.val[0], wrapper::vloadq(bias_ptr)); - wrapper::vadd(v_in.val[1], wrapper::vloadq(bias_ptr + 4)); - wrapper::vadd(v_in.val[2], wrapper::vloadq(bias_ptr + 8)); - wrapper::vadd(v_in.val[3], wrapper::vloadq(bias_ptr + 12)); - } - - const auto out_ptr = reinterpret_cast<TOut *>(out.ptr()) + x; - wrapper::vstore(out_ptr, finalize_quantization(v_in, result_fixedpoint_multiplier, result_shift, - result_offset_after_shift_s32, min, max, false)); - } - - // Left-overs loop - for (; x < window_end_x; ++x) - { - // Get bias and pointer to input - const auto in_ptr = reinterpret_cast<int32_t *>(in.ptr()) + x; - int32_t s_in = *in_ptr; - - // Accumulate bias - if (has_bias) - { - const auto bias_ptr = reinterpret_cast<int32_t *>(bi.ptr()) + x; - s_in += *bias_ptr; - } - - const auto out_ptr = reinterpret_cast<TOut *>(out.ptr()) + x; - *out_ptr = - finalize_quantization(s_in, result_fixedpoint_multiplier, result_shift, result_offset_after_shift, - std::numeric_limits<TOut>::lowest(), std::numeric_limits<TOut>::max(), false); - } - }, - in, bi, out); -} } // namespace void CpuDirectConv2dOutputStageKernel::configure(ITensorInfo *src, @@ -447,24 +138,30 @@ void CpuDirectConv2dOutputStageKernel::configure(ITensorInfo { if (is_qasymm8_signed) { - _func = &output_stage_nchw<int8_t>; +#ifdef ENABLE_QASYMM8_SIGNED_KERNELS + _func = &output_stage_nchw_qs8; +#endif // ENABLE_QASYMM8_SIGNED_KERNELS } else { - _func = &output_stage_nchw<uint8_t>; +#ifdef ENABLE_QASYMM8_KERNELS + _func = &output_stage_nchw_qu8; +#endif // ENABLE_QASYMM8_KERNELS } break; } -#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC +#ifdef ENABLE_FP16_KERNELS case DataType::F16: { - _func = &output_stage_nchw<float16_t>; + _func = &output_stage_nchw_fp16; break; } -#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */ +#endif // ENABLE_FP16_KERNELS case DataType::F32: { - _func = &output_stage_nchw<float>; +#ifdef ENABLE_FP32_KERNELS + _func = &output_stage_nchw_fp32; +#endif // ENABLE_FP32_KERNELS break; } default: @@ -481,24 +178,30 @@ void CpuDirectConv2dOutputStageKernel::configure(ITensorInfo { if (is_qasymm8_signed) { - _func = &output_stage_nhwc<int8_t>; +#ifdef ENABLE_QASYMM8_SIGNED_KERNELS + _func = &output_stage_nhwc_qs8; +#endif // ENABLE_QASYMM8_SIGNED_KERNELS } else { - _func = &output_stage_nhwc<uint8_t>; +#ifdef ENABLE_QASYMM8_KERNELS + _func = &output_stage_nhwc_qu8; +#endif // QASYMM8_SIGNED_KERNELS } break; } -#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC +#ifdef ENABLE_FP16_KERNELS case DataType::F16: { - _func = &output_stage_nhwc<float16_t>; + _func = &output_stage_nhwc_fp16; break; } -#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */ +#endif // ENABLE_FP16_KERNELS case DataType::F32: { - _func = &output_stage_nhwc<float>; +#ifdef ENABLE_FP32_KERNELS + _func = &output_stage_nhwc_fp32; +#endif // ENABLE_FP32_KERNELS break; } default: diff --git a/src/cpu/kernels/CpuDirectConv3dKernel.cpp b/src/cpu/kernels/CpuDirectConv3dKernel.cpp index b5b2aed1ba..9c37ece3dd 100644 --- a/src/cpu/kernels/CpuDirectConv3dKernel.cpp +++ b/src/cpu/kernels/CpuDirectConv3dKernel.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021-2022 Arm Limited. + * Copyright (c) 2021-2022, 2024 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -25,8 +25,8 @@ #include "arm_compute/core/Error.h" #include "arm_compute/core/Helpers.h" -#include "arm_compute/core/IAccessWindow.h" #include "arm_compute/core/ITensor.h" +#include "arm_compute/core/Steps.h" #include "arm_compute/core/Types.h" #include "arm_compute/core/Utils.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" @@ -35,10 +35,8 @@ #include "src/core/common/Registrars.h" #include "src/core/CPP/Validate.h" #include "src/core/helpers/AutoConfiguration.h" -#include "src/core/NEON/wrapper/wrapper.h" -#include "src/cpu/kernels/conv3d/neon/list.h" - -#include <algorithm> +#include "src/core/helpers/WindowHelpers.h" +#include "src/cpu/kernels/conv3d/list.h" using namespace arm_compute::detail; @@ -51,18 +49,16 @@ namespace kernels namespace { static const std::vector<CpuDirectConv3dKernel::DirectConv3dKernel> available_kernels = { -#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) {"neon_fp16_directconv3d", [](const DataTypeISASelectorData &data) { return data.dt == DataType::F16 && data.isa.fp16; }, - REGISTER_FP16_NEON(arm_compute::cpu::directconv3d_float_neon_ndhwc<float16_t>)}, -#endif /* !defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) */ + REGISTER_FP16_NEON(directconv3d_fp16_neon_ndhwc)}, {"neon_fp32_directconv3d", [](const DataTypeISASelectorData &data) { return data.dt == DataType::F32; }, - REGISTER_FP32_NEON(arm_compute::cpu::directconv3d_float_neon_ndhwc<float>)}, + REGISTER_FP32_NEON(directconv3d_fp32_neon_ndhwc)}, {"neon_qasymm8_directconv3d", [](const DataTypeISASelectorData &data) { return data.dt == DataType::QASYMM8; }, - REGISTER_QASYMM8_NEON(arm_compute::cpu::directconv3d_quantized_neon_ndhwc<uint8_t>)}, + REGISTER_QASYMM8_NEON(directconv3d_qu8_neon_ndhwc)}, {"neon_qasymm8_signed_directconv3d", [](const DataTypeISASelectorData &data) { return data.dt == DataType::QASYMM8_SIGNED; }, - REGISTER_QASYMM8_SIGNED_NEON(arm_compute::cpu::directconv3d_quantized_neon_ndhwc<int8_t>)}}; + REGISTER_QASYMM8_SIGNED_NEON(directconv3d_qs8_neon_ndhwc)}}; Status validate_arguments(const ITensorInfo *src0, const ITensorInfo *src1, diff --git a/src/cpu/kernels/CpuGemmLowpMatrixMultiplyKernel.cpp b/src/cpu/kernels/CpuGemmLowpMatrixMultiplyKernel.cpp index 5b88735e7a..87340e566e 100644 --- a/src/cpu/kernels/CpuGemmLowpMatrixMultiplyKernel.cpp +++ b/src/cpu/kernels/CpuGemmLowpMatrixMultiplyKernel.cpp @@ -684,6 +684,10 @@ Status validate_arguments(const ITensorInfo *src0, const ITensorInfo *src1, cons DataType::U8); ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 1, DataType::S32); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(src0->data_type() == DataType::QASYMM8_SIGNED && + src1->data_type() == DataType::QASYMM8, + "QASYMM8_SIGNED input with QASYMM8 weights not supported"); + TensorShape in0_shape = src0->tensor_shape(); TensorShape in1_shape = src1->tensor_shape(); TensorShape out_shape = dst->tensor_shape(); diff --git a/src/cpu/kernels/CpuKernelSelectionTypes.h b/src/cpu/kernels/CpuKernelSelectionTypes.h index 7c1e4772a6..96ddad9d19 100644 --- a/src/cpu/kernels/CpuKernelSelectionTypes.h +++ b/src/cpu/kernels/CpuKernelSelectionTypes.h @@ -105,6 +105,13 @@ struct SoftmaxKernelDataTypeISASelectorData cpuinfo::CpuIsaInfo isa; bool is_log; int axis; + uint64_t sme2_vector_length; +}; + +struct ScatterKernelDataTypeISASelectorData +{ + DataType dt; + cpuinfo::CpuIsaInfo isa; unsigned long sme2_vector_length; }; @@ -124,6 +131,8 @@ using ScaleKernelDataTypeISASelectorDataPtr = std::add_pointer<bool(const ScaleKernelDataTypeISASelectorData &data)>::type; using SoftmaxKernelDataTypeISASelectorDataPtr = std::add_pointer<bool(const SoftmaxKernelDataTypeISASelectorData &data)>::type; +using ScatterKernelDataTypeISASelectorDataPtr = + std::add_pointer<bool(const ScatterKernelDataTypeISASelectorData &data)>::type; } // namespace kernels } // namespace cpu } // namespace arm_compute diff --git a/src/cpu/kernels/CpuMulKernel.cpp b/src/cpu/kernels/CpuMulKernel.cpp index 8001482154..d7a3a77d51 100644 --- a/src/cpu/kernels/CpuMulKernel.cpp +++ b/src/cpu/kernels/CpuMulKernel.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2016-2023 Arm Limited. + * Copyright (c) 2016-2024 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -34,6 +34,7 @@ #include "src/core/NEON/NESymm.h" #include "src/core/NEON/wrapper/wrapper.h" #include "src/cpu/kernels/mul/generic/neon/list.h" +#include "src/cpu/kernels/mul/generic/sme2/list.h" #include <arm_neon.h> @@ -317,6 +318,41 @@ void mul_saturate_quantized_8(const ITensor *src1, const ITensor *src2, ITensor } } +bool mul_q8_sme_possible(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst, float scale) +{ + const auto &in0_shape = src0->tensor_shape(); + const auto &in1_shape = src1->tensor_shape(); + const unsigned int dst_dims = dst->num_dimensions(); + + // Calculate Scale + const auto iq0 = src0->quantization_info().uniform(); + const auto iq1 = src1->quantization_info().uniform(); + const auto oq = dst->quantization_info().uniform(); + const auto multiplier = ((iq0.scale * iq1.scale) / oq.scale) * scale; + const auto max_result = multiplier * (127) * (127) + static_cast<float>(oq.offset); + const auto min_result = multiplier * (-128) * (-128) + static_cast<float>(oq.offset); + + // Does not support broadcasting on x + // Does not support dims > 4D output, unless input shapes are identical (therefore collapsible) + // Checks whether CPU has SME2 Available + if (in0_shape.x() == in1_shape.x() && CPUInfo::get().has_sme2() && (in0_shape == in1_shape || dst_dims <= 4)) + { + // Check if multiplier cannot be stored as a 14.18 signed fixed-point number + if (multiplier < -8191.f || multiplier > 8191.f) + { + return false; + } + // It might not be possible to store the result as a 14.18 signed fixed-point number. + if (max_result > 8191.f || min_result < -8191.f) + { + return false; + } + // Passed all checks + return true; + } + return false; +} + bool mul_q8_neon_fixedpoint_possible(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst, @@ -1563,7 +1599,11 @@ void CpuMulKernel::configure(ITensorInfo *src1, case DataType::QASYMM8_SIGNED: if (dt_input2 == DataType::QASYMM8_SIGNED) { - if (mul_q8_neon_fixedpoint_possible(src1, src2, dst, scale)) + if (mul_q8_sme_possible(src1, src2, dst, scale) && rounding_policy == RoundingPolicy::TO_ZERO) + { + _func_quantized = REGISTER_QASYMM8_SIGNED_SME2(arm_compute::cpu::sme2_q8_signed_mul); + } + else if (mul_q8_neon_fixedpoint_possible(src1, src2, dst, scale)) { _func_quantized = &mul_q8_neon_fixedpoint<int8_t>; } diff --git a/src/cpu/kernels/CpuPermuteKernel.cpp b/src/cpu/kernels/CpuPermuteKernel.cpp index b444a25ff7..c6e0dd3a5e 100644 --- a/src/cpu/kernels/CpuPermuteKernel.cpp +++ b/src/cpu/kernels/CpuPermuteKernel.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018-2021 Arm Limited. + * Copyright (c) 2018-2021, 2024 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -97,15 +97,12 @@ Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst, const template <typename T> void run_permute(const Window &window, const ITensor *src, const ITensor *dst, const PermutationVector &perm) { - const DataLayout src_layout = src->info()->data_layout(); - // Source window Window window_src = window; // we only support these two configs in src/core/NEON/kernels/convolution/common/shims.hpp, for all others // we have to fall back to C++ - if ((src_layout == DataLayout::NCHW && perm == PermutationVector{2U, 0U, 1U}) || - (src_layout == DataLayout::NHWC && perm == PermutationVector{1U, 2U, 0U})) + if (perm == PermutationVector{2U, 0U, 1U} || perm == PermutationVector{1U, 2U, 0U}) { window_src.set(Window::DimX, Window::Dimension(window.x().start(), window.x().end(), window.x().end() - window.x().start())); @@ -128,49 +125,16 @@ void run_permute(const Window &window, const ITensor *src, const ITensor *dst, c Iterator src_it(src, window_src); Iterator dst_it(dst, window_dst); - int in_row_stride = 0; - int in_col_stride = 0; - int in_channel_stride = 0; - int in_batch_stride = 0; - int n_cols = 0; - int n_rows = 0; - int n_channels = 0; - int n_batches = 0; - - switch (src_layout) - { - case DataLayout::NCHW: - { - in_row_stride = src->info()->strides_in_bytes().y() / sizeof(T); - in_channel_stride = src->info()->strides_in_bytes().z() / sizeof(T); - in_batch_stride = src->info()->strides_in_bytes()[3] / sizeof(T); - n_cols = src->info()->tensor_shape().x(); - n_rows = window_src.y().step(); - n_channels = src->info()->tensor_shape().z(); - n_batches = src->info()->tensor_shape()[3]; - break; - } - case DataLayout::NHWC: - { - in_col_stride = src->info()->strides_in_bytes().y() / sizeof(T); - in_row_stride = src->info()->strides_in_bytes().z() / sizeof(T); - in_batch_stride = src->info()->strides_in_bytes()[3] / sizeof(T); - n_channels = src->info()->tensor_shape().x(); - n_cols = window_src.y().step(); - n_rows = src->info()->tensor_shape().z(); - n_batches = src->info()->tensor_shape()[3]; - break; - } - default: - { - ARM_COMPUTE_ERROR("Invalid source data layout."); - break; - } - } - // CHW -> HWC - if (src_layout == DataLayout::NCHW && perm == PermutationVector{2U, 0U, 1U}) + if (perm == PermutationVector{2U, 0U, 1U}) { + const int in_row_stride = src->info()->strides_in_bytes().y() / sizeof(T); + const int in_channel_stride = src->info()->strides_in_bytes().z() / sizeof(T); + const int in_batch_stride = src->info()->strides_in_bytes()[3] / sizeof(T); + const int n_cols = src->info()->tensor_shape().x(); + const int n_rows = window_src.y().step(); + const int n_channels = src->info()->tensor_shape().z(); + const int n_batches = src->info()->tensor_shape()[3]; const int out_channel_stride = dst->info()->strides_in_bytes().x() / sizeof(T); const int out_col_stride = dst->info()->strides_in_bytes().y() / sizeof(T); const int out_row_stride = dst->info()->strides_in_bytes().z() / sizeof(T); @@ -188,8 +152,15 @@ void run_permute(const Window &window, const ITensor *src, const ITensor *dst, c src_it, dst_it); } // HWC -> CHW - else if (src_layout == DataLayout::NHWC && perm == PermutationVector{1U, 2U, 0U}) + else if (perm == PermutationVector{1U, 2U, 0U}) { + const int in_col_stride = src->info()->strides_in_bytes().y() / sizeof(T); + const int in_row_stride = src->info()->strides_in_bytes().z() / sizeof(T); + const int in_batch_stride = src->info()->strides_in_bytes()[3] / sizeof(T); + const int n_channels = src->info()->tensor_shape().x(); + const int n_cols = window_src.y().step(); + const int n_rows = src->info()->tensor_shape().z(); + const int n_batches = src->info()->tensor_shape()[3]; const int out_col_stride = dst->info()->strides_in_bytes().x() / sizeof(T); const int out_row_stride = dst->info()->strides_in_bytes().y() / sizeof(T); const int out_channel_stride = dst->info()->strides_in_bytes().z() / sizeof(T); diff --git a/src/cpu/kernels/CpuReshapeKernel.cpp b/src/cpu/kernels/CpuReshapeKernel.cpp index 241e58fbce..78b08f19d2 100644 --- a/src/cpu/kernels/CpuReshapeKernel.cpp +++ b/src/cpu/kernels/CpuReshapeKernel.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2023 Arm Limited. + * Copyright (c) 2017-2024 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -233,7 +233,9 @@ void CpuReshapeKernel::prepare(ITensorPack &tensors) if (!src_has_holes && !dst_has_holes) { - std::tie(win, _split_dimension) = calculate_squashed_or_max_window(*dst_info); + size_t split_dimension; + + std::tie(win, split_dimension) = calculate_squashed_or_max_window(*dst_info); /* Copy the tensor per window. If the src and dst tensors are contiguous memory allocations without any holes or @@ -241,7 +243,15 @@ void CpuReshapeKernel::prepare(ITensorPack &tensors) we can use use a single memcopy call to copy the whole window in reshape_tensor_per_window fn */ - _reshape_tensor_fn = reshape_tensor_per_window; + if (split_dimension != Window::DimY) + { + // Fall back when split dimension doesn't equal Window::DimY + _reshape_tensor_fn = reshape_tensor_per_row; + } + else + { + _reshape_tensor_fn = reshape_tensor_per_window; + } } else { diff --git a/src/cpu/kernels/CpuReshapeKernel.h b/src/cpu/kernels/CpuReshapeKernel.h index ce566fd9e2..acad441b76 100644 --- a/src/cpu/kernels/CpuReshapeKernel.h +++ b/src/cpu/kernels/CpuReshapeKernel.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2023 Arm Limited. + * Copyright (c) 2017-2024 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -21,8 +21,8 @@ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ -#ifndef ARM_COMPUTE_CPU_RESHAPE_KERNEL_H -#define ARM_COMPUTE_CPU_RESHAPE_KERNEL_H +#ifndef ACL_SRC_CPU_KERNELS_CPURESHAPEKERNEL_H +#define ACL_SRC_CPU_KERNELS_CPURESHAPEKERNEL_H #include "src/core/common/Macros.h" #include "src/cpu/ICpuKernel.h" @@ -74,21 +74,10 @@ public: */ size_t get_mws(const CPUInfo &platform, size_t thread_count) const override; - /** Get the preferred dimension in which the scheduler splits the work into multiple jobs. - * - * @return The split dimension. - */ - size_t get_split_dimension() const - { - return _split_dimension; - } - private: - size_t _split_dimension{Window::DimY}; - std::function<void(const Window &window, const ITensor *src, ITensor *dst)> _reshape_tensor_fn{}; }; } // namespace kernels } // namespace cpu } // namespace arm_compute -#endif /* ARM_COMPUTE_CPU_RESHAPE_KERNEL_H */ +#endif // ACL_SRC_CPU_KERNELS_CPURESHAPEKERNEL_H diff --git a/src/cpu/kernels/CpuScatterKernel.cpp b/src/cpu/kernels/CpuScatterKernel.cpp new file mode 100644 index 0000000000..bc0fa724b0 --- /dev/null +++ b/src/cpu/kernels/CpuScatterKernel.cpp @@ -0,0 +1,98 @@ +/* + * Copyright (c) 2024 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "src/cpu/kernels/CpuScatterKernel.h" + +#include "arm_compute/core/Error.h" +#include "arm_compute/core/TensorInfo.h" + +#include <vector> + +namespace arm_compute +{ +namespace cpu +{ +namespace kernels +{ +namespace +{ + +/* Scatter */ +static const std::vector<typename CpuScatterKernel::ScatterKernel> available_kernels = { + +}; + +} // namespace + +const std::vector<typename CpuScatterKernel::ScatterKernel> &CpuScatterKernel::get_available_kernels() +{ + return available_kernels; +} + +void CpuScatterKernel::configure(const ITensorInfo *src, + const ITensorInfo *updates, + const ITensorInfo *indices, + ITensorInfo *dst, + const ScatterInfo &info) +{ + ARM_COMPUTE_UNUSED(src); + ARM_COMPUTE_UNUSED(updates); + ARM_COMPUTE_UNUSED(indices); + ARM_COMPUTE_UNUSED(dst); + ARM_COMPUTE_UNUSED(info); + + ARM_COMPUTE_UNUSED(_run_method); +} + +Status CpuScatterKernel::validate(const ITensorInfo *src, + const ITensorInfo *updates, + const ITensorInfo *indices, + const ITensorInfo *dst, + const ScatterInfo &info) +{ + ARM_COMPUTE_UNUSED(src); + ARM_COMPUTE_UNUSED(updates); + ARM_COMPUTE_UNUSED(indices); + ARM_COMPUTE_UNUSED(dst); + ARM_COMPUTE_UNUSED(info); + + return Status{ErrorCode::RUNTIME_ERROR, "No configuration implemented yet."}; +} + +void CpuScatterKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) +{ + ARM_COMPUTE_UNUSED(tensors); + ARM_COMPUTE_UNUSED(window); + ARM_COMPUTE_UNUSED(info); + + ARM_COMPUTE_UNUSED(_run_method); +} + +const char *CpuScatterKernel::name() const +{ + return _name.c_str(); +} + +} // namespace kernels +} // namespace cpu +} // namespace arm_compute diff --git a/src/cpu/kernels/CpuScatterKernel.h b/src/cpu/kernels/CpuScatterKernel.h new file mode 100644 index 0000000000..77c436034f --- /dev/null +++ b/src/cpu/kernels/CpuScatterKernel.h @@ -0,0 +1,91 @@ +/* + * Copyright (c) 2024 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ACL_SRC_CPU_KERNELS_CPUSCATTERKERNEL_H +#define ACL_SRC_CPU_KERNELS_CPUSCATTERKERNEL_H + +#include "arm_compute/function_info/ScatterInfo.h" + +#include "src/core/common/Macros.h" +#include "src/cpu/ICpuKernel.h" + +namespace arm_compute +{ +namespace cpu +{ +namespace kernels +{ +/** Arm(R) Neon(TM) kernel to perform the ScatterND operation */ +class CpuScatterKernel : public ICpuKernel<CpuScatterKernel> +{ +private: + using ScatterKernelPtr = std::add_pointer<void( + const ITensor *, const ITensor *, const ITensor *, ITensor *, const ScatterInfo, const Window &)>::type; + +public: + CpuScatterKernel() = default; + ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuScatterKernel); + /** Initialise the kernel's input and output. + * + * @param[in] src Input tensor info for the source matrix. + * @param[in] updates Input tensor info for the Update matrix. Data type supported: same as @p src + * @param[in] indices Input tensor info for the Indices matrix. Data type supported: U32. + * @param[out] dst Output tensor info. Data type supported: same as @p src + * @param[in] info Attributes for Scatter Kernel + */ + void configure(const ITensorInfo *src, + const ITensorInfo *updates, + const ITensorInfo *indices, + ITensorInfo *dst, + const ScatterInfo &info); + /** Static function to check if given info will lead to a valid configuration + * + * Similar to @ref CpuScatterKernel::configure() + * + * @return a status + */ + static Status validate(const ITensorInfo *src, + const ITensorInfo *updates, + const ITensorInfo *indices, + const ITensorInfo *dst, + const ScatterInfo &info); + + // Inherited methods overridden: + void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override; + const char *name() const override; + struct ScatterKernel + { + const char *name; + ScatterKernelPtr ukernel; + }; + + static const std::vector<ScatterKernel> &get_available_kernels(); + +private: + ScatterKernelPtr _run_method{nullptr}; + std::string _name{}; +}; +} // namespace kernels +} // namespace cpu +} // namespace arm_compute +#endif // ACL_SRC_CPU_KERNELS_CPUSCATTERKERNEL_H diff --git a/src/cpu/kernels/CpuWeightsReshapeKernel.cpp b/src/cpu/kernels/CpuWeightsReshapeKernel.cpp index 297ba63826..f8e9d123b5 100644 --- a/src/cpu/kernels/CpuWeightsReshapeKernel.cpp +++ b/src/cpu/kernels/CpuWeightsReshapeKernel.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2021 Arm Limited. + * Copyright (c) 2017-2021,2024 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -72,7 +72,10 @@ Status validate_arguments(const ITensorInfo *src, const ITensorInfo *biases, con ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(dst->tensor_shape(), get_output_shape(src, biases != nullptr)); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(src, dst); + if (!src->quantization_info().is_dynamic()) + { + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(src, dst); + } } return Status{}; diff --git a/src/cpu/kernels/assembly/CpuGemmAssemblyWrapperKernel.h b/src/cpu/kernels/assembly/CpuGemmAssemblyWrapperKernel.h index e2a27675b3..72fafca1bb 100644 --- a/src/cpu/kernels/assembly/CpuGemmAssemblyWrapperKernel.h +++ b/src/cpu/kernels/assembly/CpuGemmAssemblyWrapperKernel.h @@ -52,7 +52,7 @@ namespace kernel * * */ -template <typename TypeInput, typename TypeOutput> +template <typename TypeInput, typename TypeWeight, typename TypeOutput> class CpuGemmAssemblyWrapperKernel final : public INEKernel { public: @@ -101,7 +101,7 @@ public: * @param[in] kernel Pointer to an assembly kernel implementation. * @param[in] kernel_name_tag Tag to be attacehd to the kernel's name. */ - void configure(arm_gemm::GemmCommon<TypeInput, TypeOutput> *kernel, std::string kernel_name_tag) + void configure(arm_gemm::GemmCommon<TypeInput, TypeWeight, TypeOutput> *kernel, std::string kernel_name_tag) { ARM_COMPUTE_ERROR_ON_NULLPTR((reinterpret_cast<void *>(kernel))); _kernel = kernel; @@ -131,8 +131,8 @@ public: } private: - arm_gemm::GemmCommon<TypeInput, TypeOutput> *_kernel; - std::string _name; + arm_gemm::GemmCommon<TypeInput, TypeWeight, TypeOutput> *_kernel; + std::string _name; }; } // namespace kernel } // namespace cpu diff --git a/src/cpu/kernels/assembly/arm_gemm.hpp b/src/cpu/kernels/assembly/arm_gemm.hpp index 941fed0ba8..cbc8be416e 100644 --- a/src/cpu/kernels/assembly/arm_gemm.hpp +++ b/src/cpu/kernels/assembly/arm_gemm.hpp @@ -277,8 +277,8 @@ struct Nothing { }; -template <typename Top, typename Tret> -using UniqueGemmCommon = std::unique_ptr<GemmCommon<Top, Tret>>; +template <typename Tlop, typename Trop, typename Tret> +using UniqueGemmCommon = std::unique_ptr<GemmCommon<Tlop, Trop, Tret>>; /* Low level API calls. * These are implemented as 'GemmArgs' versions, or with the arguments explicitly listed. */ @@ -288,13 +288,13 @@ using UniqueGemmCommon = std::unique_ptr<GemmCommon<Top, Tret>>; template <typename Top, typename Tret, class OutputStage = Nothing> KernelDescription get_gemm_method(const GemmArgs &args, const OutputStage & = {}); -template <typename Top, typename Tret, class OutputStage = Nothing> -UniqueGemmCommon<Top, Tret> gemm(const GemmArgs &args, const OutputStage & = {}); +template <typename Tlop, typename Trop, typename Tret, class OutputStage = Nothing> +UniqueGemmCommon<Tlop, Trop, Tret> gemm(const GemmArgs &args, const OutputStage & = {}); -template <typename Top, typename Tret, class OutputStage = Nothing> +template <typename Tlop, typename Trop, typename Tret, class OutputStage = Nothing> std::vector<KernelDescription> get_compatible_kernels(const GemmArgs &args, const OutputStage & = {}); -template <typename Top, typename Tret, class OutputStage = Nothing> +template <typename Tlop, typename Trop, typename Tret, class OutputStage = Nothing> bool has_opt_gemm(WeightFormat &weight_format, const GemmArgs &args, const OutputStage & = {}); } // namespace arm_gemm diff --git a/src/cpu/kernels/assembly/convolution_parameters.hpp b/src/cpu/kernels/assembly/convolution_parameters.hpp index a6cf96344c..09b73ca409 100644 --- a/src/cpu/kernels/assembly/convolution_parameters.hpp +++ b/src/cpu/kernels/assembly/convolution_parameters.hpp @@ -61,6 +61,8 @@ struct ConvolutionParameters int64_t output_stride_w; int64_t output_stride_h; // output_channels not included as they do not affect the input. + int64_t dilation_w; + int64_t dilation_h; int64_t padding_top; int64_t padding_left; float padding_value; diff --git a/src/cpu/kernels/assembly/gemm_common.hpp b/src/cpu/kernels/assembly/gemm_common.hpp index 45d1e43274..d5676c134e 100644 --- a/src/cpu/kernels/assembly/gemm_common.hpp +++ b/src/cpu/kernels/assembly/gemm_common.hpp @@ -35,6 +35,7 @@ namespace arm_gemm { // Avoid circular dependency with arm_gemm.hpp struct GemmConfig; +struct Requantize32; // Abstract class for the GEMM/GEMV functions. // @@ -160,6 +161,12 @@ public: { } + /*** "Quantization update" interface (optional) ***/ + /* Update quantization parameters at run time */ + virtual void update_quantization_parameters(const Requantize32 &) + { + } + /*** Convolution interface (optional) ***/ /* Set the convolution parameters. */ virtual void set_convolution_parameters(ConvolutionParameters) @@ -189,7 +196,7 @@ public: * 'set_arrays' to capture the provided arguments in protected class * members, as essentially any implementation will need these. */ -template <typename To, typename Tr> +template <typename To, typename Tw, typename Tr> class GemmCommon : public IGemmCommon { protected: @@ -197,7 +204,7 @@ protected: int _lda = 0; int _A_batch_stride = 0; int _A_multi_stride = 0; - const To *_Bptr = nullptr; + const Tw *_Bptr = nullptr; int _ldb = 0; int _B_multi_stride = 0; Tr *_Cptr = nullptr; @@ -214,7 +221,7 @@ public: const int lda, const int A_batch_stride, const int A_multi_stride, - const To *B, + const Tw *B, const int ldb, /* batches share B */ const int B_multi_stride, Tr *C, @@ -254,7 +261,7 @@ public: const void *bias, /* no row or batch stride needed */ const int bias_multi_stride) override { - set_arrays(static_cast<const To *>(A), lda, A_batch_stride, A_multi_stride, static_cast<const To *>(B), ldb, + set_arrays(static_cast<const To *>(A), lda, A_batch_stride, A_multi_stride, static_cast<const Tw *>(B), ldb, B_multi_stride, static_cast<Tr *>(C), ldc, C_batch_stride, C_multi_stride, static_cast<const Tr *>(bias), bias_multi_stride); } @@ -262,17 +269,17 @@ public: /*** "Pretransposed" interface ***/ /* Compute col sums over all columns */ - virtual void requantize_bias(void *, const To *, const int, const int){}; + virtual void requantize_bias(void *, const Tw *, const int, const int){}; /* Perform pretranspose - the void * passed in must remain allocated for the duration of any execute calls. */ /* Arguments are: output buffer pointer, source pointer, source row stride, source multi stride */ - virtual void pretranspose_B_array(void *, const To *, const int, const int, bool){}; + virtual void pretranspose_B_array(void *, const Tw *, const int, const int, bool){}; /* Implementation of the void * overload which casts its arguments to the appropriate type. */ void pretranspose_B_array_generic( void *out, const void *in, const int row_stride, const int multi_stride, bool transposed) override { - pretranspose_B_array(out, static_cast<const To *>(in), row_stride, multi_stride, transposed); + pretranspose_B_array(out, static_cast<const Tw *>(in), row_stride, multi_stride, transposed); } /* Threaded versions of the above. @@ -280,7 +287,7 @@ public: * just calls the non-threaded functions to do the work. This is valid as with window size of 1 the only * legal values for start and end are 0 and 1 respectively. */ virtual void pretranspose_B_array_part( - void *out, const To *in, const int row_stride, const int multi_stride, bool transposed, size_t, size_t) + void *out, const Tw *in, const int row_stride, const int multi_stride, bool transposed, size_t, size_t) { pretranspose_B_array(out, in, row_stride, multi_stride, transposed); }; @@ -293,7 +300,7 @@ public: size_t start, size_t end) override { - pretranspose_B_array_part(out, static_cast<const To *>(in), row_stride, multi_stride, transposed, start, end); + pretranspose_B_array_part(out, static_cast<const Tw *>(in), row_stride, multi_stride, transposed, start, end); } /*** Indirect interface ***/ diff --git a/src/cpu/kernels/conv3d/neon/list.h b/src/cpu/kernels/conv3d/generic/neon/float_impl.h index 082c60be29..5b5611a02f 100644 --- a/src/cpu/kernels/conv3d/neon/list.h +++ b/src/cpu/kernels/conv3d/generic/neon/float_impl.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021 Arm Limited. + * Copyright (c) 2021, 2024 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -21,21 +21,25 @@ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ -#ifndef SRC_CORE_NEON_KERNELS_CONV3D_LIST_H -#define SRC_CORE_NEON_KERNELS_CONV3D_LIST_H +#ifndef ACL_SRC_CPU_KERNELS_CONV3D_GENERIC_NEON_FLOAT_IMPL_H +#define ACL_SRC_CPU_KERNELS_CONV3D_GENERIC_NEON_FLOAT_IMPL_H +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/ITensor.h" #include "arm_compute/core/Types.h" -#include "arm_compute/core/utils/misc/Traits.h" +#include "arm_compute/core/Window.h" #include "arm_compute/runtime/FunctionDescriptors.h" #include "src/core/helpers/WindowHelpers.h" #include "src/core/NEON/wrapper/wrapper.h" -#include "src/cpu/kernels/conv3d/neon/quantized.h" namespace arm_compute { namespace cpu { +namespace kernels +{ + template <typename T> void directconv3d_float_neon_ndhwc(const ITensor *src0, const ITensor *src1, @@ -192,6 +196,7 @@ void directconv3d_float_neon_ndhwc(const ITensor *src0, out); } +} // namespace kernels } // namespace cpu } // namespace arm_compute -#endif // SRC_CORE_NEON_KERNELS_CONV3D_LIST_H +#endif // ACL_SRC_CPU_KERNELS_CONV3D_GENERIC_NEON_FLOAT_IMPL_H diff --git a/src/cpu/kernels/conv3d/generic/neon/fp16.cpp b/src/cpu/kernels/conv3d/generic/neon/fp16.cpp new file mode 100644 index 0000000000..1737556e51 --- /dev/null +++ b/src/cpu/kernels/conv3d/generic/neon/fp16.cpp @@ -0,0 +1,49 @@ +/* + * Copyright (c) 2024 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "src/cpu/kernels/conv3d/generic/neon/float_impl.h" + +#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) + +namespace arm_compute +{ +namespace cpu +{ +namespace kernels +{ +void directconv3d_fp16_neon_ndhwc(const ITensor *src0, + const ITensor *src1, + const ITensor *src2, + ITensor *dst, + const Conv3dInfo &conv_info, + const Window &window) +{ + directconv3d_float_neon_ndhwc<float16_t>(src0, src1, src2, dst, conv_info, window); +} + +} // namespace kernels +} // namespace cpu +} // namespace arm_compute + +#endif // defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) diff --git a/src/cpu/kernels/conv3d/generic/neon/fp32.cpp b/src/cpu/kernels/conv3d/generic/neon/fp32.cpp new file mode 100644 index 0000000000..1cd0793442 --- /dev/null +++ b/src/cpu/kernels/conv3d/generic/neon/fp32.cpp @@ -0,0 +1,46 @@ +/* + * Copyright (c) 2024 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "src/cpu/kernels/conv3d/generic/neon/float_impl.h" + +namespace arm_compute +{ +namespace cpu +{ +namespace kernels +{ + +void directconv3d_fp32_neon_ndhwc(const ITensor *src0, + const ITensor *src1, + const ITensor *src2, + ITensor *dst, + const Conv3dInfo &conv_info, + const Window &window) +{ + directconv3d_float_neon_ndhwc<float>(src0, src1, src2, dst, conv_info, window); +} + +} // namespace kernels +} // namespace cpu +} // namespace arm_compute diff --git a/src/cpu/kernels/conv3d/generic/neon/qasymm8.cpp b/src/cpu/kernels/conv3d/generic/neon/qasymm8.cpp new file mode 100644 index 0000000000..d0cb6fc1c1 --- /dev/null +++ b/src/cpu/kernels/conv3d/generic/neon/qasymm8.cpp @@ -0,0 +1,46 @@ +/* + * Copyright (c) 2024 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "src/cpu/kernels/conv3d/generic/neon/quantized_impl.h" + +namespace arm_compute +{ +namespace cpu +{ +namespace kernels +{ + +void directconv3d_qu8_neon_ndhwc(const ITensor *src0, + const ITensor *src1, + const ITensor *src2, + ITensor *dst, + const Conv3dInfo &conv_info, + const Window &window) +{ + directconv3d_quantized_neon_ndhwc<uint8_t>(src0, src1, src2, dst, conv_info, window); +} + +} // namespace kernels +} // namespace cpu +} // namespace arm_compute diff --git a/src/cpu/kernels/conv3d/generic/neon/qasymm8_signed.cpp b/src/cpu/kernels/conv3d/generic/neon/qasymm8_signed.cpp new file mode 100644 index 0000000000..adffc1a3f8 --- /dev/null +++ b/src/cpu/kernels/conv3d/generic/neon/qasymm8_signed.cpp @@ -0,0 +1,46 @@ +/* + * Copyright (c) 2024 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "src/cpu/kernels/conv3d/generic/neon/quantized_impl.h" + +namespace arm_compute +{ +namespace cpu +{ +namespace kernels +{ + +void directconv3d_qs8_neon_ndhwc(const ITensor *src0, + const ITensor *src1, + const ITensor *src2, + ITensor *dst, + const Conv3dInfo &conv_info, + const Window &window) +{ + directconv3d_quantized_neon_ndhwc<int8_t>(src0, src1, src2, dst, conv_info, window); +} + +} // namespace kernels +} // namespace cpu +} // namespace arm_compute diff --git a/src/cpu/kernels/conv3d/neon/quantized.h b/src/cpu/kernels/conv3d/generic/neon/quantized_impl.h index f0fc9b5a71..b6b41035f8 100644 --- a/src/cpu/kernels/conv3d/neon/quantized.h +++ b/src/cpu/kernels/conv3d/generic/neon/quantized_impl.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021 Arm Limited. + * Copyright (c) 2021, 2024 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -21,9 +21,10 @@ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ -#ifndef SRC_CORE_NEON_KERNELS_CONV3D_QUANTIZED_H -#define SRC_CORE_NEON_KERNELS_CONV3D_QUANTIZED_H +#ifndef ACL_SRC_CPU_KERNELS_CONV3D_GENERIC_NEON_QUANTIZED_IMPL_H +#define ACL_SRC_CPU_KERNELS_CONV3D_GENERIC_NEON_QUANTIZED_IMPL_H +#include "arm_compute/core/Helpers.h" #include "arm_compute/core/Types.h" #include "arm_compute/core/utils/misc/Traits.h" #include "arm_compute/core/utils/quantization/AsymmHelpers.h" @@ -37,6 +38,8 @@ namespace arm_compute { namespace cpu { +namespace kernels +{ template <typename T> void directconv3d_quantized_neon_ndhwc(const ITensor *src0, const ITensor *src1, @@ -270,6 +273,7 @@ void directconv3d_quantized_neon_ndhwc(const ITensor *src0, }, out); } +} // namespace kernels } // namespace cpu } // namespace arm_compute -#endif // SRC_CORE_NEON_KERNELS_CONV3D_QUANTIZED_H +#endif // ACL_SRC_CPU_KERNELS_CONV3D_GENERIC_NEON_QUANTIZED_IMPL_H diff --git a/src/cpu/kernels/conv3d/list.h b/src/cpu/kernels/conv3d/list.h new file mode 100644 index 0000000000..256d28825d --- /dev/null +++ b/src/cpu/kernels/conv3d/list.h @@ -0,0 +1,47 @@ +/* + * Copyright (c) 2024 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ACL_SRC_CPU_KERNELS_CONV3D_LIST_H +#define ACL_SRC_CPU_KERNELS_CONV3D_LIST_H + +namespace arm_compute +{ +namespace cpu +{ +namespace kernels +{ + +#define DECLARE_CONV3D_KERNEL(func_name) \ + void func_name(const ITensor *src0, const ITensor *src1, const ITensor *src2, ITensor *dst, \ + const Conv3dInfo &conv_info, const Window &window) + +DECLARE_CONV3D_KERNEL(directconv3d_fp16_neon_ndhwc); +DECLARE_CONV3D_KERNEL(directconv3d_fp32_neon_ndhwc); +DECLARE_CONV3D_KERNEL(directconv3d_qu8_neon_ndhwc); +DECLARE_CONV3D_KERNEL(directconv3d_qs8_neon_ndhwc); +#undef DECLARE_CONV3D_KERNEL + +} // namespace kernels +} // namespace cpu +} // namespace arm_compute +#endif // ACL_SRC_CPU_KERNELS_CONV3D_LIST_H diff --git a/src/cpu/kernels/directconv2d_output_stage/generic/neon/float_impl.h b/src/cpu/kernels/directconv2d_output_stage/generic/neon/float_impl.h new file mode 100644 index 0000000000..266fa68ab2 --- /dev/null +++ b/src/cpu/kernels/directconv2d_output_stage/generic/neon/float_impl.h @@ -0,0 +1,186 @@ +/* + * Copyright (c) 2017-2021, 2024 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef ACL_SRC_CPU_KERNELS_DIRECTCONV2D_OUTPUT_STAGE_GENERIC_NEON_FLOAT_IMPL_H +#define ACL_SRC_CPU_KERNELS_DIRECTCONV2D_OUTPUT_STAGE_GENERIC_NEON_FLOAT_IMPL_H + +#include "arm_compute/core/Helpers.h" // Iterator +#include "arm_compute/core/ITensor.h" +#include "arm_compute/core/Window.h" + +#include "src/core/NEON/wrapper/wrapper.h" + +#include <cstdint> + +namespace arm_compute +{ +namespace cpu +{ +namespace kernels +{ +template <typename T> +void output_stage_nchw_fp(ITensor *src, + const ITensor *bias, + const Window &window, + ITensor *dst, + int result_fixedpoint_multiplier, + int result_shift, + int result_offset_after_shift) +{ + const bool has_bias = bias != nullptr; + /** SIMD vector tag type. */ + using ExactTagType = typename wrapper::traits::neon_bitvector_tag_t<T, wrapper::traits::BitWidth::W128>; + + ARM_COMPUTE_ERROR_ON(src->info()->data_layout() == DataLayout::UNKNOWN); + ARM_COMPUTE_UNUSED(result_fixedpoint_multiplier); + ARM_COMPUTE_UNUSED(result_shift); + ARM_COMPUTE_UNUSED(result_offset_after_shift); + + const int window_start_x = window.x().start(); + const int window_end_x = window.x().end(); + const int window_step_x = 16 / src->info()->element_size(); + Window win = window; + win.set(Window::DimX, Window::Dimension(0, 1, 1)); + + Iterator in(src, win); + Iterator out(dst, win); + execute_window_loop( + win, + [&](const Coordinates &id) + { + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) + { + // Get bias and pointer to input + const auto in_ptr = reinterpret_cast<const T *>(in.ptr()) + x; + auto v_in = wrapper::vloadq(in_ptr); + + // Accumulate bias + if (has_bias) + { + const auto vb = wrapper::vdup_n( + *reinterpret_cast<const T *>(bias->ptr_to_element(Coordinates(id.z()))), ExactTagType{}); + v_in = wrapper::vadd(v_in, vb); + } + + const auto out_ptr = reinterpret_cast<T *>(out.ptr()) + x; + wrapper::vstore(out_ptr, v_in); + } + + // Left-overs loop + for (; x < window_end_x; ++x) + { + // Get bias and pointer to input + auto s_in = *(reinterpret_cast<const T *>(in.ptr()) + x); + + // Accumulate bias + if (has_bias) + { + const auto b = *reinterpret_cast<const T *>(bias->ptr_to_element(Coordinates(id.z()))); + s_in += b; + } + + *(reinterpret_cast<T *>(out.ptr()) + x) = s_in; + } + }, + in, out); +} + +template <typename T> +void output_stage_nhwc_fp(ITensor *src, + const ITensor *bias, + const Window &window, + ITensor *dst, + int result_fixedpoint_multiplier, + int result_shift, + int result_offset_after_shift) +{ + const bool has_bias = bias != nullptr; + ARM_COMPUTE_UNUSED(result_fixedpoint_multiplier); + ARM_COMPUTE_UNUSED(result_shift); + ARM_COMPUTE_UNUSED(result_offset_after_shift); + + Window window_bias = window; + window_bias.set(Window::DimX, Window::Dimension(0, 1, 1)); + window_bias.set(Window::DimY, Window::Dimension(0, 0, 0)); + window_bias.set(Window::DimZ, Window::Dimension(0, 0, 0)); + window_bias.set(3, Window::Dimension(0, 0, 0)); + + const int window_start_x = window.x().start(); + const int window_end_x = window.x().end(); + const int window_step_x = 16 / src->info()->element_size(); + Window win = window; + win.set(Window::DimX, Window::Dimension(0, 1, 1)); + + Iterator in(src, win); + Iterator bi(bias, window_bias); + Iterator out(dst, win); + + execute_window_loop( + win, + [&](const Coordinates &) + { + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) + { + // Get bias and pointer to input + const auto in_ptr = reinterpret_cast<const T *>(in.ptr()); + auto v_in = wrapper::vloadq(in_ptr + x); + + // Accumulate bias + if (has_bias) + { + const auto bias_ptr = reinterpret_cast<T *>(bi.ptr()) + x; + v_in = wrapper::vadd(v_in, wrapper::vloadq(bias_ptr)); + } + + const auto out_ptr = reinterpret_cast<T *>(out.ptr()); + wrapper::vstore(out_ptr + x, v_in); + } + + // Left-overs loop + for (; x < window_end_x; ++x) + { + // Get bias and pointer to input + auto s_in = *(reinterpret_cast<const T *>(in.ptr()) + x); + + // Accumulate bias + if (has_bias) + { + const auto bias_ptr = reinterpret_cast<T *>(bi.ptr()) + x; + s_in += *bias_ptr; + } + + const auto out_ptr = reinterpret_cast<T *>(out.ptr()); + *(out_ptr + x) = s_in; + } + }, + in, bi, out); +} + +} // namespace kernels +} // namespace cpu +} // namespace arm_compute + +#endif // ACL_SRC_CPU_KERNELS_DIRECTCONV2D_OUTPUT_STAGE_GENERIC_NEON_FLOAT_IMPL_H diff --git a/src/cpu/kernels/directconv2d_output_stage/generic/neon/fp16.cpp b/src/cpu/kernels/directconv2d_output_stage/generic/neon/fp16.cpp new file mode 100644 index 0000000000..d7550da721 --- /dev/null +++ b/src/cpu/kernels/directconv2d_output_stage/generic/neon/fp16.cpp @@ -0,0 +1,63 @@ +/* + * Copyright (c) 2024 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "src/cpu/kernels/directconv2d_output_stage/generic/neon/float_impl.h" + +#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) + +namespace arm_compute +{ +namespace cpu +{ +namespace kernels +{ +void output_stage_nhwc_fp16(ITensor *src, + const ITensor *bias, + const Window &window, + ITensor *dst, + int result_fixedpoint_multiplier, + int result_shift, + int result_offset_after_shift) +{ + output_stage_nhwc_fp<float16_t>(src, bias, window, dst, result_fixedpoint_multiplier, result_shift, + result_offset_after_shift); +} + +void output_stage_nchw_fp16(ITensor *src, + const ITensor *bias, + const Window &window, + ITensor *dst, + int result_fixedpoint_multiplier, + int result_shift, + int result_offset_after_shift) +{ + output_stage_nchw_fp<float16_t>(src, bias, window, dst, result_fixedpoint_multiplier, result_shift, + result_offset_after_shift); +} + +} // namespace kernels +} // namespace cpu +} // namespace arm_compute + +#endif // defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) diff --git a/src/cpu/kernels/directconv2d_output_stage/generic/neon/fp32.cpp b/src/cpu/kernels/directconv2d_output_stage/generic/neon/fp32.cpp new file mode 100644 index 0000000000..05dec370b9 --- /dev/null +++ b/src/cpu/kernels/directconv2d_output_stage/generic/neon/fp32.cpp @@ -0,0 +1,59 @@ +/* + * Copyright (c) 2024 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "src/cpu/kernels/directconv2d_output_stage/generic/neon/float_impl.h" + +namespace arm_compute +{ +namespace cpu +{ +namespace kernels +{ +void output_stage_nhwc_fp32(ITensor *src, + const ITensor *bias, + const Window &window, + ITensor *dst, + int result_fixedpoint_multiplier, + int result_shift, + int result_offset_after_shift) +{ + output_stage_nhwc_fp<float>(src, bias, window, dst, result_fixedpoint_multiplier, result_shift, + result_offset_after_shift); +} + +void output_stage_nchw_fp32(ITensor *src, + const ITensor *bias, + const Window &window, + ITensor *dst, + int result_fixedpoint_multiplier, + int result_shift, + int result_offset_after_shift) +{ + output_stage_nchw_fp<float>(src, bias, window, dst, result_fixedpoint_multiplier, result_shift, + result_offset_after_shift); +} + +} // namespace kernels +} // namespace cpu +} // namespace arm_compute diff --git a/src/cpu/kernels/directconv2d_output_stage/generic/neon/qasymm8.cpp b/src/cpu/kernels/directconv2d_output_stage/generic/neon/qasymm8.cpp new file mode 100644 index 0000000000..dbdf951b6f --- /dev/null +++ b/src/cpu/kernels/directconv2d_output_stage/generic/neon/qasymm8.cpp @@ -0,0 +1,59 @@ +/* + * Copyright (c) 2024 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "src/cpu/kernels/directconv2d_output_stage/generic/neon/quantized_impl.h" + +namespace arm_compute +{ +namespace cpu +{ +namespace kernels +{ +void output_stage_nhwc_qu8(ITensor *src, + const ITensor *bias, + const Window &window, + ITensor *dst, + int result_fixedpoint_multiplier, + int result_shift, + int result_offset_after_shift) +{ + output_stage_nhwc_quant<uint8_t>(src, bias, window, dst, result_fixedpoint_multiplier, result_shift, + result_offset_after_shift); +} + +void output_stage_nchw_qu8(ITensor *src, + const ITensor *bias, + const Window &window, + ITensor *dst, + int result_fixedpoint_multiplier, + int result_shift, + int result_offset_after_shift) +{ + output_stage_nchw_quant<uint8_t>(src, bias, window, dst, result_fixedpoint_multiplier, result_shift, + result_offset_after_shift); +} + +} // namespace kernels +} // namespace cpu +} // namespace arm_compute diff --git a/src/cpu/kernels/directconv2d_output_stage/generic/neon/qasymm8_signed.cpp b/src/cpu/kernels/directconv2d_output_stage/generic/neon/qasymm8_signed.cpp new file mode 100644 index 0000000000..c00fe87161 --- /dev/null +++ b/src/cpu/kernels/directconv2d_output_stage/generic/neon/qasymm8_signed.cpp @@ -0,0 +1,59 @@ +/* + * Copyright (c) 2024 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "src/cpu/kernels/directconv2d_output_stage/generic/neon/quantized_impl.h" + +namespace arm_compute +{ +namespace cpu +{ +namespace kernels +{ +void output_stage_nhwc_qs8(ITensor *src, + const ITensor *bias, + const Window &window, + ITensor *dst, + int result_fixedpoint_multiplier, + int result_shift, + int result_offset_after_shift) +{ + output_stage_nhwc_quant<int8_t>(src, bias, window, dst, result_fixedpoint_multiplier, result_shift, + result_offset_after_shift); +} + +void output_stage_nchw_qs8(ITensor *src, + const ITensor *bias, + const Window &window, + ITensor *dst, + int result_fixedpoint_multiplier, + int result_shift, + int result_offset_after_shift) +{ + output_stage_nchw_quant<int8_t>(src, bias, window, dst, result_fixedpoint_multiplier, result_shift, + result_offset_after_shift); +} + +} // namespace kernels +} // namespace cpu +} // namespace arm_compute diff --git a/src/cpu/kernels/directconv2d_output_stage/generic/neon/quantized_impl.h b/src/cpu/kernels/directconv2d_output_stage/generic/neon/quantized_impl.h new file mode 100644 index 0000000000..f74ed55ad0 --- /dev/null +++ b/src/cpu/kernels/directconv2d_output_stage/generic/neon/quantized_impl.h @@ -0,0 +1,213 @@ +/* + * Copyright (c) 2017-2021, 2024 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef ACL_SRC_CPU_KERNELS_DIRECTCONV2D_OUTPUT_STAGE_GENERIC_NEON_QUANTIZED_IMPL_H +#define ACL_SRC_CPU_KERNELS_DIRECTCONV2D_OUTPUT_STAGE_GENERIC_NEON_QUANTIZED_IMPL_H + +#include "arm_compute/core/Helpers.h" // Iterator +#include "arm_compute/core/ITensor.h" +#include "arm_compute/core/Window.h" + +#include "src/core/NEON/NEAsymm.h" +#include "src/core/NEON/wrapper/wrapper.h" + +#include <cstdint> + +namespace arm_compute +{ +namespace cpu +{ +namespace kernels +{ + +template <typename TOut> +void output_stage_nchw_quant(ITensor *src, + const ITensor *bias, + const Window &window, + ITensor *dst, + int result_fixedpoint_multiplier, + int result_shift, + int result_offset_after_shift) +{ + const bool has_bias = bias != nullptr; + using VectorType = typename wrapper::traits::neon_bitvector_t<TOut, wrapper::traits::BitWidth::W128>; + using TagType = typename wrapper::traits::neon_bitvector_tag_t<TOut, wrapper::traits::BitWidth::W128>; + + const int32x4_t result_offset_after_shift_s32 = vdupq_n_s32(result_offset_after_shift); + + const VectorType min = wrapper::vdup_n(std::numeric_limits<TOut>::lowest(), TagType{}); + const VectorType max = wrapper::vdup_n(std::numeric_limits<TOut>::max(), TagType{}); + + const int window_start_x = window.x().start(); + const int window_end_x = window.x().end(); + const int window_step_x = 16 / src->info()->element_size(); + Window win = window; + win.set(Window::DimX, Window::Dimension(0, 1, 1)); + + Iterator in(src, win); + Iterator out(dst, win); + + execute_window_loop( + win, + [&](const Coordinates &id) + { + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) + { + // Get bias and pointer to input + const auto in_ptr = reinterpret_cast<int32_t *>(in.ptr()) + x; + int32x4x4_t v_in = {{wrapper::vloadq(in_ptr), wrapper::vloadq(in_ptr + 4), wrapper::vloadq(in_ptr + 8), + wrapper::vloadq(in_ptr + 12)}}; + + // Accumulate bias + if (has_bias) + { + const auto vb = wrapper::vdup_n( + *reinterpret_cast<const int32_t *>(bias->ptr_to_element(Coordinates(id.z()))), TagType{}); + v_in = {{wrapper::vadd(v_in.val[0], vb), wrapper::vadd(v_in.val[1], vb), + wrapper::vadd(v_in.val[2], vb), wrapper::vadd(v_in.val[3], vb)}}; + } + + const auto out_ptr = reinterpret_cast<TOut *>(out.ptr()) + x; + wrapper::vstore(out_ptr, finalize_quantization(v_in, result_fixedpoint_multiplier, result_shift, + result_offset_after_shift_s32, min, max, false)); + } + + // Left-overs loop + for (; x < window_end_x; ++x) + { + // Get bias and pointer to input + int32_t s_in = *(reinterpret_cast<const int32_t *>(in.ptr()) + x); + + // Accumulate bias + if (has_bias) + { + const auto b = *reinterpret_cast<const int32_t *>(bias->ptr_to_element(Coordinates(id.z()))); + s_in += b; + } + + const auto out_ptr = reinterpret_cast<TOut *>(out.ptr()) + x; + *out_ptr = + finalize_quantization(s_in, result_fixedpoint_multiplier, result_shift, result_offset_after_shift, + std::numeric_limits<TOut>::lowest(), std::numeric_limits<TOut>::max(), false); + } + }, + in, out); +} +template < + typename TOut, + typename std::enable_if<std::is_same<TOut, uint8_t>::value || std::is_same<TOut, int8_t>::value, int>::type = 0> +void output_stage_nhwc_quant(ITensor *src, + const ITensor *bias, + const Window &window, + ITensor *dst, + int result_fixedpoint_multiplier, + int result_shift, + int result_offset_after_shift) +{ + const bool has_bias = bias != nullptr; + using VectorType = typename wrapper::traits::neon_bitvector_t<TOut, wrapper::traits::BitWidth::W128>; + using TagType = typename wrapper::traits::neon_bitvector_tag_t<TOut, wrapper::traits::BitWidth::W128>; + + const int32x4_t result_offset_after_shift_s32 = vdupq_n_s32(result_offset_after_shift); + + const VectorType min = wrapper::vdup_n(std::numeric_limits<TOut>::lowest(), TagType{}); + const VectorType max = wrapper::vdup_n(std::numeric_limits<TOut>::max(), TagType{}); + + Window window_bias = window; + window_bias.set(Window::DimX, Window::Dimension(0, 1, 1)); + window_bias.set(Window::DimY, Window::Dimension(0, 0, 0)); + window_bias.set(Window::DimZ, Window::Dimension(0, 0, 0)); + window_bias.set(3, Window::Dimension(0, 0, 0)); + + const int window_start_x = window.x().start(); + const int window_end_x = window.x().end(); + const int window_step_x = 16 / src->info()->element_size(); + Window win = window; + win.set(Window::DimX, Window::Dimension(0, 1, 1)); + + Iterator in(src, win); + Iterator bi(bias, window_bias); + Iterator out(dst, win); + + execute_window_loop( + win, + [&](const Coordinates &) + { + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) + { + // Get bias and pointer to input + const auto in_ptr = reinterpret_cast<int32_t *>(in.ptr()) + x; + int32x4x4_t v_in = {{ + wrapper::vloadq(in_ptr), + wrapper::vloadq(in_ptr + 4), + wrapper::vloadq(in_ptr + 8), + wrapper::vloadq(in_ptr + 12), + }}; + + // Accumulate bias + if (has_bias) + { + const auto bias_ptr = reinterpret_cast<int32_t *>(bi.ptr()) + x; + + wrapper::vadd(v_in.val[0], wrapper::vloadq(bias_ptr)); + wrapper::vadd(v_in.val[1], wrapper::vloadq(bias_ptr + 4)); + wrapper::vadd(v_in.val[2], wrapper::vloadq(bias_ptr + 8)); + wrapper::vadd(v_in.val[3], wrapper::vloadq(bias_ptr + 12)); + } + + const auto out_ptr = reinterpret_cast<TOut *>(out.ptr()) + x; + wrapper::vstore(out_ptr, finalize_quantization(v_in, result_fixedpoint_multiplier, result_shift, + result_offset_after_shift_s32, min, max, false)); + } + + // Left-overs loop + for (; x < window_end_x; ++x) + { + // Get bias and pointer to input + const auto in_ptr = reinterpret_cast<int32_t *>(in.ptr()) + x; + int32_t s_in = *in_ptr; + + // Accumulate bias + if (has_bias) + { + const auto bias_ptr = reinterpret_cast<int32_t *>(bi.ptr()) + x; + s_in += *bias_ptr; + } + + const auto out_ptr = reinterpret_cast<TOut *>(out.ptr()) + x; + *out_ptr = + finalize_quantization(s_in, result_fixedpoint_multiplier, result_shift, result_offset_after_shift, + std::numeric_limits<TOut>::lowest(), std::numeric_limits<TOut>::max(), false); + } + }, + in, bi, out); +} + +} // namespace kernels +} // namespace cpu +} // namespace arm_compute + +#endif // ACL_SRC_CPU_KERNELS_DIRECTCONV2D_OUTPUT_STAGE_GENERIC_NEON_QUANTIZED_IMPL_H diff --git a/src/cpu/kernels/directconv2d_output_stage/list.h b/src/cpu/kernels/directconv2d_output_stage/list.h new file mode 100644 index 0000000000..9372269bca --- /dev/null +++ b/src/cpu/kernels/directconv2d_output_stage/list.h @@ -0,0 +1,57 @@ +/* + * Copyright (c) 2024 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef ACL_SRC_CPU_KERNELS_DIRECTCONV2D_OUTPUT_STAGE_LIST_H +#define ACL_SRC_CPU_KERNELS_DIRECTCONV2D_OUTPUT_STAGE_LIST_H + +#include "arm_compute/core/ITensor.h" +#include "arm_compute/core/Window.h" + +namespace arm_compute +{ +namespace cpu +{ +namespace kernels +{ + +#define DECLARE_DIRECTCONV2D_OUTPUT_STAGE_KERNEL(func_name) \ + void func_name(ITensor *src, const ITensor *bias, const Window &window, ITensor *dst, \ + int result_fixedpoint_multiplier, int result_shift, int result_offset_after_shift) + +DECLARE_DIRECTCONV2D_OUTPUT_STAGE_KERNEL(output_stage_nhwc_fp32); +DECLARE_DIRECTCONV2D_OUTPUT_STAGE_KERNEL(output_stage_nhwc_fp16); +DECLARE_DIRECTCONV2D_OUTPUT_STAGE_KERNEL(output_stage_nchw_fp32); +DECLARE_DIRECTCONV2D_OUTPUT_STAGE_KERNEL(output_stage_nchw_fp16); +DECLARE_DIRECTCONV2D_OUTPUT_STAGE_KERNEL(output_stage_nhwc_qs8); +DECLARE_DIRECTCONV2D_OUTPUT_STAGE_KERNEL(output_stage_nhwc_qu8); +DECLARE_DIRECTCONV2D_OUTPUT_STAGE_KERNEL(output_stage_nchw_qs8); +DECLARE_DIRECTCONV2D_OUTPUT_STAGE_KERNEL(output_stage_nchw_qu8); + +#undef DECLARE_DIRECTCONV2D_OUTPUT_STAGE_KERNEL + +} // namespace kernels +} // namespace cpu +} // namespace arm_compute + +#endif // ACL_SRC_CPU_KERNELS_DIRECTCONV2D_OUTPUT_STAGE_LIST_H diff --git a/src/cpu/kernels/gemm_matrix_mul/generic/neon/impl.cpp b/src/cpu/kernels/gemm_matrix_mul/generic/neon/impl.cpp index 404d070a37..580fdc3e8f 100644 --- a/src/cpu/kernels/gemm_matrix_mul/generic/neon/impl.cpp +++ b/src/cpu/kernels/gemm_matrix_mul/generic/neon/impl.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2023 Arm Limited. + * Copyright (c) 2017-2024 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -81,7 +81,7 @@ void vector_matrix_multiply_f32( // window_end_x is computed above which may cause out-of-bound writes to the dst. for (; x < (window_end_x - window_step_x); x += window_step_x) { - if (x > width_matrix_b) + if (x >= width_matrix_b) { return; } @@ -203,7 +203,7 @@ void vector_matrix_multiply_f32( // Left-over loop for (; x < window_end_x; ++x) { - if (x > width_matrix_b) + if (x >= width_matrix_b) { return; } @@ -309,9 +309,21 @@ void matrix_matrix_multiply_f32( Iterator inb(rhs, win_b); Iterator out(dst, window); - const bool multiply_alpha = !(helpers::float_ops::is_one(alpha)); + // End address of matrix B at batch number n + const float *end_addr_mtx_b_at_batch_n = + reinterpret_cast<const float *>(inb.ptr()) + rhs->info()->dimension(0) * rhs->info()->dimension(1); + std::vector<const float *> end_addr_mtx_b_per_batch = {}; + const bool multiply_alpha = !(helpers::float_ops::is_one(alpha)); + const float32x4_t alpha_f32 = vdupq_n_f32(alpha); + const size_t out_dim2 = static_cast<int>(dst->info()->dimension(2)); - const float32x4_t alpha_f32 = vdupq_n_f32(alpha); + for (size_t b = 0; b < out_dim2; ++b) + { + // Store the ptrs to the last elem in the tensor for each batch + end_addr_mtx_b_per_batch.push_back(end_addr_mtx_b_at_batch_n); + end_addr_mtx_b_at_batch_n += + rhs->info()->dimension(2) != 1 ? rhs->info()->dimension(0) * rhs->info()->dimension(1) : 0; + } // The implementation assumes that the matrix A and Matrix B have been reshaped respectively with CpuGemmInterleave4x4 and CpuGemmTranspose1xW // The reshaping of the matrices helps to have a cache friendly implementation and helps to avoid the data re-arrangements needed for computing 16x4 elements per iteration @@ -341,220 +353,374 @@ void matrix_matrix_multiply_f32( #endif /* __arm__ */ auto mtx_b0_end_addr = mtx_b0 + num_elems_matrix_b_x; - for (; mtx_b0 <= (mtx_b0_end_addr - 32);) + + ARM_COMPUTE_ERROR_ON(end_addr_mtx_b_per_batch.size() == 0); + if (mtx_b1 < end_addr_mtx_b_per_batch[id.z()]) { - float32x4_t a0 = vld1q_dup_f32(mtx_a0 + 0); - float32x4_t a1 = vld1q_dup_f32(mtx_a0 + 1); - float32x4_t a2 = vld1q_dup_f32(mtx_a0 + 2); - float32x4_t a3 = vld1q_dup_f32(mtx_a0 + 3); + for (; mtx_b0 < (mtx_b0_end_addr - 32);) + { + float32x4_t a0 = vld1q_dup_f32(mtx_a0 + 0); + float32x4_t a1 = vld1q_dup_f32(mtx_a0 + 1); + float32x4_t a2 = vld1q_dup_f32(mtx_a0 + 2); + float32x4_t a3 = vld1q_dup_f32(mtx_a0 + 3); - float32x4_t b00 = vld1q_f32(mtx_b0); - float32x4_t b10 = vld1q_f32(mtx_b1); - float32x4_t b01 = vld1q_f32(mtx_b0 + 4); - float32x4_t b11 = vld1q_f32(mtx_b1 + 4); + float32x4_t b00 = vld1q_f32(mtx_b0); + float32x4_t b10 = vld1q_f32(mtx_b1); + float32x4_t b01 = vld1q_f32(mtx_b0 + 4); + float32x4_t b11 = vld1q_f32(mtx_b1 + 4); #if __arm__ - asm volatile("PLD [%0, #128*4]" ::"r"(reinterpret_cast<const uint8_t *>(mtx_a0))); - asm volatile("PLD [%0, #128*4]" ::"r"(reinterpret_cast<const uint8_t *>(mtx_b0))); - asm volatile("PLD [%0, #128*4]" ::"r"(reinterpret_cast<const uint8_t *>(mtx_b1))); + asm volatile("PLD [%0, #128*4]" ::"r"(reinterpret_cast<const uint8_t *>(mtx_a0))); + asm volatile("PLD [%0, #128*4]" ::"r"(reinterpret_cast<const uint8_t *>(mtx_b0))); + asm volatile("PLD [%0, #128*4]" ::"r"(reinterpret_cast<const uint8_t *>(mtx_b1))); #endif /* __arm__ */ - // 4x4 block 0 - acc00 = vmlaq_f32(acc00, b00, a0); - acc10 = vmlaq_f32(acc10, b00, a1); - acc20 = vmlaq_f32(acc20, b00, a2); - acc30 = vmlaq_f32(acc30, b00, a3); - - float32x4_t a4 = vld1q_dup_f32(mtx_a0 + 4); - float32x4_t a5 = vld1q_dup_f32(mtx_a0 + 5); - float32x4_t a6 = vld1q_dup_f32(mtx_a0 + 6); - float32x4_t a7 = vld1q_dup_f32(mtx_a0 + 7); - - // 4x4 block 1 - acc01 = vmlaq_f32(acc01, b10, a0); - acc11 = vmlaq_f32(acc11, b10, a1); - acc21 = vmlaq_f32(acc21, b10, a2); - acc31 = vmlaq_f32(acc31, b10, a3); - - // 4x4 block 0 - acc00 = vmlaq_f32(acc00, b01, a4); - acc10 = vmlaq_f32(acc10, b01, a5); - acc20 = vmlaq_f32(acc20, b01, a6); - acc30 = vmlaq_f32(acc30, b01, a7); - - // 4x4 block 1 - acc01 = vmlaq_f32(acc01, b11, a4); - acc11 = vmlaq_f32(acc11, b11, a5); - acc21 = vmlaq_f32(acc21, b11, a6); - acc31 = vmlaq_f32(acc31, b11, a7); - - mtx_a0 += 8; - mtx_b0 += 8; - mtx_b1 += 8; - - a0 = vld1q_dup_f32(mtx_a0 + 0); - a1 = vld1q_dup_f32(mtx_a0 + 1); - a2 = vld1q_dup_f32(mtx_a0 + 2); - a3 = vld1q_dup_f32(mtx_a0 + 3); - - b00 = vld1q_f32(mtx_b0); - b10 = vld1q_f32(mtx_b1); - b01 = vld1q_f32(mtx_b0 + 4); - b11 = vld1q_f32(mtx_b1 + 4); - - // 4x4 block 0 - acc00 = vmlaq_f32(acc00, b00, a0); - acc10 = vmlaq_f32(acc10, b00, a1); - acc20 = vmlaq_f32(acc20, b00, a2); - acc30 = vmlaq_f32(acc30, b00, a3); - - a4 = vld1q_dup_f32(mtx_a0 + 4); - a5 = vld1q_dup_f32(mtx_a0 + 5); - a6 = vld1q_dup_f32(mtx_a0 + 6); - a7 = vld1q_dup_f32(mtx_a0 + 7); - - // 4x4 block 1 - acc01 = vmlaq_f32(acc01, b10, a0); - acc11 = vmlaq_f32(acc11, b10, a1); - acc21 = vmlaq_f32(acc21, b10, a2); - acc31 = vmlaq_f32(acc31, b10, a3); - - // 4x4 block 0 - acc00 = vmlaq_f32(acc00, b01, a4); - acc10 = vmlaq_f32(acc10, b01, a5); - acc20 = vmlaq_f32(acc20, b01, a6); - acc30 = vmlaq_f32(acc30, b01, a7); - - // 4x4 block 1 - acc01 = vmlaq_f32(acc01, b11, a4); - acc11 = vmlaq_f32(acc11, b11, a5); - acc21 = vmlaq_f32(acc21, b11, a6); - acc31 = vmlaq_f32(acc31, b11, a7); - - mtx_a0 += 8; - mtx_b0 += 8; - mtx_b1 += 8; - - a0 = vld1q_dup_f32(mtx_a0 + 0); - a1 = vld1q_dup_f32(mtx_a0 + 1); - a2 = vld1q_dup_f32(mtx_a0 + 2); - a3 = vld1q_dup_f32(mtx_a0 + 3); - b00 = vld1q_f32(mtx_b0); - b10 = vld1q_f32(mtx_b1); - b01 = vld1q_f32(mtx_b0 + 4); - b11 = vld1q_f32(mtx_b1 + 4); + // 4x4 block 0 + acc00 = vmlaq_f32(acc00, b00, a0); + acc10 = vmlaq_f32(acc10, b00, a1); + acc20 = vmlaq_f32(acc20, b00, a2); + acc30 = vmlaq_f32(acc30, b00, a3); + + float32x4_t a4 = vld1q_dup_f32(mtx_a0 + 4); + float32x4_t a5 = vld1q_dup_f32(mtx_a0 + 5); + float32x4_t a6 = vld1q_dup_f32(mtx_a0 + 6); + float32x4_t a7 = vld1q_dup_f32(mtx_a0 + 7); + + // 4x4 block 1 + acc01 = vmlaq_f32(acc01, b10, a0); + acc11 = vmlaq_f32(acc11, b10, a1); + acc21 = vmlaq_f32(acc21, b10, a2); + acc31 = vmlaq_f32(acc31, b10, a3); + + // 4x4 block 0 + acc00 = vmlaq_f32(acc00, b01, a4); + acc10 = vmlaq_f32(acc10, b01, a5); + acc20 = vmlaq_f32(acc20, b01, a6); + acc30 = vmlaq_f32(acc30, b01, a7); + + // 4x4 block 1 + acc01 = vmlaq_f32(acc01, b11, a4); + acc11 = vmlaq_f32(acc11, b11, a5); + acc21 = vmlaq_f32(acc21, b11, a6); + acc31 = vmlaq_f32(acc31, b11, a7); + + mtx_a0 += 8; + mtx_b0 += 8; + mtx_b1 += 8; + + a0 = vld1q_dup_f32(mtx_a0 + 0); + a1 = vld1q_dup_f32(mtx_a0 + 1); + a2 = vld1q_dup_f32(mtx_a0 + 2); + a3 = vld1q_dup_f32(mtx_a0 + 3); + + b00 = vld1q_f32(mtx_b0); + b10 = vld1q_f32(mtx_b1); + b01 = vld1q_f32(mtx_b0 + 4); + b11 = vld1q_f32(mtx_b1 + 4); + + // 4x4 block 0 + acc00 = vmlaq_f32(acc00, b00, a0); + acc10 = vmlaq_f32(acc10, b00, a1); + acc20 = vmlaq_f32(acc20, b00, a2); + acc30 = vmlaq_f32(acc30, b00, a3); + + a4 = vld1q_dup_f32(mtx_a0 + 4); + a5 = vld1q_dup_f32(mtx_a0 + 5); + a6 = vld1q_dup_f32(mtx_a0 + 6); + a7 = vld1q_dup_f32(mtx_a0 + 7); + + // 4x4 block 1 + acc01 = vmlaq_f32(acc01, b10, a0); + acc11 = vmlaq_f32(acc11, b10, a1); + acc21 = vmlaq_f32(acc21, b10, a2); + acc31 = vmlaq_f32(acc31, b10, a3); + + // 4x4 block 0 + acc00 = vmlaq_f32(acc00, b01, a4); + acc10 = vmlaq_f32(acc10, b01, a5); + acc20 = vmlaq_f32(acc20, b01, a6); + acc30 = vmlaq_f32(acc30, b01, a7); + + // 4x4 block 1 + acc01 = vmlaq_f32(acc01, b11, a4); + acc11 = vmlaq_f32(acc11, b11, a5); + acc21 = vmlaq_f32(acc21, b11, a6); + acc31 = vmlaq_f32(acc31, b11, a7); + + mtx_a0 += 8; + mtx_b0 += 8; + mtx_b1 += 8; + + a0 = vld1q_dup_f32(mtx_a0 + 0); + a1 = vld1q_dup_f32(mtx_a0 + 1); + a2 = vld1q_dup_f32(mtx_a0 + 2); + a3 = vld1q_dup_f32(mtx_a0 + 3); + b00 = vld1q_f32(mtx_b0); + b10 = vld1q_f32(mtx_b1); + b01 = vld1q_f32(mtx_b0 + 4); + b11 = vld1q_f32(mtx_b1 + 4); #if __arm__ - asm volatile("PLD [%0, #128*4]" ::"r"(reinterpret_cast<const uint8_t *>(mtx_a0))); - asm volatile("PLD [%0, #128*4]" ::"r"(reinterpret_cast<const uint8_t *>(mtx_b0))); - asm volatile("PLD [%0, #128*4]" ::"r"(reinterpret_cast<const uint8_t *>(mtx_b1))); + asm volatile("PLD [%0, #128*4]" ::"r"(reinterpret_cast<const uint8_t *>(mtx_a0))); + asm volatile("PLD [%0, #128*4]" ::"r"(reinterpret_cast<const uint8_t *>(mtx_b0))); + asm volatile("PLD [%0, #128*4]" ::"r"(reinterpret_cast<const uint8_t *>(mtx_b1))); #endif /* __arm__ */ - // 4x4 block 0 - acc00 = vmlaq_f32(acc00, b00, a0); - acc10 = vmlaq_f32(acc10, b00, a1); - acc20 = vmlaq_f32(acc20, b00, a2); - acc30 = vmlaq_f32(acc30, b00, a3); - - a4 = vld1q_dup_f32(mtx_a0 + 4); - a5 = vld1q_dup_f32(mtx_a0 + 5); - a6 = vld1q_dup_f32(mtx_a0 + 6); - a7 = vld1q_dup_f32(mtx_a0 + 7); - - // 4x4 block 1 - acc01 = vmlaq_f32(acc01, b10, a0); - acc11 = vmlaq_f32(acc11, b10, a1); - acc21 = vmlaq_f32(acc21, b10, a2); - acc31 = vmlaq_f32(acc31, b10, a3); - - // 4x4 block 0 - acc00 = vmlaq_f32(acc00, b01, a4); - acc10 = vmlaq_f32(acc10, b01, a5); - acc20 = vmlaq_f32(acc20, b01, a6); - acc30 = vmlaq_f32(acc30, b01, a7); - - // 4x4 block 1 - acc01 = vmlaq_f32(acc01, b11, a4); - acc11 = vmlaq_f32(acc11, b11, a5); - acc21 = vmlaq_f32(acc21, b11, a6); - acc31 = vmlaq_f32(acc31, b11, a7); - - mtx_a0 += 8; - mtx_b0 += 8; - mtx_b1 += 8; - - a0 = vld1q_dup_f32(mtx_a0 + 0); - a1 = vld1q_dup_f32(mtx_a0 + 1); - a2 = vld1q_dup_f32(mtx_a0 + 2); - a3 = vld1q_dup_f32(mtx_a0 + 3); - b00 = vld1q_f32(mtx_b0); - b10 = vld1q_f32(mtx_b1); - b01 = vld1q_f32(mtx_b0 + 4); - b11 = vld1q_f32(mtx_b1 + 4); - - // 4x4 block 0 - acc00 = vmlaq_f32(acc00, b00, a0); - acc10 = vmlaq_f32(acc10, b00, a1); - acc20 = vmlaq_f32(acc20, b00, a2); - acc30 = vmlaq_f32(acc30, b00, a3); - - a4 = vld1q_dup_f32(mtx_a0 + 4); - a5 = vld1q_dup_f32(mtx_a0 + 5); - a6 = vld1q_dup_f32(mtx_a0 + 6); - a7 = vld1q_dup_f32(mtx_a0 + 7); - - // 4x4 block 1 - acc01 = vmlaq_f32(acc01, b10, a0); - acc11 = vmlaq_f32(acc11, b10, a1); - acc21 = vmlaq_f32(acc21, b10, a2); - acc31 = vmlaq_f32(acc31, b10, a3); - - // 4x4 block 0 - acc00 = vmlaq_f32(acc00, b01, a4); - acc10 = vmlaq_f32(acc10, b01, a5); - acc20 = vmlaq_f32(acc20, b01, a6); - acc30 = vmlaq_f32(acc30, b01, a7); - - // 4x4 block 1 - acc01 = vmlaq_f32(acc01, b11, a4); - acc11 = vmlaq_f32(acc11, b11, a5); - acc21 = vmlaq_f32(acc21, b11, a6); - acc31 = vmlaq_f32(acc31, b11, a7); - - mtx_a0 += 8; - mtx_b0 += 8; - mtx_b1 += 8; + // 4x4 block 0 + acc00 = vmlaq_f32(acc00, b00, a0); + acc10 = vmlaq_f32(acc10, b00, a1); + acc20 = vmlaq_f32(acc20, b00, a2); + acc30 = vmlaq_f32(acc30, b00, a3); + + a4 = vld1q_dup_f32(mtx_a0 + 4); + a5 = vld1q_dup_f32(mtx_a0 + 5); + a6 = vld1q_dup_f32(mtx_a0 + 6); + a7 = vld1q_dup_f32(mtx_a0 + 7); + + // 4x4 block 1 + acc01 = vmlaq_f32(acc01, b10, a0); + acc11 = vmlaq_f32(acc11, b10, a1); + acc21 = vmlaq_f32(acc21, b10, a2); + acc31 = vmlaq_f32(acc31, b10, a3); + + // 4x4 block 0 + acc00 = vmlaq_f32(acc00, b01, a4); + acc10 = vmlaq_f32(acc10, b01, a5); + acc20 = vmlaq_f32(acc20, b01, a6); + acc30 = vmlaq_f32(acc30, b01, a7); + + // 4x4 block 1 + acc01 = vmlaq_f32(acc01, b11, a4); + acc11 = vmlaq_f32(acc11, b11, a5); + acc21 = vmlaq_f32(acc21, b11, a6); + acc31 = vmlaq_f32(acc31, b11, a7); + + mtx_a0 += 8; + mtx_b0 += 8; + mtx_b1 += 8; + + a0 = vld1q_dup_f32(mtx_a0 + 0); + a1 = vld1q_dup_f32(mtx_a0 + 1); + a2 = vld1q_dup_f32(mtx_a0 + 2); + a3 = vld1q_dup_f32(mtx_a0 + 3); + b00 = vld1q_f32(mtx_b0); + b10 = vld1q_f32(mtx_b1); + b01 = vld1q_f32(mtx_b0 + 4); + b11 = vld1q_f32(mtx_b1 + 4); + + // 4x4 block 0 + acc00 = vmlaq_f32(acc00, b00, a0); + acc10 = vmlaq_f32(acc10, b00, a1); + acc20 = vmlaq_f32(acc20, b00, a2); + acc30 = vmlaq_f32(acc30, b00, a3); + + a4 = vld1q_dup_f32(mtx_a0 + 4); + a5 = vld1q_dup_f32(mtx_a0 + 5); + a6 = vld1q_dup_f32(mtx_a0 + 6); + a7 = vld1q_dup_f32(mtx_a0 + 7); + + // 4x4 block 1 + acc01 = vmlaq_f32(acc01, b10, a0); + acc11 = vmlaq_f32(acc11, b10, a1); + acc21 = vmlaq_f32(acc21, b10, a2); + acc31 = vmlaq_f32(acc31, b10, a3); + + // 4x4 block 0 + acc00 = vmlaq_f32(acc00, b01, a4); + acc10 = vmlaq_f32(acc10, b01, a5); + acc20 = vmlaq_f32(acc20, b01, a6); + acc30 = vmlaq_f32(acc30, b01, a7); + + // 4x4 block 1 + acc01 = vmlaq_f32(acc01, b11, a4); + acc11 = vmlaq_f32(acc11, b11, a5); + acc21 = vmlaq_f32(acc21, b11, a6); + acc31 = vmlaq_f32(acc31, b11, a7); + + mtx_a0 += 8; + mtx_b0 += 8; + mtx_b1 += 8; + } + + // Only consider one row from matrix b if subsequent row is out of boundary. + for (; mtx_b0 < mtx_b0_end_addr;) + { + float32x4_t a0 = vld1q_dup_f32(mtx_a0 + 0); + float32x4_t a1 = vld1q_dup_f32(mtx_a0 + 1); + float32x4_t a2 = vld1q_dup_f32(mtx_a0 + 2); + float32x4_t a3 = vld1q_dup_f32(mtx_a0 + 3); + float32x4_t b00 = vld1q_f32(mtx_b0); + float32x4_t b10 = vld1q_f32(mtx_b1); + +#if __arm__ + asm volatile("PLD [%0, #128*2]" ::"r"(reinterpret_cast<const uint8_t *>(mtx_a0))); + asm volatile("PLD [%0, #128*2]" ::"r"(reinterpret_cast<const uint8_t *>(mtx_b0))); + asm volatile("PLD [%0, #128*2]" ::"r"(reinterpret_cast<const uint8_t *>(mtx_b1))); +#endif /* __arm__ */ + // 4x4 block 0 + acc00 = vmlaq_f32(acc00, b00, a0); + acc10 = vmlaq_f32(acc10, b00, a1); + acc20 = vmlaq_f32(acc20, b00, a2); + acc30 = vmlaq_f32(acc30, b00, a3); + + // 4x4 block 1 + acc01 = vmlaq_f32(acc01, b10, a0); + acc11 = vmlaq_f32(acc11, b10, a1); + acc21 = vmlaq_f32(acc21, b10, a2); + acc31 = vmlaq_f32(acc31, b10, a3); + + mtx_a0 += 4; + mtx_b0 += 4; + mtx_b1 += 4; + } } - for (; mtx_b0 < mtx_b0_end_addr;) + // Leftover last row in matrix b, in case of there are odd number of rows in matrix B + else if (mtx_b0 < end_addr_mtx_b_per_batch[id.z()]) { - float32x4_t a0 = vld1q_dup_f32(mtx_a0 + 0); - float32x4_t a1 = vld1q_dup_f32(mtx_a0 + 1); - float32x4_t a2 = vld1q_dup_f32(mtx_a0 + 2); - float32x4_t a3 = vld1q_dup_f32(mtx_a0 + 3); - float32x4_t b00 = vld1q_f32(mtx_b0); - float32x4_t b10 = vld1q_f32(mtx_b1); + for (; mtx_b0 < (mtx_b0_end_addr - 32);) + { + float32x4_t a0 = vld1q_dup_f32(mtx_a0 + 0); + float32x4_t a1 = vld1q_dup_f32(mtx_a0 + 1); + float32x4_t a2 = vld1q_dup_f32(mtx_a0 + 2); + float32x4_t a3 = vld1q_dup_f32(mtx_a0 + 3); + + float32x4_t b00 = vld1q_f32(mtx_b0); + float32x4_t b01 = vld1q_f32(mtx_b0 + 4); #if __arm__ - asm volatile("PLD [%0, #128*2]" ::"r"(reinterpret_cast<const uint8_t *>(mtx_a0))); - asm volatile("PLD [%0, #128*2]" ::"r"(reinterpret_cast<const uint8_t *>(mtx_b0))); - asm volatile("PLD [%0, #128*2]" ::"r"(reinterpret_cast<const uint8_t *>(mtx_b1))); + asm volatile("PLD [%0, #128*4]" ::"r"(reinterpret_cast<const uint8_t *>(mtx_a0))); + asm volatile("PLD [%0, #128*4]" ::"r"(reinterpret_cast<const uint8_t *>(mtx_b0))); #endif /* __arm__ */ - // 4x4 block 0 - acc00 = vmlaq_f32(acc00, b00, a0); - acc10 = vmlaq_f32(acc10, b00, a1); - acc20 = vmlaq_f32(acc20, b00, a2); - acc30 = vmlaq_f32(acc30, b00, a3); - - // 4x4 block 1 - acc01 = vmlaq_f32(acc01, b10, a0); - acc11 = vmlaq_f32(acc11, b10, a1); - acc21 = vmlaq_f32(acc21, b10, a2); - acc31 = vmlaq_f32(acc31, b10, a3); - - mtx_a0 += 4; - mtx_b0 += 4; - mtx_b1 += 4; + + // 4x4 block 0 + acc00 = vmlaq_f32(acc00, b00, a0); + acc10 = vmlaq_f32(acc10, b00, a1); + acc20 = vmlaq_f32(acc20, b00, a2); + acc30 = vmlaq_f32(acc30, b00, a3); + + float32x4_t a4 = vld1q_dup_f32(mtx_a0 + 4); + float32x4_t a5 = vld1q_dup_f32(mtx_a0 + 5); + float32x4_t a6 = vld1q_dup_f32(mtx_a0 + 6); + float32x4_t a7 = vld1q_dup_f32(mtx_a0 + 7); + + // 4x4 block 0 + acc00 = vmlaq_f32(acc00, b01, a4); + acc10 = vmlaq_f32(acc10, b01, a5); + acc20 = vmlaq_f32(acc20, b01, a6); + acc30 = vmlaq_f32(acc30, b01, a7); + + mtx_a0 += 8; + mtx_b0 += 8; + + a0 = vld1q_dup_f32(mtx_a0 + 0); + a1 = vld1q_dup_f32(mtx_a0 + 1); + a2 = vld1q_dup_f32(mtx_a0 + 2); + a3 = vld1q_dup_f32(mtx_a0 + 3); + + b00 = vld1q_f32(mtx_b0); + b01 = vld1q_f32(mtx_b0 + 4); + + // 4x4 block 0 + acc00 = vmlaq_f32(acc00, b00, a0); + acc10 = vmlaq_f32(acc10, b00, a1); + acc20 = vmlaq_f32(acc20, b00, a2); + acc30 = vmlaq_f32(acc30, b00, a3); + + a4 = vld1q_dup_f32(mtx_a0 + 4); + a5 = vld1q_dup_f32(mtx_a0 + 5); + a6 = vld1q_dup_f32(mtx_a0 + 6); + a7 = vld1q_dup_f32(mtx_a0 + 7); + + // 4x4 block 0 + acc00 = vmlaq_f32(acc00, b01, a4); + acc10 = vmlaq_f32(acc10, b01, a5); + acc20 = vmlaq_f32(acc20, b01, a6); + acc30 = vmlaq_f32(acc30, b01, a7); + + mtx_a0 += 8; + mtx_b0 += 8; + + a0 = vld1q_dup_f32(mtx_a0 + 0); + a1 = vld1q_dup_f32(mtx_a0 + 1); + a2 = vld1q_dup_f32(mtx_a0 + 2); + a3 = vld1q_dup_f32(mtx_a0 + 3); + b00 = vld1q_f32(mtx_b0); + b01 = vld1q_f32(mtx_b0 + 4); + +#if __arm__ + asm volatile("PLD [%0, #128*4]" ::"r"(reinterpret_cast<const uint8_t *>(mtx_a0))); + asm volatile("PLD [%0, #128*4]" ::"r"(reinterpret_cast<const uint8_t *>(mtx_b0))); +#endif /* __arm__ */ + + // 4x4 block 0 + acc00 = vmlaq_f32(acc00, b00, a0); + acc10 = vmlaq_f32(acc10, b00, a1); + acc20 = vmlaq_f32(acc20, b00, a2); + acc30 = vmlaq_f32(acc30, b00, a3); + + a4 = vld1q_dup_f32(mtx_a0 + 4); + a5 = vld1q_dup_f32(mtx_a0 + 5); + a6 = vld1q_dup_f32(mtx_a0 + 6); + a7 = vld1q_dup_f32(mtx_a0 + 7); + + // 4x4 block 0 + acc00 = vmlaq_f32(acc00, b01, a4); + acc10 = vmlaq_f32(acc10, b01, a5); + acc20 = vmlaq_f32(acc20, b01, a6); + acc30 = vmlaq_f32(acc30, b01, a7); + + mtx_a0 += 8; + mtx_b0 += 8; + + a0 = vld1q_dup_f32(mtx_a0 + 0); + a1 = vld1q_dup_f32(mtx_a0 + 1); + a2 = vld1q_dup_f32(mtx_a0 + 2); + a3 = vld1q_dup_f32(mtx_a0 + 3); + b00 = vld1q_f32(mtx_b0); + b01 = vld1q_f32(mtx_b0 + 4); + + // 4x4 block 0 + acc00 = vmlaq_f32(acc00, b00, a0); + acc10 = vmlaq_f32(acc10, b00, a1); + acc20 = vmlaq_f32(acc20, b00, a2); + acc30 = vmlaq_f32(acc30, b00, a3); + + a4 = vld1q_dup_f32(mtx_a0 + 4); + a5 = vld1q_dup_f32(mtx_a0 + 5); + a6 = vld1q_dup_f32(mtx_a0 + 6); + a7 = vld1q_dup_f32(mtx_a0 + 7); + + // 4x4 block 0 + acc00 = vmlaq_f32(acc00, b01, a4); + acc10 = vmlaq_f32(acc10, b01, a5); + acc20 = vmlaq_f32(acc20, b01, a6); + acc30 = vmlaq_f32(acc30, b01, a7); + + mtx_a0 += 8; + mtx_b0 += 8; + } + for (; mtx_b0 < mtx_b0_end_addr;) + { + float32x4_t a0 = vld1q_dup_f32(mtx_a0 + 0); + float32x4_t a1 = vld1q_dup_f32(mtx_a0 + 1); + float32x4_t a2 = vld1q_dup_f32(mtx_a0 + 2); + float32x4_t a3 = vld1q_dup_f32(mtx_a0 + 3); + float32x4_t b00 = vld1q_f32(mtx_b0); + +#if __arm__ + asm volatile("PLD [%0, #128*2]" ::"r"(reinterpret_cast<const uint8_t *>(mtx_a0))); + asm volatile("PLD [%0, #128*2]" ::"r"(reinterpret_cast<const uint8_t *>(mtx_b0))); +#endif /* __arm__ */ + // 4x4 block 0 + acc00 = vmlaq_f32(acc00, b00, a0); + acc10 = vmlaq_f32(acc10, b00, a1); + acc20 = vmlaq_f32(acc20, b00, a2); + acc30 = vmlaq_f32(acc30, b00, a3); + + mtx_a0 += 4; + mtx_b0 += 4; + } } // Multiply by the weight of matrix product (alpha) diff --git a/src/cpu/kernels/logistic/generic/sme2/fp32.cpp b/src/cpu/kernels/logistic/generic/sme2/fp32.cpp new file mode 100644 index 0000000000..876e466594 --- /dev/null +++ b/src/cpu/kernels/logistic/generic/sme2/fp32.cpp @@ -0,0 +1,429 @@ +/* + * Copyright (c) 2024 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifdef ARM_COMPUTE_ENABLE_SME2 + +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/ITensor.h" +#include "arm_compute/core/Window.h" + +namespace arm_compute +{ +namespace cpu +{ + +// This function expects a collapsed 2D shape. +void sme2_f32_logistic_kernel(const float *src, + float *dst, + const uintptr_t shape[2], + const uintptr_t src_strides[2], + const uintptr_t dst_strides[2]) +{ + // Precondition: + assert(src_strides[0] == sizeof(float)); + assert(dst_strides[0] == sizeof(float)); + __asm__ volatile( + R"( + .inst 0xd503477f // smstart + + ptrue p0.b + .inst 0x25207811 // ptrue pn9.b + + // Registers + // + // * x9: temporary, index + // * x10: temporary, inf + // * x11: temporary, 0 + // * x12: temporary, 1.0f + // * x13: temporary, body_length + // + // * x26: index_1 + // * x27: src_1 + // * x28: dst_1 + // + // * z0: c1 + // * z1: c2 + // * z2: c3 + // * z3: c4 + // * z4: c5 + // * z5: shift + // * z6: inv_ln2 + // * z7: neg_ln2_hi + // * z8: neg_ln2_lo + // * z9: min_input, max_input + // * z10: 23, 0, 1, inf + // * z11: max_value + // * z12-z15: x, r_hi, r, r2 + // * z16-z19: max_value, shift, z, scale, poly + // * z20-z21: n, p1, p12345 + // * z22-z23: n, p23, p2345 + // * z24-z25: p45 + // * z26: max_input + // * z28-z31: sum_value + // + // * za0-za3: sum_value + // + // * p0: all-true + // * p1-p4: underflow, + // * p4: leftover predicate + // * p5-p8: overflow, + // * pn9: all-true + + // TAYLORS CONSTANTS + mov w9, #0xfff6 // c1: 0x1.ffffecp-1f = 0x3f7ffff6 + mov w10, #0xfedb // c2: 0x1.fffdb6p-2f = 0x3efffedb + mov w11, #0xaf33 // c3: 0x1.555e66p-3f = 0x3e2aaf33 + mov w12, #0x9f17 // c4: 0x1.573e2ep-5f = 0x3d2b9f17 + mov w13, #0x2010 // c5: 0x1.0e4020p-7f = 0x3c072010 + + movk w9, #0x3f7f, LSL #16 // c1: 0x1.ffffecp-1f = 0x3f7ffff6 + movk w10, #0x3eff, LSL #16 // c2: 0x1.fffdb6p-2f = 0x3efffedb + movk x11, #0x3e2a, LSL #16 // c3: 0x1.555e66p-3f = 0x3e2aaf33 + movk w12, #0x3d2b, LSL #16 // c4: 0x1.573e2ep-5f = 0x3d2b9f17 + movk w13, #0x3c07, LSL #16 // c5: 0x1.0e4020p-7f = 0x3c072010 + + dup z0.s, w9 // c1. + dup z1.s, w10 // c2. + dup z2.s, w11 // c3. + dup z3.s, w12 // c4. + dup z4.s, w13 // c5. + + mov w9, #0x007f // shift: 2^23 + 127 = 0x1.0000fep23f = 0x4b00007f + mov w10, #0xaa3b // inv_ln2: 1 / ln(2) = 0x1.715476p+0f = 0x3fb8aa3b + mov w11, #0x7200 // neg_ln2_hi: -ln(2) from bits -1 to -19 = -0x1.62e400p-1f = 0xbf317200 + mov w12, #0xbe8e // neg_ln2_lo: -ln(2) from bits -20 to -42 = -0x1.7f7d1cp-20f = 0xb5bfbe8e + mov w13, #0x47ae // min_input (Approximately ln 2^-125): -86.64 = 0xc2ad47ae + mov w14, #0xBD71 // max_input 88.37 = 0x42B0BD71 + + movk w9, #0x4b00, LSL #16 // shift: 2^23 + 127 = 0x1.0000fep23f = 0x4b00007f + movk w10, #0x3fb8, LSL #16 // inv_ln2: 1 / ln(2) = 0x1.715476p+0f = 0x3fb8aa3b + movk w11, #0xbf31, LSL #16 // neg_ln2_hi: -ln(2) from bits -1 to -19 = -0x1.62e400p-1f = 0xbf317200 + movk w12, #0xb5bf, LSL #16 // neg_ln2_lo: -ln(2) from bits -20 to -42 = -0x1.7f7d1cp-20f = 0xb5bfbe8e + movk w13, #0xc2ad, LSL #16 // min_input (Approximately ln 2^-125): -86.64 = 0xc2ad47ae + movk w14, #0x42B0, LSL #16 // max_input (88.37) = 0x42B0BD71 + + dup z5.s, w9 // shift + dup z6.s, w10 // inv_ln2 + dup z7.s, w11 // neg_ln2_hi + dup z8.s, w12 // neg_ln2_lo + dup z9.s, w13 // min_input + dup z26.s, w14 // max_input + + mov w10, #0x0000 // inf: 0x7F800000 + movk w10, #0x7F80, LSL #16 // inf: 0x7F800000 + + mov w15, #0x0000 + movk w15, #0x3F80, LSL #16 // 1 + + mov w11, #0 // 0 + + // ---------------------------------------------------------------- x13: body_length = (length / vl) * vl + cntw x13, ALL, MUL #4 // x13 is vl + udiv x9, %x[length], x13 // length/vl + mul x13, x13, x9 // x13 = vl * result + + // ================================================== + // Outer loop opening + // ================================================== + + mov x27, %x[src] // starting point of pointers for src. + mov x28, %x[dst] // starting point of pointers for dst. + mov x26, %x[shape_1] + +outer_loop_start%=: + // for index_1 in shape_1 downto 1 + cmp x26, #0 + b.eq outer_loop_end%= + sub x26, x26, #1 + + mov x9, #0 // x9: index + +inner_body_start%=: + cmp x9, x13 + b.eq inner_body_end%= + + // Loads the input data to 4 consecutive registers ---------------- z12-z15: input_data + .inst 0xa009c76c // ld1w {z12.s-z15.s}, pn9/z, [x27, x9, LSL #2] + + // ---------------------------------------------------------------- z12-z15: x = neg(x) + fneg z12.s, p0/m, z12.s + fneg z13.s, p0/m, z13.s + fneg z14.s, p0/m, z14.s + fneg z15.s, p0/m, z15.s + + // ---------------------------------------------------------------- p4-p7: underflow = x < min_input + fcmlt p1.s, p0/z, z12.s, z9.s + fcmlt p2.s, p0/z, z13.s, z9.s + fcmlt p3.s, p0/z, z14.s, z9.s + fcmlt p4.s, p0/z, z15.s, z9.s + + // ---------------------------------------------------------------- p4-p7: overflow = x > max_input + fcmlt p5.s, p0/z, z26.s, z12.s + fcmlt p6.s, p0/z, z26.s, z13.s + fcmlt p7.s, p0/z, z26.s, z14.s + fcmlt p8.s, p0/z, z26.s, z15.s + + // ---------------------------------------------------------------- z16-z19: shift + mov z16.d, z5.d + mov z17.d, z5.d + mov z18.d, z5.d + mov z19.d, z5.d + + // ---------------------------------------------------------------- z16-z19: z = shift + x * inv_ln2 + fmla z16.s, p0/m, z12.s, z6.s + fmla z17.s, p0/m, z13.s, z6.s + fmla z18.s, p0/m, z14.s, z6.s + fmla z19.s, p0/m, z15.s, z6.s + + // ---------------------------------------------------------------- z20-z23: n = z - shift + fsub z20.s, z16.s, z5.s + fsub z21.s, z17.s, z5.s + fsub z22.s, z18.s, z5.s + fsub z23.s, z19.s, z5.s + + // ---------------------------------------------------------------- z12-z15: r_hi = x + n * neg_ln2_hi + fmla z12.s, p0/m, z20.s, z7.s + fmla z13.s, p0/m, z21.s, z7.s + fmla z14.s, p0/m, z22.s, z7.s + fmla z15.s, p0/m, z23.s, z7.s + + // ---------------------------------------------------------------- z12-z15: r = r_hi + n * neg_ln2_lo + fmla z12.s, p0/m, z20.s, z8.s + fmla z13.s, p0/m, z21.s, z8.s + fmla z14.s, p0/m, z22.s, z8.s + fmla z15.s, p0/m, z23.s, z8.s + + // ---------------------------------------------------------------- z16-z19: scale = z << 23 (2^n) + dup z10.s, #23 + urshl z16.s, p0/m, z16.s, z10.s + urshl z17.s, p0/m, z17.s, z10.s + urshl z18.s, p0/m, z18.s, z10.s + urshl z19.s, p0/m, z19.s, z10.s + + // Processes the first 2 vectors. + + // ---------------------------------------------------------------- z20-z21: p1 = r * c1 + fmul z20.s, z12.s, z0.s + fmul z21.s, z13.s, z0.s + + // ---------------------------------------------------------------- z22-z23: p23 = c2 + mov z22.d, z1.d + mov z23.d, z1.d + + // ---------------------------------------------------------------- z22-z23: p23 = c2 + r * c3 + fmla z22.s, p0/m, z12.s, z2.s + fmla z23.s, p0/m, z13.s, z2.s + + // ---------------------------------------------------------------- z24-z35: c4 + mov z24.d, z3.d + mov z25.d, z3.d + + // ---------------------------------------------------------------- z24-z25: p45 = c4 + r * c5 + fmla z24.s, p0/m, z12.s, z4.s + fmla z25.s, p0/m, z13.s, z4.s + + // ---------------------------------------------------------------- z12-z13: r2 = r * r + fmul z12.s, z12.s, z12.s + fmul z13.s, z13.s, z13.s + + // ---------------------------------------------------------------- z22-z23: p2345 = p23 + r2 * p45 + fmla z22.s, p0/m, z12.s, z24.s + fmla z23.s, p0/m, z13.s, z25.s + + // ---------------------------------------------------------------- z20-z21: p12345 = p1 + r2 * p2345 + fmla z20.s, p0/m, z12.s, z22.s + fmla z21.s, p0/m, z13.s, z23.s + + // ---------------------------------------------------------------- z16-z17: poly = scale + p12345 * scale + fmla z16.s, p0/m, z20.s, z16.s + fmla z17.s, p0/m, z21.s, z17.s + + // Processes the last 2 vectors + + // ---------------------------------------------------------------- z20-z21: p1 = r * c1 + fmul z20.s, z14.s, z0.s + fmul z21.s, z15.s, z0.s + + // ---------------------------------------------------------------- z22-z23: p23 = c2 + mov z22.d, z1.d + mov z23.d, z1.d + + // ---------------------------------------------------------------- z22-z23: p23 = c2 + r * c3 + fmla z22.s, p0/m, z14.s, z2.s + fmla z23.s, p0/m, z15.s, z2.s + + // ---------------------------------------------------------------- z24-z35: c4 + mov z24.d, z3.d + mov z25.d, z3.d + + // ---------------------------------------------------------------- z24-z25: p45 = c4 + r * c5 + fmla z24.s, p0/m, z14.s, z4.s + fmla z25.s, p0/m, z15.s, z4.s + + // ---------------------------------------------------------------- z14-z15: r2 = r * r + fmul z14.s, z14.s, z14.s + fmul z15.s, z15.s, z15.s + + // ---------------------------------------------------------------- z22-z23: p2345 = p23 + r2 * p45 + fmla z22.s, p0/m, z14.s, z24.s + fmla z23.s, p0/m, z15.s, z25.s + + // ---------------------------------------------------------------- z20-z21: p12345 = p1 + r2 * p2345 + fmla z20.s, p0/m, z14.s, z22.s + fmla z21.s, p0/m, z15.s, z23.s + + // ---------------------------------------------------------------- z18-z19: poly = scale + p12345 * scale + fmla z18.s, p0/m, z20.s, z18.s + fmla z19.s, p0/m, z21.s, z19.s + + // ---------------------------------------------------------------- z16-z19: poly = underflow ? 0 : poly + dup z10.s, #0 + sel z16.s, p1, z10.s, z16.s + sel z17.s, p2, z10.s, z17.s + sel z18.s, p3, z10.s, z18.s + sel z19.s, p4, z10.s, z19.s + + // ---------------------------------------------------------------- z16-z19: poly = overflow ? inf : poly + dup z10.s, w10 // z10: inf + sel z16.s, p5, z10.s, z16.s + sel z17.s, p6, z10.s, z17.s + sel z18.s, p7, z10.s, z18.s + sel z19.s, p8, z10.s, z19.s + + // 1 / 1 + poly + dup z10.s, w15 // z10: 1 + fadd z16.s, z10.s, z16.s // poly + 1 + fadd z17.s, z10.s, z17.s // poly + 1 + fadd z18.s, z10.s, z18.s // poly + 1 + fadd z19.s, z10.s, z19.s // poly + 1 + + fdivr z16.s, p0/m, z16.s, z10.s // z16: 1/(poly+1) + fdivr z17.s, p0/m, z17.s, z10.s // z16: 1/(poly+1) + fdivr z18.s, p0/m, z18.s, z10.s // z16: 1/(poly+1) + fdivr z19.s, p0/m, z19.s, z10.s // z16: 1/(poly+1) + + // Stores 4 consecutive registers to the output + .inst 0xa029c790 // st1w {z16.s-z19.s}, pn9, [x28, x9, LSL #2] + + incw x9, ALL, MUL #4 + b inner_body_start%= +inner_body_end%=: + +inner_leftover_start%=: + // Largely ordinary Sve code to handle taylor series 1/1+e^-x for leftover loop. + whilelo p1.s, x9, %x[length] // While x9<length + b.none inner_leftover_end%= + + ld1w z12.s, p1/z, [x27, x9, LSL #2] // x12: input_data (LOADS POINTERS) + fneg z12.s, p1/m, z12.s + + mov z16.d, z5.d // z16: shift + fcmlt p4.s, p1/z, z12.s, z9.s // p4: underflow = x < min_input + fcmlt p5.s, p1/z, z26.s, z12.s // p5: overflow = x > max_input + + fmla z16.s, p1/m, z12.s, z6.s // z16: z = shift + x * inv_ln2 + fsub z20.s, z16.s, z5.s // z20: n = z - shift + fmla z12.s, p1/m, z20.s, z7.s // z12: r_hi = x + n * neg_ln2_hi + fmla z12.s, p1/m, z20.s, z8.s // z12: r = r_hi + n * neg_ln2_lo + dup z10.s, #23 // z10: 23 + urshl z16.s, p1/m, z16.s, z10.s // z16: scale = z << 23 (2^n) + fmul z20.s, z12.s, z0.s // z20: p1 = r * c1 + mov z22.d, z1.d // z22: p23 = c2 + fmla z22.s, p1/m, z12.s, z2.s // z22: p23 = c2 + r * c3 + mov z24.d, z3.d // z24: c4 + fmla z24.s, p1/m, z12.s, z4.s // z24: p45 = c4 + r * c5 + fmul z12.s, z12.s, z12.s // z12: r2 = r * r + fmla z22.s, p1/m, z12.s, z24.s // z22: p2345 = p23 + r2 * p45 + fmla z20.s, p1/m, z12.s, z22.s // z20: p12345 = p1 + r2 * p2345 + fmla z16.s, p1/m, z20.s, z16.s // z16: poly = scale + p12345 * scale + dup z10.s, #0 // z10: 0 + sel z16.s, p4, z10.s, z16.s // z16: poly = underflow ? 0 : poly + dup z10.s, w10 + sel z16.s, p5, z10.s, z16.s // z16: poly = overflow ? inf : poly + + // 1 / 1+poly + dup z10.s, w15 // z10: 1 + fadd z16.s, z10.s, z16.s // z16: z16 + 1 + fdivr z16.s, p0/m, z16.s, z10.s // z16: 1/(poly+1) + + st1w z16.s, p1, [x28, x9, LSL #2] + + incw x9 // each word + 1 + b inner_leftover_start%= +inner_leftover_end%=: + + // ================================================== + // Outer loop closing + // ================================================== + + add x27, x27, %x[src_stride_1] + add x28, x28, %x[dst_stride_1] + b outer_loop_start%= +outer_loop_end%=: + + .inst 0xd503467f // smstop + )" + : + : [src] "r"(src), [dst] "r"(dst), [shape_1] "r"(shape[1]), [src_stride_1] "r"(src_strides[1]), + [dst_stride_1] "r"(dst_strides[1]), [length] "r"(shape[0]) + : "cc", "memory", // + "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p9", // + "x9", "x10", "x11", "x12", "x13", "x14", "x15", // + "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", // + "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", // + "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", // + "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31" // + ); +} + +void sme2_fp32_logistic(const ITensor *in, ITensor *out, const ActivationLayerInfo &act_info, const Window &window) +{ + ARM_COMPUTE_UNUSED(act_info); + const auto *src_info = in->info(); + const auto *dst_info = out->info(); + + const auto &src_strides = src_info->strides_in_bytes(); + const auto &dst_strides = dst_info->strides_in_bytes(); + + // Iterator calculates pointer offsets and takes into account padding. + Iterator input(in, window); + Iterator output(out, window); + + // NOTE: This kernel uses collapsed 2D shapes. + // The excecution window is expected to be pre-collapsed in kernel configure(...) function. + const uintptr_t k_shape[] = {window.num_iterations(0), window.num_iterations(1)}; + + const uintptr_t k_src_strides[] = {src_strides[0], src_strides[1]}; + const uintptr_t k_dst_strides[] = {dst_strides[0], dst_strides[1]}; + + const auto *k_src = reinterpret_cast<const float *>(input.ptr()); + auto *k_dst = reinterpret_cast<float *>(output.ptr()); + + sme2_f32_logistic_kernel(k_src, k_dst, k_shape, k_src_strides, k_dst_strides); +} + +} // namespace cpu +} // namespace arm_compute + +#endif // ARM_COMPUTE_ENABLE_SME2 diff --git a/src/cpu/kernels/logistic/list.h b/src/cpu/kernels/logistic/list.h new file mode 100644 index 0000000000..7893a61248 --- /dev/null +++ b/src/cpu/kernels/logistic/list.h @@ -0,0 +1,42 @@ +/* + * Copyright (c) 2024 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ACL_SRC_CPU_KERNELS_LOGISTIC_LIST_H +#define ACL_SRC_CPU_KERNELS_LOGISTIC_LIST_H + +namespace arm_compute +{ +namespace cpu +{ +#define DECLARE_LOGISTIC_KERNEL(func_name) \ + void func_name(const ITensor *src, ITensor *dst, const ActivationLayerInfo &act_info, const Window &window) + +#ifdef __aarch64__ +DECLARE_LOGISTIC_KERNEL(sme2_fp32_logistic); +#endif // __aarch64__ + +#undef DECLARE_LOGISTIC_KERNEL +} // namespace cpu +} // namespace arm_compute + +#endif // ACL_SRC_CPU_KERNELS_LOGISTIC_LIST_H diff --git a/src/cpu/kernels/meanstddevnorm/generic/neon/fp16.cpp b/src/cpu/kernels/meanstddevnorm/generic/neon/fp16.cpp index 344b9df0c8..c73d1def6b 100644 --- a/src/cpu/kernels/meanstddevnorm/generic/neon/fp16.cpp +++ b/src/cpu/kernels/meanstddevnorm/generic/neon/fp16.cpp @@ -53,23 +53,24 @@ void mean_stddev_normalization<float16_t, 8>(ITensor *input, ITensor *output, fl auto in_ptr = reinterpret_cast<const float16_t *>(input_itr.ptr()); auto out_ptr = reinterpret_cast<float16_t *>(output_itr.ptr()); - float16x8_t sum_vec = vdupq_n_f16(static_cast<float16_t>(0.0f)); + float32x4x2_t sum_vec = {vdupq_n_f32(0.0f), vdupq_n_f32(0.0f)}; + float32x4_t sum_sq_vec = vdupq_n_f32(0.0f); for (; x <= (window_end_x - window_step_x); x += window_step_x) { float16x8_t data = vld1q_f16(in_ptr + x); - sum_vec = vaddq_f16(sum_vec, data); float32x4_t dl = vcvt_f32_f16(vget_low_f16(data)); float32x4_t dh = vcvt_f32_f16(vget_high_f16(data)); + sum_vec.val[0] = vaddq_f32(sum_vec.val[0], dl); + sum_vec.val[1] = vaddq_f32(sum_vec.val[1], dh); sum_sq_vec = vaddq_f32(sum_sq_vec, vmulq_f32(dl, dl)); sum_sq_vec = vaddq_f32(sum_sq_vec, vmulq_f32(dh, dh)); } - float32x4_t sum_carry_res = - vpaddq_f32(vcvt_f32_f16(vget_high_f16(sum_vec)), vcvt_f32_f16(vget_low_f16(sum_vec))); - float sum = vaddvq_f32(sum_carry_res); - float sum_sq = vaddvq_f32(sum_sq_vec); + float32x4_t sum_carry_res = vpaddq_f32(sum_vec.val[0], sum_vec.val[1]); + float sum = vaddvq_f32(sum_carry_res); + float sum_sq = vaddvq_f32(sum_sq_vec); // Compute left-over elements for (; x < window_end_x; ++x) diff --git a/src/cpu/kernels/mul/generic/sme2/list.h b/src/cpu/kernels/mul/generic/sme2/list.h new file mode 100644 index 0000000000..f6aecfb91c --- /dev/null +++ b/src/cpu/kernels/mul/generic/sme2/list.h @@ -0,0 +1,38 @@ +/* + * Copyright (c) 2024 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ACL_SRC_CPU_KERNELS_MUL_GENERIC_SME2_LIST_H +#define ACL_SRC_CPU_KERNELS_MUL_GENERIC_SME2_LIST_H +namespace arm_compute +{ +namespace cpu +{ +#define DECLARE_MUL_KERNEL(func_name) \ + void func_name(const ITensor *src1, const ITensor *src2, ITensor *out, const Window &window, float scale) + +DECLARE_MUL_KERNEL(sme2_q8_signed_mul); + +#undef DECLARE_MUL_KERNEL +} // namespace cpu +} // namespace arm_compute +#endif // ACL_SRC_CPU_KERNELS_MUL_GENERIC_SME2_LIST_H diff --git a/src/cpu/kernels/mul/generic/sme2/qasymm8_signed.cpp b/src/cpu/kernels/mul/generic/sme2/qasymm8_signed.cpp new file mode 100644 index 0000000000..94fbaa1bb2 --- /dev/null +++ b/src/cpu/kernels/mul/generic/sme2/qasymm8_signed.cpp @@ -0,0 +1,410 @@ +/* + * Copyright (c) 2024 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifdef ARM_COMPUTE_ENABLE_SME2 + +#include "arm_compute/core/ITensor.h" +#include "arm_compute/core/Window.h" + +namespace arm_compute +{ +namespace cpu +{ + +// Mul SME kernel +void sme2_q8_signed_mul_kernel( // + const int8_t *src, + const int8_t *weights, + int8_t *dst, + const int16_t offset_a, + const int16_t offset_b, + const int16_t offset_c, + const float multiplier, // = (scale_a * scale_b * mul) / scale_c + const uintptr_t win_shape[4], + const uintptr_t src_strides[4], + const uintptr_t wei_strides[4], + const uintptr_t dst_strides[4]) +{ + struct Args + { + uintptr_t shape1; + uintptr_t shape2; + uintptr_t shape3; + const int8_t *src; + const int8_t *wei; + int8_t *dst; + int multiplier14p18; + int offsetC14p18; + int16_t offsetA; + int16_t offsetB; + } args; + + // Constants used to express values in the 14p18 fixed point format + constexpr int32_t two_pwr18i = 262144; + constexpr float two_pwr18f = 262144.f; + + args.shape1 = win_shape[1]; + args.shape2 = win_shape[2]; + args.shape3 = win_shape[3]; + args.src = src; + args.wei = weights; + args.dst = dst; + args.multiplier14p18 = static_cast<int>(multiplier * two_pwr18f); + args.offsetC14p18 = static_cast<int>(offset_c * two_pwr18i); + // Offsets a/b need to be negated as assembly kernel uses addition instructions where subtraction is needed. + // Offset C is not negated as it needs to be added rather than subtracted. + args.offsetA = offset_a * -1; + args.offsetB = offset_b * -1; + + // Precondition: + assert(src_strides[0] == sizeof(int8_t)); + assert(wei_strides[0] == sizeof(int8_t)); + assert(dst_strides[0] == sizeof(int8_t)); + __asm__ volatile( + R"( + .inst 0xd503477f // smstart + .inst 0x25207811 // ptrue pn9.b + ptrue p0.b + + // ================================================== + // 3D loop opening + // ================================================== + + // ---------------------------------------------------------------- x13: body_length = (length / vl) * vl + cntb x8, ALL, MUL #2 // x13 is vl (of 16 bit values) + udiv x9, %x[length], x8 // length/vl + mul x8, x8, x9 // x13 = vl * result + + + // ---------------------------------------------------------------- x13: body_length = (length / vl) * vl + ldr x10, [%[args_ptr], %[offset_shape_3]] + ldr x11, [%[args_ptr], %[offset_src_ptr]] + ldr x12, [%[args_ptr], %[offset_wei_ptr]] + ldr x13, [%[args_ptr], %[offset_dst_ptr]] + + // Could potentially be replaced with explicit loads. + ld1rh {z1.h}, p0/z, [%[args_ptr], %[offset_A_offset]] + ld1rh {z2.h}, p0/z, [%[args_ptr], %[offset_B_offset]] + ld1rw {z3.s}, p0/z, [%[args_ptr], %[multiplier_offset]] + +loop_3_start%=: + // for index_3 in shape_3 downto 1 + cmp x10, #0 + b.eq loop_3_end%= + sub x10, x10, #1 + + ldr x14, [%[args_ptr], %[offset_shape_2]] + mov x15, x11 + mov x16, x12 + mov x17, x13 + +loop_2_start%=: + // for index_2 in shape_2 downto 1 + cmp x14, #0 + b.eq loop_2_end%= + sub x14, x14, #1 + + ldr x7, [%[args_ptr], %[offset_shape_1]] + mov x20, x15 + mov x21, x16 + mov x22, x17 + +loop_1_start%=: + // for index_1 in shape_2 downto 1 + cmp x7, #0 + b.eq loop_1_end%= + sub x7, x7, #1 + + mov x9, #0 // x9: index/count + +inner_loop_body_start%=: + cmp x9, x8 + b.eq inner_loop_body_end%= + + // WIDEN LOAD. LOAD 4 Z-REGS FOR BOTH A/B + + // NOTE: INSTEAD OF LOADING 4 LOAD 2 due to REG LIMITATIONS + .inst 0xa0090684 // ld1b {z4.b-z5.b}, pn9/z, [x20, x9] + .inst 0xa00906a6 // ld1b {z6.b-z7.b}, pn9/z, [x21, x9] + + // Widen to 16 bits + .inst 0xc175e08c // sunpk {z12.h-z15.h}, {z4.b-z5.b} // (a) + .inst 0xc175e0d0 // sunpk {z16.h-z19.h}, {z6.b-z7.b} // (b) + + // Apply offset to all registers in 16-bit + .inst 0xc161ab0c // add {z12.h-z15.h}, {z12.h-z15.h}, z1.h //a + .inst 0xc162ab10 // add {z16.h-z19.h}, {z16.h-z19.h}, z2.h //b + + // Widen to 32-bit now. + // 12-19 are taken + // 4-11 a, 20-27 b + .inst 0xc1b5e184 // sunpk {z4.s-z7.s}, {z12.h-z13.h} //a + .inst 0xc1b5e1c8 // sunpk {z8.s-z11.s}, {z14.h-z15.h} + .inst 0xc1b5e214 // sunpk {z20.s-z23.s}, {z16.h-z17.h} //b + .inst 0xc1b5e258 // sunpk {z24.s-z27.s}, {z18.h-z19.h} + + // Multiply a*b in int32 + // Output in z4-z11 + MUL z4.s, z4.s, z20.s + MUL z5.s, z5.s, z21.s + MUL z6.s, z6.s, z22.s + MUL z7.s, z7.s, z23.s + MUL z8.s, z8.s, z24.s + MUL z9.s, z9.s, z25.s + MUL z10.s, z10.s, z26.s + MUL z11.s, z11.s, z27.s + + // offsets + dup z12.s, %w[offset_C] + dup z13.s, %w[offset_C] + dup z14.s, %w[offset_C] + dup z15.s, %w[offset_C] + dup z16.s, %w[offset_C] + dup z17.s, %w[offset_C] + dup z18.s, %w[offset_C] + dup z19.s, %w[offset_C] + + // MLA Fixed Point multiplication integer + MLA z12.s, p0/m, z4.s, z3.s + MLA z13.s, p0/m, z5.s, z3.s + MLA z14.s, p0/m, z6.s, z3.s + MLA z15.s, p0/m, z7.s, z3.s + MLA z16.s, p0/m, z8.s, z3.s + MLA z17.s, p0/m, z9.s, z3.s + MLA z18.s, p0/m, z10.s, z3.s + MLA z19.s, p0/m, z11.s, z3.s + + // Int32 to Int8 saturate + .inst 0xc16eda05 // sqrshr z5.b, {z16.s-z19.s}, #18 + .inst 0xc16ed984 // sqrshr z4.b, {z12.s-z15.s}, #18 + // Store + .inst 0xa02906c4 // st1b {z4.b-z5.b}, pn9, [x22, x9] + + incb x9, ALL, MUL #2 + b inner_loop_body_start%= +inner_loop_body_end%=: + +inner_loop_leftover_start%=: + whilelo p1.b, x9, %x[length] // While x9<length + b.none inner_loop_leftover_end%= + + // HANDLE MULTIPLICATION HERE + ld1b z4.b, p1/z, [x20, x9] // z4: a input_data + ld1b z5.b, p1/z, [x21, x9] // z5: b input_data + + // Widen register z4 (a) + sunpklo z6.h, z4.b // lower as 16 bits + sunpkhi z7.h, z4.b // upper as 16 bits + + // Widen register z5 (b) + sunpklo z8.h, z5.b // lower as 16 bits + sunpkhi z9.h, z5.b // upper as 16 bits + + // Apply offset in 16bit maths to all resulting vectors. + add z6.h, z6.h, z1.h //a + add z7.h, z7.h, z1.h + add z8.h, z8.h, z2.h //b + add z9.h, z9.h, z2.h + + // Widen a,b to 32-bit z-registers. + // Multiply a and b and store result as 32 bit int. + // a lower - 32-bit + sunpklo z10.s, z6.h + sunpkhi z11.s, z6.h + // a upper - 32-bit + sunpklo z12.s, z7.h + sunpkhi z13.s, z7.h + + // b lower - 32-bit + sunpklo z14.s, z8.h + sunpkhi z15.s, z8.h + // b upper - 32-bit + sunpklo z16.s, z9.h + sunpkhi z17.s, z9.h + + // offsets + dup z4.s, %w[offset_C] + dup z5.s, %w[offset_C] + dup z6.s, %w[offset_C] + dup z7.s, %w[offset_C] + + // Multiply a*b (lower) in int32 + MUL z10.s, z10.s, z14.s + MUL z11.s, z11.s, z15.s + + // Multiply a*b (upper) in int32 + MUL z12.s, z12.s, z16.s + MUL z13.s, z13.s, z17.s + + // Still int32 here. + // Now MLA in fixed point + MLA z4.s, p0/m, z10.s, z3.s + MLA z5.s, p0/m, z11.s, z3.s + MLA z6.s, p0/m, z12.s, z3.s + MLA z7.s, p0/m, z13.s, z3.s + + // Right shift, no narrow + LSR z20.s, z4.s, #8 + LSR z21.s, z5.s, #8 + LSR z22.s, z6.s, #8 + LSR z23.s, z7.s, #8 + + // Right shift rounding (lower) + // Do not saturate. + RSHRNB z20.h, z20.s, #8 + RSHRNB z21.h, z21.s, #8 + UZP1 z25.h, z20.h, z21.h + // Right shift upper. + RSHRNB z22.h, z22.s, #8 + RSHRNB z23.h, z23.s, #8 + UZP1 z26.h, z22.h, z23.h + + // Shift again to 8 bit both vectors. Recombine. + SQRSHRNB z25.b, z25.h, #2 + SQRSHRNB z26.b, z26.h, #2 + UZP1 z27.b, z25.b, z26.b + + st1b z27.b, p1, [x22, x9] + + incb x9 // x9 : x9 += sizeof(element) * predicate_count + b inner_loop_leftover_start%= +inner_loop_leftover_end%=: + + // ================================================== + // 3D loop closing + // ================================================== + + add x20, x20, %[src_stride_1] + add x21, x21, %[wei_stride_1] + add x22, x22, %[dst_stride_1] + b loop_1_start%= +loop_1_end%=: + + add x15, x15, %[src_stride_2] + add x16, x16, %[wei_stride_2] + add x17, x17, %[dst_stride_2] + b loop_2_start%= +loop_2_end%=: + + add x11, x11, %[src_stride_3] + add x12, x12, %[wei_stride_3] + add x13, x13, %[dst_stride_3] + b loop_3_start%= +loop_3_end%=: + + .inst 0xd503467f // smstop + )" + : + : // The following arguments are loaded via arg ptr values and a constant offset. + [args_ptr] "r"(&args), [offset_src_ptr] "I"(offsetof(Args, src)), [offset_wei_ptr] "I"(offsetof(Args, wei)), + [offset_dst_ptr] "I"(offsetof(Args, dst)), [offset_shape_1] "I"(offsetof(Args, shape1)), + [offset_shape_2] "I"(offsetof(Args, shape2)), [offset_shape_3] "I"(offsetof(Args, shape3)), + [multiplier_offset] "I"(offsetof(Args, multiplier14p18)), // + [offset_A_offset] "I"(offsetof(Args, offsetA)), // + [offset_B_offset] "I"(offsetof(Args, offsetB)), // + // Use registers for efficiency sake. + [src_stride_1] "r"(src_strides[1]), [src_stride_2] "r"(src_strides[2]), [src_stride_3] "r"(src_strides[3]), + [wei_stride_1] "r"(wei_strides[1]), [wei_stride_2] "r"(wei_strides[2]), [wei_stride_3] "r"(wei_strides[3]), + [dst_stride_1] "r"(dst_strides[1]), [dst_stride_2] "r"(dst_strides[2]), [dst_stride_3] "r"(dst_strides[3]), + [offset_C] "r"(args.offsetC14p18), // + [length] "r"(win_shape[0]) + : "cc", "memory", // + "p0", "p1", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", + "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", // + "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", // + "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", // + "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31" // + ); +} + +void sme2_q8_signed_mul(const ITensor *in0, const ITensor *in1, ITensor *out, const Window &window, const float scale) +{ + const auto *src_info = in0->info(); + const auto *src2_info = in1->info(); + const auto *dst_info = out->info(); + + const UniformQuantizationInfo src_q_info = src_info->quantization_info().uniform(); + const UniformQuantizationInfo src2_q_info = src2_info->quantization_info().uniform(); + const UniformQuantizationInfo dst_q_info = dst_info->quantization_info().uniform(); + + const auto &src_strides_bytes = src_info->strides_in_bytes(); + const auto &wei_strides_bytes = src2_info->strides_in_bytes(); + const auto &dst_strides_bytes = dst_info->strides_in_bytes(); + + // NOTE: This kernel does not support shapes above 4D (Unless excecution window has been collapsed) + assert(window.num_iterations(4) == 1 && window.num_iterations(5) == 1); + + // Note : The window is expected to handle y-broadcasting by setting relevant strides to 0. + const uintptr_t shape[] = { + window.num_iterations(0), + window.num_iterations(1), + window.num_iterations(2), + window.num_iterations(3), + }; + + Window input1_win = window.broadcast_if_dimension_le_one(src_info->tensor_shape()); + Window input2_win = window.broadcast_if_dimension_le_one(src2_info->tensor_shape()); + + // First dim is always datasize. If broadcasting in other dims, set stride to 0. + uintptr_t src_strides[] = {src_strides_bytes[0], (input1_win.is_broadcasted(1)) ? 0 : src_strides_bytes[1], + (input1_win.is_broadcasted(2)) ? 0 : src_strides_bytes[2], + (input1_win.is_broadcasted(3)) ? 0 : src_strides_bytes[3]}; + uintptr_t wei_strides[] = {wei_strides_bytes[0], (input2_win.is_broadcasted(1)) ? 0 : wei_strides_bytes[1], + (input2_win.is_broadcasted(2)) ? 0 : wei_strides_bytes[2], + (input2_win.is_broadcasted(3)) ? 0 : wei_strides_bytes[3]}; + + const uintptr_t dst_strides[] = { + dst_strides_bytes[0], + dst_strides_bytes[1], + dst_strides_bytes[2], + dst_strides_bytes[3], + }; + + const uintptr_t src_offset = window[0].start() * src_strides[0] + window[1].start() * src_strides[1] + + window[2].start() * src_strides[2] + window[3].start() * src_strides[3] + + in0->info()->offset_first_element_in_bytes(); + const uintptr_t src2_offset = window[0].start() * wei_strides[0] + window[1].start() * wei_strides[1] + + window[2].start() * wei_strides[2] + window[3].start() * wei_strides[3] + + in1->info()->offset_first_element_in_bytes(); + const uintptr_t dst_offset = window[0].start() * dst_strides[0] + window[1].start() * dst_strides[1] + + window[2].start() * dst_strides[2] + window[3].start() * dst_strides[3] + + out->info()->offset_first_element_in_bytes(); + + const auto *src = reinterpret_cast<const int8_t *>(in0->buffer() + src_offset); + const auto *src2 = reinterpret_cast<const int8_t *>(in1->buffer() + src2_offset); + auto *dst = reinterpret_cast<int8_t *>(out->buffer() + dst_offset); + + // Calculate or retrieve necessary offsets/scale values. + const int16_t offset_a = src_q_info.offset; + const int16_t offset_b = src2_q_info.offset; + float multiplier = (src_q_info.scale * src2_q_info.scale * scale) / dst_q_info.scale; + + sme2_q8_signed_mul_kernel(src, src2, dst, offset_a, offset_b, dst_q_info.offset, multiplier, shape, src_strides, + wei_strides, dst_strides); +} + +} // namespace cpu +} // namespace arm_compute + +#endif // ARM_COMPUTE_ENABLE_SME2 diff --git a/src/cpu/operators/CpuConv2d.h b/src/cpu/operators/CpuConv2d.h index 3f98e71896..0012ff6609 100644 --- a/src/cpu/operators/CpuConv2d.h +++ b/src/cpu/operators/CpuConv2d.h @@ -85,6 +85,7 @@ public: * |F16 |F16 |F16 |F16 | * |F32 |F32 |F32 |F32 | * |QASYMM8 |QASYMM8 |S32 |QASYMM8 | + * |QASYMM8 |QASYMM8_SIGNED |S32 |QASYMM8 | * |QASYMM8 |QSYMM8_PER_CHANNEL |S32 |QASYMM8 | * |QASYMM8_SIGNED |QASYMM8_SIGNED |S32 |QASYMM8_SIGNED | * |QASYMM8_SIGNED |QSYMM8_PER_CHANNEL |S32 |QASYMM8_SIGNED | @@ -93,7 +94,7 @@ public: * while every optional dimension from 4 and above represent a batch of inputs. * Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32. * @param[in] weights Weights tensor info. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM]. - * Data type supported: Same as @p src, also could be QSYMM8_PER_CHANNEL if input is QASYMM8/QASYMM8_SIGNED. + * Data type supported: Same as @p src, also could be QSYMM8_PER_CHANNEL or QASYMM8_SIGNED if input is QASYMM8/QASYMM8_SIGNED. * @param[in] biases Biases tensor info. Shared biases supported. Biases are 1D tensor with dimensions [OFM]. * Data type supported: Same as @p src, except for input of QASYMM8/QASYMM8_SIGNED type where biases should be of S32 type. * @param[out] dst Destination tensor info. 3 lower dimensions represent a single output [width, height, OFM], while the rest represent batch of outputs. @@ -139,7 +140,7 @@ public: * while every optional dimension from 4 and above represent a batch of inputs. * Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32. * @param[in] weights Weights tensor info. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM]. - * Data type supported:Same as @p src, also could be QSYMM8_PER_CHANNEL if input is QASYMM8/QASYMM8_SIGNED. + * Data type supported:Same as @p src, also could be QSYMM8_PER_CHANNEL or QASYMM8_SIGNED if input is QASYMM8/QASYMM8_SIGNED. * @param[in] dst Destination tensor info. 3 lower dimensions represent a single output [width, height, OFM], while the rest represent batch of outputs. * Data types supported: Same as @p src. * @param[in] conv_info Contains padding and stride information described in @ref PadStrideInfo. diff --git a/src/cpu/operators/CpuGemmConv2d.cpp b/src/cpu/operators/CpuGemmConv2d.cpp index f3b78f8885..4ef5722438 100644 --- a/src/cpu/operators/CpuGemmConv2d.cpp +++ b/src/cpu/operators/CpuGemmConv2d.cpp @@ -227,6 +227,7 @@ CpuGemmConv2d::CpuGemmConv2d() _is_prepared(false), _wt_method(WeightTransformMethod::ReshapeThenTranspose), _run_wt(true), + _act_info(), _aux_mem(AuxTensorIdx::Count) { } @@ -278,6 +279,7 @@ void CpuGemmConv2d::configure_mm(const ITensorInfo *src, int32_t min_activation = type_min.get<int32_t>(); int32_t max_activation = type_max.get<int32_t>(); + _act_info = act_info; if (supported_acts.count(act_info.activation()) != 0) { std::tie(min_activation, max_activation) = get_quantized_activation_min_max(act_info, data_type, uoqinfo); @@ -291,11 +293,14 @@ void CpuGemmConv2d::configure_mm(const ITensorInfo *src, output_info.is_quantized_per_channel = (tmp_weights.data_type() == DataType::QSYMM8_PER_CHANNEL); quantization::calculate_quantized_multipliers(iqinfo, wqinfo, oqinfo, output_info); + const GEMMInfo gemm_info = + GEMMInfo(false /* is_a_reshaped */, false /* is_b_reshaped */, true /* reshape_b_only_on_first_run */, + gemm_3d_depth, _skip_im2col, false /* retain_internal_weights */, output_info, + false /* fp_mixed_precision */, enable_fast_math, false /* broadcast_bias */, act_info, + fixed_format, weight_format, false /* pretranspose_B. TODO: COMPMID-6596 */); + _mm_gemmlowp = std::make_unique<CpuGemmLowpMatrixMultiplyCore>(); - _mm_gemmlowp->configure(&tmp_src, &tmp_weights, biases, dst, - GEMMInfo(false, false, true, gemm_3d_depth, _skip_im2col, false, output_info, false, - enable_fast_math, false, act_info, fixed_format, weight_format, - false /* pretranspose_B. TODO: COMPMID-6596 */)); + _mm_gemmlowp->configure(&tmp_src, &tmp_weights, biases, dst, gemm_info); auto mm_mem_req = _mm_gemmlowp->workspace(); for (unsigned int cont = 0; cont < mm_mem_req.size(); ++cont) @@ -306,7 +311,7 @@ void CpuGemmConv2d::configure_mm(const ITensorInfo *src, else { // Create GEMMInfo structure - const GEMMInfo &gemm_info = + const GEMMInfo gemm_info = GEMMInfo(false, false, true /* Reshape weights only for the first run */, gemm_3d_depth, _skip_im2col /* Reinterpret the input as 3D if im2col is skipped */, false, GEMMLowpOutputStageInfo(), false, enable_fast_math, false, act_info, fixed_format, weight_format, @@ -800,6 +805,58 @@ Status CpuGemmConv2d::validate(const ITensorInfo *src, return Status{}; } +void CpuGemmConv2d::update_quantization_parameters(ITensorPack &tensors) +{ + // Supported activations in GEMM + const std::set<ActivationLayerInfo::ActivationFunction> supported_acts = { + ActivationLayerInfo::ActivationFunction::RELU, ActivationLayerInfo::ActivationFunction::BOUNDED_RELU, + ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU}; + + auto src = tensors.get_const_tensor(ACL_SRC_0); + auto dst = tensors.get_tensor(ACL_DST); + + auto wei = tensors.get_const_tensor(TensorType::ACL_SRC_1); + TensorInfo tmp_src{*src->info()}; + TensorInfo tmp_weights{*wei->info()}; + + const QuantizationInfo iqinfo = src->info()->quantization_info(); + const QuantizationInfo wqinfo = wei->info()->quantization_info(); + const QuantizationInfo oqinfo = (dst->info()->total_size() == 0) ? iqinfo : dst->info()->quantization_info(); + const UniformQuantizationInfo uiqinfo = iqinfo.uniform(); + const UniformQuantizationInfo uoqinfo = oqinfo.uniform(); + const DataType data_type = src->info()->data_type(); + + tmp_src.set_quantization_info(QuantizationInfo(uiqinfo.scale, -uiqinfo.offset)); + if (!is_data_type_quantized_per_channel(tmp_weights.data_type())) + { + const UniformQuantizationInfo uwqinfo = wqinfo.uniform(); + tmp_weights.set_quantization_info(QuantizationInfo(uwqinfo.scale, -uwqinfo.offset)); + } + + // Merge activation with output stage + PixelValue type_min{}; + PixelValue type_max{}; + std::tie(type_min, type_max) = get_min_max(data_type); + int32_t min_activation = type_min.get<int32_t>(); + int32_t max_activation = type_max.get<int32_t>(); + + if (supported_acts.count(_act_info.activation()) != 0) + { + std::tie(min_activation, max_activation) = get_quantized_activation_min_max(_act_info, data_type, uoqinfo); + } + + GEMMLowpOutputStageInfo output_info; + output_info.type = GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT; + output_info.gemmlowp_offset = uoqinfo.offset; + output_info.gemmlowp_min_bound = min_activation; + output_info.gemmlowp_max_bound = max_activation; + output_info.is_quantized_per_channel = (tmp_weights.data_type() == DataType::QSYMM8_PER_CHANNEL); + quantization::calculate_quantized_multipliers(iqinfo, wqinfo, oqinfo, output_info); + + _mm_gemmlowp->update_quantization_parameters(output_info, tmp_src.quantization_info(), + tmp_weights.quantization_info(), _is_prepared, true); +} + void CpuGemmConv2d::run(ITensorPack &tensors) { prepare(tensors); diff --git a/src/cpu/operators/CpuGemmConv2d.h b/src/cpu/operators/CpuGemmConv2d.h index fa16ce860b..e4e34cc7c5 100644 --- a/src/cpu/operators/CpuGemmConv2d.h +++ b/src/cpu/operators/CpuGemmConv2d.h @@ -76,6 +76,7 @@ public: * |F32 |F32 |F32 |F32 | * |BFLOAT16 |BFLOAT16 |BFLOAT16 |BFLOAT16 | * |QASYMM8 |QASYMM8 |S32 |QASYMM8 | + * |QASYMM8 |QASYMM8_SIGNED |S32 |QASYMM8 | * |QASYMM8 |QSYMM8_PER_CHANNEL |S32 |QASYMM8 | * |QASYMM8_SIGNED |QASYMM8_SIGNED |S32 |QASYMM8_SIGNED | * |QASYMM8_SIGNED |QSYMM8_PER_CHANNEL |S32 |QASYMM8_SIGNED | @@ -142,6 +143,12 @@ public: const ActivationLayerInfo &act_info = ActivationLayerInfo(), const bool enable_fast_math = false); + /** Update of quantization information at the run stage for convolution so that the quantization multipliers can be properly calculated. + * + * @param[in] tensors Vector that contains the tensors to operate on. + */ + void update_quantization_parameters(ITensorPack &tensors); + // Inherited methods overridden: void run(ITensorPack &tensors) override; void prepare(ITensorPack &tensors) override; @@ -292,6 +299,7 @@ private: bool _is_prepared; WeightTransformMethod _wt_method; bool _run_wt; + ActivationLayerInfo _act_info; experimental::MemoryRequirements _aux_mem{Count}; }; diff --git a/src/cpu/operators/CpuGemmLowpMatrixMultiplyCore.cpp b/src/cpu/operators/CpuGemmLowpMatrixMultiplyCore.cpp index f3396fbb5c..0ea3c249df 100644 --- a/src/cpu/operators/CpuGemmLowpMatrixMultiplyCore.cpp +++ b/src/cpu/operators/CpuGemmLowpMatrixMultiplyCore.cpp @@ -128,24 +128,31 @@ void CpuGemmLowpMatrixMultiplyCore::configure( _reshape_b_only_on_first_run; _gemm_info = gemm_info; - // Offset kernel is need if offset is non-zero or it may change (i.e. dynamic). - // It is not needed if the datatype is symmetric, because there is no offset - bool a_offset_kernel_needed = _a_offset != 0 || a->quantization_info().is_dynamic(); - bool b_offset_kernel_needed = _b_offset != 0 || b->quantization_info().is_dynamic(); + const ITensorInfo *a_to_use = a; - _asm_glue = std::make_unique<cpu::CpuGemmAssemblyDispatch>(); + // Initialize assembly kernel meta-data + const cpu::AsmGemmInfo asm_info = init_assembly_metadata(gemm_info); - const ITensorInfo *a_to_use = a; + const int32_t offset_correction = 128; + const DataType dt = DataType::QASYMM8_SIGNED; + const UniformQuantizationInfo iqinfo = a_to_use->quantization_info().uniform(); + + _signed_a = a_to_use->clone()->set_data_type(dt).set_quantization_info( + QuantizationInfo(iqinfo.scale, iqinfo.offset + offset_correction)); + + // If inputs are mixed-sign but this machine does not support mixed sign kernels, + // flip the sign so matched-sign kernels can be used. + if (!_flip_signedness && a->data_type() == DataType::QASYMM8 && b->data_type() == DataType::QASYMM8_SIGNED && + !bool(CpuGemmAssemblyDispatch::validate(a_to_use, b, c, dst, asm_info))) + { + _flip_signedness = true; + } + + _asm_glue = std::make_unique<cpu::CpuGemmAssemblyDispatch>(); // Convert to QASYMM8 -> QASYMM8_SIGNED and back if (_flip_signedness) { - const int32_t offset_correction = 128; - const DataType dt = DataType::QASYMM8_SIGNED; - const UniformQuantizationInfo iqinfo = a_to_use->quantization_info().uniform(); - - _signed_a = a_to_use->clone()->set_data_type(dt).set_quantization_info( - QuantizationInfo(iqinfo.scale, iqinfo.offset + offset_correction)); _convert_to_signed_asymm = std::make_unique<kernels::CpuConvertQuantizedSignednessKernel>(); _convert_to_signed_asymm->configure(a_to_use, &_signed_a); a_to_use = &_signed_a; @@ -166,6 +173,11 @@ void CpuGemmLowpMatrixMultiplyCore::configure( matrix_a = &_signed_a; } + // Offset kernel is need if offset is non-zero or it may change (i.e. dynamic). + // It is not needed if the datatype is symmetric, because there is no offset + bool a_offset_kernel_needed = _a_offset != 0 || a->quantization_info().is_dynamic(); + bool b_offset_kernel_needed = _b_offset != 0 || b->quantization_info().is_dynamic(); + // If GEMMLowpOutputStage != NONE, fuse the offset contribution with the output stage if (info.gemmlowp_output_stage().type != GEMMLowpOutputStageType::NONE) { @@ -173,8 +185,6 @@ void CpuGemmLowpMatrixMultiplyCore::configure( _mm_result_s32 = TensorInfo(dst->tensor_shape(), 1, DataType::S32); } - // Initialize assembly kernel meta-data - const cpu::AsmGemmInfo asm_info = init_assembly_metadata(gemm_info); #ifdef __aarch64__ if (!(!b->are_values_constant() && b->tensor_shape().z() > 1)) // Disable batch matmul as optimized GeMM handles batching differently. @@ -375,10 +385,6 @@ Status CpuGemmLowpMatrixMultiplyCore::validate(const ITensorInfo *a, int32_t a_offset = a->quantization_info().uniform().offset; int32_t b_offset = b->quantization_info().uniform().offset; - // Offset kernel is need if offset is non-zero or it may change (i.e. dynamic). - bool a_offset_kernel_needed = a_offset != 0 || a->quantization_info().is_dynamic(); - bool b_offset_kernel_needed = b_offset != 0 || b->quantization_info().is_dynamic(); - bool fuse_output_stage = info.gemmlowp_output_stage().type != GEMMLowpOutputStageType::NONE; if (fuse_output_stage) { @@ -386,19 +392,31 @@ Status CpuGemmLowpMatrixMultiplyCore::validate(const ITensorInfo *a, a->clone()->set_tensor_shape(output->tensor_shape()).set_data_type(DataType::S32)); } + // Initialize assembly kernel meta-data + const AsmGemmInfo asm_info = init_assembly_metadata(info); + // Convert QASYMM8->QASYMM8_SIGNED - TensorInfo signed_a{}; + const int32_t offset_correction = 128; + const DataType dt = DataType::QASYMM8_SIGNED; + const UniformQuantizationInfo iqinfo = a_to_use->quantization_info().uniform(); + + TensorInfo signed_a = a_to_use->clone()->set_data_type(dt).set_quantization_info( + QuantizationInfo(iqinfo.scale, iqinfo.offset + offset_correction)); TensorInfo signed_output{}; - bool flip_signedness = is_data_type_quantized_per_channel(b->data_type()) && + + bool flip_signedness = is_data_type_quantized_per_channel(b->data_type()) && (a->data_type() == DataType::QASYMM8) && info.reshape_b_only_on_first_run(); - if (flip_signedness) + + // If inputs are mixed-sign but this machine does not support mixed sign kernels, + // flip the sign so matched-sign kernels can be used. + if (!flip_signedness && a->data_type() == DataType::QASYMM8 && b->data_type() == DataType::QASYMM8_SIGNED && + !bool(CpuGemmAssemblyDispatch::validate(a_to_use, b, c, output, asm_info))) { - const int32_t offset_correction = 128; - const DataType dt = DataType::QASYMM8_SIGNED; - const UniformQuantizationInfo iqinfo = a_to_use->quantization_info().uniform(); + flip_signedness = true; + } - signed_a = a_to_use->clone()->set_data_type(dt).set_quantization_info( - QuantizationInfo(iqinfo.scale, iqinfo.offset + offset_correction)); + if (flip_signedness) + { ARM_COMPUTE_RETURN_ON_ERROR(kernels::CpuConvertQuantizedSignednessKernel::validate(a_to_use, &signed_a)); a_to_use = &signed_a; a_offset = signed_a.quantization_info().uniform().offset; @@ -418,8 +436,9 @@ Status CpuGemmLowpMatrixMultiplyCore::validate(const ITensorInfo *a, matrix_a_info = &signed_a; } - // Initialize assembly kernel meta-data - const AsmGemmInfo asm_info = init_assembly_metadata(info); + // Offset kernel is need if offset is non-zero or it may change (i.e. dynamic). + bool a_offset_kernel_needed = a_offset != 0 || a->quantization_info().is_dynamic(); + bool b_offset_kernel_needed = b_offset != 0 || b->quantization_info().is_dynamic(); // Check if we need to run the optimized assembly kernel bool run_optimised = false; @@ -556,9 +575,12 @@ Status CpuGemmLowpMatrixMultiplyCore::validate(const ITensorInfo *a, kernels::CpuGemmLowpMatrixMultiplyKernel::validate(matrix_a_info, matrix_b_info, output)); } // Validate offset contribution kernel - ARM_COMPUTE_RETURN_ON_ERROR(kernels::CpuGemmLowpOffsetContributionKernel::validate( - output, a_offset_kernel_needed ? &info_vector_sum_col : nullptr, - b_offset_kernel_needed ? &info_vector_sum_row : nullptr, a_offset, b_offset)); + if (output->data_type() != DataType::QASYMM8 && output->data_type() != DataType::QASYMM8_SIGNED) + { + ARM_COMPUTE_RETURN_ON_ERROR(kernels::CpuGemmLowpOffsetContributionKernel::validate( + output, a_offset_kernel_needed ? &info_vector_sum_col : nullptr, + b_offset_kernel_needed ? &info_vector_sum_row : nullptr, a_offset, b_offset)); + } } } @@ -614,7 +636,6 @@ void CpuGemmLowpMatrixMultiplyCore::run(ITensorPack &tensors) if (_asm_glue->is_configured()) { ITensorPack asm_glue_tensors = tensors; - auto output_to_use = (_fuse_output_stage ? mm_result_s32.get() : dst); if (is_data_type_quantized_asymmetric(a_to_use->info()->data_type()) && _gemm_info.gemmlowp_output_stage().type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT) { @@ -625,6 +646,7 @@ void CpuGemmLowpMatrixMultiplyCore::run(ITensorPack &tensors) } else { + auto output_to_use = (_fuse_output_stage ? mm_result_s32.get() : dst); asm_glue_tensors.add_const_tensor(TensorType::ACL_SRC_0, a_to_use); asm_glue_tensors.add_const_tensor(TensorType::ACL_SRC_1, b); asm_glue_tensors.add_tensor(TensorType::ACL_DST, output_to_use); @@ -775,5 +797,17 @@ experimental::MemoryRequirements CpuGemmLowpMatrixMultiplyCore::workspace() cons { return _aux_mem; } + +void CpuGemmLowpMatrixMultiplyCore::update_quantization_parameters(const GEMMLowpOutputStageInfo &output_info, + const QuantizationInfo &a, + const QuantizationInfo &b, + const bool is_prepared, + const bool negated_offsets) +{ + auto lowp_os = output_info; + _gemm_info.set_gemmlowp_output_stage(lowp_os); + _asm_glue->update_quantization_parameters(output_info, a, b, is_prepared, negated_offsets); + _is_prepared = is_prepared; +} } // namespace cpu } // namespace arm_compute diff --git a/src/cpu/operators/CpuGemmLowpMatrixMultiplyCore.h b/src/cpu/operators/CpuGemmLowpMatrixMultiplyCore.h index 38121c9bb4..033979e93f 100644 --- a/src/cpu/operators/CpuGemmLowpMatrixMultiplyCore.h +++ b/src/cpu/operators/CpuGemmLowpMatrixMultiplyCore.h @@ -81,11 +81,13 @@ public: * |src0 |src1 |src2 |dst | * |:--------------|:------------------|:--------|:--------------| * |QASYMM8 |QASYMM8 |S32 |QASYMM8 | + * |QASYMM8 |QASYMM8_SIGNED |S32 |QASYMM8 | * |QASYMM8 |QSYMM8_PER_CHANNEL |S32 |QASYMM8 | * |QASYMM8 |QSYMM8 |S32 |QASYMM8 | * |QASYMM8 |QASYMM8 |S32 |S32 | * |QASYMM8 |QSYMM8_PER_CHANNEL |S32 |S32 | * |QASYMM8 |QSYMM8 |S32 |S32 | + * |QASYMM8 |QASYMM8_SIGNED |F32 |F32 | * |QASYMM8_SIGNED |QASYMM8_SIGNED |S32 |QASYMM8_SIGNED | * |QASYMM8_SIGNED |QSYMM8_PER_CHANNEL |S32 |QASYMM8_SIGNED | * |QASYMM8_SIGNED |QSYMM8 |S32 |QASYMM8_SIGNED | @@ -131,6 +133,11 @@ public: void run(ITensorPack &tensors) override; void prepare(ITensorPack &tensors) override; experimental::MemoryRequirements workspace() const override; + void update_quantization_parameters(const GEMMLowpOutputStageInfo &output_info, + const QuantizationInfo &a, + const QuantizationInfo &b, + const bool is_prepared, + const bool negated_offsets); private: enum AuxTensorIdx diff --git a/src/cpu/operators/CpuMatMul.cpp b/src/cpu/operators/CpuMatMul.cpp index f68ae9883f..acc620edc6 100644 --- a/src/cpu/operators/CpuMatMul.cpp +++ b/src/cpu/operators/CpuMatMul.cpp @@ -215,6 +215,8 @@ void CpuMatMul::configure(ITensorInfo *lhs, // Setup transpose LHS _transpose_kernel_lhs = std::make_unique<cpu::kernels::CpuTransposeKernel>(); _transpose_kernel_lhs->configure(&lhs_to_use, &_lhs_transposed); + + _aux_mem[TransposeLHS] = MemoryInfo(offset_int_vec(TransposeLHS), MemoryLifetime::Temporary, lhs->total_size()); } if (_adj_rhs) @@ -222,6 +224,8 @@ void CpuMatMul::configure(ITensorInfo *lhs, // Setup transpose RHS _transpose_kernel_rhs = std::make_unique<cpu::kernels::CpuTransposeKernel>(); _transpose_kernel_rhs->configure(&rhs_to_use, &_rhs_transposed); + + _aux_mem[TransposeRHS] = MemoryInfo(offset_int_vec(TransposeRHS), MemoryLifetime::Temporary, rhs->total_size()); } // 3. Configure assembly kernel using transposed tensors. @@ -269,9 +273,6 @@ void CpuMatMul::configure(ITensorInfo *lhs, _aux_mem[idx] = aux; idx++; } - // Memory requirements for transposed tensors - _aux_mem[TransposeLHS] = MemoryInfo(offset_int_vec(TransposeLHS), MemoryLifetime::Temporary, lhs->total_size()); - _aux_mem[TransposeRHS] = MemoryInfo(offset_int_vec(TransposeRHS), MemoryLifetime::Temporary, rhs->total_size()); } void CpuMatMul::run(ITensorPack &tensors) diff --git a/src/cpu/operators/CpuPermute.cpp b/src/cpu/operators/CpuPermute.cpp index 25acc92d00..2d4e009d51 100644 --- a/src/cpu/operators/CpuPermute.cpp +++ b/src/cpu/operators/CpuPermute.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018-2021 Arm Limited. + * Copyright (c) 2018-2021, 2024 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -23,23 +23,91 @@ */ #include "src/cpu/operators/CpuPermute.h" +#include "arm_compute/core/CoreTypes.h" +#include "arm_compute/core/Error.h" +#include "arm_compute/core/ITensorInfo.h" + #include "src/common/utils/Log.h" +#include "src/cpu/kernels/CpuCopyKernel.h" #include "src/cpu/kernels/CpuPermuteKernel.h" +#include "src/cpu/kernels/CpuTransposeKernel.h" + +#include <algorithm> +#include <array> +#include <memory> namespace arm_compute { namespace cpu { +namespace +{ +// Handle "No-op" cases +bool prefer_copy(const PermutationVector &v) +{ + static const std::array<PermutationVector, 6> permutations = {{ + PermutationVector(0U), + PermutationVector(0U, 1U), + PermutationVector(0U, 1U, 2U), + PermutationVector(0U, 1U, 2U, 3U), + PermutationVector(0U, 1U, 2U, 3U, 4U), + PermutationVector(0U, 1U, 2U, 3U, 4U, 5U), + }}; + + return std::find(permutations.begin(), permutations.end(), v) != permutations.end(); +} + +// Transpose kernel is optimized for permuting the first two dimensions of a tensor +bool prefer_transpose(const PermutationVector &v) +{ + static const std::array<PermutationVector, 5> permutations = {{ + PermutationVector(1U, 0U), + PermutationVector(1U, 0U, 2U), + PermutationVector(1U, 0U, 2U, 3U), + PermutationVector(1U, 0U, 2U, 3U, 4U), + PermutationVector(1U, 0U, 2U, 3U, 4U, 5U), + }}; + + return std::find(permutations.begin(), permutations.end(), v) != permutations.end(); +} +} // namespace + void CpuPermute::configure(const ITensorInfo *src, ITensorInfo *dst, const PermutationVector &perm) { ARM_COMPUTE_LOG_PARAMS(src, dst, perm); - auto k = std::make_unique<kernels::CpuPermuteKernel>(); - k->configure(src, dst, perm); - _kernel = std::move(k); + + if (prefer_copy(perm)) + { + auto k = std::make_unique<kernels::CpuCopyKernel>(); + k->configure(src, dst); + _kernel = std::move(k); + } + else if (prefer_transpose(perm)) + { + auto k = std::make_unique<kernels::CpuTransposeKernel>(); + k->configure(src, dst); + _kernel = std::move(k); + } + else + { + auto k = std::make_unique<kernels::CpuPermuteKernel>(); + k->configure(src, dst, perm); + _kernel = std::move(k); + } } Status CpuPermute::validate(const ITensorInfo *src, const ITensorInfo *dst, const PermutationVector &perm) { + if (prefer_copy(perm)) + { + return kernels::CpuCopyKernel::validate(src, dst); + } + + if (prefer_transpose(perm)) + { + return kernels::CpuTransposeKernel::validate(src, dst); + } + return kernels::CpuPermuteKernel::validate(src, dst, perm); } } // namespace cpu diff --git a/src/cpu/operators/CpuReshape.cpp b/src/cpu/operators/CpuReshape.cpp index a423abb49a..fe4d0f9d7d 100644 --- a/src/cpu/operators/CpuReshape.cpp +++ b/src/cpu/operators/CpuReshape.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021, 2023 Arm Limited. + * Copyright (c) 2021, 2023-2024 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -53,8 +53,8 @@ void CpuReshape::run(ITensorPack &tensors) static_cast<kernels::CpuReshapeKernel *>(_kernel.get())->prepare(tensors); _is_prepared = true; } - const auto split_dimension = static_cast<kernels::CpuReshapeKernel *>(_kernel.get())->get_split_dimension(); - NEScheduler::get().schedule_op(_kernel.get(), split_dimension, _kernel->window(), tensors); + + ICpuOperator::run(tensors); } } // namespace cpu } // namespace arm_compute diff --git a/src/cpu/operators/CpuScatter.cpp b/src/cpu/operators/CpuScatter.cpp new file mode 100644 index 0000000000..f82413d201 --- /dev/null +++ b/src/cpu/operators/CpuScatter.cpp @@ -0,0 +1,70 @@ +/* + * Copyright (c) 2024 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "src/cpu/operators/CpuScatter.h" + +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/TensorInfo.h" + +#include "src/cpu/kernels/CpuScatterKernel.h" + +namespace arm_compute +{ +namespace cpu +{ + +void CpuScatter::configure(const ITensorInfo *src, + const ITensorInfo *updates, + const ITensorInfo *indices, + ITensorInfo *dst, + const ScatterInfo &Scatter_info) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(src, indices, dst); + ARM_COMPUTE_UNUSED(src); + ARM_COMPUTE_UNUSED(updates); + ARM_COMPUTE_UNUSED(indices); + ARM_COMPUTE_UNUSED(dst); + ARM_COMPUTE_UNUSED(Scatter_info); +} + +Status CpuScatter::validate(const ITensorInfo *src, + const ITensorInfo *updates, + const ITensorInfo *indices, + const ITensorInfo *dst, + const ScatterInfo &Scatter_info) +{ + ARM_COMPUTE_UNUSED(src); + ARM_COMPUTE_UNUSED(updates); + ARM_COMPUTE_UNUSED(indices); + ARM_COMPUTE_UNUSED(dst); + ARM_COMPUTE_UNUSED(Scatter_info); + + return Status{ErrorCode::RUNTIME_ERROR, "No configuration implemented yet."}; +} + +void CpuScatter::run(ITensorPack &tensors) +{ + ARM_COMPUTE_UNUSED(tensors); +} +} // namespace cpu +} // namespace arm_compute diff --git a/src/cpu/operators/CpuScatter.h b/src/cpu/operators/CpuScatter.h new file mode 100644 index 0000000000..d0161a778d --- /dev/null +++ b/src/cpu/operators/CpuScatter.h @@ -0,0 +1,81 @@ +/* + * Copyright (c) 2024 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ACL_SRC_CPU_OPERATORS_CPUSCATTER_H +#define ACL_SRC_CPU_OPERATORS_CPUSCATTER_H + +#include "arm_compute/core/ITensorInfo.h" +#include "arm_compute/function_info/ScatterInfo.h" + +#include "src/cpu/ICpuKernel.h" +#include "src/cpu/ICpuOperator.h" + +namespace arm_compute +{ +namespace cpu +{ +/** Basic function to execute Scatter in Neon ™ */ +class CpuScatter : public ICpuOperator +{ +public: + /** Initialise the kernel's inputs and output + * + * Valid data layouts: + * - All + * + * @note indices must always be U32 + * @note src, updates and dst tensors must be same datatype. + * + * @param[in] src Source input tensor info. Can be nullptr when using "Add" Scatter Function with zero initialization. + * @param[in] updates Tensor info for tensor storing update values to use for scatter function. Data types supported: same as @p src. + * @param[in] indices Tensor info for tensor storing indices to use for scatter function. Data types supported: U32 only. + * @param[out] dst Output tensor to store the result of the Scatter Function. Data types supported: same as @p src and @p updates. + * @param[in] Scatter_info Contains Scatter operation information described in @ref ScatterInfo. + */ + void configure(const ITensorInfo *src, + const ITensorInfo *updates, + const ITensorInfo *indices, + ITensorInfo *dst, + const ScatterInfo &Scatter_info); + /** Static function to check if given info will lead to a valid configuration + * + * Similar to @ref CpuScatter::configure() + * + * @return a status + */ + static Status validate(const ITensorInfo *src, + const ITensorInfo *updates, + const ITensorInfo *indices, + const ITensorInfo *dst, + const ScatterInfo &Scatter_info); + + // Inherited methods overridden: + void run(ITensorPack &tensors) override; + +private: + std::unique_ptr<ICPPKernel> _scatter_kernel{nullptr}; + std::unique_ptr<ICPPKernel> _fill_kernel{nullptr}; +}; +} // namespace cpu +} // namespace arm_compute +#endif // ACL_SRC_CPU_OPERATORS_CPUSCATTER_H diff --git a/src/cpu/operators/internal/CpuGemmAssemblyDispatch.cpp b/src/cpu/operators/internal/CpuGemmAssemblyDispatch.cpp index fb9bc15212..881142c374 100644 --- a/src/cpu/operators/internal/CpuGemmAssemblyDispatch.cpp +++ b/src/cpu/operators/internal/CpuGemmAssemblyDispatch.cpp @@ -45,6 +45,7 @@ namespace /** Run pretranspose_B_array in parallel (1D static scheduling) * * @tparam TypeInput + * @tparam TypeWeight * @tparam TypeOutput * * @param[in] gemm_asm GemmCommon kernel to run @@ -54,14 +55,14 @@ namespace * @param[in] src_multi_stride Stride in z ("multi") * @param[in] num_threads Number of threads to run this method. Must be >= 1 */ -template <typename TypeInput, typename TypeOutput> -void run_parallel_pretranspose_B_array(arm_gemm::GemmCommon<TypeInput, TypeOutput> *gemm_asm, - ITensor *dst, - const TypeInput *src, - int src_ld, - int src_multi_stride, - unsigned int num_threads, - bool transpose) +template <typename TypeInput, typename TypeWeight, typename TypeOutput> +void run_parallel_pretranspose_B_array(arm_gemm::GemmCommon<TypeInput, TypeWeight, TypeOutput> *gemm_asm, + ITensor *dst, + const TypeWeight *src, + int src_ld, + int src_multi_stride, + unsigned int num_threads, + bool transpose) { ARM_COMPUTE_ERROR_ON(gemm_asm == nullptr); ARM_COMPUTE_ERROR_ON(num_threads == 0); @@ -91,14 +92,6 @@ using namespace arm_compute::experimental; namespace { -struct free_delete -{ - void operator()(void *x) - { - free(x); - } -}; - struct Params { unsigned int M; @@ -113,14 +106,13 @@ struct Params Params extract_parameters(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *d, const AsmGemmInfo &info) { ARM_COMPUTE_ERROR_ON_NULLPTR(a, b, d); - Params p; - p.M = d->tensor_shape().y(); - p.K = a->tensor_shape().x(); - p.N = d->tensor_shape().x(); - p.batches = 1; - p.multis = 1; - p.sections = 1; - p.indirect = false; + Params p{/* M */ static_cast<unsigned int>(d->tensor_shape().y()), + /* N */ static_cast<unsigned int>(d->tensor_shape().x()), + /* K */ static_cast<unsigned int>(a->tensor_shape().x()), + /* batches */ 1, + /* multis */ 1, + /* sections */ 1, + /* indirect */ false}; if (info.method == AsmConvMethod::Conv || info.method == AsmConvMethod::Indirect) { @@ -172,13 +164,10 @@ IScheduler::Hints scheduling_hint_heuristic(arm_gemm::GemmMethod method, DataTyp } /** Fallback in case ACL doesn't have a function */ -template <typename TypeInput, typename TypeOutput, class OutputStage = arm_gemm::Nothing> +template <typename TypeInput, typename TypeWeight, typename TypeOutput, class OutputStage = arm_gemm::Nothing> class Fallback : public CpuGemmAssemblyDispatch::IFallback { public: - /** Destructor */ - ~Fallback() = default; - /** Initialise the functions's input and output. * * @param[in] a Input tensor containing the Matrix A. @@ -222,12 +211,45 @@ public: bool isVarWeightsKernel() const override { if (!_gemm_kernel_asm) + { return false; + } const arm_compute::WeightFormat wf = assembly_utils::map_to_arm_compute_weight_format(_gemm_kernel_asm->get_config().weight_format); return wf != arm_compute::WeightFormat::UNSPECIFIED && wf != arm_compute::WeightFormat::ANY; } + void update_quantization_parameters(const GEMMLowpOutputStageInfo &output_info, + const QuantizationInfo &a, + const QuantizationInfo &b, + const bool is_prepared, + const bool negated_offsets) override + { + const int32_t negation = negated_offsets ? 1 : -1; + const int32_t a_offset = -a.uniform().offset * negation; + const int32_t b_offset = -b.uniform().offset * negation; + + arm_gemm::Requantize32 gemm_requant_info{}; + if (output_info.gemmlowp_shifts.size() > 1) + { + const auto requantize_data = + this->set_requantize_data(output_info.gemmlowp_multipliers, output_info.gemmlowp_shifts); + gemm_requant_info = arm_gemm::Requantize32( + nullptr, 0, a_offset, b_offset, output_info.gemmlowp_offset, + (std::get<0>(requantize_data)) ? std::get<1>(requantize_data) : nullptr, std::get<2>(requantize_data), + std::get<3>(requantize_data), output_info.gemmlowp_min_bound, output_info.gemmlowp_max_bound); + } + else + { + gemm_requant_info = arm_gemm::Requantize32(nullptr, 0, a_offset, b_offset, output_info.gemmlowp_offset, + -output_info.gemmlowp_shift, output_info.gemmlowp_multiplier, + output_info.gemmlowp_min_bound, output_info.gemmlowp_max_bound); + } + + _gemm_kernel_asm->update_quantization_parameters(gemm_requant_info); + _is_prepared = is_prepared; + } + private: enum AuxTensorIdx { @@ -251,7 +273,7 @@ private: /** Operator to transpose B before gemm or pretranspose_B_array*/ std::unique_ptr<CpuTranspose> _pre_pretranspose_b{nullptr}; /** Assembly Gemm kernel */ - std::shared_ptr<arm_gemm::GemmCommon<TypeInput, TypeOutput>> _gemm_kernel_asm{nullptr}; + std::shared_ptr<arm_gemm::GemmCommon<TypeInput, TypeWeight, TypeOutput>> _gemm_kernel_asm{nullptr}; /** Optimised Arm® Neon™ kernel */ std::unique_ptr<INEKernel> _optimised_kernel{nullptr}; /** Assembly GEMM workspace tensor info */ @@ -273,22 +295,22 @@ private: /** Per channel quantization multipliers */ std::vector<int32_t> _multipliers{}; /** Indirect buffer */ - std::unique_ptr<const TypeInput *const *, free_delete> _indirect_arg{}; - std::unique_ptr<const TypeInput *, free_delete> _indirect_buf{}; - std::vector<TypeInput> _indirect_pad{}; - arm_gemm::ConvolutionParameters _cp{}; - experimental::MemoryRequirements _aux_mem{Count}; - bool _B_pretranspose_required{false}; - bool _is_b_constant{true}; - bool _is_c_constant{true}; - bool _run_pre_pretranspose_b{false}; - bool _B_pre_pretranspose_required{false}; + std::vector<const TypeInput *const *> _indirect_arg{}; + std::vector<const TypeInput *> _indirect_buf{}; + std::vector<TypeInput> _indirect_pad{}; + arm_gemm::ConvolutionParameters _cp{}; + experimental::MemoryRequirements _aux_mem{Count}; + bool _B_pretranspose_required{false}; + bool _is_b_constant{true}; + bool _is_c_constant{true}; + bool _run_pre_pretranspose_b{false}; + bool _B_pre_pretranspose_required{false}; }; -template <typename TypeInput, typename TypeOutput, class OutputStage> +template <typename TypeInput, typename TypeWeight, typename TypeOutput, class OutputStage> std::tuple<bool, const int32_t *, const int32_t *, const int32_t *> -Fallback<TypeInput, TypeOutput, OutputStage>::set_requantize_data(const std::vector<int32_t> &shifts, - const std::vector<int32_t> &multipliers) +Fallback<TypeInput, TypeWeight, TypeOutput, OutputStage>::set_requantize_data(const std::vector<int32_t> &shifts, + const std::vector<int32_t> &multipliers) { _multipliers = multipliers; _shifts = shifts; @@ -305,8 +327,8 @@ Fallback<TypeInput, TypeOutput, OutputStage>::set_requantize_data(const std::vec return std::make_tuple(need_left, left_shifts.data(), right_shifts.data(), _multipliers.data()); } -template <typename TypeInput, typename TypeOutput, class OutputStage> -void Fallback<TypeInput, TypeOutput, OutputStage>::prepare_indirect_buffer(ITensorPack &tensors) +template <typename TypeInput, typename TypeWeight, typename TypeOutput, class OutputStage> +void Fallback<TypeInput, TypeWeight, TypeOutput, OutputStage>::prepare_indirect_buffer(ITensorPack &tensors) { auto a = tensors.get_const_tensor(TensorType::ACL_SRC_0); const TypeInput *A_ptr = reinterpret_cast<TypeInput *>(a->buffer()); @@ -343,14 +365,12 @@ void Fallback<TypeInput, TypeOutput, OutputStage>::prepare_indirect_buffer(ITens if (input_x < 0 || input_x >= _cp.input_width || input_y < 0 || input_y >= _cp.input_height) { - _indirect_buf - .get()[m * multi_stride + b * batch_stride + kernel_xy * output_hw + output_xy] = + _indirect_buf[m * multi_stride + b * batch_stride + kernel_xy * output_hw + output_xy] = _indirect_pad.data(); } else { - _indirect_buf - .get()[m * multi_stride + b * batch_stride + kernel_xy * output_hw + output_xy] = + _indirect_buf[m * multi_stride + b * batch_stride + kernel_xy * output_hw + output_xy] = A_ptr + (m * multi_stride_A + b * batch_stride_A + input_xy * stride_A); } } @@ -361,11 +381,11 @@ void Fallback<TypeInput, TypeOutput, OutputStage>::prepare_indirect_buffer(ITens } } -template <typename TypeInput, typename TypeOutput, class OutputStage> -void Fallback<TypeInput, TypeOutput, OutputStage>::configure_indirect(const ITensorInfo *a, - const ITensorInfo *b, - const ITensorInfo *d, - const AsmGemmInfo &info) +template <typename TypeInput, typename TypeWeight, typename TypeOutput, class OutputStage> +void Fallback<TypeInput, TypeWeight, TypeOutput, OutputStage>::configure_indirect(const ITensorInfo *a, + const ITensorInfo *b, + const ITensorInfo *d, + const AsmGemmInfo &info) { ARM_COMPUTE_ERROR_ON(!(info.method == AsmConvMethod::Conv || info.method == AsmConvMethod::Indirect)); @@ -375,13 +395,13 @@ void Fallback<TypeInput, TypeOutput, OutputStage>::configure_indirect(const ITen zeropad = a->quantization_info().uniform().offset; } - const int64_t input_width = static_cast<int64_t>(a->tensor_shape()[1]); - const int64_t input_height = static_cast<int64_t>(a->tensor_shape()[2]); - const int64_t input_channels = static_cast<int64_t>(a->tensor_shape()[0]); - const int64_t kernel_width = static_cast<int64_t>(b->tensor_shape()[2]); - const int64_t kernel_height = static_cast<int64_t>(b->tensor_shape()[3]); - const int64_t output_width = static_cast<int64_t>(d->tensor_shape()[1]); - const int64_t output_height = static_cast<int64_t>(d->tensor_shape()[2]); + const auto input_width = static_cast<int64_t>(a->tensor_shape()[1]); + const auto input_height = static_cast<int64_t>(a->tensor_shape()[2]); + const auto input_channels = static_cast<int64_t>(a->tensor_shape()[0]); + const auto kernel_width = static_cast<int64_t>(b->tensor_shape()[2]); + const auto kernel_height = static_cast<int64_t>(b->tensor_shape()[3]); + const auto output_width = static_cast<int64_t>(d->tensor_shape()[1]); + const auto output_height = static_cast<int64_t>(d->tensor_shape()[2]); _cp = {input_width, input_height, @@ -392,6 +412,8 @@ void Fallback<TypeInput, TypeOutput, OutputStage>::configure_indirect(const ITen output_height, info.ps_info.stride().first, info.ps_info.stride().second, + 1, + 1, info.padding_top, info.padding_left, zeropad}; @@ -414,10 +436,8 @@ void Fallback<TypeInput, TypeOutput, OutputStage>::configure_indirect(const ITen const int multi_size = batch_size * batches; const size_t multi_stride = multi_size / sizeof(TypeInputPtr); - _indirect_buf = std::unique_ptr<const TypeInput *, free_delete>( - reinterpret_cast<const TypeInput **>(malloc(multi_size * multis))); - _indirect_arg = std::unique_ptr<const TypeInput *const *, free_delete>( - reinterpret_cast<const TypeInput *const **>(malloc(sizeof(TypeInput **) * kernel_hw * multis * batches))); + _indirect_buf = std::vector<const TypeInput *>(multi_size * multis); + _indirect_arg = std::vector<const TypeInput *const *>(sizeof(TypeInput **) * kernel_hw * multis * batches); _indirect_pad = std::vector<TypeInput>(_cp.input_channels, TypeInput(zeropad)); // Set indirect argument @@ -428,29 +448,28 @@ void Fallback<TypeInput, TypeOutput, OutputStage>::configure_indirect(const ITen { for (int64_t kernel_xy = 0; kernel_xy < kernel_hw; kernel_xy++) { - (_indirect_arg.get())[pos++] = - _indirect_buf.get() + m * multi_stride + b * batch_stride + kernel_xy * output_hw; + _indirect_arg[pos++] = &_indirect_buf[m * multi_stride + b * batch_stride + kernel_xy * output_hw]; } } } - _gemm_kernel_asm->set_indirect_parameters(a->tensor_shape()[0], _indirect_arg.get()); + _gemm_kernel_asm->set_indirect_parameters(a->tensor_shape()[0], _indirect_arg.data()); } } -template <typename TypeInput, typename TypeOutput, class OutputStage> -void Fallback<TypeInput, TypeOutput, OutputStage>::configure(const ITensorInfo *a, - const ITensorInfo *b, - const ITensorInfo *c, - ITensorInfo *d, - arm_gemm::GemmArgs args, - const AsmGemmInfo &gemm_info, - const OutputStage &os) +template <typename TypeInput, typename TypeWeight, typename TypeOutput, class OutputStage> +void Fallback<TypeInput, TypeWeight, TypeOutput, OutputStage>::configure(const ITensorInfo *a, + const ITensorInfo *b, + const ITensorInfo *c, + ITensorInfo *d, + arm_gemm::GemmArgs args, + const AsmGemmInfo &gemm_info, + const OutputStage &os) { _is_b_constant = b->are_values_constant(); _is_c_constant = c ? c->are_values_constant() : true; - _gemm_kernel_asm = arm_gemm::gemm<TypeInput, TypeOutput, OutputStage>(args, os); + _gemm_kernel_asm = arm_gemm::gemm<TypeInput, TypeWeight, TypeOutput, OutputStage>(args, os); if (_gemm_kernel_asm == nullptr) { //configuration not supported: Leave function unconfigured: @@ -460,7 +479,7 @@ void Fallback<TypeInput, TypeOutput, OutputStage>::configure(const ITensorInfo * arm_gemm::GemmConfig gemm_cfg = _gemm_kernel_asm->get_config(); // arm_compute wrapper for the Gemm object (see above) - auto acl_gemm_wrapper = std::make_unique<kernel::CpuGemmAssemblyWrapperKernel<TypeInput, TypeOutput>>(); + auto acl_gemm_wrapper = std::make_unique<kernel::CpuGemmAssemblyWrapperKernel<TypeInput, TypeWeight, TypeOutput>>(); ARM_COMPUTE_ERROR_ON(acl_gemm_wrapper == nullptr); acl_gemm_wrapper->configure(_gemm_kernel_asm.get(), gemm_cfg.filter); const size_t workspace_size = _gemm_kernel_asm->get_working_size(); @@ -549,8 +568,8 @@ void Fallback<TypeInput, TypeOutput, OutputStage>::configure(const ITensorInfo * } } -template <typename TypeInput, typename TypeOutput, class OutputStage> -void Fallback<TypeInput, TypeOutput, OutputStage>::prepare(ITensorPack &tensors) +template <typename TypeInput, typename TypeWeight, typename TypeOutput, class OutputStage> +void Fallback<TypeInput, TypeWeight, TypeOutput, OutputStage>::prepare(ITensorPack &tensors) { if (!_is_prepared) { @@ -588,17 +607,17 @@ void Fallback<TypeInput, TypeOutput, OutputStage>::prepare(ITensorPack &tensors) // Fixed format kernels need no pretranspose. ARM_COMPUTE_ERROR_ON(arm_compute::is_fixed_format( assembly_utils::map_to_arm_compute_weight_format(_gemm_kernel_asm->get_config().weight_format))); - const int ldb = b_to_use->info()->strides_in_bytes().y() / b_to_use->info()->element_size(); - const auto in1_ptr = reinterpret_cast<const TypeInput *>(b_to_use->buffer() + - b_to_use->info()->offset_first_element_in_bytes()); - const int multi_stride_b = b_to_use->info()->strides_in_bytes().z() / b_to_use->info()->element_size(); + const int ldb = b_to_use->info()->strides_in_bytes().y() / b_to_use->info()->element_size(); + const auto in1_ptr = reinterpret_cast<const TypeWeight *>( + b_to_use->buffer() + b_to_use->info()->offset_first_element_in_bytes()); + const int multi_stride_b = b_to_use->info()->strides_in_bytes().z() / b_to_use->info()->element_size(); CpuAuxTensorHandler pretranspose(offset_int_vec(Pretranspose), _pretranspose_info, tensors, false); ARM_COMPUTE_ERROR_ON(pretranspose.get()->buffer() == nullptr); const bool kernel_supports_transpose = _gemm_kernel_asm->B_pretranspose_supports_transpose(); - run_parallel_pretranspose_B_array<TypeInput, TypeOutput>( + run_parallel_pretranspose_B_array<TypeInput, TypeWeight, TypeOutput>( _gemm_kernel_asm.get(), pretranspose.get(), in1_ptr, ldb, multi_stride_b, NEScheduler::get().num_threads(), _B_pre_pretranspose_required && kernel_supports_transpose); @@ -616,20 +635,20 @@ void Fallback<TypeInput, TypeOutput, OutputStage>::prepare(ITensorPack &tensors) } } -template <typename TypeInput, typename TypeOutput, class OutputStage> -bool Fallback<TypeInput, TypeOutput, OutputStage>::is_configured() const +template <typename TypeInput, typename TypeWeight, typename TypeOutput, class OutputStage> +bool Fallback<TypeInput, TypeWeight, TypeOutput, OutputStage>::is_configured() const { return _optimised_kernel != nullptr; } -template <typename TypeInput, typename TypeOutput, class OutputStage> -experimental::MemoryRequirements Fallback<TypeInput, TypeOutput, OutputStage>::workspace() const +template <typename TypeInput, typename TypeWeight, typename TypeOutput, class OutputStage> +experimental::MemoryRequirements Fallback<TypeInput, TypeWeight, TypeOutput, OutputStage>::workspace() const { return _aux_mem; } -template <typename TypeInput, typename TypeOutput, class OutputStage> -void Fallback<TypeInput, TypeOutput, OutputStage>::run(ITensorPack &tensors) +template <typename TypeInput, typename TypeWeight, typename TypeOutput, class OutputStage> +void Fallback<TypeInput, TypeWeight, TypeOutput, OutputStage>::run(ITensorPack &tensors) { auto a = tensors.get_const_tensor(TensorType::ACL_SRC_0); auto b = tensors.get_const_tensor(TensorType::ACL_SRC_1); @@ -663,8 +682,8 @@ void Fallback<TypeInput, TypeOutput, OutputStage>::run(ITensorPack &tensors) const int multi_stride_d = d->info()->strides_in_bytes()[d_multi_idx] / d->info()->element_size(); auto in0_ptr = reinterpret_cast<const TypeInput *>(a->buffer() + a->info()->offset_first_element_in_bytes()); - const TypeInput *in1_ptr = nullptr; - auto out_ptr = reinterpret_cast<TypeOutput *>(d->buffer() + d->info()->offset_first_element_in_bytes()); + const TypeWeight *in1_ptr = nullptr; + auto out_ptr = reinterpret_cast<TypeOutput *>(d->buffer() + d->info()->offset_first_element_in_bytes()); const ITensor *b_to_use = b; @@ -686,8 +705,8 @@ void Fallback<TypeInput, TypeOutput, OutputStage>::run(ITensorPack &tensors) { ldb = b_to_use->info()->strides_in_bytes().y() / b_to_use->info()->element_size(); multi_stride_b = b_to_use->info()->strides_in_bytes().z() / b_to_use->info()->element_size(); - in1_ptr = - reinterpret_cast<const TypeInput *>(b_to_use->buffer() + b_to_use->info()->offset_first_element_in_bytes()); + in1_ptr = reinterpret_cast<const TypeWeight *>(b_to_use->buffer() + + b_to_use->info()->offset_first_element_in_bytes()); } // If necessary, run pretranspose every time if either weights or biases are non-constant @@ -706,8 +725,8 @@ void Fallback<TypeInput, TypeOutput, OutputStage>::run(ITensorPack &tensors) ARM_COMPUTE_ERROR_ON(arm_compute::is_fixed_format( assembly_utils::map_to_arm_compute_weight_format(_gemm_kernel_asm->get_config().weight_format))); const int ldb = b_to_use->info()->strides_in_bytes().y() / b_to_use->info()->element_size(); - const auto b_ptr = reinterpret_cast<const TypeInput *>(b_to_use->buffer() + - b_to_use->info()->offset_first_element_in_bytes()); + const auto b_ptr = reinterpret_cast<const TypeWeight *>(b_to_use->buffer() + + b_to_use->info()->offset_first_element_in_bytes()); const int multi_stride_b = b_to_use->info()->strides_in_bytes().z() / b_to_use->info()->element_size(); CpuAuxTensorHandler pretranspose(offset_int_vec(Pretranspose), _pretranspose_info, tensors, true); @@ -720,7 +739,7 @@ void Fallback<TypeInput, TypeOutput, OutputStage>::run(ITensorPack &tensors) else { const bool kernel_supports_transpose = _gemm_kernel_asm->B_pretranspose_supports_transpose(); - run_parallel_pretranspose_B_array<TypeInput, TypeOutput>( + run_parallel_pretranspose_B_array<TypeInput, TypeWeight, TypeOutput>( _gemm_kernel_asm.get(), pretranspose.get(), b_ptr, ldb, multi_stride_b, NEScheduler::get().num_threads(), _B_pre_pretranspose_required && kernel_supports_transpose); } @@ -744,7 +763,7 @@ void Fallback<TypeInput, TypeOutput, OutputStage>::run(ITensorPack &tensors) if (split_dim != IScheduler::split_dimensions_all) { // Make sure the kernel does not expect more threads than we can actually spawn - const unsigned int num_iterations = _optimised_kernel.get()->window().num_iterations(split_dim); + const unsigned int num_iterations = _optimised_kernel->window().num_iterations(split_dim); num_threads = std::min(num_iterations, num_threads); } _gemm_kernel_asm->set_nthreads(num_threads); @@ -775,7 +794,7 @@ void Fallback<TypeInput, TypeOutput, OutputStage>::run(ITensorPack &tensors) NEScheduler::get().schedule(_optimised_kernel.get(), scheduling_hint); } -template <typename TypeInput, typename TypeOutput> +template <typename TypeInput, typename TypeWeight, typename TypeOutput> void create_arm_gemm(std::unique_ptr<CpuGemmAssemblyDispatch::IFallback> &arm_gemm, const ITensorInfo *a, const ITensorInfo *b, @@ -794,12 +813,12 @@ void create_arm_gemm(std::unique_ptr<CpuGemmAssemblyDispatch::IFallback> &arm_ge info.fixed_format, info.fast_mode, info.accumulate, &cfg); // Create arm_gemm fallback - auto fallback = std::make_unique<Fallback<TypeInput, TypeOutput>>(); + auto fallback = std::make_unique<Fallback<TypeInput, TypeWeight, TypeOutput>>(); fallback->configure(a, b, c, d, args, info); arm_gemm = std::move(fallback); } -template <typename TypeInput, typename TypeOutput> +template <typename TypeInput, typename TypeWeight, typename TypeOutput> void create_arm_gemm_dequant(std::unique_ptr<CpuGemmAssemblyDispatch::IFallback> &arm_gemm, const ITensorInfo *a, const ITensorInfo *b, @@ -820,7 +839,7 @@ void create_arm_gemm_dequant(std::unique_ptr<CpuGemmAssemblyDispatch::IFallback> info.fixed_format, info.fast_mode, info.accumulate, &cfg); // Create arm_gemm fallback - auto fallback = std::make_unique<Fallback<TypeInput, TypeOutput, arm_gemm::DequantizeFloat>>(); + auto fallback = std::make_unique<Fallback<TypeInput, TypeWeight, TypeOutput, arm_gemm::DequantizeFloat>>(); // Configure requantization info const GEMMLowpOutputStageInfo os_info = info.output_stage; @@ -832,7 +851,7 @@ void create_arm_gemm_dequant(std::unique_ptr<CpuGemmAssemblyDispatch::IFallback> arm_gemm = std::move(fallback); } -template <typename TypeInput, typename TypeOutput> +template <typename TypeInput, typename TypeWeight, typename TypeOutput> void create_arm_gemm_quant(std::unique_ptr<CpuGemmAssemblyDispatch::IFallback> &arm_gemm, const ITensorInfo *a, const ITensorInfo *b, @@ -852,7 +871,7 @@ void create_arm_gemm_quant(std::unique_ptr<CpuGemmAssemblyDispatch::IFallback> & info.fixed_format, info.fast_mode, info.accumulate, &cfg); // Create arm_gemm fallback - auto fallback = std::make_unique<Fallback<TypeInput, TypeOutput, arm_gemm::Requantize32>>(); + auto fallback = std::make_unique<Fallback<TypeInput, TypeWeight, TypeOutput, arm_gemm::Requantize32>>(); // Configure requantization info const int32_t negation = info.negated_offsets ? 1 : -1; @@ -905,12 +924,12 @@ Status CpuGemmAssemblyDispatch::has_opt_impl(arm_compute::WeightFormat &expected arm_gemm::WeightFormat arm_gemm_expected_wf = assembly_utils::map_to_arm_gemm_weight_format(expected_weight_format); arm_gemm::GemmArgs args(&ci, p.M, p.N, p.K, p.sections, p.batches, p.multis, p.indirect, act, num_threads, info.fixed_format, info.fast_mode, info.accumulate, &cfg); - // TODO: Incorporate info.transpose_b COMPMID-6595 + // TODO(COMPMID-6595): Incorporate info.transpose_b switch (a->data_type()) { case DataType::F32: ARM_COMPUTE_RETURN_ERROR_ON_MSG( - !(arm_gemm::has_opt_gemm<float, float, arm_gemm::Nothing>(arm_gemm_expected_wf, args, {})), + !(arm_gemm::has_opt_gemm<float, float, float, arm_gemm::Nothing>(arm_gemm_expected_wf, args, {})), "We could not find an optimized kernel for F32 input"); break; #ifdef __aarch64__ @@ -919,13 +938,22 @@ Status CpuGemmAssemblyDispatch::has_opt_impl(arm_compute::WeightFormat &expected if (d->data_type() == DataType::S32) { ARM_COMPUTE_RETURN_ERROR_ON_MSG( - !(arm_gemm::has_opt_gemm<uint8_t, uint32_t, arm_gemm::Nothing>(arm_gemm_expected_wf, args, {})), + !(arm_gemm::has_opt_gemm<uint8_t, uint8_t, uint32_t, arm_gemm::Nothing>(arm_gemm_expected_wf, args, + {})), "We could not find an optimized kernel for U8/QASYMM8 input and U32 output"); } + else if (b->data_type() == DataType::QASYMM8_SIGNED) + { + ARM_COMPUTE_RETURN_ERROR_ON_MSG( + !(arm_gemm::has_opt_gemm<uint8_t, int8_t, uint8_t, arm_gemm::Requantize32>(arm_gemm_expected_wf, + args, {})), + "We could not find an optimized kernel for U8 input with S8 weights and U8 output"); + } else { ARM_COMPUTE_RETURN_ERROR_ON_MSG( - !(arm_gemm::has_opt_gemm<uint8_t, uint8_t, arm_gemm::Requantize32>(arm_gemm_expected_wf, args, {})), + !(arm_gemm::has_opt_gemm<uint8_t, uint8_t, uint8_t, arm_gemm::Requantize32>(arm_gemm_expected_wf, + args, {})), "We could not find an optimized kernel for U8 input and U8 output"); } break; @@ -934,13 +962,15 @@ Status CpuGemmAssemblyDispatch::has_opt_impl(arm_compute::WeightFormat &expected if (d->data_type() == DataType::S32) { ARM_COMPUTE_RETURN_ERROR_ON_MSG( - !(arm_gemm::has_opt_gemm<int8_t, int32_t, arm_gemm::Nothing>(arm_gemm_expected_wf, args, {})), + !(arm_gemm::has_opt_gemm<int8_t, int8_t, int32_t, arm_gemm::Nothing>(arm_gemm_expected_wf, args, + {})), "We could not find an optimized kernel for S8/QASYMM8_SIGNED input and S32 output"); } else { ARM_COMPUTE_RETURN_ERROR_ON_MSG( - !(arm_gemm::has_opt_gemm<int8_t, int8_t, arm_gemm::Requantize32>(arm_gemm_expected_wf, args, {})), + !(arm_gemm::has_opt_gemm<int8_t, int8_t, int8_t, arm_gemm::Requantize32>(arm_gemm_expected_wf, args, + {})), "We could not find an optimized kernel for S8 input and S8 output"); } break; @@ -952,13 +982,15 @@ Status CpuGemmAssemblyDispatch::has_opt_impl(arm_compute::WeightFormat &expected if (d->data_type() == DataType::BFLOAT16) { ARM_COMPUTE_RETURN_ERROR_ON_MSG( - !(arm_gemm::has_opt_gemm<bfloat16, bfloat16, arm_gemm::Nothing>(arm_gemm_expected_wf, args, {})), + !(arm_gemm::has_opt_gemm<bfloat16, bfloat16, bfloat16, arm_gemm::Nothing>(arm_gemm_expected_wf, + args, {})), "We could not find an optimized kernel for BFLOAT16 input and BFLOAT16 output"); } else { ARM_COMPUTE_RETURN_ERROR_ON_MSG( - !(arm_gemm::has_opt_gemm<bfloat16, float, arm_gemm::Nothing>(arm_gemm_expected_wf, args, {})), + !(arm_gemm::has_opt_gemm<bfloat16, bfloat16, float, arm_gemm::Nothing>(arm_gemm_expected_wf, args, + {})), "We could not find an optimized kernel for BFLOAT16 input and F32 output"); } break; @@ -968,7 +1000,8 @@ Status CpuGemmAssemblyDispatch::has_opt_impl(arm_compute::WeightFormat &expected #if defined(ENABLE_FP16_KERNELS) case DataType::F16: ARM_COMPUTE_RETURN_ERROR_ON_MSG( - !(arm_gemm::has_opt_gemm<float16_t, float16_t, arm_gemm::Nothing>(arm_gemm_expected_wf, args, {})), + !(arm_gemm::has_opt_gemm<float16_t, float16_t, float16_t, arm_gemm::Nothing>(arm_gemm_expected_wf, args, + {})), "We could not find an optimized kernel for F16 input and F16 output"); break; #endif /* ENABLE_FP16_KERNELS */ @@ -1009,7 +1042,7 @@ Status CpuGemmAssemblyDispatch::validate( ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_NOT_IN(a, DataType::F32); ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_NOT_IN(b, DataType::BFLOAT16); } - else + else if (!(a->data_type() == DataType::QASYMM8 && b->data_type() == DataType::QASYMM8_SIGNED)) { ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(a, b); } @@ -1024,12 +1057,13 @@ Status CpuGemmAssemblyDispatch::validate( "Only U32 output supported for U8 input"); ARM_COMPUTE_RETURN_ERROR_ON_MSG(a->data_type() == DataType::S8 && d->data_type() != DataType::S32, "Only S32 output supported for S8 input"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(a->data_type() == DataType::QASYMM8 && - (d->data_type() != DataType::QASYMM8 && d->data_type() != DataType::S32), - "Only QASYMM8/S32 output supported for QASYMM8 input"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG( + a->data_type() == DataType::QASYMM8 && + (d->data_type() != DataType::QASYMM8 && d->data_type() != DataType::S32 && d->data_type() != DataType::F32), + "Only QASYMM8/S32/F32 output supported for QASYMM8 input"); arm_compute::WeightFormat expected_weight_format = arm_compute::WeightFormat::UNSPECIFIED; const Status ret = CpuGemmAssemblyDispatch::has_opt_impl(expected_weight_format, a, b, c, d, info); - if ((bool)ret && expected_weight_format != arm_compute::WeightFormat::ANY) + if (bool(ret) && expected_weight_format != arm_compute::WeightFormat::ANY) { // Correctness check: if the format expected by the kernel is // not "any", make sure that the one found matches the format @@ -1062,33 +1096,44 @@ void CpuGemmAssemblyDispatch::configure( switch (a->data_type()) { case DataType::F32: - create_arm_gemm<float, float>(_arm_gemm, a, b, c, d, act, info); + create_arm_gemm<float, float, float>(_arm_gemm, a, b, c, d, act, info); break; #ifdef __aarch64__ case DataType::U8: case DataType::QASYMM8: - if (d->data_type() == DataType::S32) + if (b->data_type() == DataType::S8 || b->data_type() == DataType::QASYMM8_SIGNED) { - create_arm_gemm<uint8_t, uint32_t>(_arm_gemm, a, b, c, d, act, info); + if (d->data_type() == DataType::F32) + { + create_arm_gemm_dequant<uint8_t, int8_t, float>(_arm_gemm, a, b, c, d, act, info); + } + else + { + create_arm_gemm_quant<uint8_t, int8_t, uint8_t>(_arm_gemm, a, b, c, d, act, info); + } + } + else if (d->data_type() == DataType::S32) + { + create_arm_gemm<uint8_t, uint8_t, uint32_t>(_arm_gemm, a, b, c, d, act, info); } else { - create_arm_gemm_quant<uint8_t, uint8_t>(_arm_gemm, a, b, c, d, act, info); + create_arm_gemm_quant<uint8_t, uint8_t, uint8_t>(_arm_gemm, a, b, c, d, act, info); } break; case DataType::S8: case DataType::QASYMM8_SIGNED: if (d->data_type() == DataType::S32) { - create_arm_gemm<int8_t, int32_t>(_arm_gemm, a, b, c, d, act, info); + create_arm_gemm<int8_t, int8_t, int32_t>(_arm_gemm, a, b, c, d, act, info); } else if (d->data_type() == DataType::F32) { - create_arm_gemm_dequant<int8_t, float>(_arm_gemm, a, b, c, d, act, info); + create_arm_gemm_dequant<int8_t, int8_t, float>(_arm_gemm, a, b, c, d, act, info); } else { - create_arm_gemm_quant<int8_t, int8_t>(_arm_gemm, a, b, c, d, act, info); + create_arm_gemm_quant<int8_t, int8_t, int8_t>(_arm_gemm, a, b, c, d, act, info); } break; #endif /* __aarch64__ */ @@ -1096,17 +1141,17 @@ void CpuGemmAssemblyDispatch::configure( case DataType::BFLOAT16: if (d->data_type() == DataType::BFLOAT16) { - create_arm_gemm<bfloat16, bfloat16>(_arm_gemm, a, b, c, d, act, info); + create_arm_gemm<bfloat16, bfloat16, bfloat16>(_arm_gemm, a, b, c, d, act, info); } else { - create_arm_gemm<bfloat16, float>(_arm_gemm, a, b, c, d, act, info); + create_arm_gemm<bfloat16, bfloat16, float>(_arm_gemm, a, b, c, d, act, info); } break; #endif /* defined(ARM_COMPUTE_ENABLE_BF16) */ #ifdef ENABLE_FP16_KERNELS case DataType::F16: - create_arm_gemm<float16_t, float16_t>(_arm_gemm, a, b, c, d, act, info); + create_arm_gemm<float16_t, float16_t, float16_t>(_arm_gemm, a, b, c, d, act, info); break; #endif /* ENABLE_FP16_KERNELS */ default: @@ -1136,5 +1181,15 @@ experimental::MemoryRequirements CpuGemmAssemblyDispatch::workspace() const ARM_COMPUTE_ERROR_ON(_arm_gemm == nullptr); return _arm_gemm->workspace(); } + +void CpuGemmAssemblyDispatch::update_quantization_parameters(const GEMMLowpOutputStageInfo &output_info, + const QuantizationInfo &a, + const QuantizationInfo &b, + const bool is_prepared, + const bool negated_offsets) +{ + ARM_COMPUTE_ERROR_ON(_arm_gemm == nullptr); + _arm_gemm->update_quantization_parameters(output_info, a, b, is_prepared, negated_offsets); +} } // namespace cpu } // namespace arm_compute diff --git a/src/cpu/operators/internal/CpuGemmAssemblyDispatch.h b/src/cpu/operators/internal/CpuGemmAssemblyDispatch.h index 44c5c189a5..0b6f22d45a 100644 --- a/src/cpu/operators/internal/CpuGemmAssemblyDispatch.h +++ b/src/cpu/operators/internal/CpuGemmAssemblyDispatch.h @@ -28,6 +28,7 @@ #include "src/core/common/Macros.h" #include "src/cpu/ICpuOperator.h" +#include "src/cpu/kernels/assembly/arm_gemm.hpp" namespace arm_compute { @@ -81,12 +82,17 @@ public: class IFallback { public: - virtual void run(ITensorPack &tensors) = 0; - virtual void prepare(ITensorPack &tensors) = 0; - virtual experimental::MemoryRequirements workspace() const = 0; - virtual bool is_configured() const = 0; - virtual bool isVarWeightsKernel() const = 0; - virtual ~IFallback() = default; + virtual void run(ITensorPack &tensors) = 0; + virtual void prepare(ITensorPack &tensors) = 0; + virtual experimental::MemoryRequirements workspace() const = 0; + virtual bool is_configured() const = 0; + virtual bool isVarWeightsKernel() const = 0; + virtual void update_quantization_parameters(const GEMMLowpOutputStageInfo &, + const QuantizationInfo &, + const QuantizationInfo &, + const bool, + const bool) = 0; + virtual ~IFallback() = default; }; public: @@ -185,6 +191,12 @@ public: return _arm_gemm && _arm_gemm->isVarWeightsKernel(); } + void update_quantization_parameters(const GEMMLowpOutputStageInfo &output_info, + const QuantizationInfo &a, + const QuantizationInfo &b, + const bool is_prepared, + const bool negated_offsets); + // Inherited methods overridden: void prepare(ITensorPack &tensors) override; void run(ITensorPack &tensors) override; |