From 53929b1fd4dd3c27f5afb5b8626e27605ebe62cf Mon Sep 17 00:00:00 2001 From: Gunes Bayir Date: Thu, 11 Aug 2022 12:15:39 +0100 Subject: =?UTF-8?q?Use=20Neon=E2=84=A2=20kernels=20for=20FP=20Bilinear=20R?= =?UTF-8?q?esize=20for=20SVE?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Removes FP Bilinear SVE kernels and uses Neon™ kernels instead Resolves: COMPMID-5449 Signed-off-by: Gunes Bayir Change-Id: I8e01de44bd884cb6578ca0b9358509b69bc31ca2 Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/8100 Benchmark: Arm Jenkins Reviewed-by: Viet-Hoa Do Reviewed-by: Pablo Marquez Tello Comments-Addressed: Arm Jenkins Tested-by: Arm Jenkins --- src/cpu/kernels/CpuKernelSelectionTypes.h | 8 ++++ src/cpu/kernels/CpuScaleKernel.cpp | 34 +++++++------ src/cpu/kernels/CpuScaleKernel.h | 6 +-- src/cpu/kernels/scale/sve/fp16.cpp | 80 ++----------------------------- src/cpu/kernels/scale/sve/fp32.cpp | 78 ++---------------------------- 5 files changed, 41 insertions(+), 165 deletions(-) (limited to 'src/cpu') diff --git a/src/cpu/kernels/CpuKernelSelectionTypes.h b/src/cpu/kernels/CpuKernelSelectionTypes.h index 19c41f9fcd..e3ecc4e709 100644 --- a/src/cpu/kernels/CpuKernelSelectionTypes.h +++ b/src/cpu/kernels/CpuKernelSelectionTypes.h @@ -90,6 +90,13 @@ struct CpuAddKernelDataTypeISASelectorData bool can_interpret_inputs_as_1d_array; }; +struct ScaleKernelDataTypeISASelectorData +{ + DataType dt; + cpuinfo::CpuIsaInfo isa; + InterpolationPolicy interpolation_policy; +}; + // Selector pointer types using DataTypeISASelectorPtr = std::add_pointer::type; using DataTypeDataLayoutSelectorPtr = std::add_pointer::type; @@ -99,6 +106,7 @@ using DepthwiseConv2dNativeDataTypeISASelectorPtr = std::add_pointer::type; using ActivationDataTypeISASelectorDataPtr = std::add_pointer::type; using CpuAddKernelDataTypeISASelectorDataPtr = std::add_pointer::type; +using ScaleKernelDataTypeISASelectorDataPtr = std::add_pointer::type; } // namespace kernels } // namespace cpu diff --git a/src/cpu/kernels/CpuScaleKernel.cpp b/src/cpu/kernels/CpuScaleKernel.cpp index e230dfa938..c9e858fc02 100644 --- a/src/cpu/kernels/CpuScaleKernel.cpp +++ b/src/cpu/kernels/CpuScaleKernel.cpp @@ -52,62 +52,68 @@ static const std::vector available_kernels = { { "sve_fp16_scale", - [](const DataTypeISASelectorData & data) { return data.dt == DataType::F16 && data.isa.sve && data.isa.fp16; }, + [](const ScaleKernelDataTypeISASelectorData & data) + { + return data.dt == DataType::F16 && data.isa.sve && data.isa.fp16 && data.interpolation_policy != InterpolationPolicy::BILINEAR; + }, REGISTER_FP16_SVE(arm_compute::cpu::fp16_sve_scale) }, { "sve_fp32_scale", - [](const DataTypeISASelectorData & data) { return data.dt == DataType::F32 && data.isa.sve; }, + [](const ScaleKernelDataTypeISASelectorData & data) + { + return data.dt == DataType::F32 && data.isa.sve && data.interpolation_policy != InterpolationPolicy::BILINEAR; + }, REGISTER_FP32_SVE(arm_compute::cpu::fp32_sve_scale) }, { "sve_qu8_scale", - [](const DataTypeISASelectorData & data) { return data.dt == DataType::QASYMM8 && data.isa.sve; }, + [](const ScaleKernelDataTypeISASelectorData & data) { return data.dt == DataType::QASYMM8 && data.isa.sve; }, REGISTER_QASYMM8_SVE(arm_compute::cpu::qasymm8_sve_scale) }, { "sve_qs8_scale", - [](const DataTypeISASelectorData & data) { return data.dt == DataType::QASYMM8_SIGNED && data.isa.sve; }, + [](const ScaleKernelDataTypeISASelectorData & data) { return data.dt == DataType::QASYMM8_SIGNED && data.isa.sve; }, REGISTER_QASYMM8_SIGNED_SVE(arm_compute::cpu::qasymm8_signed_sve_scale) }, { "sve_u8_scale", - [](const DataTypeISASelectorData & data) { return data.dt == DataType::U8 && data.isa.sve; }, + [](const ScaleKernelDataTypeISASelectorData & data) { return data.dt == DataType::U8 && data.isa.sve; }, REGISTER_INTEGER_SVE(arm_compute::cpu::u8_sve_scale) }, { "sve_s16_scale", - [](const DataTypeISASelectorData & data) { return data.dt == DataType::S16 && data.isa.sve; }, + [](const ScaleKernelDataTypeISASelectorData & data) { return data.dt == DataType::S16 && data.isa.sve; }, REGISTER_INTEGER_SVE(arm_compute::cpu::s16_sve_scale) }, { "neon_fp16_scale", - [](const DataTypeISASelectorData & data) { return data.dt == DataType::F16 && data.isa.fp16; }, + [](const ScaleKernelDataTypeISASelectorData & data) { return data.dt == DataType::F16 && data.isa.fp16; }, REGISTER_FP16_NEON(arm_compute::cpu::common_neon_scale) }, { "neon_fp32_scale", - [](const DataTypeISASelectorData & data) { return data.dt == DataType::F32; }, + [](const ScaleKernelDataTypeISASelectorData & data) { return data.dt == DataType::F32; }, REGISTER_FP32_NEON(arm_compute::cpu::common_neon_scale) }, { "neon_qu8_scale", - [](const DataTypeISASelectorData & data) { return data.dt == DataType::QASYMM8; }, + [](const ScaleKernelDataTypeISASelectorData & data) { return data.dt == DataType::QASYMM8; }, REGISTER_QASYMM8_NEON(arm_compute::cpu::qasymm8_neon_scale) }, { "neon_qs8_scale", - [](const DataTypeISASelectorData & data) { return data.dt == DataType::QASYMM8_SIGNED; }, + [](const ScaleKernelDataTypeISASelectorData & data) { return data.dt == DataType::QASYMM8_SIGNED; }, REGISTER_QASYMM8_SIGNED_NEON(arm_compute::cpu::qasymm8_signed_neon_scale) }, { "neon_u8_scale", - [](const DataTypeISASelectorData & data) { return data.dt == DataType::U8; }, + [](const ScaleKernelDataTypeISASelectorData & data) { return data.dt == DataType::U8; }, REGISTER_INTEGER_NEON(arm_compute::cpu::u8_neon_scale) }, { "neon_s16_scale", - [](const DataTypeISASelectorData & data) { return data.dt == DataType::S16; }, + [](const ScaleKernelDataTypeISASelectorData & data) { return data.dt == DataType::S16; }, REGISTER_INTEGER_NEON(arm_compute::cpu::s16_neon_scale) }, }; @@ -115,7 +121,7 @@ static const std::vector available_kernels = Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dx, const ITensorInfo *dy, const ITensorInfo *offsets, ITensorInfo *dst, const ScaleKernelInfo &info) { - const auto *uk = CpuScaleKernel::get_implementation(DataTypeISASelectorData{ src->data_type(), CPUInfo::get().get_isa() }); + const auto *uk = CpuScaleKernel::get_implementation(ScaleKernelDataTypeISASelectorData{ src->data_type(), CPUInfo::get().get_isa(), info.interpolation_policy }); ARM_COMPUTE_RETURN_ERROR_ON(uk == nullptr || uk->ukernel == nullptr); @@ -174,7 +180,7 @@ void CpuScaleKernel::configure(const ITensorInfo *src, const ITensorInfo *dx, co dst, info)); - const auto *uk = CpuScaleKernel::get_implementation(DataTypeISASelectorData{ src->data_type(), CPUInfo::get().get_isa() }); + const auto *uk = CpuScaleKernel::get_implementation(ScaleKernelDataTypeISASelectorData{ src->data_type(), CPUInfo::get().get_isa(), info.interpolation_policy }); ARM_COMPUTE_ERROR_ON_NULLPTR(uk); _run_method = uk->ukernel; diff --git a/src/cpu/kernels/CpuScaleKernel.h b/src/cpu/kernels/CpuScaleKernel.h index e0e9e387bd..416e115796 100644 --- a/src/cpu/kernels/CpuScaleKernel.h +++ b/src/cpu/kernels/CpuScaleKernel.h @@ -75,9 +75,9 @@ public: struct ScaleKernel { - const char *name; - const DataTypeISASelectorPtr is_selected; - ScaleKernelPtr ukernel; + const char *name; + const ScaleKernelDataTypeISASelectorDataPtr is_selected; + ScaleKernelPtr ukernel; }; static const std::vector &get_available_kernels(); diff --git a/src/cpu/kernels/scale/sve/fp16.cpp b/src/cpu/kernels/scale/sve/fp16.cpp index d08bfd8cdf..ceda19f366 100644 --- a/src/cpu/kernels/scale/sve/fp16.cpp +++ b/src/cpu/kernels/scale/sve/fp16.cpp @@ -84,77 +84,6 @@ void fp16_sve_scale_nearest(const ITensor *src, ITensor *dst, const ITensor *off }, out); } - -void fp16_sve_scale_bilinear(const ITensor *src, ITensor *dst, const ITensor *offsets, const ITensor *dx, const ITensor *dy, - BorderMode border_mode, PixelValue constant_border_value, float sampling_offset, - bool align_corners, const Window &window) -{ - // Compute the ratio between source height and destination height - const auto hr = scale_utils::calculate_resize_ratio(src->info()->dimension(2), dst->info()->dimension(2), align_corners); - - Iterator out(dst, window); - const int in_stride_c = src->info()->dimension(0) + src->info()->padding().left + src->info()->padding().right; - const int in_dim_w = src->info()->dimension(1); - const int in_dim_h = src->info()->dimension(2); - const int in_stride_wc = in_stride_c * (in_dim_w + src->info()->padding().top + src->info()->padding().bottom); - - // Don't increment in Y and Z direction for the input tensor - // A pointer to the start of this plane is needed as base for the precomputed offsets - Window win_in(window); - win_in.set(Window::DimY, Window::Dimension(0, 0, 0)); - win_in.set(Window::DimZ, Window::Dimension(0, 0, 0)); - Iterator in(src, win_in); - - if(border_mode == BorderMode::CONSTANT) - { - using ConstType = typename std::conditional::value, half, float16_t>::type; - - const float16_t const_border_value = static_cast(constant_border_value.get()); - execute_window_loop(window, [&](const Coordinates & id) - { - const auto offset = *reinterpret_cast(offsets->ptr_to_element(Coordinates(id.y(), id.z()))); - const auto dx_val = *reinterpret_cast(dx->ptr_to_element(Coordinates(id.y(), id.z()))); - const auto dy_val = *reinterpret_cast(dy->ptr_to_element(Coordinates(id.y(), id.z()))); - const int32_t in_hi = std::floor((id.z() + sampling_offset) * hr - sampling_offset); - const float16_t *in_ptr = reinterpret_cast(in.ptr()) + offset * in_stride_c + in_hi * in_stride_wc; - - const auto a00 = (0 <= offset && offset < in_dim_w && 0 <= in_hi && in_hi < in_dim_h) ? *in_ptr : const_border_value; - const auto a01 = (-1 <= offset && offset < in_dim_w - 1 && 0 <= in_hi && in_hi < in_dim_h) ? *(in_ptr + in_stride_c) : const_border_value; - const auto a10 = (0 <= offset && offset < in_dim_w && -1 <= in_hi && in_hi < in_dim_h - 1) ? *(in_ptr + in_stride_wc) : const_border_value; - const auto a11 = (-1 <= offset && offset < in_dim_w - 1 && -1 <= in_hi && in_hi < in_dim_h - 1) ? *(in_ptr + in_stride_c + in_stride_wc) : const_border_value; - - *reinterpret_cast(out.ptr()) = static_cast(scale_helpers::delta_bilinear(a00, a01, a10, a11, dx_val, dy_val)); - }, - in, out); - } - else if(border_mode == BorderMode::REPLICATE) - { - execute_window_loop(window, [&](const Coordinates & id) - { - const auto offset = *reinterpret_cast(offsets->ptr_to_element(Coordinates(id.y(), id.z()))); - const auto dx_val = *reinterpret_cast(dx->ptr_to_element(Coordinates(id.y(), id.z()))); - const auto dy_val = *reinterpret_cast(dy->ptr_to_element(Coordinates(id.y(), id.z()))); - const int in_hi = std::floor((id.z() + sampling_offset) * hr - sampling_offset); - - auto clamped_w = utility::clamp(offset, 0, in_dim_w - 1); - auto clamped_w1 = utility::clamp(offset + 1, 0, in_dim_w - 1); - auto clamped_h = utility::clamp(in_hi, 0, in_dim_h - 1); - auto clamped_h1 = utility::clamp(in_hi + 1, 0, in_dim_h - 1); - - const auto a00 = *(reinterpret_cast(in.ptr()) + clamped_w * in_stride_c + clamped_h * in_stride_wc); - const auto a01 = *(reinterpret_cast(in.ptr()) + clamped_w1 * in_stride_c + clamped_h * in_stride_wc); - const auto a10 = *(reinterpret_cast(in.ptr()) + clamped_w * in_stride_c + clamped_h1 * in_stride_wc); - const auto a11 = *(reinterpret_cast(in.ptr()) + clamped_w1 * in_stride_c + clamped_h1 * in_stride_wc); - - *reinterpret_cast(out.ptr()) = static_cast(scale_helpers::delta_bilinear(a00, a01, a10, a11, dx_val, dy_val)); - }, - in, out); - } - else - { - ARM_COMPUTE_ERROR("Not implemented"); - } -} } namespace cpu { @@ -162,13 +91,14 @@ void fp16_sve_scale(const ITensor *src, ITensor *dst, const ITensor *offsets, co InterpolationPolicy policy, BorderMode border_mode, PixelValue constant_border_value, float sampling_offset, bool align_corners, const Window &window) { - if(policy == InterpolationPolicy::BILINEAR) + ARM_COMPUTE_UNUSED(dx, dy, border_mode, constant_border_value); + if(policy == InterpolationPolicy::NEAREST_NEIGHBOR) { - fp16_sve_scale_bilinear(src, dst, offsets, dx, dy, border_mode, constant_border_value, sampling_offset, align_corners, window); + fp16_sve_scale_nearest(src, dst, offsets, sampling_offset, align_corners, window); } - else if(policy == InterpolationPolicy::NEAREST_NEIGHBOR) + else { - fp16_sve_scale_nearest(src, dst, offsets, sampling_offset, align_corners, window); + ARM_COMPUTE_ERROR("Not implemented"); } } } // namespace cpu diff --git a/src/cpu/kernels/scale/sve/fp32.cpp b/src/cpu/kernels/scale/sve/fp32.cpp index 98b343870f..f3472f1efd 100644 --- a/src/cpu/kernels/scale/sve/fp32.cpp +++ b/src/cpu/kernels/scale/sve/fp32.cpp @@ -83,75 +83,6 @@ void fp32_sve_scale_nearest(const ITensor *src, ITensor *dst, const ITensor *off }, out); } - -void fp32_sve_scale_bilinear(const ITensor *src, ITensor *dst, const ITensor *offsets, const ITensor *dx, const ITensor *dy, - BorderMode border_mode, PixelValue constant_border_value, float sampling_offset, - bool align_corners, const Window &window) -{ - // Compute the ratio between source height and destination height - const auto hr = scale_utils::calculate_resize_ratio(src->info()->dimension(2), dst->info()->dimension(2), align_corners); - - Iterator out(dst, window); - const int in_stride_c = src->info()->dimension(0) + src->info()->padding().left + src->info()->padding().right; - const int in_dim_w = src->info()->dimension(1); - const int in_dim_h = src->info()->dimension(2); - const int in_stride_wc = in_stride_c * (in_dim_w + src->info()->padding().top + src->info()->padding().bottom); - - // Don't increment in Y and Z direction for the input tensor - // A pointer to the start of this plane is needed as base for the precomputed offsets - Window win_in(window); - win_in.set(Window::DimY, Window::Dimension(0, 0, 0)); - win_in.set(Window::DimZ, Window::Dimension(0, 0, 0)); - Iterator in(src, win_in); - - if(border_mode == BorderMode::CONSTANT) - { - const float const_border_value = static_cast(constant_border_value.get()); - execute_window_loop(window, [&](const Coordinates & id) - { - const auto offset = *reinterpret_cast(offsets->ptr_to_element(Coordinates(id.y(), id.z()))); - const auto dx_val = *reinterpret_cast(dx->ptr_to_element(Coordinates(id.y(), id.z()))); - const auto dy_val = *reinterpret_cast(dy->ptr_to_element(Coordinates(id.y(), id.z()))); - const int32_t in_hi = std::floor((id.z() + sampling_offset) * hr - sampling_offset); - const float *in_ptr = reinterpret_cast(in.ptr()) + offset * in_stride_c + in_hi * in_stride_wc; - - const auto a00 = (0 <= offset && offset < in_dim_w && 0 <= in_hi && in_hi < in_dim_h) ? *in_ptr : const_border_value; - const auto a01 = (-1 <= offset && offset < in_dim_w - 1 && 0 <= in_hi && in_hi < in_dim_h) ? *(in_ptr + in_stride_c) : const_border_value; - const auto a10 = (0 <= offset && offset < in_dim_w && -1 <= in_hi && in_hi < in_dim_h - 1) ? *(in_ptr + in_stride_wc) : const_border_value; - const auto a11 = (-1 <= offset && offset < in_dim_w - 1 && -1 <= in_hi && in_hi < in_dim_h - 1) ? *(in_ptr + in_stride_c + in_stride_wc) : const_border_value; - - *reinterpret_cast(out.ptr()) = static_cast(scale_helpers::delta_bilinear(a00, a01, a10, a11, dx_val, dy_val)); - }, - in, out); - } - else if(border_mode == BorderMode::REPLICATE) - { - execute_window_loop(window, [&](const Coordinates & id) - { - const auto offset = *reinterpret_cast(offsets->ptr_to_element(Coordinates(id.y(), id.z()))); - const auto dx_val = *reinterpret_cast(dx->ptr_to_element(Coordinates(id.y(), id.z()))); - const auto dy_val = *reinterpret_cast(dy->ptr_to_element(Coordinates(id.y(), id.z()))); - const int in_hi = std::floor((id.z() + sampling_offset) * hr - sampling_offset); - - auto clamped_w = utility::clamp(offset, 0, in_dim_w - 1); - auto clamped_w1 = utility::clamp(offset + 1, 0, in_dim_w - 1); - auto clamped_h = utility::clamp(in_hi, 0, in_dim_h - 1); - auto clamped_h1 = utility::clamp(in_hi + 1, 0, in_dim_h - 1); - - const auto a00 = *(reinterpret_cast(in.ptr()) + clamped_w * in_stride_c + clamped_h * in_stride_wc); - const auto a01 = *(reinterpret_cast(in.ptr()) + clamped_w1 * in_stride_c + clamped_h * in_stride_wc); - const auto a10 = *(reinterpret_cast(in.ptr()) + clamped_w * in_stride_c + clamped_h1 * in_stride_wc); - const auto a11 = *(reinterpret_cast(in.ptr()) + clamped_w1 * in_stride_c + clamped_h1 * in_stride_wc); - - *reinterpret_cast(out.ptr()) = static_cast(scale_helpers::delta_bilinear(a00, a01, a10, a11, dx_val, dy_val)); - }, - in, out); - } - else - { - ARM_COMPUTE_ERROR("Not implemented"); - } -} } namespace cpu { @@ -159,13 +90,14 @@ void fp32_sve_scale(const ITensor *src, ITensor *dst, const ITensor *offsets, co InterpolationPolicy policy, BorderMode border_mode, PixelValue constant_border_value, float sampling_offset, bool align_corners, const Window &window) { - if(policy == InterpolationPolicy::BILINEAR) + ARM_COMPUTE_UNUSED(dx, dy, border_mode, constant_border_value); + if(policy == InterpolationPolicy::NEAREST_NEIGHBOR) { - fp32_sve_scale_bilinear(src, dst, offsets, dx, dy, border_mode, constant_border_value, sampling_offset, align_corners, window); + fp32_sve_scale_nearest(src, dst, offsets, sampling_offset, align_corners, window); } - else if(policy == InterpolationPolicy::NEAREST_NEIGHBOR) + else { - fp32_sve_scale_nearest(src, dst, offsets, sampling_offset, align_corners, window); + ARM_COMPUTE_ERROR("Not implemented"); } } } // namespace cpu -- cgit v1.2.1