aboutsummaryrefslogtreecommitdiff
path: root/src/cpu/kernels/scale/neon/fp16.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'src/cpu/kernels/scale/neon/fp16.cpp')
-rw-r--r--src/cpu/kernels/scale/neon/fp16.cpp189
1 files changed, 118 insertions, 71 deletions
diff --git a/src/cpu/kernels/scale/neon/fp16.cpp b/src/cpu/kernels/scale/neon/fp16.cpp
index 895f42215e..bd01569cc4 100644
--- a/src/cpu/kernels/scale/neon/fp16.cpp
+++ b/src/cpu/kernels/scale/neon/fp16.cpp
@@ -27,9 +27,10 @@
#include "arm_compute/core/Helpers.h"
#include "arm_compute/core/ITensorPack.h"
#include "arm_compute/core/Window.h"
+
+#include "src/core/helpers/ScaleHelpers.h"
#include "src/core/NEON/NEMath.h"
#include "src/core/NEON/wrapper/wrapper.h"
-#include "src/core/helpers/ScaleHelpers.h"
#include "src/core/utils/ScaleUtils.h"
#include "support/Rounding.h"
@@ -41,8 +42,12 @@ namespace arm_compute
{
namespace
{
-void fp16_neon_scale_nearest(const ITensor *src, ITensor *dst, const ITensor *offsets,
- float sampling_offset, bool align_corners, const Window &window)
+void fp16_neon_scale_nearest(const ITensor *src,
+ ITensor *dst,
+ const ITensor *offsets,
+ float sampling_offset,
+ bool align_corners,
+ const Window &window)
{
const size_t in_stride_c = src->info()->dimension(0) + src->info()->padding().left + src->info()->padding().right;
const size_t in_stride_w = src->info()->dimension(1) + src->info()->padding().top + src->info()->padding().bottom;
@@ -62,33 +67,46 @@ void fp16_neon_scale_nearest(const ITensor *src, ITensor *dst, const ITensor *of
const uint8_t *in_ptr_start = src->buffer() + src->info()->offset_first_element_in_bytes();
const unsigned int in_stride_bytes_hwc = src->info()->strides_in_bytes()[3];
- execute_window_loop(win, [&](const Coordinates & id)
- {
- const int32_t offset = *reinterpret_cast<const int32_t *>(offsets->ptr_to_element(Coordinates(id.y(), id.z()))) * in_stride_c;
- const auto in_hi = static_cast<int>(align_corners ? utils::rounding::round_half_away_from_zero((id.z() + sampling_offset) * hr) : std::floor((id.z() + sampling_offset) * hr));
- const int offset_row = in_hi * in_stride_wc;
- int32_t x = window_start_x;
- const float16_t *in_ptr = reinterpret_cast<const float16_t *>(in_ptr_start + in_stride_bytes_hwc * id[3]);
-
- for(; x <= window_end_x - window_step_x; x += window_step_x)
- {
- wrapper::vstore(reinterpret_cast<float16_t *>(out.ptr()) + x,
- wrapper::vloadq(in_ptr + offset + offset_row + x));
- }
- for(; x < window_end_x; ++x)
+ execute_window_loop(
+ win,
+ [&](const Coordinates &id)
{
- *(reinterpret_cast<float16_t *>(out.ptr()) + x) = *(in_ptr + offset + offset_row + x);
- }
- },
- out);
+ const int32_t offset =
+ *reinterpret_cast<const int32_t *>(offsets->ptr_to_element(Coordinates(id.y(), id.z()))) * in_stride_c;
+ const auto in_hi = static_cast<int>(
+ align_corners ? utils::rounding::round_half_away_from_zero((id.z() + sampling_offset) * hr)
+ : std::floor((id.z() + sampling_offset) * hr));
+ const int offset_row = in_hi * in_stride_wc;
+ int32_t x = window_start_x;
+ const float16_t *in_ptr = reinterpret_cast<const float16_t *>(in_ptr_start + in_stride_bytes_hwc * id[3]);
+
+ for (; x <= window_end_x - window_step_x; x += window_step_x)
+ {
+ wrapper::vstore(reinterpret_cast<float16_t *>(out.ptr()) + x,
+ wrapper::vloadq(in_ptr + offset + offset_row + x));
+ }
+ for (; x < window_end_x; ++x)
+ {
+ *(reinterpret_cast<float16_t *>(out.ptr()) + x) = *(in_ptr + offset + offset_row + x);
+ }
+ },
+ out);
}
-void fp16_neon_scale_bilinear(const ITensor *src, ITensor *dst, const ITensor *offsets, const ITensor *dx, const ITensor *dy,
- BorderMode border_mode, PixelValue constant_border_value, float sampling_offset,
- bool align_corners, const Window &window)
+void fp16_neon_scale_bilinear(const ITensor *src,
+ ITensor *dst,
+ const ITensor *offsets,
+ const ITensor *dx,
+ const ITensor *dy,
+ BorderMode border_mode,
+ PixelValue constant_border_value,
+ float sampling_offset,
+ bool align_corners,
+ const Window &window)
{
// Compute the ratio between source height and destination height
- const auto hr = scale_utils::calculate_resize_ratio(src->info()->dimension(2), dst->info()->dimension(2), align_corners);
+ const auto hr =
+ scale_utils::calculate_resize_ratio(src->info()->dimension(2), dst->info()->dimension(2), align_corners);
Iterator out(dst, window);
const int in_stride_c = src->info()->dimension(0) + src->info()->padding().left + src->info()->padding().right;
@@ -103,68 +121,97 @@ void fp16_neon_scale_bilinear(const ITensor *src, ITensor *dst, const ITensor *o
win_in.set(Window::DimZ, Window::Dimension(0, 0, 0));
Iterator in(src, win_in);
- if(border_mode == BorderMode::CONSTANT)
+ if (border_mode == BorderMode::CONSTANT)
{
using ConstType = typename std::conditional<std::is_same<float16_t, float16_t>::value, half, float16_t>::type;
const float16_t const_border_value = static_cast<float16_t>(constant_border_value.get<ConstType>());
- execute_window_loop(window, [&](const Coordinates & id)
- {
- const auto offset = *reinterpret_cast<const int32_t *>(offsets->ptr_to_element(Coordinates(id.y(), id.z())));
- const auto dx_val = *reinterpret_cast<const float *>(dx->ptr_to_element(Coordinates(id.y(), id.z())));
- const auto dy_val = *reinterpret_cast<const float *>(dy->ptr_to_element(Coordinates(id.y(), id.z())));
- const int32_t in_hi = std::floor((id.z() + sampling_offset) * hr - sampling_offset);
- const float16_t *in_ptr = reinterpret_cast<const float16_t *>(in.ptr()) + offset * in_stride_c + in_hi * in_stride_wc;
-
- const auto a00 = (0 <= offset && offset < in_dim_w && 0 <= in_hi && in_hi < in_dim_h) ? *in_ptr : const_border_value;
- const auto a01 = (-1 <= offset && offset < in_dim_w - 1 && 0 <= in_hi && in_hi < in_dim_h) ? *(in_ptr + in_stride_c) : const_border_value;
- const auto a10 = (0 <= offset && offset < in_dim_w && -1 <= in_hi && in_hi < in_dim_h - 1) ? *(in_ptr + in_stride_wc) : const_border_value;
- const auto a11 = (-1 <= offset && offset < in_dim_w - 1 && -1 <= in_hi && in_hi < in_dim_h - 1) ? *(in_ptr + in_stride_c + in_stride_wc) : const_border_value;
-
- *reinterpret_cast<float16_t *>(out.ptr()) = static_cast<float16_t>(scale_helpers::delta_bilinear(a00, a01, a10, a11, dx_val, dy_val));
- },
- in, out);
+ execute_window_loop(
+ window,
+ [&](const Coordinates &id)
+ {
+ const auto offset =
+ *reinterpret_cast<const int32_t *>(offsets->ptr_to_element(Coordinates(id.y(), id.z())));
+ const auto dx_val = *reinterpret_cast<const float *>(dx->ptr_to_element(Coordinates(id.y(), id.z())));
+ const auto dy_val = *reinterpret_cast<const float *>(dy->ptr_to_element(Coordinates(id.y(), id.z())));
+ const int32_t in_hi = std::floor((id.z() + sampling_offset) * hr - sampling_offset);
+ const float16_t *in_ptr =
+ reinterpret_cast<const float16_t *>(in.ptr()) + offset * in_stride_c + in_hi * in_stride_wc;
+
+ const auto a00 =
+ (0 <= offset && offset < in_dim_w && 0 <= in_hi && in_hi < in_dim_h) ? *in_ptr : const_border_value;
+ const auto a01 = (-1 <= offset && offset < in_dim_w - 1 && 0 <= in_hi && in_hi < in_dim_h)
+ ? *(in_ptr + in_stride_c)
+ : const_border_value;
+ const auto a10 = (0 <= offset && offset < in_dim_w && -1 <= in_hi && in_hi < in_dim_h - 1)
+ ? *(in_ptr + in_stride_wc)
+ : const_border_value;
+ const auto a11 = (-1 <= offset && offset < in_dim_w - 1 && -1 <= in_hi && in_hi < in_dim_h - 1)
+ ? *(in_ptr + in_stride_c + in_stride_wc)
+ : const_border_value;
+
+ *reinterpret_cast<float16_t *>(out.ptr()) =
+ static_cast<float16_t>(scale_helpers::delta_bilinear(a00, a01, a10, a11, dx_val, dy_val));
+ },
+ in, out);
}
- else if(border_mode == BorderMode::REPLICATE)
+ else if (border_mode == BorderMode::REPLICATE)
{
- execute_window_loop(window, [&](const Coordinates & id)
- {
- const auto offset = *reinterpret_cast<const int32_t *>(offsets->ptr_to_element(Coordinates(id.y(), id.z())));
- const auto dx_val = *reinterpret_cast<const float *>(dx->ptr_to_element(Coordinates(id.y(), id.z())));
- const auto dy_val = *reinterpret_cast<const float *>(dy->ptr_to_element(Coordinates(id.y(), id.z())));
- const int in_hi = std::floor((id.z() + sampling_offset) * hr - sampling_offset);
-
- auto clamped_w = utility::clamp<int>(offset, 0, in_dim_w - 1);
- auto clamped_w1 = utility::clamp<int>(offset + 1, 0, in_dim_w - 1);
- auto clamped_h = utility::clamp<int>(in_hi, 0, in_dim_h - 1);
- auto clamped_h1 = utility::clamp<int>(in_hi + 1, 0, in_dim_h - 1);
-
- const auto a00 = *(reinterpret_cast<const float16_t *>(in.ptr()) + clamped_w * in_stride_c + clamped_h * in_stride_wc);
- const auto a01 = *(reinterpret_cast<const float16_t *>(in.ptr()) + clamped_w1 * in_stride_c + clamped_h * in_stride_wc);
- const auto a10 = *(reinterpret_cast<const float16_t *>(in.ptr()) + clamped_w * in_stride_c + clamped_h1 * in_stride_wc);
- const auto a11 = *(reinterpret_cast<const float16_t *>(in.ptr()) + clamped_w1 * in_stride_c + clamped_h1 * in_stride_wc);
-
- *reinterpret_cast<float16_t *>(out.ptr()) = static_cast<float16_t>(scale_helpers::delta_bilinear(a00, a01, a10, a11, dx_val, dy_val));
- },
- in, out);
+ execute_window_loop(
+ window,
+ [&](const Coordinates &id)
+ {
+ const auto offset =
+ *reinterpret_cast<const int32_t *>(offsets->ptr_to_element(Coordinates(id.y(), id.z())));
+ const auto dx_val = *reinterpret_cast<const float *>(dx->ptr_to_element(Coordinates(id.y(), id.z())));
+ const auto dy_val = *reinterpret_cast<const float *>(dy->ptr_to_element(Coordinates(id.y(), id.z())));
+ const int in_hi = std::floor((id.z() + sampling_offset) * hr - sampling_offset);
+
+ auto clamped_w = utility::clamp<int>(offset, 0, in_dim_w - 1);
+ auto clamped_w1 = utility::clamp<int>(offset + 1, 0, in_dim_w - 1);
+ auto clamped_h = utility::clamp<int>(in_hi, 0, in_dim_h - 1);
+ auto clamped_h1 = utility::clamp<int>(in_hi + 1, 0, in_dim_h - 1);
+
+ const auto a00 = *(reinterpret_cast<const float16_t *>(in.ptr()) + clamped_w * in_stride_c +
+ clamped_h * in_stride_wc);
+ const auto a01 = *(reinterpret_cast<const float16_t *>(in.ptr()) + clamped_w1 * in_stride_c +
+ clamped_h * in_stride_wc);
+ const auto a10 = *(reinterpret_cast<const float16_t *>(in.ptr()) + clamped_w * in_stride_c +
+ clamped_h1 * in_stride_wc);
+ const auto a11 = *(reinterpret_cast<const float16_t *>(in.ptr()) + clamped_w1 * in_stride_c +
+ clamped_h1 * in_stride_wc);
+
+ *reinterpret_cast<float16_t *>(out.ptr()) =
+ static_cast<float16_t>(scale_helpers::delta_bilinear(a00, a01, a10, a11, dx_val, dy_val));
+ },
+ in, out);
}
else
{
ARM_COMPUTE_ERROR("Not implemented");
}
}
-}
+} // namespace
namespace cpu
{
-void fp16_neon_scale(const ITensor *src, ITensor *dst, const ITensor *offsets, const ITensor *dx, const ITensor *dy,
- InterpolationPolicy policy, BorderMode border_mode, PixelValue constant_border_value, float sampling_offset,
- bool align_corners, const Window &window)
+void fp16_neon_scale(const ITensor *src,
+ ITensor *dst,
+ const ITensor *offsets,
+ const ITensor *dx,
+ const ITensor *dy,
+ InterpolationPolicy policy,
+ BorderMode border_mode,
+ PixelValue constant_border_value,
+ float sampling_offset,
+ bool align_corners,
+ const Window &window)
{
- if(policy == InterpolationPolicy::BILINEAR)
+ if (policy == InterpolationPolicy::BILINEAR)
{
- fp16_neon_scale_bilinear(src, dst, offsets, dx, dy, border_mode, constant_border_value, sampling_offset, align_corners, window);
+ fp16_neon_scale_bilinear(src, dst, offsets, dx, dy, border_mode, constant_border_value, sampling_offset,
+ align_corners, window);
}
- else if(policy == InterpolationPolicy::NEAREST_NEIGHBOR)
+ else if (policy == InterpolationPolicy::NEAREST_NEIGHBOR)
{
fp16_neon_scale_nearest(src, dst, offsets, sampling_offset, align_corners, window);
}
@@ -172,4 +219,4 @@ void fp16_neon_scale(const ITensor *src, ITensor *dst, const ITensor *offsets, c
} // namespace cpu
} // namespace arm_compute
-#endif /* defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) */ \ No newline at end of file
+#endif /* defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) */