aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorGian Marco Iodice <gianmarco.iodice@arm.com>2021-10-01 17:48:02 +0100
committerGian Marco Iodice <gianmarco.iodice@arm.com>2021-10-06 10:40:49 +0000
commit8b8405aec8af17b04205b60094680751abfdc94a (patch)
treeae8f7e68271acf4791cab04c6deaf9a9ddcb3b38
parent8229e5a8b5e38fcfa53cab5752f12e8f17a65e6c (diff)
downloadComputeLibrary-8b8405aec8af17b04205b60094680751abfdc94a.tar.gz
Optimize CpuScale NHWC F32/F16
- Rework CpuScaleKernel F32/F16 NHWC - bilinear - Rework CpuScaleKernel F32/F16 NHWC - nearest - Add test to validate the vector computation path Resolves COMPMID-4801, COMPMID-4802 Change-Id: Ie6e4f262a8cce509edd7b8f564c940758625c58a Signed-off-by: Gian Marco Iodice <gianmarco.iodice@arm.com> Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/6361 Tested-by: Arm Jenkins <bsgcomp@arm.com> Comments-Addressed: Arm Jenkins <bsgcomp@arm.com> Reviewed-by: Pablo Marquez Tello <pablo.tello@arm.com>
-rw-r--r--src/cpu/kernels/CpuScaleKernel.cpp4
-rw-r--r--src/cpu/kernels/scale/neon/list.h377
-rw-r--r--tests/datasets/ScaleValidationDataset.h10
-rw-r--r--tests/validation/NEON/Scale.cpp68
4 files changed, 376 insertions, 83 deletions
diff --git a/src/cpu/kernels/CpuScaleKernel.cpp b/src/cpu/kernels/CpuScaleKernel.cpp
index 1108c7a78e..3063d8f682 100644
--- a/src/cpu/kernels/CpuScaleKernel.cpp
+++ b/src/cpu/kernels/CpuScaleKernel.cpp
@@ -123,12 +123,12 @@ static const ScaleKernel available_kernels[] =
{
"neon_u8_scale",
[](const ScaleSelectorData & data) { return data.dt == DataType::U8; },
- REGISTER_INTEGER_NEON(arm_compute::cpu::common_neon_scale<uint8_t>)
+ REGISTER_INTEGER_NEON(arm_compute::cpu::u8_neon_scale)
},
{
"neon_s16_scale",
[](const ScaleSelectorData & data) { return data.dt == DataType::S16; },
- REGISTER_INTEGER_NEON(arm_compute::cpu::common_neon_scale<int16_t>)
+ REGISTER_INTEGER_NEON(arm_compute::cpu::s16_neon_scale)
},
#endif /* defined(ARM_COMPUTE_ENABLE_NEON) */
};
diff --git a/src/cpu/kernels/scale/neon/list.h b/src/cpu/kernels/scale/neon/list.h
index c91242f5b2..9679f161e7 100644
--- a/src/cpu/kernels/scale/neon/list.h
+++ b/src/cpu/kernels/scale/neon/list.h
@@ -42,6 +42,8 @@ namespace cpu
InterpolationPolicy policy, BorderMode border_mode, PixelValue constant_border_value, float sampling_offset, \
bool align_corners, const Window &window)
+DECLARE_SCALE_KERNEL(s16_neon_scale);
+DECLARE_SCALE_KERNEL(u8_neon_scale);
DECLARE_SCALE_KERNEL(qasymm8_neon_scale);
DECLARE_SCALE_KERNEL(qasymm8_signed_neon_scale);
@@ -51,43 +53,90 @@ template <typename T>
void nearest_neon_scale(const ITensor *src, ITensor *dst, const ITensor *offsets, float sampling_offset,
bool align_corners, const Window &window)
{
- const size_t in_stride_c = src->info()->dimension(0) + src->info()->padding().left + src->info()->padding().right;
- const size_t in_stride_w = src->info()->dimension(1) + src->info()->padding().top + src->info()->padding().bottom;
- const size_t in_stride_wc = in_stride_w * in_stride_c;
- const size_t in_dim_h = src->info()->dimension(2);
+ ARM_COMPUTE_UNUSED(offsets);
- // Compute the ratio between source height and destination height
- const auto hr = scale_utils::calculate_resize_ratio(in_dim_h, dst->info()->dimension(2), align_corners);
- const auto window_start_x = static_cast<int32_t>(window.x().start());
- const auto window_end_x = static_cast<int32_t>(window.x().end());
- const int window_step_x = 16 / sizeof(T);
+ // Compute the ratio between source and destination dimensions
+ const float scale_x = scale_utils::calculate_resize_ratio(src->info()->dimension(1), dst->info()->dimension(1), align_corners);
+ const float scale_y = scale_utils::calculate_resize_ratio(src->info()->dimension(2), dst->info()->dimension(2), align_corners);
- Window win(window);
- win.set(Window::DimX, Window::Dimension(0, 1, 1));
- Iterator out(dst, win);
+ const int in_stride_y = src->info()->strides_in_bytes()[1];
+ const int in_stride_z = src->info()->strides_in_bytes()[2];
+ const int in_stride_w = src->info()->strides_in_bytes()[3];
+ const int out_stride_y = dst->info()->strides_in_bytes()[1];
+ const int out_stride_z = dst->info()->strides_in_bytes()[2];
+ const int out_stride_w = dst->info()->strides_in_bytes()[3];
+ const int out_dim_ch = dst->info()->dimension(0);
+ const int step_cout = 16 / sizeof(T);
- const uint8_t *in_ptr_start = src->buffer() + src->info()->offset_first_element_in_bytes();
- const unsigned int in_stride_bytes_hwc = src->info()->strides_in_bytes()[3];
+ Window window_execution = window;
+ window_execution.set(Window::DimX, Window::Dimension(0, 1, 1));
+ Window win_in_out(window);
+ win_in_out.set(Window::DimY, Window::Dimension(0, 0, 0));
+ win_in_out.set(Window::DimZ, Window::Dimension(0, 0, 0));
+ Iterator in(src, win_in_out);
+ Iterator out(dst, win_in_out);
- execute_window_loop(win, [&](const Coordinates & id)
+ const int xo_start = window_execution.y().start();
+ const int xo_end = window_execution.y().end();
+ const int xo_step = window_execution.y().step();
+ const int yo_start = window_execution.z().start();
+ const int yo_end = window_execution.z().end();
+ const int yo_step = window_execution.z().step();
+ const int bo_start = window_execution[3].start();
+ const int bo_end = window_execution[3].end();
+ const int bo_step = window_execution[3].step();
+
+ for(int bo = bo_start; bo < bo_end; bo += bo_step)
{
- const int32_t offset = *reinterpret_cast<const int32_t *>(offsets->ptr_to_element(Coordinates(id.y(), id.z()))) * in_stride_c;
- const auto in_hi = static_cast<int>(align_corners ? utils::rounding::round_half_away_from_zero((id.z() + sampling_offset) * hr) : std::floor((id.z() + sampling_offset) * hr));
- const int offset_row = in_hi * in_stride_wc;
- int32_t x = window_start_x;
- const T *in_ptr = reinterpret_cast<const T *>(in_ptr_start + in_stride_bytes_hwc * id[3]);
+ const uint8_t *in_ptr_base = in.ptr() + bo * in_stride_w;
+ uint8_t *out_ptr_base = out.ptr() + bo * out_stride_w;
- for(; x <= window_end_x - window_step_x; x += window_step_x)
- {
- wrapper::vstore(reinterpret_cast<T *>(out.ptr()) + x,
- wrapper::vloadq(in_ptr + offset + offset_row + x));
- }
- for(; x < window_end_x; ++x)
+ for(int yo = yo_start; yo < yo_end; yo += yo_step)
{
- *(reinterpret_cast<T *>(out.ptr()) + x) = *(in_ptr + offset + offset_row + x);
+ // Floating-point coordinate
+ float yi_f = ((yo + sampling_offset) * scale_y);
+ int yi = 0;
+ if(align_corners)
+ {
+ yi = utils::rounding::round_half_away_from_zero(yi_f);
+ }
+ else
+ {
+ yi = static_cast<int>(std::floor(yi_f));
+ }
+
+ for(int xo = xo_start; xo < xo_end; xo += xo_step)
+ {
+ // Floating-point coordinate
+ float xi_f = ((xo + sampling_offset) * scale_x);
+ int xi = 0;
+ if(align_corners)
+ {
+ xi = utils::rounding::round_half_away_from_zero(xi_f);
+ }
+ else
+ {
+ xi = static_cast<int>(std::floor(xi_f));
+ }
+
+ const uint8_t *in_ptr = in_ptr_base + xi * in_stride_y + yi * in_stride_z;
+ uint8_t *out_ptr = out_ptr_base + xo * out_stride_y + yo * out_stride_z;
+
+ int cout = 0;
+ for(; cout <= (out_dim_ch - step_cout); cout += step_cout)
+ {
+ auto out0 = wrapper::vloadq(reinterpret_cast<const T *>(in_ptr + cout * sizeof(T)));
+ wrapper::vstore(reinterpret_cast<T *>(out_ptr + cout * sizeof(T)), out0);
+ }
+
+ for(; cout < out_dim_ch; ++cout)
+ {
+ auto out0 = *(reinterpret_cast<const T *>(in_ptr + cout * sizeof(T)));
+ *(reinterpret_cast<T *>(out_ptr + cout * sizeof(T))) = out0;
+ }
+ }
}
- },
- out);
+ }
}
template <typename T>
@@ -95,21 +144,43 @@ void bilinear_neon_scale(const ITensor *src, ITensor *dst, const ITensor *offset
BorderMode border_mode, PixelValue constant_border_value, float sampling_offset,
bool align_corners, const Window &window)
{
- // Compute the ratio between source height and destination height
- const auto hr = scale_utils::calculate_resize_ratio(src->info()->dimension(2), dst->info()->dimension(2), align_corners);
+ ARM_COMPUTE_UNUSED(offsets);
+ ARM_COMPUTE_UNUSED(dx);
+ ARM_COMPUTE_UNUSED(dy);
+ using ExactTagType = typename wrapper::traits::neon_bitvector_tag_t<T, wrapper::traits::BitWidth::W128>;
+
+ // Compute the ratio between source and destination dimensions
+ const float scale_x = scale_utils::calculate_resize_ratio(src->info()->dimension(1), dst->info()->dimension(1), align_corners);
+ const float scale_y = scale_utils::calculate_resize_ratio(src->info()->dimension(2), dst->info()->dimension(2), align_corners);
- Iterator out(dst, window);
- const int in_stride_c = src->info()->dimension(0) + src->info()->padding().left + src->info()->padding().right;
+ const int in_stride_y = src->info()->strides_in_bytes()[1];
+ const int in_stride_z = src->info()->strides_in_bytes()[2];
+ const int in_stride_w = src->info()->strides_in_bytes()[3];
+ const int out_stride_y = dst->info()->strides_in_bytes()[1];
+ const int out_stride_z = dst->info()->strides_in_bytes()[2];
+ const int out_stride_w = dst->info()->strides_in_bytes()[3];
const int in_dim_w = src->info()->dimension(1);
const int in_dim_h = src->info()->dimension(2);
- const int in_stride_wc = in_stride_c * (in_dim_w + src->info()->padding().top + src->info()->padding().bottom);
+ const int out_dim_ch = dst->info()->dimension(0);
+ const int step_cout = 16 / sizeof(T);
- // Don't increment in Y and Z direction for the input tensor
- // A pointer to the start of this plane is needed as base for the precomputed offsets
- Window win_in(window);
- win_in.set(Window::DimY, Window::Dimension(0, 0, 0));
- win_in.set(Window::DimZ, Window::Dimension(0, 0, 0));
- Iterator in(src, win_in);
+ Window window_execution = window;
+ window_execution.set(Window::DimX, Window::Dimension(0, 1, 1));
+ Window win_in_out(window);
+ win_in_out.set(Window::DimY, Window::Dimension(0, 0, 0));
+ win_in_out.set(Window::DimZ, Window::Dimension(0, 0, 0));
+ Iterator in(src, win_in_out);
+ Iterator out(dst, win_in_out);
+
+ const int xo_start = window_execution.y().start();
+ const int xo_end = window_execution.y().end();
+ const int xo_step = window_execution.y().step();
+ const int yo_start = window_execution.z().start();
+ const int yo_end = window_execution.z().end();
+ const int yo_step = window_execution.z().step();
+ const int bo_start = window_execution[3].start();
+ const int bo_end = window_execution[3].end();
+ const int bo_step = window_execution[3].step();
if(border_mode == BorderMode::CONSTANT)
{
@@ -119,45 +190,203 @@ void bilinear_neon_scale(const ITensor *src, ITensor *dst, const ITensor *offset
using ConstType = T;
#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
const T const_border_value = static_cast<T>(constant_border_value.get<ConstType>());
- execute_window_loop(window, [&](const Coordinates & id)
+
+ for(int bo = bo_start; bo < bo_end; bo += bo_step)
{
- const auto offset = *reinterpret_cast<const int32_t *>(offsets->ptr_to_element(Coordinates(id.y(), id.z())));
- const auto dx_val = *reinterpret_cast<const float *>(dx->ptr_to_element(Coordinates(id.y(), id.z())));
- const auto dy_val = *reinterpret_cast<const float *>(dy->ptr_to_element(Coordinates(id.y(), id.z())));
- const int32_t in_hi = std::floor((id.z() + sampling_offset) * hr - sampling_offset);
- const T *in_ptr = reinterpret_cast<const T *>(in.ptr()) + offset * in_stride_c + in_hi * in_stride_wc;
-
- const auto a00 = (0 <= offset && offset < in_dim_w && 0 <= in_hi && in_hi < in_dim_h) ? *in_ptr : const_border_value;
- const auto a01 = (-1 <= offset && offset < in_dim_w - 1 && 0 <= in_hi && in_hi < in_dim_h) ? *(in_ptr + in_stride_c) : const_border_value;
- const auto a10 = (0 <= offset && offset < in_dim_w && -1 <= in_hi && in_hi < in_dim_h - 1) ? *(in_ptr + in_stride_wc) : const_border_value;
- const auto a11 = (-1 <= offset && offset < in_dim_w - 1 && -1 <= in_hi && in_hi < in_dim_h - 1) ? *(in_ptr + in_stride_c + in_stride_wc) : const_border_value;
-
- *reinterpret_cast<T *>(out.ptr()) = static_cast<T>(scale_helpers::delta_bilinear(a00, a01, a10, a11, dx_val, dy_val));
- },
- in, out);
+ const uint8_t *in_ptr_base = in.ptr() + bo * in_stride_w;
+ uint8_t *out_ptr_base = out.ptr() + bo * out_stride_w;
+
+ for(int yo = yo_start; yo < yo_end; yo += yo_step)
+ {
+ // Floating-point coordinate
+ const float yi_f = ((yo + sampling_offset) * scale_y - sampling_offset);
+ // Integer coordinate
+ const auto yi = static_cast<int>(std::floor(yi_f));
+ // Weight for the y coordinate
+ const auto a1 = (yi_f - static_cast<float>(yi));
+ const auto b1 = (1.f - a1);
+
+ for(int xo = xo_start; xo < xo_end; xo += xo_step)
+ {
+ // Floating-point coordinate
+ const float xi_f = ((xo + sampling_offset) * scale_x - sampling_offset);
+ // Integer coordinate
+ const auto xi = static_cast<int>(std::floor(xi_f));
+ // Weight for the x coordinate
+ const auto a = (xi_f - static_cast<float>(xi));
+ const auto b = (1.f - a);
+
+ const auto s00_s = static_cast<T>(b * b1);
+ const auto s01_s = static_cast<T>(a * b1);
+ const auto s10_s = static_cast<T>(b * a1);
+ const auto s11_s = static_cast<T>(a * a1);
+
+ const uint8_t *in_ptr = in_ptr_base + xi * in_stride_y + yi * in_stride_z;
+ uint8_t *out_ptr = out_ptr_base + xo * out_stride_y + yo * out_stride_z;
+
+ int cout = 0;
+ for(; cout <= (out_dim_ch - step_cout); cout += step_cout)
+ {
+ auto in00 = wrapper::vdup_n(static_cast<T>(const_border_value), ExactTagType{});
+ auto in01 = wrapper::vdup_n(static_cast<T>(const_border_value), ExactTagType{});
+ auto in10 = wrapper::vdup_n(static_cast<T>(const_border_value), ExactTagType{});
+ auto in11 = wrapper::vdup_n(static_cast<T>(const_border_value), ExactTagType{});
+ if((yi >= 0) && (yi < in_dim_h))
+ {
+ if((xi >= 0) && (xi < in_dim_w))
+ {
+ in00 = wrapper::vloadq(reinterpret_cast<const T *>(in_ptr + cout * sizeof(T)));
+ }
+ if(((xi + 1) >= 0) && ((xi + 1) < in_dim_w))
+ {
+ in01 = wrapper::vloadq(reinterpret_cast<const T *>(in_ptr + cout * sizeof(T) + in_stride_y));
+ }
+ }
+ if(((yi + 1) >= 0) && ((yi + 1) < in_dim_h))
+ {
+ if((xi >= 0) && (xi < in_dim_w))
+ {
+ in10 = wrapper::vloadq(reinterpret_cast<const T *>(in_ptr + cout * sizeof(T) + in_stride_z));
+ }
+ if(((xi + 1) >= 0) && ((xi + 1) < in_dim_w))
+ {
+ in11 = wrapper::vloadq(reinterpret_cast<const T *>(in_ptr + cout * sizeof(T) + in_stride_y + in_stride_z));
+ }
+ }
+
+ const auto s00 = wrapper::vdup_n(s00_s, ExactTagType{});
+ const auto s01 = wrapper::vdup_n(s01_s, ExactTagType{});
+ const auto s10 = wrapper::vdup_n(s10_s, ExactTagType{});
+ const auto s11 = wrapper::vdup_n(s11_s, ExactTagType{});
+ auto out0 = wrapper::vdup_n(static_cast<T>(0), ExactTagType{});
+ out0 = wrapper::vmla(out0, in00, s00);
+ out0 = wrapper::vmla(out0, in01, s01);
+ out0 = wrapper::vmla(out0, in10, s10);
+ out0 = wrapper::vmla(out0, in11, s11);
+ wrapper::vstore(reinterpret_cast<T *>(out_ptr + cout * sizeof(T)), out0);
+ }
+
+ for(; cout < out_dim_ch; ++cout)
+ {
+ auto in00 = static_cast<T>(const_border_value);
+ auto in01 = static_cast<T>(const_border_value);
+ auto in10 = static_cast<T>(const_border_value);
+ auto in11 = static_cast<T>(const_border_value);
+ if((yi >= 0) && (yi < in_dim_h))
+ {
+ if((xi >= 0) && (xi < in_dim_w))
+ {
+ in00 = *(reinterpret_cast<const T *>(in_ptr + cout * sizeof(T)));
+ }
+ if(((xi + 1) >= 0) && ((xi + 1) < in_dim_w))
+ {
+ in01 = *(reinterpret_cast<const T *>(in_ptr + cout * sizeof(T) + in_stride_y));
+ }
+ }
+ if(((yi + 1) >= 0) && ((yi + 1) < in_dim_h))
+ {
+ if((xi >= 0) && (xi < in_dim_w))
+ {
+ in10 = *(reinterpret_cast<const T *>(in_ptr + cout * sizeof(T) + in_stride_z));
+ }
+ if(((xi + 1) >= 0) && ((xi + 1) < in_dim_w))
+ {
+ in11 = *(reinterpret_cast<const T *>(in_ptr + cout * sizeof(T) + in_stride_y + in_stride_z));
+ }
+ }
+ auto out0 = static_cast<T>(0);
+ out0 += in00 * s00_s;
+ out0 += in01 * s01_s;
+ out0 += in10 * s10_s;
+ out0 += in11 * s11_s;
+ *(reinterpret_cast<T *>(out_ptr + cout * sizeof(T))) = out0;
+ }
+ }
+ }
+ }
}
else if(border_mode == BorderMode::REPLICATE)
{
- execute_window_loop(window, [&](const Coordinates & id)
+ for(int bo = bo_start; bo < bo_end; bo += bo_step)
{
- const auto offset = *reinterpret_cast<const int32_t *>(offsets->ptr_to_element(Coordinates(id.y(), id.z())));
- const auto dx_val = *reinterpret_cast<const float *>(dx->ptr_to_element(Coordinates(id.y(), id.z())));
- const auto dy_val = *reinterpret_cast<const float *>(dy->ptr_to_element(Coordinates(id.y(), id.z())));
- const int in_hi = std::floor((id.z() + sampling_offset) * hr - sampling_offset);
-
- auto clamped_w = utility::clamp<int>(offset, 0, in_dim_w - 1);
- auto clamped_w1 = utility::clamp<int>(offset + 1, 0, in_dim_w - 1);
- auto clamped_h = utility::clamp<int>(in_hi, 0, in_dim_h - 1);
- auto clamped_h1 = utility::clamp<int>(in_hi + 1, 0, in_dim_h - 1);
-
- const auto a00 = *(reinterpret_cast<const T *>(in.ptr()) + clamped_w * in_stride_c + clamped_h * in_stride_wc);
- const auto a01 = *(reinterpret_cast<const T *>(in.ptr()) + clamped_w1 * in_stride_c + clamped_h * in_stride_wc);
- const auto a10 = *(reinterpret_cast<const T *>(in.ptr()) + clamped_w * in_stride_c + clamped_h1 * in_stride_wc);
- const auto a11 = *(reinterpret_cast<const T *>(in.ptr()) + clamped_w1 * in_stride_c + clamped_h1 * in_stride_wc);
-
- *reinterpret_cast<T *>(out.ptr()) = static_cast<T>(scale_helpers::delta_bilinear(a00, a01, a10, a11, dx_val, dy_val));
- },
- in, out);
+ const uint8_t *in_ptr = in.ptr() + bo * in_stride_w;
+ uint8_t *out_ptr = out.ptr() + bo * out_stride_w;
+
+ for(int yo = yo_start; yo < yo_end; yo += yo_step)
+ {
+ // Floating-point coordinate
+ const float yi_f = ((yo + sampling_offset) * scale_y - sampling_offset);
+ // Integer coordinate
+ const auto yi = static_cast<int>(std::floor(yi_f));
+ // Weight for the y coordinate
+ const auto a1 = (yi_f - static_cast<float>(yi));
+ const auto b1 = (1.f - a1);
+
+ const auto yi0 = utility::clamp<int>(yi, 0, in_dim_h - 1);
+ const auto yi1 = utility::clamp<int>(yi + 1, 0, in_dim_h - 1);
+
+ for(int xo = xo_start; xo < xo_end; xo += xo_step)
+ {
+ // Floating-point coordinate
+ const float xi_f = ((xo + sampling_offset) * scale_x - sampling_offset);
+ // Integer coordinate
+ const auto xi = static_cast<int>(std::floor(xi_f));
+ // Weight for the x coordinate
+ const auto a = (xi_f - static_cast<float>(xi));
+ const auto b = (1.f - a);
+
+ const auto s00_s = static_cast<T>(b * b1);
+ const auto s01_s = static_cast<T>(a * b1);
+ const auto s10_s = static_cast<T>(b * a1);
+ const auto s11_s = static_cast<T>(a * a1);
+
+ const auto xi0 = utility::clamp<int>(xi, 0, in_dim_w - 1);
+ const auto xi1 = utility::clamp<int>(xi + 1, 0, in_dim_w - 1);
+
+ int cout = 0;
+ for(; cout <= (out_dim_ch - step_cout); cout += step_cout)
+ {
+ auto in00 = wrapper::vdup_n(static_cast<T>(0), ExactTagType{});
+ auto in01 = wrapper::vdup_n(static_cast<T>(0), ExactTagType{});
+ auto in10 = wrapper::vdup_n(static_cast<T>(0), ExactTagType{});
+ auto in11 = wrapper::vdup_n(static_cast<T>(0), ExactTagType{});
+ in00 = wrapper::vloadq(reinterpret_cast<const T *>(in_ptr + cout * sizeof(T) + (xi0) * in_stride_y + (yi0) * in_stride_z));
+ in01 = wrapper::vloadq(reinterpret_cast<const T *>(in_ptr + cout * sizeof(T) + (xi1) * in_stride_y + (yi0) * in_stride_z));
+ in10 = wrapper::vloadq(reinterpret_cast<const T *>(in_ptr + cout * sizeof(T) + (xi0) * in_stride_y + (yi1) * in_stride_z));
+ in11 = wrapper::vloadq(reinterpret_cast<const T *>(in_ptr + cout * sizeof(T) + (xi1) * in_stride_y + (yi1) * in_stride_z));
+
+ const auto s00 = wrapper::vdup_n(s00_s, ExactTagType{});
+ const auto s01 = wrapper::vdup_n(s01_s, ExactTagType{});
+ const auto s10 = wrapper::vdup_n(s10_s, ExactTagType{});
+ const auto s11 = wrapper::vdup_n(s11_s, ExactTagType{});
+ auto out0 = wrapper::vdup_n(static_cast<T>(0), ExactTagType{});
+ out0 = wrapper::vmla(out0, in00, s00);
+ out0 = wrapper::vmla(out0, in01, s01);
+ out0 = wrapper::vmla(out0, in10, s10);
+ out0 = wrapper::vmla(out0, in11, s11);
+ wrapper::vstore(reinterpret_cast<T *>(out_ptr + cout * sizeof(T) + xo * out_stride_y + yo * out_stride_z), out0);
+ }
+
+ for(; cout < out_dim_ch; ++cout)
+ {
+ auto in00 = static_cast<T>(0);
+ auto in01 = static_cast<T>(0);
+ auto in10 = static_cast<T>(0);
+ auto in11 = static_cast<T>(0);
+ in00 = *(reinterpret_cast<const T *>(in_ptr + cout * sizeof(T) + (xi0) * in_stride_y + (yi0) * in_stride_z));
+ in01 = *(reinterpret_cast<const T *>(in_ptr + cout * sizeof(T) + (xi1) * in_stride_y + (yi0) * in_stride_z));
+ in10 = *(reinterpret_cast<const T *>(in_ptr + cout * sizeof(T) + (xi0) * in_stride_y + (yi1) * in_stride_z));
+ in11 = *(reinterpret_cast<const T *>(in_ptr + cout * sizeof(T) + (xi1) * in_stride_y + (yi1) * in_stride_z));
+ auto out0 = static_cast<T>(0);
+ out0 += in00 * s00_s;
+ out0 += in01 * s01_s;
+ out0 += in10 * s10_s;
+ out0 += in11 * s11_s;
+ *(reinterpret_cast<T *>(out_ptr + cout * sizeof(T) + xo * out_stride_y + yo * out_stride_z)) = out0;
+ }
+ }
+ }
+ }
}
else
{
diff --git a/tests/datasets/ScaleValidationDataset.h b/tests/datasets/ScaleValidationDataset.h
index c0073f93f5..11e0343582 100644
--- a/tests/datasets/ScaleValidationDataset.h
+++ b/tests/datasets/ScaleValidationDataset.h
@@ -147,7 +147,7 @@ framework::dataset::make("AlignCorners", { true }));
*/
#define SCALE_SHAPE_DATASET(element_per_iteration) \
concat(concat(concat(ScaleShapesBaseDataSet<1, 1, (element_per_iteration), 0>(), \
- ScaleShapesBaseDataSet<1, 1, (element_per_iteration), 2>()), \
+ ScaleShapesBaseDataSet<1, 1, (element_per_iteration), 2>()), \
ScaleShapesBaseDataSet<3, 1, (element_per_iteration), 1>()), \
ScaleShapesBaseDataSet<3, 3, (element_per_iteration), 0>())
@@ -166,7 +166,7 @@ framework::dataset::make("AlignCorners", { true }));
*/
#define SCALE_NIGHTLY_SHAPE_DATASET(element_per_iteration) \
concat(concat(concat(ScaleShapesBaseDataSet<1, 1, (element_per_iteration), 0>(), \
- ScaleShapesBaseDataSet<1, 1, (element_per_iteration), 1>()), \
+ ScaleShapesBaseDataSet<1, 1, (element_per_iteration), 1>()), \
ScaleShapesBaseDataSet<3, 1, (element_per_iteration), 0>()), \
ScaleShapesBaseDataSet<3, 3, (element_per_iteration), 0>())
@@ -177,6 +177,12 @@ framework::dataset::make("AlignCorners", { true }));
datasets::BorderModes()), \
samping_policy_set)
+#define ASSEMBLE_NHWC_DATASET(shape, samping_policy_set) \
+ combine(combine(combine(combine((shape), framework::dataset::make("DataLayout", DataLayout::NHWC)), \
+ ScaleInterpolationPolicySet), \
+ framework::dataset::make("BorderMode", { BorderMode::CONSTANT, BorderMode::REPLICATE })), \
+ samping_policy_set)
+
/** Generating dataset for quantized data tyeps with the given shapes */
#define ASSEMBLE_QUANTIZED_DATASET(shape, sampling_policy_set, quantization_info_set) \
combine(combine(combine(combine(combine(shape, \
diff --git a/tests/validation/NEON/Scale.cpp b/tests/validation/NEON/Scale.cpp
index 64427ae34f..e386d804ca 100644
--- a/tests/validation/NEON/Scale.cpp
+++ b/tests/validation/NEON/Scale.cpp
@@ -81,6 +81,7 @@ constexpr AbsoluteTolerance<uint8_t> tolerance_u8(1);
constexpr AbsoluteTolerance<int16_t> tolerance_s16(1);
RelativeTolerance<float> tolerance_f32(0.05);
#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+constexpr float abs_tolerance_f16(0.01f);
RelativeTolerance<half> tolerance_f16(half(0.1));
#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
@@ -324,7 +325,8 @@ using NEScaleQuantizedMixedDataLayoutFixture = ScaleValidationQuantizedFixture<T
TEST_SUITE(Float)
TEST_SUITE(FP32)
-const auto f32_shape = combine((SCALE_SHAPE_DATASET(num_elements_per_vector<float>())), framework::dataset::make("DataType", DataType::F32));
+const auto f32_shape = combine((SCALE_SHAPE_DATASET(num_elements_per_vector<float>())), framework::dataset::make("DataType", DataType::F32));
+const auto f32_shape_nhwc = combine(datasets::Small3DShapes(), framework::dataset::make("DataType", DataType::F32));
FIXTURE_DATA_TEST_CASE(RunSmall, NEScaleFixture<float>, framework::DatasetMode::ALL, ASSEMBLE_DATASET(f32_shape, ScaleSamplingPolicySet))
{
//Create valid region
@@ -352,10 +354,38 @@ FIXTURE_DATA_TEST_CASE(RunSmallAlignCorners, NEScaleFixture<float>, framework::D
// Validate output
validate(Accessor(_target), _reference, valid_region, tolerance_f32, tolerance_num_f32);
}
+FIXTURE_DATA_TEST_CASE(RunMediumNHWC, NEScaleFixture<float>, framework::DatasetMode::ALL, ASSEMBLE_NHWC_DATASET(f32_shape_nhwc, ScaleSamplingPolicySet))
+{
+ //Create valid region
+ TensorInfo src_info(_shape, 1, _data_type);
+ ValidRegion valid_region = calculate_valid_region_scale(src_info, _reference.shape(), _policy, _sampling_policy, (_border_mode == BorderMode::UNDEFINED));
+
+ // Validate output
+ validate(Accessor(_target), _reference, valid_region, tolerance_f32, tolerance_num_f32);
+}
+FIXTURE_DATA_TEST_CASE(RunMediumMixedDataLayoutNHWC, NEScaleMixedDataLayoutFixture<float>, framework::DatasetMode::PRECOMMIT, ASSEMBLE_NHWC_DATASET(f32_shape_nhwc, ScaleSamplingPolicySet))
+{
+ //Create valid region
+ TensorInfo src_info(_shape, 1, _data_type);
+ ValidRegion valid_region = calculate_valid_region_scale(src_info, _reference.shape(), _policy, _sampling_policy, (_border_mode == BorderMode::UNDEFINED));
+
+ // Validate output
+ validate(Accessor(_target), _reference, valid_region, tolerance_f32, tolerance_num_f32);
+}
+FIXTURE_DATA_TEST_CASE(RunMediumAlignCornersNHWC, NEScaleFixture<float>, framework::DatasetMode::ALL, ASSEMBLE_NHWC_DATASET(f32_shape_nhwc, ScaleAlignCornersSamplingPolicySet))
+{
+ //Create valid region
+ TensorInfo src_info(_shape, 1, _data_type);
+ ValidRegion valid_region = calculate_valid_region_scale(src_info, _reference.shape(), _policy, _sampling_policy, (_border_mode == BorderMode::UNDEFINED));
+
+ // Validate output
+ validate(Accessor(_target), _reference, valid_region, tolerance_f32, tolerance_num_f32);
+}
TEST_SUITE_END() // FP32
#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
TEST_SUITE(FP16)
-const auto f16_shape = combine((SCALE_SHAPE_DATASET(num_elements_per_vector<half>())), framework::dataset::make("DataType", DataType::F16));
+const auto f16_shape = combine((SCALE_SHAPE_DATASET(num_elements_per_vector<half>())), framework::dataset::make("DataType", DataType::F16));
+const auto f16_shape_nhwc = combine(datasets::Small3DShapes(), framework::dataset::make("DataType", DataType::F16));
FIXTURE_DATA_TEST_CASE(RunSmall, NEScaleFixture<half>, framework::DatasetMode::ALL, ASSEMBLE_DATASET(f16_shape, ScaleSamplingPolicySet))
{
//Create valid region
@@ -363,7 +393,7 @@ FIXTURE_DATA_TEST_CASE(RunSmall, NEScaleFixture<half>, framework::DatasetMode::A
const ValidRegion valid_region = calculate_valid_region_scale(src_info, _reference.shape(), _policy, _sampling_policy, (_border_mode == BorderMode::UNDEFINED));
// Validate output
- validate(Accessor(_target), _reference, valid_region, tolerance_f16);
+ validate(Accessor(_target), _reference, valid_region, tolerance_f16, 0.0f, abs_tolerance_f16);
}
FIXTURE_DATA_TEST_CASE(RunSmallAlignCorners, NEScaleFixture<half>, framework::DatasetMode::ALL, ASSEMBLE_DATASET(f16_shape, ScaleAlignCornersSamplingPolicySet))
{
@@ -372,7 +402,34 @@ FIXTURE_DATA_TEST_CASE(RunSmallAlignCorners, NEScaleFixture<half>, framework::Da
const ValidRegion valid_region = calculate_valid_region_scale(src_info, _reference.shape(), _policy, _sampling_policy, (_border_mode == BorderMode::UNDEFINED));
// Validate output
- validate(Accessor(_target), _reference, valid_region, tolerance_f16);
+ validate(Accessor(_target), _reference, valid_region, tolerance_f16, 0.0f, abs_tolerance_f16);
+}
+FIXTURE_DATA_TEST_CASE(RunMediumNHWC, NEScaleFixture<half>, framework::DatasetMode::ALL, ASSEMBLE_NHWC_DATASET(f16_shape_nhwc, ScaleSamplingPolicySet))
+{
+ //Create valid region
+ TensorInfo src_info(_shape, 1, _data_type);
+ ValidRegion valid_region = calculate_valid_region_scale(src_info, _reference.shape(), _policy, _sampling_policy, (_border_mode == BorderMode::UNDEFINED));
+
+ // Validate output
+ validate(Accessor(_target), _reference, valid_region, tolerance_f16, 0.0f, abs_tolerance_f16);
+}
+FIXTURE_DATA_TEST_CASE(RunMediumMixedDataLayoutNHWC, NEScaleMixedDataLayoutFixture<half>, framework::DatasetMode::PRECOMMIT, ASSEMBLE_NHWC_DATASET(f16_shape_nhwc, ScaleSamplingPolicySet))
+{
+ //Create valid region
+ TensorInfo src_info(_shape, 1, _data_type);
+ ValidRegion valid_region = calculate_valid_region_scale(src_info, _reference.shape(), _policy, _sampling_policy, (_border_mode == BorderMode::UNDEFINED));
+
+ // Validate output
+ validate(Accessor(_target), _reference, valid_region, tolerance_f16, 0.0f, abs_tolerance_f16);
+}
+FIXTURE_DATA_TEST_CASE(RunMediumAlignCornersNHWC, NEScaleFixture<half>, framework::DatasetMode::ALL, ASSEMBLE_NHWC_DATASET(f16_shape_nhwc, ScaleAlignCornersSamplingPolicySet))
+{
+ //Create valid region
+ TensorInfo src_info(_shape, 1, _data_type);
+ ValidRegion valid_region = calculate_valid_region_scale(src_info, _reference.shape(), _policy, _sampling_policy, (_border_mode == BorderMode::UNDEFINED));
+
+ // Validate output
+ validate(Accessor(_target), _reference, valid_region, tolerance_f16, 0.0f, abs_tolerance_f16);
}
TEST_SUITE_END() // FP16
#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
@@ -435,7 +492,8 @@ FIXTURE_DATA_TEST_CASE(RunSmall, NEScaleQuantizedFixture<uint8_t>, framework::Da
// Validate output
validate(Accessor(_target), _reference, valid_region, tolerance_u8);
}
-FIXTURE_DATA_TEST_CASE(RunMixedDataLayout, NEScaleQuantizedMixedDataLayoutFixture<uint8_t>, framework::DatasetMode::ALL, ASSEMBLE_QUANTIZED_DATASET(qasymm8_shape, ScaleSamplingPolicySet, QuantizationInfoSet))
+FIXTURE_DATA_TEST_CASE(RunMixedDataLayout, NEScaleQuantizedMixedDataLayoutFixture<uint8_t>, framework::DatasetMode::ALL, ASSEMBLE_QUANTIZED_DATASET(qasymm8_shape, ScaleSamplingPolicySet,
+ QuantizationInfoSet))
{
//Create valid region
TensorInfo src_info(_shape, 1, _data_type);