aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorGeorgios Pinitas <georgios.pinitas@arm.com>2018-11-16 18:19:43 +0000
committerGeorgios Pinitas <georgios.pinitas@arm.com>2018-11-20 18:45:09 +0000
commitc47ef20d69e8ea0f519fdc679435cd7037fc18fe (patch)
tree4f3aaeb62a5bc2d6177d2212a1322b8eb17e49fc
parent2a35f425dc65afb517bba396e1c3be98ec7f507f (diff)
downloadComputeLibrary-c47ef20d69e8ea0f519fdc679435cd7037fc18fe.tar.gz
COMPMID-1646: NEResizeBilinearLayer NHWC
-Adds NHWC support for FP16 Change-Id: I61addf8efecf511ac8cd5f8aa9afc3e09c476aaf
-rw-r--r--arm_compute/core/CL/kernels/CLScaleKernel.h4
-rw-r--r--arm_compute/core/NEON/kernels/NEScaleKernel.h4
-rw-r--r--arm_compute/runtime/CL/functions/CLScale.h4
-rw-r--r--arm_compute/runtime/NEON/functions/NEScale.h4
-rw-r--r--src/core/NEON/kernels/NEScaleKernel.cpp108
-rw-r--r--tests/validation/NEON/Scale.cpp35
6 files changed, 149 insertions, 10 deletions
diff --git a/arm_compute/core/CL/kernels/CLScaleKernel.h b/arm_compute/core/CL/kernels/CLScaleKernel.h
index db87519c0f..ff72af29fc 100644
--- a/arm_compute/core/CL/kernels/CLScaleKernel.h
+++ b/arm_compute/core/CL/kernels/CLScaleKernel.h
@@ -37,7 +37,7 @@ class CLScaleKernel : public ICLSimple2DKernel
public:
/** Initialise the kernel's inputs, output and interpolation policy
*
- * @param[in] input Source tensor. Data types supported: U8/S16/F16/F32
+ * @param[in] input Source tensor. Data types supported: U8/QASYMM8/S16/F16/F32
* @param[out] output Destination tensor. Data types supported: Same as @p input
* All but the lowest two dimensions must be the same size as in the input tensor, i.e. scaling is only performed within the XY-plane.
* @param[in] policy Interpolation type to use
@@ -48,7 +48,7 @@ public:
/** Static function to check if given info will lead to a valid configuration of @ref CLScaleKernel
*
- * @param[in] input Source tensor info. Data types supported: U8/S16/F16/F32
+ * @param[in] input Source tensor info. Data types supported: U8/QASYMM8/S16/F16/F32
* @param[in] output Destination tensor info. Data types supported: Same as @p input
* All but the lowest two dimensions must be the same size as in the input tensor, i.e. scaling is only performed within the XY-plane.
* @param[in] policy Interpolation type to use
diff --git a/arm_compute/core/NEON/kernels/NEScaleKernel.h b/arm_compute/core/NEON/kernels/NEScaleKernel.h
index 575814b84c..c851b3d335 100644
--- a/arm_compute/core/NEON/kernels/NEScaleKernel.h
+++ b/arm_compute/core/NEON/kernels/NEScaleKernel.h
@@ -56,7 +56,7 @@ public:
*
* @note dx, dy and offsets have the same dimensions (width and height) of the output tensor
*
- * @param[in] input Source tensor. Data types supported: U8/S16/F32.
+ * @param[in] input Source tensor. Data types supported: U8/S16/F16/F32.
* @param[in] dx Pixel's distance between the X real coordinate and the smallest X following integer. Data type supported: F32
* @param[in] dy Pixel's distance between the Y real coordinate and the smallest Y following integer. Data type supported: F32
* @param[in] offsets Offset to access the pixel with NEAREST interpolation or the top-left pixel with BILINEAR interpolation in the input tensor. Data type supported: S32.
@@ -71,7 +71,7 @@ public:
*
* @note dx, dy and offsets have the same dimensions (width and height) of the output tensor
*
- * @param[in] input Source tensor. Data types supported: U8/S16/F32.
+ * @param[in] input Source tensor. Data types supported: U8/S16/F16/F32.
* @param[in] dx Pixel's distance between the X real coordinate and the smallest X following integer. Data type supported: F32
* @param[in] dy Pixel's distance between the Y real coordinate and the smallest Y following integer. Data type supported: F32
* @param[in] offsets Offset to access the pixel with NEAREST interpolation or the top-left pixel with BILINEAR interpolation in the input tensor. Data type supported: S32.
diff --git a/arm_compute/runtime/CL/functions/CLScale.h b/arm_compute/runtime/CL/functions/CLScale.h
index c4b8a2aada..ec324942d3 100644
--- a/arm_compute/runtime/CL/functions/CLScale.h
+++ b/arm_compute/runtime/CL/functions/CLScale.h
@@ -39,7 +39,7 @@ class CLScale : public ICLSimpleFunction
public:
/** Initialize the function's source, destination, interpolation type and border_mode.
*
- * @param[in,out] input Source tensor. Data types supported: U8/S16/F16/F32. (Written to only for @p border_mode != UNDEFINED)
+ * @param[in,out] input Source tensor. Data types supported: U8/QASYMM8/S16/F16/F32. (Written to only for @p border_mode != UNDEFINED)
* @param[out] output Destination tensor. Data types supported: Same as @p input
* All but the lowest two dimensions must be the same size as in the input tensor, i.e. scaling is only performed within the XY-plane.
* @param[in] policy The interpolation type.
@@ -52,7 +52,7 @@ public:
/** Static function to check if given info will lead to a valid configuration of @ref CLScale
*
- * @param[in] input Source tensor info. Data types supported: U8/S16/F16/F32.
+ * @param[in] input Source tensor info. Data types supported: U8/QASYMM8/S16/F16/F32.
* @param[in] output Output tensor info. Data type supported: Same as @p input
* All but the lowest two dimensions must be the same size as in the input tensor, i.e. scaling is only performed within the XY-plane.
* @param[in] policy The interpolation type.
diff --git a/arm_compute/runtime/NEON/functions/NEScale.h b/arm_compute/runtime/NEON/functions/NEScale.h
index 9b5a12bece..d59e3cccb6 100644
--- a/arm_compute/runtime/NEON/functions/NEScale.h
+++ b/arm_compute/runtime/NEON/functions/NEScale.h
@@ -47,7 +47,7 @@ public:
NEScale();
/** Initialize the function's source, destination, interpolation type and border_mode.
*
- * @param[in, out] input Source tensor. Data type supported: U8/S16/F32. (Written to only for @p border_mode != UNDEFINED)
+ * @param[in, out] input Source tensor. Data type supported: U8/S16/F16/F32. (Written to only for @p border_mode != UNDEFINED)
* @param[out] output Destination tensor. Data type supported: Same as @p input. All but the lowest two dimensions must be the same size as in the input tensor, i.e. scaling is only performed within the XY-plane.
* @param[in] policy The interpolation type.
* @param[in] border_mode Strategy to use for borders.
@@ -58,7 +58,7 @@ public:
SamplingPolicy sampling_policy = SamplingPolicy::CENTER);
/** Static function to check if given info will lead to a valid configuration of @ref NEScale
*
- * @param[in] input Source tensor. Data type supported: U8/S16/F32. (Written to only for @p border_mode != UNDEFINED)
+ * @param[in] input Source tensor. Data type supported: U8/S16/F16/F32. (Written to only for @p border_mode != UNDEFINED)
* @param[in] output Destination tensor. Data type supported: Same as @p input. All but the lowest two dimensions must be the same size as in the input tensor, i.e. scaling is only performed within the XY-plane.
* @param[in] policy The interpolation type.
* @param[in] border_mode Strategy to use for borders.
diff --git a/src/core/NEON/kernels/NEScaleKernel.cpp b/src/core/NEON/kernels/NEScaleKernel.cpp
index 71116447f4..5fef4f9744 100644
--- a/src/core/NEON/kernels/NEScaleKernel.cpp
+++ b/src/core/NEON/kernels/NEScaleKernel.cpp
@@ -24,6 +24,7 @@
#include "arm_compute/core/NEON/kernels/NEScaleKernel.h"
#include "arm_compute/core/AccessWindowStatic.h"
+#include "arm_compute/core/CPP/Validate.h"
#include "arm_compute/core/Coordinates.h"
#include "arm_compute/core/Error.h"
#include "arm_compute/core/Helpers.h"
@@ -46,7 +47,8 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *dx, const
const ITensorInfo *offsets, ITensorInfo *output, InterpolationPolicy policy,
BorderMode border_mode, SamplingPolicy sampling_policy)
{
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::S16, DataType::F32);
+ ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::S16, DataType::F16, DataType::F32);
ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(output);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
ARM_COMPUTE_RETURN_ERROR_ON(output == input);
@@ -463,6 +465,48 @@ void NEScaleKernel::scale_nearest_nchw(const Window &window)
in, offsets, out);
break;
}
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+ case DataType::F16:
+ {
+ float16x8x2_t tmp =
+ {
+ {
+ vdupq_n_f16(0),
+ vdupq_n_f16(0)
+ }
+ };
+
+ execute_window_loop(window, [&](const Coordinates & id)
+ {
+ const auto offsets_ptr = reinterpret_cast<const int32_t *>(offsets.ptr());
+
+ const int in_yi = (id.y() + 0.5f) * hr;
+ const int offset_row = in_yi * input_stride;
+
+ tmp.val[0] = vsetq_lane_f16(*reinterpret_cast<const __fp16 *>(in.ptr() + offsets_ptr[0] + offset_row), tmp.val[0], 0);
+ tmp.val[0] = vsetq_lane_f16(*reinterpret_cast<const __fp16 *>(in.ptr() + offsets_ptr[2] + offset_row), tmp.val[0], 1);
+ tmp.val[0] = vsetq_lane_f16(*reinterpret_cast<const __fp16 *>(in.ptr() + offsets_ptr[4] + offset_row), tmp.val[0], 2);
+ tmp.val[0] = vsetq_lane_f16(*reinterpret_cast<const __fp16 *>(in.ptr() + offsets_ptr[6] + offset_row), tmp.val[0], 3);
+ tmp.val[0] = vsetq_lane_f16(*reinterpret_cast<const __fp16 *>(in.ptr() + offsets_ptr[8] + offset_row), tmp.val[0], 4);
+ tmp.val[0] = vsetq_lane_f16(*reinterpret_cast<const __fp16 *>(in.ptr() + offsets_ptr[10] + offset_row), tmp.val[0], 5);
+ tmp.val[0] = vsetq_lane_f16(*reinterpret_cast<const __fp16 *>(in.ptr() + offsets_ptr[12] + offset_row), tmp.val[0], 6);
+ tmp.val[0] = vsetq_lane_f16(*reinterpret_cast<const __fp16 *>(in.ptr() + offsets_ptr[14] + offset_row), tmp.val[0], 7);
+
+ tmp.val[1] = vsetq_lane_f16(*reinterpret_cast<const __fp16 *>(in.ptr() + offsets_ptr[1] + offset_row), tmp.val[1], 0);
+ tmp.val[1] = vsetq_lane_f16(*reinterpret_cast<const __fp16 *>(in.ptr() + offsets_ptr[3] + offset_row), tmp.val[1], 1);
+ tmp.val[1] = vsetq_lane_f16(*reinterpret_cast<const __fp16 *>(in.ptr() + offsets_ptr[5] + offset_row), tmp.val[1], 2);
+ tmp.val[1] = vsetq_lane_f16(*reinterpret_cast<const __fp16 *>(in.ptr() + offsets_ptr[7] + offset_row), tmp.val[1], 3);
+ tmp.val[1] = vsetq_lane_f16(*reinterpret_cast<const __fp16 *>(in.ptr() + offsets_ptr[9] + offset_row), tmp.val[1], 4);
+ tmp.val[1] = vsetq_lane_f16(*reinterpret_cast<const __fp16 *>(in.ptr() + offsets_ptr[11] + offset_row), tmp.val[1], 5);
+ tmp.val[1] = vsetq_lane_f16(*reinterpret_cast<const __fp16 *>(in.ptr() + offsets_ptr[13] + offset_row), tmp.val[1], 6);
+ tmp.val[1] = vsetq_lane_f16(*reinterpret_cast<const __fp16 *>(in.ptr() + offsets_ptr[15] + offset_row), tmp.val[1], 7);
+
+ vst2q_f16(reinterpret_cast<__fp16 *>(out.ptr()), tmp);
+ },
+ in, offsets, out);
+ break;
+ }
+#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
case DataType::F32:
{
float32x4x4_t tmp =
@@ -515,7 +559,7 @@ void NEScaleKernel::scale_nearest_nchw(const Window &window)
void NEScaleKernel::scale_bilinear_nchw(const Window &window)
{
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(_input, 1, DataType::U8, DataType::S16, DataType::F32);
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(_input, 1, DataType::U8, DataType::S16, DataType::F16, DataType::F32);
// Compute the ratio between source height and destination height
const auto hr = static_cast<float>(_input->info()->dimension(1)) / static_cast<float>(_output->info()->dimension(1));
@@ -626,6 +670,50 @@ void NEScaleKernel::scale_bilinear_nchw(const Window &window)
in, offsets, dx, dy, out);
break;
}
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+ case DataType::F16:
+ {
+ execute_window_loop(window, [&](const Coordinates & id)
+ {
+ const auto offsets_ptr = reinterpret_cast<const int32_t *>(offsets.ptr());
+ const auto dx_ptr = reinterpret_cast<const float *>(dx.ptr());
+ const auto dy_ptr = reinterpret_cast<const float *>(dy.ptr());
+
+ const int in_yi = std::floor((id.y() + 0.5f) * hr - 0.5f);
+ const int offset_row = in_yi * in_stide_in_bytes;
+
+ float16x8x2_t tmp =
+ {
+ {
+ vdupq_n_f16(0),
+ vdupq_n_f16(0)
+ }
+ };
+
+ tmp.val[0] = vsetq_lane_f16(delta_bilinear_c1(reinterpret_cast<const __fp16 *>(in.ptr() + offsets_ptr[0] + offset_row), in_stride, dx_ptr[0], dy_ptr[0]), tmp.val[0], 0);
+ tmp.val[0] = vsetq_lane_f16(delta_bilinear_c1(reinterpret_cast<const __fp16 *>(in.ptr() + offsets_ptr[2] + offset_row), in_stride, dx_ptr[2], dy_ptr[2]), tmp.val[0], 1);
+ tmp.val[0] = vsetq_lane_f16(delta_bilinear_c1(reinterpret_cast<const __fp16 *>(in.ptr() + offsets_ptr[4] + offset_row), in_stride, dx_ptr[4], dy_ptr[4]), tmp.val[0], 2);
+ tmp.val[0] = vsetq_lane_f16(delta_bilinear_c1(reinterpret_cast<const __fp16 *>(in.ptr() + offsets_ptr[6] + offset_row), in_stride, dx_ptr[6], dy_ptr[6]), tmp.val[0], 3);
+ tmp.val[0] = vsetq_lane_f16(delta_bilinear_c1(reinterpret_cast<const __fp16 *>(in.ptr() + offsets_ptr[8] + offset_row), in_stride, dx_ptr[8], dy_ptr[8]), tmp.val[0], 4);
+ tmp.val[0] = vsetq_lane_f16(delta_bilinear_c1(reinterpret_cast<const __fp16 *>(in.ptr() + offsets_ptr[10] + offset_row), in_stride, dx_ptr[10], dy_ptr[10]), tmp.val[0], 5);
+ tmp.val[0] = vsetq_lane_f16(delta_bilinear_c1(reinterpret_cast<const __fp16 *>(in.ptr() + offsets_ptr[12] + offset_row), in_stride, dx_ptr[12], dy_ptr[12]), tmp.val[0], 6);
+ tmp.val[0] = vsetq_lane_f16(delta_bilinear_c1(reinterpret_cast<const __fp16 *>(in.ptr() + offsets_ptr[14] + offset_row), in_stride, dx_ptr[14], dy_ptr[14]), tmp.val[0], 7);
+
+ tmp.val[1] = vsetq_lane_f16(delta_bilinear_c1(reinterpret_cast<const __fp16 *>(in.ptr() + offsets_ptr[1] + offset_row), in_stride, dx_ptr[1], dy_ptr[1]), tmp.val[1], 0);
+ tmp.val[1] = vsetq_lane_f16(delta_bilinear_c1(reinterpret_cast<const __fp16 *>(in.ptr() + offsets_ptr[3] + offset_row), in_stride, dx_ptr[3], dy_ptr[3]), tmp.val[1], 1);
+ tmp.val[1] = vsetq_lane_f16(delta_bilinear_c1(reinterpret_cast<const __fp16 *>(in.ptr() + offsets_ptr[5] + offset_row), in_stride, dx_ptr[5], dy_ptr[5]), tmp.val[1], 2);
+ tmp.val[1] = vsetq_lane_f16(delta_bilinear_c1(reinterpret_cast<const __fp16 *>(in.ptr() + offsets_ptr[7] + offset_row), in_stride, dx_ptr[7], dy_ptr[7]), tmp.val[1], 3);
+ tmp.val[1] = vsetq_lane_f16(delta_bilinear_c1(reinterpret_cast<const __fp16 *>(in.ptr() + offsets_ptr[9] + offset_row), in_stride, dx_ptr[9], dy_ptr[9]), tmp.val[1], 4);
+ tmp.val[1] = vsetq_lane_f16(delta_bilinear_c1(reinterpret_cast<const __fp16 *>(in.ptr() + offsets_ptr[11] + offset_row), in_stride, dx_ptr[11], dy_ptr[11]), tmp.val[1], 5);
+ tmp.val[1] = vsetq_lane_f16(delta_bilinear_c1(reinterpret_cast<const __fp16 *>(in.ptr() + offsets_ptr[13] + offset_row), in_stride, dx_ptr[13], dy_ptr[13]), tmp.val[1], 6);
+ tmp.val[1] = vsetq_lane_f16(delta_bilinear_c1(reinterpret_cast<const __fp16 *>(in.ptr() + offsets_ptr[15] + offset_row), in_stride, dx_ptr[15], dy_ptr[15]), tmp.val[1], 7);
+
+ vst2q_f16(reinterpret_cast<__fp16 *>(out.ptr()), tmp);
+ },
+ in, offsets, dx, dy, out);
+ break;
+ }
+#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
case DataType::F32:
{
execute_window_loop(window, [&](const Coordinates & id)
@@ -777,6 +865,22 @@ void NEScaleKernel::scale_nhwc(const Window &window)
}
break;
}
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+ case DataType::F16:
+ {
+ if(_policy == InterpolationPolicy::NEAREST_NEIGHBOR)
+ {
+ scale_nearest_nhwc_core<float16_t>(_input, _offsets, _output, hr,
+ window, win_in, input_stride_w, input_stride_h, input_stride_c);
+ }
+ else
+ {
+ scale_bilinear_nhwc_core<float16_t>(_input, _offsets, _dx, _dy, _output, hr,
+ window, win_in, input_stride_w, input_stride_h, input_stride_c, _border_mode);
+ }
+ break;
+ }
+#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
case DataType::F32:
{
if(_policy == InterpolationPolicy::NEAREST_NEIGHBOR)
diff --git a/tests/validation/NEON/Scale.cpp b/tests/validation/NEON/Scale.cpp
index 0d4a86e372..0d9c8e1f8c 100644
--- a/tests/validation/NEON/Scale.cpp
+++ b/tests/validation/NEON/Scale.cpp
@@ -66,6 +66,9 @@ const auto ScaleDataLayouts = framework::dataset::make("DataLayout",
constexpr AbsoluteTolerance<uint8_t> tolerance_u8(1);
constexpr AbsoluteTolerance<int16_t> tolerance_s16(1);
RelativeTolerance<float> tolerance_f32(0.01);
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+RelativeTolerance<half> tolerance_f16(half(0.1));
+#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
constexpr float tolerance_num_s16 = 0.01f;
constexpr float tolerance_num_f32 = 0.01f;
@@ -230,6 +233,38 @@ FIXTURE_DATA_TEST_CASE(RunLarge, NEScaleFixture<float>, framework::DatasetMode::
validate(Accessor(_target), _reference, valid_region, tolerance_f32, tolerance_num_f32);
}
TEST_SUITE_END()
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+TEST_SUITE(FP16)
+FIXTURE_DATA_TEST_CASE(RunSmall, NEScaleFixture<half>, framework::DatasetMode::ALL, combine(combine(combine(combine(combine(datasets::SmallShapes(), framework::dataset::make("DataType",
+ DataType::F16)),
+ framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })),
+ framework::dataset::make("InterpolationPolicy", { InterpolationPolicy::NEAREST_NEIGHBOR, InterpolationPolicy::BILINEAR })),
+ datasets::BorderModes()),
+ framework::dataset::make("SamplingPolicy", { SamplingPolicy::CENTER })))
+{
+ //Create valid region
+ TensorInfo src_info(_shape, 1, _data_type);
+ const ValidRegion valid_region = calculate_valid_region_scale(src_info, _reference.shape(), _policy, _sampling_policy, (_border_mode == BorderMode::UNDEFINED));
+
+ // Validate output
+ validate(Accessor(_target), _reference, valid_region, tolerance_f16);
+}
+FIXTURE_DATA_TEST_CASE(RunLarge, NEScaleFixture<half>, framework::DatasetMode::NIGHTLY, combine(combine(combine(combine(combine(datasets::LargeShapes(), framework::dataset::make("DataType",
+ DataType::F16)),
+ framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })),
+ framework::dataset::make("InterpolationPolicy", { InterpolationPolicy::NEAREST_NEIGHBOR, InterpolationPolicy::BILINEAR })),
+ datasets::BorderModes()),
+ framework::dataset::make("SamplingPolicy", { SamplingPolicy::CENTER })))
+{
+ //Create valid region
+ TensorInfo src_info(_shape, 1, _data_type);
+ const ValidRegion valid_region = calculate_valid_region_scale(src_info, _reference.shape(), _policy, _sampling_policy, (_border_mode == BorderMode::UNDEFINED));
+
+ // Validate output
+ validate(Accessor(_target), _reference, valid_region, tolerance_f16);
+}
+TEST_SUITE_END() // FP16
+#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
TEST_SUITE_END()
TEST_SUITE(Integer)