aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorPablo Tello <pablo.tello@arm.com>2017-06-26 17:17:42 +0100
committerAnthony Barbier <anthony.barbier@arm.com>2018-09-17 14:16:42 +0100
commit0c34fe29c298057091d48cde332cb60bb14efee1 (patch)
treef77502715904a522b0e76f32eb1bcd6ce2f567dc
parent2b26b850c0cff6a25f1012e9e4e7fe6654364a88 (diff)
downloadComputeLibrary-0c34fe29c298057091d48cde332cb60bb14efee1.tar.gz
COMPMID-421: Added FP16 support in Pooling Layer
Change-Id: I6b6119c8770051c1656da40aa073c539c15b493e Reviewed-on: http://mpd-gerrit.cambridge.arm.com/78985 Reviewed-by: Moritz Pflanzer <moritz.pflanzer@arm.com> Tested-by: Kaizen <jeremy.johnson+kaizengerrit@arm.com>
-rw-r--r--arm_compute/core/NEON/kernels/NEPoolingLayerKernel.h17
-rw-r--r--arm_compute/core/PixelValue.h48
-rwxr-xr-xscripts/check_clang-tidy.py1
-rw-r--r--scripts/clang-tidy.h23
-rw-r--r--src/core/NEON/kernels/NEFillBorderKernel.cpp18
-rw-r--r--src/core/NEON/kernels/NEPoolingLayerKernel.cpp166
-rw-r--r--tests/benchmark_new/NEON/PoolingLayer.cpp23
-rw-r--r--tests/validation/NEON/PoolingLayer.cpp31
-rw-r--r--tests/validation/Reference.cpp1
-rw-r--r--tests/validation/TensorOperations.h22
10 files changed, 306 insertions, 44 deletions
diff --git a/arm_compute/core/NEON/kernels/NEPoolingLayerKernel.h b/arm_compute/core/NEON/kernels/NEPoolingLayerKernel.h
index bf06fdd639..a5de81137b 100644
--- a/arm_compute/core/NEON/kernels/NEPoolingLayerKernel.h
+++ b/arm_compute/core/NEON/kernels/NEPoolingLayerKernel.h
@@ -48,7 +48,7 @@ public:
~NEPoolingLayerKernel() = default;
/** Set the input and output tensors.
*
- * @param[in] input Source tensor. Data types supported: QS8/F32.
+ * @param[in] input Source tensor. Data types supported: QS8/F16/F32.
* @param[out] output Destination tensor. Data types supported: Same as @p input.
* @param[in] pool_info Contains pooling operation information described in @ref PoolingLayerInfo.
*/
@@ -66,6 +66,14 @@ private:
*/
template <PoolingType pooling_type>
void pooling2_f32(const Window &window_input, const Window &window);
+ /** Function to perform 2x2 pooling for float16_t.
+ *
+ * @param[in] window_input Input region on which to execute the kernel.
+ * @param[in] window Output region on which to execute the kernel.
+ */
+ template <PoolingType pooling_type>
+ void pooling2_f16(const Window &window_input, const Window &window);
+
/** Function to perform 2x2 pooling for 8bit fixed point.
*
* @param[in] window_input Input region on which to execute the kernel.
@@ -80,6 +88,13 @@ private:
*/
template <PoolingType pooling_type>
void pooling3_f32(const Window &window_input, const Window &window);
+ /** Function to perform 3x3 pooling.
+ *
+ * @param[in] window_input Input region on which to execute the kernel.
+ * @param[in] window Output region on which to execute the kernel.
+ */
+ template <PoolingType pooling_type>
+ void pooling3_f16(const Window &window_input, const Window &window);
/** Function to perform 3x3 pooling for 8bit fixed point.
*
* @param[in] window_input Input region on which to execute the kernel.
diff --git a/arm_compute/core/PixelValue.h b/arm_compute/core/PixelValue.h
index b4912ce15a..1b1a5a3845 100644
--- a/arm_compute/core/PixelValue.h
+++ b/arm_compute/core/PixelValue.h
@@ -26,6 +26,10 @@
#include <cstdint>
+#if ARM_COMPUTE_ENABLE_FP16
+#include <arm_fp16.h> // needed for float16_t
+#endif /* ARM_COMPUTE_ENABLE_FP16 */
+
namespace arm_compute
{
/** Class describing the value of a pixel for any image format. */
@@ -82,6 +86,17 @@ public:
{
value.s32 = v;
}
+#if ARM_COMPUTE_ENABLE_FP16
+ /** Initialize the union with a F16 pixel value
+ *
+ * @param[in] v F16 value.
+ */
+ PixelValue(float16_t v)
+ : PixelValue()
+ {
+ value.f16 = v;
+ }
+#endif /* ARM_COMPUTE_ENABLE_FP16 */
/** Initialize the union with a F32 pixel value
*
* @param[in] v F32 value.
@@ -96,16 +111,19 @@ public:
*/
union
{
- uint8_t rgb[3]; /**< 3 channels: RGB888 */
- uint8_t yuv[3]; /**< 3 channels: Any YUV format */
- uint8_t rgbx[4]; /**< 4 channels: RGBX8888 */
- float f32; /**< Single channel float 32 */
- uint8_t u8; /**< Single channel U8 */
- int8_t s8; /**< Single channel S8 */
- uint16_t u16; /**< Single channel U16 */
- int16_t s16; /**< Single channel S16 */
- uint32_t u32; /**< Single channel U32 */
- int32_t s32; /**< Single channel S32 */
+ uint8_t rgb[3]; /**< 3 channels: RGB888 */
+ uint8_t yuv[3]; /**< 3 channels: Any YUV format */
+ uint8_t rgbx[4]; /**< 4 channels: RGBX8888 */
+ float f32; /**< Single channel float 32 */
+#if ARM_COMPUTE_ENABLE_FP16
+ float16_t f16; /**< Single channel F16 */
+#endif /* ARM_COMPUTE_ENABLE_FP16 */
+ uint8_t u8; /**< Single channel U8 */
+ int8_t s8; /**< Single channel S8 */
+ uint16_t u16; /**< Single channel U16 */
+ int16_t s16; /**< Single channel S16 */
+ uint32_t u32; /**< Single channel U32 */
+ int32_t s32; /**< Single channel S32 */
} value;
/** Interpret the pixel value as a U8
*
@@ -155,6 +173,16 @@ public:
{
v = value.s32;
}
+#if ARM_COMPUTE_ENABLE_FP16
+ /** Interpret the pixel value as a F16
+ *
+ * @param[out] v Returned value
+ */
+ void get(float16_t &v) const
+ {
+ v = value.f16;
+ }
+#endif /* ARM_COMPUTE_ENABLE_FP16 */
/** Interpret the pixel value as a F32
*
* @param[out] v Returned value
diff --git a/scripts/check_clang-tidy.py b/scripts/check_clang-tidy.py
index b24f5f7f8f..6c2173b6fe 100755
--- a/scripts/check_clang-tidy.py
+++ b/scripts/check_clang-tidy.py
@@ -46,6 +46,7 @@ if __name__ == "__main__":
("ReferenceCPP.cpp" in line and "parameter 'srcs' is unused" in line) or
("ReferenceCPP.cpp" in line and re.search(r"parameter '[^']+' is unused", line)) or
("NEGEMMMatrixMultiplyKernel.cpp" in line and "do not use C-style cast to convert between unrelated types" in line) or
+ ("NEPoolingLayerKernel.cpp" in line and "do not use C-style cast to convert between unrelated types" in line) or
"3rdparty" in line):
continue
diff --git a/scripts/clang-tidy.h b/scripts/clang-tidy.h
index cbc0d07cd6..ccf267e9b8 100644
--- a/scripts/clang-tidy.h
+++ b/scripts/clang-tidy.h
@@ -1,5 +1,15 @@
#include <arm_neon.h>
+inline float16x4_t vpmax_f16 (float16x4_t, float16x4_t)
+{
+ return vdup_n_f16(0);
+}
+
+inline float16x4_t vpadd_f16 (float16x4_t, float16x4_t)
+{
+ return vdup_n_f16(0);
+}
+
inline float16x8_t vmulq_lane_f16 (float16x8_t, float16x4_t, const int)
{
return vdupq_n_f16(0);
@@ -7,22 +17,27 @@ inline float16x8_t vmulq_lane_f16 (float16x8_t, float16x4_t, const int)
inline float16x4_t vmul_f16 (float16x4_t, float16x4_t)
{
- return vdup_n_u16(0);
+ return vdup_n_f16(0);
}
inline float16x4_t vadd_f16 (float16x4_t, float16x4_t)
{
- return vdup_n_u16(0);
+ return vdup_n_f16(0);
}
inline float16x4_t vmul_lane_f16 (float16x4_t, float16x4_t, const int)
{
- return vdup_n_u16(0);
+ return vdup_n_f16(0);
}
inline float16x4_t vmul_n_f16 (float16x4_t, float16_t)
{
- return vdup_n_u16(0);
+ return vdup_n_f16(0);
+}
+
+inline float16x4_t vmax_f16(float16x4_t, float16x4_t)
+{
+ return vdup_n_f16(0);
}
inline float16x8_t vcvtq_f16_u16(uint16x8_t)
diff --git a/src/core/NEON/kernels/NEFillBorderKernel.cpp b/src/core/NEON/kernels/NEFillBorderKernel.cpp
index bd99242b11..cd84e36aad 100644
--- a/src/core/NEON/kernels/NEFillBorderKernel.cpp
+++ b/src/core/NEON/kernels/NEFillBorderKernel.cpp
@@ -33,6 +33,10 @@
#include <algorithm>
#include <cstdint>
+#if ARM_COMPUTE_ENABLE_FP16
+#include <arm_fp16.h> // needed for float16_t
+#endif /* ARM_COMPUTE_ENABLE_FP16 */
+
using namespace arm_compute;
namespace arm_compute
@@ -47,7 +51,7 @@ NEFillBorderKernel::NEFillBorderKernel()
void NEFillBorderKernel::configure(ITensor *tensor, BorderSize border_size, BorderMode border_mode, const PixelValue &constant_border_value)
{
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(tensor, 1, DataType::U8, DataType::QS8, DataType::QS16, DataType::U16, DataType::S16, DataType::U32, DataType::S32, DataType::F32);
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(tensor, 1, DataType::U8, DataType::QS8, DataType::QS16, DataType::U16, DataType::S16, DataType::F16, DataType::U32, DataType::S32, DataType::F32);
_tensor = tensor;
_border_size = border_size;
@@ -100,6 +104,12 @@ void NEFillBorderKernel::run(const Window &window)
case DataType::S32:
fill_constant_value_single_channel<int32_t>(window);
break;
+#ifdef ARM_COMPUTE_ENABLE_FP16
+ case DataType::F16:
+ static_assert(sizeof(float16_t) == 2, "Float16_t must be 16 bit");
+ fill_constant_value_single_channel<float16_t>(window);
+ break;
+#endif /* ARM_COMPUTE_ENABLE_FP16 */
case DataType::F32:
static_assert(sizeof(float) == 4, "Float must be 32 bit");
fill_constant_value_single_channel<float>(window);
@@ -133,6 +143,12 @@ void NEFillBorderKernel::run(const Window &window)
case DataType::S32:
fill_replicate_single_channel<int32_t>(window);
break;
+#ifdef ARM_COMPUTE_ENABLE_FP16
+ case DataType::F16:
+ static_assert(sizeof(float16_t) == 2, "Float16_t must be 16 bit");
+ fill_replicate_single_channel<float16_t>(window);
+ break;
+#endif /* ARM_COMPUTE_ENABLE_FP16 */
case DataType::F32:
static_assert(sizeof(float) == 4, "Float must be 32 bit");
fill_replicate_single_channel<float>(window);
diff --git a/src/core/NEON/kernels/NEPoolingLayerKernel.cpp b/src/core/NEON/kernels/NEPoolingLayerKernel.cpp
index 2ef2b9881f..ce977140fb 100644
--- a/src/core/NEON/kernels/NEPoolingLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEPoolingLayerKernel.cpp
@@ -48,17 +48,17 @@ namespace
inline float calculate_avg_scale(const Coordinates &id, const int pool_size, const int upper_bound_w, const int upper_bound_h,
const int pad_x, const int pad_y, const int stride_x, const int stride_y)
{
- int start_x = id.x() * stride_x - pad_x;
- int start_y = id.y() * stride_y - pad_y;
- int end_x = std::min(start_x + pool_size, upper_bound_w);
- int end_y = std::min(start_y + pool_size, upper_bound_h);
+ const int start_x = id.x() * stride_x - pad_x;
+ const int start_y = id.y() * stride_y - pad_y;
+ const int end_x = std::min(start_x + pool_size, upper_bound_w);
+ const int end_y = std::min(start_y + pool_size, upper_bound_h);
return 1.f / ((end_y - start_y) * (end_x - start_x));
}
inline qint8_t calculate_avg_scale_q8(const Coordinates &id, int pool_size, int upper_bound_w, int upper_bound_h,
int pad_x, int pad_y, int stride_x, int stride_y, int fixed_point_position)
{
- static std::array<qint8_t, 10> scale_values_q8 =
+ static const std::array<qint8_t, 10> scale_values_q8 =
{ { 0x0, 0x0, 0x40, 0x2A, 0x20, 0x19, 0x15, 0x12, 0x10, 0xE } };
const int start_x = id.x() * stride_x - pad_x;
const int start_y = id.y() * stride_y - pad_y;
@@ -96,8 +96,11 @@ void NEPoolingLayerKernel::configure(const ITensor *input, ITensor *output, cons
static const std::set<int> supported_pool_sizes = { 2, 3, 7 };
ARM_COMPUTE_UNUSED(supported_pool_sizes);
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::F32);
ARM_COMPUTE_ERROR_ON_NULLPTR(output);
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::F16, DataType::F32);
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::QS8, DataType::F16, DataType::F32);
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(input, output);
ARM_COMPUTE_ERROR_ON(supported_pool_sizes.find(pool_size) == supported_pool_sizes.end());
ARM_COMPUTE_ERROR_ON(7 == pool_size && input->info()->data_type() != DataType::F32);
ARM_COMPUTE_ERROR_ON(pool_pad_x >= pool_size || pool_pad_y >= pool_size);
@@ -140,9 +143,30 @@ void NEPoolingLayerKernel::configure(const ITensor *input, ITensor *output, cons
break;
default:
ARM_COMPUTE_ERROR("Pooling size not supported");
+ break;
}
num_elems_horizontal_window = 8;
break;
+#ifdef ARM_COMPUTE_ENABLE_FP16
+ case DataType::F16:
+ switch(pool_size)
+ {
+ case 2:
+ num_elems_read_per_iteration = 16;
+ num_elems_processed_per_iteration = 8;
+ num_elems_horizontal_window = 8;
+ break;
+ case 3:
+ num_elems_read_per_iteration = 4;
+ num_elems_processed_per_iteration = 1;
+ num_elems_horizontal_window = 1;
+ break;
+ default:
+ ARM_COMPUTE_ERROR("Pooling size not supported");
+ break;
+ }
+ break;
+#endif /* ARM_COMPUTE_ENABLE_FP16 */
case DataType::F32:
switch(pool_size)
{
@@ -157,6 +181,7 @@ void NEPoolingLayerKernel::configure(const ITensor *input, ITensor *output, cons
break;
default:
ARM_COMPUTE_ERROR("Pooling size not supported");
+ break;
}
num_elems_processed_per_iteration = 1;
num_elems_horizontal_window = 1;
@@ -188,6 +213,10 @@ void NEPoolingLayerKernel::configure(const ITensor *input, ITensor *output, cons
{
_func = (PoolingType::AVG == pool_type) ? &NEPoolingLayerKernel::pooling2_q8<PoolingType::AVG> : &NEPoolingLayerKernel::pooling2_q8<PoolingType::MAX>;
}
+ else if(input->info()->data_type() == DataType::F16)
+ {
+ _func = (PoolingType::AVG == pool_type) ? &NEPoolingLayerKernel::pooling2_f16<PoolingType::AVG> : &NEPoolingLayerKernel::pooling2_f16<PoolingType::MAX>;
+ }
else if(input->info()->data_type() == DataType::F32)
{
_func = (PoolingType::AVG == pool_type) ? &NEPoolingLayerKernel::pooling2_f32<PoolingType::AVG> : &NEPoolingLayerKernel::pooling2_f32<PoolingType::MAX>;
@@ -198,6 +227,10 @@ void NEPoolingLayerKernel::configure(const ITensor *input, ITensor *output, cons
{
_func = (PoolingType::AVG == pool_type) ? &NEPoolingLayerKernel::pooling3_q8<PoolingType::AVG> : &NEPoolingLayerKernel::pooling3_q8<PoolingType::MAX>;
}
+ else if(input->info()->data_type() == DataType::F16)
+ {
+ _func = (PoolingType::AVG == pool_type) ? &NEPoolingLayerKernel::pooling3_f16<PoolingType::AVG> : &NEPoolingLayerKernel::pooling3_f16<PoolingType::MAX>;
+ }
else if(input->info()->data_type() == DataType::F32)
{
_func = (PoolingType::AVG == pool_type) ? &NEPoolingLayerKernel::pooling3_f32<PoolingType::AVG> : &NEPoolingLayerKernel::pooling3_f32<PoolingType::MAX>;
@@ -266,6 +299,101 @@ void NEPoolingLayerKernel::pooling2_q8(const Window &window_input, const Window
}
template <PoolingType pooling_type>
+void NEPoolingLayerKernel::pooling3_f16(const Window &window_input, const Window &window)
+{
+#ifdef ARM_COMPUTE_ENABLE_FP16
+ Iterator input(_input, window_input);
+ Iterator output(_output, window);
+
+ constexpr const int pool_size = 3;
+ int pool_pad_x = 0;
+ int pool_pad_y = 0;
+ int pool_stride_x = 0;
+ int pool_stride_y = 0;
+ std::tie(pool_pad_x, pool_pad_y) = _pool_info.pad_stride_info().pad();
+ std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info().stride();
+ const int upper_bound_w = _input->info()->dimension(0) + pool_pad_x;
+ const int upper_bound_h = _input->info()->dimension(1) + pool_pad_y;
+
+ const unsigned char *const input_top_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_x), -static_cast<int>(pool_pad_y)));
+ const unsigned char *const input_middle_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_x), -static_cast<int>(pool_pad_y) + 1));
+ const unsigned char *const input_bottom_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_x), -static_cast<int>(pool_pad_y) + 2));
+
+ execute_window_loop(window, [&](const Coordinates & id)
+ {
+ const float16x4_t top_data = vld1_f16(reinterpret_cast<const float16_t *>(input_top_ptr + input.offset()));
+ const float16x4_t middle_data = vld1_f16(reinterpret_cast<const float16_t *>(input_middle_ptr + input.offset()));
+ const float16x4_t bottom_data = vld1_f16(reinterpret_cast<const float16_t *>(input_bottom_ptr + input.offset()));
+ float16x4_t res = {};
+ if(pooling_type == PoolingType::AVG)
+ {
+ // Calculate scale
+ const float scale = calculate_avg_scale(id, pool_size, upper_bound_w, upper_bound_h, pool_pad_x, pool_pad_y, pool_stride_x, pool_stride_y);
+ const float16x4_t scale_v = vdup_n_f16(scale);
+ // Perform pooling
+ const float16x4_t sum_data = vadd_f16(vadd_f16(top_data, bottom_data), middle_data);
+ res = vpadd_f16(vset_lane_f16(0.f, sum_data, 3), sum_data);
+ res = vmul_f16(vpadd_f16(res, res), scale_v);
+ }
+ else
+ {
+ const float16x4_t max_data = vmax_f16(vmax_f16(top_data, bottom_data), middle_data);
+ res = vpmax_f16(vset_lane_f16(-std::numeric_limits<float>::max(), max_data, 3), max_data);
+ res = vpmax_f16(res, res);
+ }
+ *(reinterpret_cast<float16_t *>(output.ptr())) = vget_lane_f16(res, 0);
+ },
+ input, output);
+#else /* ARM_COMPUTE_ENABLE_FP16 */
+ ARM_COMPUTE_UNUSED(window_input);
+ ARM_COMPUTE_UNUSED(window);
+ ARM_COMPUTE_ERROR("FP16 Not supported! Recompile the library with arch=arm64-v8.2-a");
+#endif /* ARM_COMPUTE_ENABLE_FP16 */
+}
+
+template <PoolingType pooling_type>
+void NEPoolingLayerKernel::pooling2_f16(const Window &window_input, const Window &window)
+{
+#ifdef ARM_COMPUTE_ENABLE_FP16
+ Iterator input(_input, window_input);
+ Iterator output(_output, window);
+ constexpr int pool_size = 2;
+ int pool_pad_x, pool_pad_y, pool_stride_x, pool_stride_y = 0;
+ std::tie(pool_pad_x, pool_pad_y) = _pool_info.pad_stride_info().pad();
+ std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info().stride();
+ const int upper_bound_w = _input->info()->dimension(0) + pool_pad_x;
+ const int upper_bound_h = _input->info()->dimension(1) + pool_pad_y;
+
+ const unsigned char *const input_top_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_x), -static_cast<int>(pool_pad_y)));
+ const unsigned char *const input_bottom_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_x), -static_cast<int>(pool_pad_y) + 1));
+
+ execute_window_loop(window, [&](const Coordinates & id)
+ {
+ const auto top_data = vld2q_f16(reinterpret_cast<const float16_t *>(input_top_ptr + input.offset()));
+ const auto bottom_data = vld2q_f16(reinterpret_cast<const float16_t *>(input_bottom_ptr + input.offset()));
+ float16x8_t res = {};
+
+ if(pooling_type == PoolingType::AVG)
+ {
+ const float scale = calculate_avg_scale(id, pool_size, upper_bound_w, upper_bound_h, pool_pad_x, pool_pad_y, pool_stride_x, pool_stride_y);
+ const float16x8_t scale_v = vdupq_n_f16(scale);
+ res = vmulq_f16(scale_v, vaddq_f16(bottom_data.val[1], vaddq_f16(bottom_data.val[0], vaddq_f16(top_data.val[0], top_data.val[1]))));
+ }
+ else
+ {
+ res = vmaxq_f16(bottom_data.val[1], vmaxq_f16(bottom_data.val[0], vmaxq_f16(top_data.val[0], top_data.val[1])));
+ }
+ vst1q_f16(reinterpret_cast<float16_t *>(output.ptr()), res);
+ },
+ input, output);
+#else /* ARM_COMPUTE_ENABLE_FP16 */
+ ARM_COMPUTE_UNUSED(window_input);
+ ARM_COMPUTE_UNUSED(window);
+ ARM_COMPUTE_ERROR("FP16 Not supported! Recompile the library with arch=arm64-v8.2-a");
+#endif /* ARM_COMPUTE_ENABLE_FP16 */
+}
+
+template <PoolingType pooling_type>
void NEPoolingLayerKernel::pooling2_f32(const Window &window_input, const Window &window)
{
Iterator input(_input, window_input);
@@ -496,19 +624,29 @@ void NEPoolingLayerKernel::run(const Window &window)
ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
ARM_COMPUTE_ERROR_ON(_func == nullptr);
- unsigned int pool_stride_x, pool_stride_y = 0;
- std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info().stride();
+ const unsigned int pool_stride_x = _pool_info.pad_stride_info().stride().first;
+ const unsigned int pool_stride_y = _pool_info.pad_stride_info().stride().second;
// Set step for input in x and y direction for the input
Window window_input(window);
unsigned int window_x_inc = 0;
- if(_input->info()->data_type() == DataType::QS8)
- {
- window_x_inc = (pool_stride_x == 2) ? _num_elems_processed_per_iteration * 2 : _num_elems_processed_per_iteration;
- }
- else
+ switch(_input->info()->data_type())
{
- window_x_inc = pool_stride_x;
+ case DataType::QS8:
+ case DataType::F16:
+ {
+ window_x_inc = (pool_stride_x == 2) ? _num_elems_processed_per_iteration * 2 : _num_elems_processed_per_iteration;
+ break;
+ }
+ case DataType::F32:
+ {
+ window_x_inc = pool_stride_x;
+ break;
+ }
+ default:
+ {
+ ARM_COMPUTE_ERROR("Not supported");
+ }
}
window_input.set(Window::DimX, Window::Dimension(window.x().start() * pool_stride_x, window.x().end() * pool_stride_x, window_x_inc));
window_input.set(Window::DimY, Window::Dimension(window.y().start() * pool_stride_y, window.y().end() * pool_stride_y, pool_stride_y));
diff --git a/tests/benchmark_new/NEON/PoolingLayer.cpp b/tests/benchmark_new/NEON/PoolingLayer.cpp
index baa6e31483..c9d598d4a9 100644
--- a/tests/benchmark_new/NEON/PoolingLayer.cpp
+++ b/tests/benchmark_new/NEON/PoolingLayer.cpp
@@ -39,24 +39,29 @@ namespace arm_compute
{
namespace test
{
+namespace
+{
+#ifdef ARM_COMPUTE_ENABLE_FP16
+const auto alexnet_data_types = framework::dataset::make("DataType", { DataType::QS8, DataType::F16, DataType::F32 });
+const auto lenet_data_types = framework::dataset::make("DataType", { DataType::F16, DataType::F32 });
+#else /* ARM_COMPUTE_ENABLE_FP16 */
+const auto alexnet_data_types = framework::dataset::make("DataType", { DataType::QS8, DataType::F32 });
+const auto lenet_data_types = framework::dataset::make("DataType", { DataType::F32 });
+#endif /* ARM_COMPUTE_ENABLE_FP16 */
+} // namespace
+
using NEPoolingLayerFixture = PoolingLayerFixture<Tensor, NEPoolingLayer, Accessor>;
TEST_SUITE(NEON)
REGISTER_FIXTURE_DATA_TEST_CASE(AlexNetPoolingLayer, NEPoolingLayerFixture, framework::DatasetMode::ALL,
- framework::dataset::combine(framework::dataset::combine(datasets::AlexNetPoolingLayerDataset(),
- framework::dataset::make("DataType", { DataType::F32, DataType::QS8 })),
- framework::dataset::make("Batches", { 1, 4, 8 })));
+ framework::dataset::combine(framework::dataset::combine(datasets::AlexNetPoolingLayerDataset(), alexnet_data_types), framework::dataset::make("Batches", { 1, 4, 8 })));
REGISTER_FIXTURE_DATA_TEST_CASE(LeNet5PoolingLayer, NEPoolingLayerFixture, framework::DatasetMode::ALL,
- framework::dataset::combine(framework::dataset::combine(datasets::LeNet5PoolingLayerDataset(),
- framework::dataset::make("DataType", DataType::F32)),
- framework::dataset::make("Batches", { 1, 4, 8 })));
+ framework::dataset::combine(framework::dataset::combine(datasets::LeNet5PoolingLayerDataset(), lenet_data_types), framework::dataset::make("Batches", { 1, 4, 8 })));
REGISTER_FIXTURE_DATA_TEST_CASE(GoogLeNetPoolingLayer, NEPoolingLayerFixture, framework::DatasetMode::ALL,
- framework::dataset::combine(framework::dataset::combine(datasets::GoogLeNetPoolingLayerDataset(),
- framework::dataset::make("DataType", DataType::F32)),
- framework::dataset::make("Batches", { 1, 4, 8 })));
+ framework::dataset::combine(framework::dataset::combine(datasets::GoogLeNetPoolingLayerDataset(), lenet_data_types), framework::dataset::make("Batches", { 1, 4, 8 })));
TEST_SUITE_END()
} // namespace test
diff --git a/tests/validation/NEON/PoolingLayer.cpp b/tests/validation/NEON/PoolingLayer.cpp
index 9d6c8824ca..3961770310 100644
--- a/tests/validation/NEON/PoolingLayer.cpp
+++ b/tests/validation/NEON/PoolingLayer.cpp
@@ -39,8 +39,11 @@ using namespace arm_compute::test::validation;
namespace
{
-const float tolerance_q = 0; /**< Tolerance value for comparing reference's output against implementation's output for quantized input */
-const float tolerance_f = 1e-05; /**< Tolerance value for comparing reference's output against implementation's output for float input */
+const float tolerance_q = 0; /**< Tolerance value for comparing reference's output against implementation's output for quantized input */
+const float tolerance_f32 = 1e-05; /**< Tolerance value for comparing reference's output against implementation's output for float input */
+#ifdef ARM_COMPUTE_ENABLE_FP16
+const float tolerance_f16 = 0.001f; /**< Tolerance value for comparing reference's output against half precision floating point implementation's output */
+#endif /* ARM_COMPUTE_ENABLE_FP16 */
/** Compute Neon pooling layer function.
*
@@ -73,6 +76,7 @@ Tensor compute_pooling_layer(const TensorShape &shape_in, const TensorShape &sha
switch(dt)
{
case DataType::F32:
+ case DataType::F16:
min = -1;
max = 1;
break;
@@ -123,7 +127,7 @@ BOOST_DATA_TEST_CASE(RandomDataset,
RawTensor ref_dst = Reference::compute_reference_pooling_layer(obj.src_shape, obj.dst_shape, dt, obj.info);
// Validate output
- validate(Accessor(dst), ref_dst, tolerance_f, 0);
+ validate(Accessor(dst), ref_dst, tolerance_f32, 0);
}
BOOST_DATA_TEST_CASE(RunSmall7x7,
@@ -140,10 +144,29 @@ BOOST_DATA_TEST_CASE(RunSmall7x7,
RawTensor ref_dst = Reference::compute_reference_pooling_layer(src_shape, dst_shape, dt, pool_info);
// Validate output
- validate(Accessor(dst), ref_dst, tolerance_f, 0);
+ validate(Accessor(dst), ref_dst, tolerance_f32, 0);
}
BOOST_AUTO_TEST_SUITE_END()
+#ifdef ARM_COMPUTE_ENABLE_FP16
+BOOST_AUTO_TEST_SUITE(Float16)
+BOOST_TEST_DECORATOR(*boost::unit_test::label("precommit"))
+BOOST_DATA_TEST_CASE(RandomDataset,
+ RandomPoolingLayerDataset() * boost::unit_test::data::make(DataType::F16),
+ obj, dt)
+{
+ // Compute function
+ Tensor dst = compute_pooling_layer(obj.src_shape, obj.dst_shape, dt, obj.info);
+
+ // Compute reference
+ RawTensor ref_dst = Reference::compute_reference_pooling_layer(obj.src_shape, obj.dst_shape, dt, obj.info);
+
+ // Validate output
+ validate(NEAccessor(dst), ref_dst, tolerance_f16, 0);
+}
+BOOST_AUTO_TEST_SUITE_END()
+#endif /* ARM_COMPUTE_ENABLE_FP16 */
+
BOOST_AUTO_TEST_SUITE(Quantized)
BOOST_TEST_DECORATOR(*boost::unit_test::label("precommit"))
BOOST_DATA_TEST_CASE(RandomDataset,
diff --git a/tests/validation/Reference.cpp b/tests/validation/Reference.cpp
index c0d5a20c16..b838907320 100644
--- a/tests/validation/Reference.cpp
+++ b/tests/validation/Reference.cpp
@@ -698,6 +698,7 @@ RawTensor Reference::compute_reference_pooling_layer(const TensorShape &shape_in
switch(dt)
{
case DataType::F32:
+ case DataType::F16:
min = -1;
max = 1;
break;
diff --git a/tests/validation/TensorOperations.h b/tests/validation/TensorOperations.h
index 67dadd6da3..b8e5a6678c 100644
--- a/tests/validation/TensorOperations.h
+++ b/tests/validation/TensorOperations.h
@@ -44,6 +44,26 @@
#include <string>
#include <vector>
+#if ARM_COMPUTE_ENABLE_FP16
+//Beware! most std templates acting on types don't work with the data type float16_t
+namespace std
+{
+template <>
+class numeric_limits<float16_t>
+{
+public:
+ static float16_t lowest()
+ {
+ return -std::numeric_limits<float>::max(); // -inf
+ };
+ static float16_t max()
+ {
+ return std::numeric_limits<float>::max(); // +inf
+ };
+};
+}
+#endif /* ARM_COMPUTE_ENABLE_FP16 */
+
namespace arm_compute
{
namespace test
@@ -1476,7 +1496,7 @@ void pooling_layer(const Tensor<T> &in, Tensor<T> &out, PoolingLayerInfo pool_in
{
for(int x = wstart; x < wend; ++x)
{
- T val = in[r * h_in * w_in + y * w_in + x];
+ const T val = in[r * h_in * w_in + y * w_in + x];
if(val > max_val)
{
max_val = val;