From 0c34fe29c298057091d48cde332cb60bb14efee1 Mon Sep 17 00:00:00 2001 From: Pablo Tello Date: Mon, 26 Jun 2017 17:17:42 +0100 Subject: COMPMID-421: Added FP16 support in Pooling Layer Change-Id: I6b6119c8770051c1656da40aa073c539c15b493e Reviewed-on: http://mpd-gerrit.cambridge.arm.com/78985 Reviewed-by: Moritz Pflanzer Tested-by: Kaizen --- .../core/NEON/kernels/NEPoolingLayerKernel.h | 17 ++- arm_compute/core/PixelValue.h | 48 ++++-- scripts/check_clang-tidy.py | 1 + scripts/clang-tidy.h | 23 ++- src/core/NEON/kernels/NEFillBorderKernel.cpp | 18 ++- src/core/NEON/kernels/NEPoolingLayerKernel.cpp | 166 +++++++++++++++++++-- tests/benchmark_new/NEON/PoolingLayer.cpp | 23 +-- tests/validation/NEON/PoolingLayer.cpp | 31 +++- tests/validation/Reference.cpp | 1 + tests/validation/TensorOperations.h | 22 ++- 10 files changed, 306 insertions(+), 44 deletions(-) diff --git a/arm_compute/core/NEON/kernels/NEPoolingLayerKernel.h b/arm_compute/core/NEON/kernels/NEPoolingLayerKernel.h index bf06fdd639..a5de81137b 100644 --- a/arm_compute/core/NEON/kernels/NEPoolingLayerKernel.h +++ b/arm_compute/core/NEON/kernels/NEPoolingLayerKernel.h @@ -48,7 +48,7 @@ public: ~NEPoolingLayerKernel() = default; /** Set the input and output tensors. * - * @param[in] input Source tensor. Data types supported: QS8/F32. + * @param[in] input Source tensor. Data types supported: QS8/F16/F32. * @param[out] output Destination tensor. Data types supported: Same as @p input. * @param[in] pool_info Contains pooling operation information described in @ref PoolingLayerInfo. */ @@ -66,6 +66,14 @@ private: */ template void pooling2_f32(const Window &window_input, const Window &window); + /** Function to perform 2x2 pooling for float16_t. + * + * @param[in] window_input Input region on which to execute the kernel. + * @param[in] window Output region on which to execute the kernel. + */ + template + void pooling2_f16(const Window &window_input, const Window &window); + /** Function to perform 2x2 pooling for 8bit fixed point. * * @param[in] window_input Input region on which to execute the kernel. @@ -80,6 +88,13 @@ private: */ template void pooling3_f32(const Window &window_input, const Window &window); + /** Function to perform 3x3 pooling. + * + * @param[in] window_input Input region on which to execute the kernel. + * @param[in] window Output region on which to execute the kernel. + */ + template + void pooling3_f16(const Window &window_input, const Window &window); /** Function to perform 3x3 pooling for 8bit fixed point. * * @param[in] window_input Input region on which to execute the kernel. diff --git a/arm_compute/core/PixelValue.h b/arm_compute/core/PixelValue.h index b4912ce15a..1b1a5a3845 100644 --- a/arm_compute/core/PixelValue.h +++ b/arm_compute/core/PixelValue.h @@ -26,6 +26,10 @@ #include +#if ARM_COMPUTE_ENABLE_FP16 +#include // needed for float16_t +#endif /* ARM_COMPUTE_ENABLE_FP16 */ + namespace arm_compute { /** Class describing the value of a pixel for any image format. */ @@ -82,6 +86,17 @@ public: { value.s32 = v; } +#if ARM_COMPUTE_ENABLE_FP16 + /** Initialize the union with a F16 pixel value + * + * @param[in] v F16 value. + */ + PixelValue(float16_t v) + : PixelValue() + { + value.f16 = v; + } +#endif /* ARM_COMPUTE_ENABLE_FP16 */ /** Initialize the union with a F32 pixel value * * @param[in] v F32 value. @@ -96,16 +111,19 @@ public: */ union { - uint8_t rgb[3]; /**< 3 channels: RGB888 */ - uint8_t yuv[3]; /**< 3 channels: Any YUV format */ - uint8_t rgbx[4]; /**< 4 channels: RGBX8888 */ - float f32; /**< Single channel float 32 */ - uint8_t u8; /**< Single channel U8 */ - int8_t s8; /**< Single channel S8 */ - uint16_t u16; /**< Single channel U16 */ - int16_t s16; /**< Single channel S16 */ - uint32_t u32; /**< Single channel U32 */ - int32_t s32; /**< Single channel S32 */ + uint8_t rgb[3]; /**< 3 channels: RGB888 */ + uint8_t yuv[3]; /**< 3 channels: Any YUV format */ + uint8_t rgbx[4]; /**< 4 channels: RGBX8888 */ + float f32; /**< Single channel float 32 */ +#if ARM_COMPUTE_ENABLE_FP16 + float16_t f16; /**< Single channel F16 */ +#endif /* ARM_COMPUTE_ENABLE_FP16 */ + uint8_t u8; /**< Single channel U8 */ + int8_t s8; /**< Single channel S8 */ + uint16_t u16; /**< Single channel U16 */ + int16_t s16; /**< Single channel S16 */ + uint32_t u32; /**< Single channel U32 */ + int32_t s32; /**< Single channel S32 */ } value; /** Interpret the pixel value as a U8 * @@ -155,6 +173,16 @@ public: { v = value.s32; } +#if ARM_COMPUTE_ENABLE_FP16 + /** Interpret the pixel value as a F16 + * + * @param[out] v Returned value + */ + void get(float16_t &v) const + { + v = value.f16; + } +#endif /* ARM_COMPUTE_ENABLE_FP16 */ /** Interpret the pixel value as a F32 * * @param[out] v Returned value diff --git a/scripts/check_clang-tidy.py b/scripts/check_clang-tidy.py index b24f5f7f8f..6c2173b6fe 100755 --- a/scripts/check_clang-tidy.py +++ b/scripts/check_clang-tidy.py @@ -46,6 +46,7 @@ if __name__ == "__main__": ("ReferenceCPP.cpp" in line and "parameter 'srcs' is unused" in line) or ("ReferenceCPP.cpp" in line and re.search(r"parameter '[^']+' is unused", line)) or ("NEGEMMMatrixMultiplyKernel.cpp" in line and "do not use C-style cast to convert between unrelated types" in line) or + ("NEPoolingLayerKernel.cpp" in line and "do not use C-style cast to convert between unrelated types" in line) or "3rdparty" in line): continue diff --git a/scripts/clang-tidy.h b/scripts/clang-tidy.h index cbc0d07cd6..ccf267e9b8 100644 --- a/scripts/clang-tidy.h +++ b/scripts/clang-tidy.h @@ -1,5 +1,15 @@ #include +inline float16x4_t vpmax_f16 (float16x4_t, float16x4_t) +{ + return vdup_n_f16(0); +} + +inline float16x4_t vpadd_f16 (float16x4_t, float16x4_t) +{ + return vdup_n_f16(0); +} + inline float16x8_t vmulq_lane_f16 (float16x8_t, float16x4_t, const int) { return vdupq_n_f16(0); @@ -7,22 +17,27 @@ inline float16x8_t vmulq_lane_f16 (float16x8_t, float16x4_t, const int) inline float16x4_t vmul_f16 (float16x4_t, float16x4_t) { - return vdup_n_u16(0); + return vdup_n_f16(0); } inline float16x4_t vadd_f16 (float16x4_t, float16x4_t) { - return vdup_n_u16(0); + return vdup_n_f16(0); } inline float16x4_t vmul_lane_f16 (float16x4_t, float16x4_t, const int) { - return vdup_n_u16(0); + return vdup_n_f16(0); } inline float16x4_t vmul_n_f16 (float16x4_t, float16_t) { - return vdup_n_u16(0); + return vdup_n_f16(0); +} + +inline float16x4_t vmax_f16(float16x4_t, float16x4_t) +{ + return vdup_n_f16(0); } inline float16x8_t vcvtq_f16_u16(uint16x8_t) diff --git a/src/core/NEON/kernels/NEFillBorderKernel.cpp b/src/core/NEON/kernels/NEFillBorderKernel.cpp index bd99242b11..cd84e36aad 100644 --- a/src/core/NEON/kernels/NEFillBorderKernel.cpp +++ b/src/core/NEON/kernels/NEFillBorderKernel.cpp @@ -33,6 +33,10 @@ #include #include +#if ARM_COMPUTE_ENABLE_FP16 +#include // needed for float16_t +#endif /* ARM_COMPUTE_ENABLE_FP16 */ + using namespace arm_compute; namespace arm_compute @@ -47,7 +51,7 @@ NEFillBorderKernel::NEFillBorderKernel() void NEFillBorderKernel::configure(ITensor *tensor, BorderSize border_size, BorderMode border_mode, const PixelValue &constant_border_value) { - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(tensor, 1, DataType::U8, DataType::QS8, DataType::QS16, DataType::U16, DataType::S16, DataType::U32, DataType::S32, DataType::F32); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(tensor, 1, DataType::U8, DataType::QS8, DataType::QS16, DataType::U16, DataType::S16, DataType::F16, DataType::U32, DataType::S32, DataType::F32); _tensor = tensor; _border_size = border_size; @@ -100,6 +104,12 @@ void NEFillBorderKernel::run(const Window &window) case DataType::S32: fill_constant_value_single_channel(window); break; +#ifdef ARM_COMPUTE_ENABLE_FP16 + case DataType::F16: + static_assert(sizeof(float16_t) == 2, "Float16_t must be 16 bit"); + fill_constant_value_single_channel(window); + break; +#endif /* ARM_COMPUTE_ENABLE_FP16 */ case DataType::F32: static_assert(sizeof(float) == 4, "Float must be 32 bit"); fill_constant_value_single_channel(window); @@ -133,6 +143,12 @@ void NEFillBorderKernel::run(const Window &window) case DataType::S32: fill_replicate_single_channel(window); break; +#ifdef ARM_COMPUTE_ENABLE_FP16 + case DataType::F16: + static_assert(sizeof(float16_t) == 2, "Float16_t must be 16 bit"); + fill_replicate_single_channel(window); + break; +#endif /* ARM_COMPUTE_ENABLE_FP16 */ case DataType::F32: static_assert(sizeof(float) == 4, "Float must be 32 bit"); fill_replicate_single_channel(window); diff --git a/src/core/NEON/kernels/NEPoolingLayerKernel.cpp b/src/core/NEON/kernels/NEPoolingLayerKernel.cpp index 2ef2b9881f..ce977140fb 100644 --- a/src/core/NEON/kernels/NEPoolingLayerKernel.cpp +++ b/src/core/NEON/kernels/NEPoolingLayerKernel.cpp @@ -48,17 +48,17 @@ namespace inline float calculate_avg_scale(const Coordinates &id, const int pool_size, const int upper_bound_w, const int upper_bound_h, const int pad_x, const int pad_y, const int stride_x, const int stride_y) { - int start_x = id.x() * stride_x - pad_x; - int start_y = id.y() * stride_y - pad_y; - int end_x = std::min(start_x + pool_size, upper_bound_w); - int end_y = std::min(start_y + pool_size, upper_bound_h); + const int start_x = id.x() * stride_x - pad_x; + const int start_y = id.y() * stride_y - pad_y; + const int end_x = std::min(start_x + pool_size, upper_bound_w); + const int end_y = std::min(start_y + pool_size, upper_bound_h); return 1.f / ((end_y - start_y) * (end_x - start_x)); } inline qint8_t calculate_avg_scale_q8(const Coordinates &id, int pool_size, int upper_bound_w, int upper_bound_h, int pad_x, int pad_y, int stride_x, int stride_y, int fixed_point_position) { - static std::array scale_values_q8 = + static const std::array scale_values_q8 = { { 0x0, 0x0, 0x40, 0x2A, 0x20, 0x19, 0x15, 0x12, 0x10, 0xE } }; const int start_x = id.x() * stride_x - pad_x; const int start_y = id.y() * stride_y - pad_y; @@ -96,8 +96,11 @@ void NEPoolingLayerKernel::configure(const ITensor *input, ITensor *output, cons static const std::set supported_pool_sizes = { 2, 3, 7 }; ARM_COMPUTE_UNUSED(supported_pool_sizes); - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::F32); ARM_COMPUTE_ERROR_ON_NULLPTR(output); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::F16, DataType::F32); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::QS8, DataType::F16, DataType::F32); + ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); + ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(input, output); ARM_COMPUTE_ERROR_ON(supported_pool_sizes.find(pool_size) == supported_pool_sizes.end()); ARM_COMPUTE_ERROR_ON(7 == pool_size && input->info()->data_type() != DataType::F32); ARM_COMPUTE_ERROR_ON(pool_pad_x >= pool_size || pool_pad_y >= pool_size); @@ -140,9 +143,30 @@ void NEPoolingLayerKernel::configure(const ITensor *input, ITensor *output, cons break; default: ARM_COMPUTE_ERROR("Pooling size not supported"); + break; } num_elems_horizontal_window = 8; break; +#ifdef ARM_COMPUTE_ENABLE_FP16 + case DataType::F16: + switch(pool_size) + { + case 2: + num_elems_read_per_iteration = 16; + num_elems_processed_per_iteration = 8; + num_elems_horizontal_window = 8; + break; + case 3: + num_elems_read_per_iteration = 4; + num_elems_processed_per_iteration = 1; + num_elems_horizontal_window = 1; + break; + default: + ARM_COMPUTE_ERROR("Pooling size not supported"); + break; + } + break; +#endif /* ARM_COMPUTE_ENABLE_FP16 */ case DataType::F32: switch(pool_size) { @@ -157,6 +181,7 @@ void NEPoolingLayerKernel::configure(const ITensor *input, ITensor *output, cons break; default: ARM_COMPUTE_ERROR("Pooling size not supported"); + break; } num_elems_processed_per_iteration = 1; num_elems_horizontal_window = 1; @@ -188,6 +213,10 @@ void NEPoolingLayerKernel::configure(const ITensor *input, ITensor *output, cons { _func = (PoolingType::AVG == pool_type) ? &NEPoolingLayerKernel::pooling2_q8 : &NEPoolingLayerKernel::pooling2_q8; } + else if(input->info()->data_type() == DataType::F16) + { + _func = (PoolingType::AVG == pool_type) ? &NEPoolingLayerKernel::pooling2_f16 : &NEPoolingLayerKernel::pooling2_f16; + } else if(input->info()->data_type() == DataType::F32) { _func = (PoolingType::AVG == pool_type) ? &NEPoolingLayerKernel::pooling2_f32 : &NEPoolingLayerKernel::pooling2_f32; @@ -198,6 +227,10 @@ void NEPoolingLayerKernel::configure(const ITensor *input, ITensor *output, cons { _func = (PoolingType::AVG == pool_type) ? &NEPoolingLayerKernel::pooling3_q8 : &NEPoolingLayerKernel::pooling3_q8; } + else if(input->info()->data_type() == DataType::F16) + { + _func = (PoolingType::AVG == pool_type) ? &NEPoolingLayerKernel::pooling3_f16 : &NEPoolingLayerKernel::pooling3_f16; + } else if(input->info()->data_type() == DataType::F32) { _func = (PoolingType::AVG == pool_type) ? &NEPoolingLayerKernel::pooling3_f32 : &NEPoolingLayerKernel::pooling3_f32; @@ -265,6 +298,101 @@ void NEPoolingLayerKernel::pooling2_q8(const Window &window_input, const Window input, output); } +template +void NEPoolingLayerKernel::pooling3_f16(const Window &window_input, const Window &window) +{ +#ifdef ARM_COMPUTE_ENABLE_FP16 + Iterator input(_input, window_input); + Iterator output(_output, window); + + constexpr const int pool_size = 3; + int pool_pad_x = 0; + int pool_pad_y = 0; + int pool_stride_x = 0; + int pool_stride_y = 0; + std::tie(pool_pad_x, pool_pad_y) = _pool_info.pad_stride_info().pad(); + std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info().stride(); + const int upper_bound_w = _input->info()->dimension(0) + pool_pad_x; + const int upper_bound_h = _input->info()->dimension(1) + pool_pad_y; + + const unsigned char *const input_top_ptr = _input->ptr_to_element(Coordinates(-static_cast(pool_pad_x), -static_cast(pool_pad_y))); + const unsigned char *const input_middle_ptr = _input->ptr_to_element(Coordinates(-static_cast(pool_pad_x), -static_cast(pool_pad_y) + 1)); + const unsigned char *const input_bottom_ptr = _input->ptr_to_element(Coordinates(-static_cast(pool_pad_x), -static_cast(pool_pad_y) + 2)); + + execute_window_loop(window, [&](const Coordinates & id) + { + const float16x4_t top_data = vld1_f16(reinterpret_cast(input_top_ptr + input.offset())); + const float16x4_t middle_data = vld1_f16(reinterpret_cast(input_middle_ptr + input.offset())); + const float16x4_t bottom_data = vld1_f16(reinterpret_cast(input_bottom_ptr + input.offset())); + float16x4_t res = {}; + if(pooling_type == PoolingType::AVG) + { + // Calculate scale + const float scale = calculate_avg_scale(id, pool_size, upper_bound_w, upper_bound_h, pool_pad_x, pool_pad_y, pool_stride_x, pool_stride_y); + const float16x4_t scale_v = vdup_n_f16(scale); + // Perform pooling + const float16x4_t sum_data = vadd_f16(vadd_f16(top_data, bottom_data), middle_data); + res = vpadd_f16(vset_lane_f16(0.f, sum_data, 3), sum_data); + res = vmul_f16(vpadd_f16(res, res), scale_v); + } + else + { + const float16x4_t max_data = vmax_f16(vmax_f16(top_data, bottom_data), middle_data); + res = vpmax_f16(vset_lane_f16(-std::numeric_limits::max(), max_data, 3), max_data); + res = vpmax_f16(res, res); + } + *(reinterpret_cast(output.ptr())) = vget_lane_f16(res, 0); + }, + input, output); +#else /* ARM_COMPUTE_ENABLE_FP16 */ + ARM_COMPUTE_UNUSED(window_input); + ARM_COMPUTE_UNUSED(window); + ARM_COMPUTE_ERROR("FP16 Not supported! Recompile the library with arch=arm64-v8.2-a"); +#endif /* ARM_COMPUTE_ENABLE_FP16 */ +} + +template +void NEPoolingLayerKernel::pooling2_f16(const Window &window_input, const Window &window) +{ +#ifdef ARM_COMPUTE_ENABLE_FP16 + Iterator input(_input, window_input); + Iterator output(_output, window); + constexpr int pool_size = 2; + int pool_pad_x, pool_pad_y, pool_stride_x, pool_stride_y = 0; + std::tie(pool_pad_x, pool_pad_y) = _pool_info.pad_stride_info().pad(); + std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info().stride(); + const int upper_bound_w = _input->info()->dimension(0) + pool_pad_x; + const int upper_bound_h = _input->info()->dimension(1) + pool_pad_y; + + const unsigned char *const input_top_ptr = _input->ptr_to_element(Coordinates(-static_cast(pool_pad_x), -static_cast(pool_pad_y))); + const unsigned char *const input_bottom_ptr = _input->ptr_to_element(Coordinates(-static_cast(pool_pad_x), -static_cast(pool_pad_y) + 1)); + + execute_window_loop(window, [&](const Coordinates & id) + { + const auto top_data = vld2q_f16(reinterpret_cast(input_top_ptr + input.offset())); + const auto bottom_data = vld2q_f16(reinterpret_cast(input_bottom_ptr + input.offset())); + float16x8_t res = {}; + + if(pooling_type == PoolingType::AVG) + { + const float scale = calculate_avg_scale(id, pool_size, upper_bound_w, upper_bound_h, pool_pad_x, pool_pad_y, pool_stride_x, pool_stride_y); + const float16x8_t scale_v = vdupq_n_f16(scale); + res = vmulq_f16(scale_v, vaddq_f16(bottom_data.val[1], vaddq_f16(bottom_data.val[0], vaddq_f16(top_data.val[0], top_data.val[1])))); + } + else + { + res = vmaxq_f16(bottom_data.val[1], vmaxq_f16(bottom_data.val[0], vmaxq_f16(top_data.val[0], top_data.val[1]))); + } + vst1q_f16(reinterpret_cast(output.ptr()), res); + }, + input, output); +#else /* ARM_COMPUTE_ENABLE_FP16 */ + ARM_COMPUTE_UNUSED(window_input); + ARM_COMPUTE_UNUSED(window); + ARM_COMPUTE_ERROR("FP16 Not supported! Recompile the library with arch=arm64-v8.2-a"); +#endif /* ARM_COMPUTE_ENABLE_FP16 */ +} + template void NEPoolingLayerKernel::pooling2_f32(const Window &window_input, const Window &window) { @@ -496,19 +624,29 @@ void NEPoolingLayerKernel::run(const Window &window) ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window); ARM_COMPUTE_ERROR_ON(_func == nullptr); - unsigned int pool_stride_x, pool_stride_y = 0; - std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info().stride(); + const unsigned int pool_stride_x = _pool_info.pad_stride_info().stride().first; + const unsigned int pool_stride_y = _pool_info.pad_stride_info().stride().second; // Set step for input in x and y direction for the input Window window_input(window); unsigned int window_x_inc = 0; - if(_input->info()->data_type() == DataType::QS8) - { - window_x_inc = (pool_stride_x == 2) ? _num_elems_processed_per_iteration * 2 : _num_elems_processed_per_iteration; - } - else + switch(_input->info()->data_type()) { - window_x_inc = pool_stride_x; + case DataType::QS8: + case DataType::F16: + { + window_x_inc = (pool_stride_x == 2) ? _num_elems_processed_per_iteration * 2 : _num_elems_processed_per_iteration; + break; + } + case DataType::F32: + { + window_x_inc = pool_stride_x; + break; + } + default: + { + ARM_COMPUTE_ERROR("Not supported"); + } } window_input.set(Window::DimX, Window::Dimension(window.x().start() * pool_stride_x, window.x().end() * pool_stride_x, window_x_inc)); window_input.set(Window::DimY, Window::Dimension(window.y().start() * pool_stride_y, window.y().end() * pool_stride_y, pool_stride_y)); diff --git a/tests/benchmark_new/NEON/PoolingLayer.cpp b/tests/benchmark_new/NEON/PoolingLayer.cpp index baa6e31483..c9d598d4a9 100644 --- a/tests/benchmark_new/NEON/PoolingLayer.cpp +++ b/tests/benchmark_new/NEON/PoolingLayer.cpp @@ -39,24 +39,29 @@ namespace arm_compute { namespace test { +namespace +{ +#ifdef ARM_COMPUTE_ENABLE_FP16 +const auto alexnet_data_types = framework::dataset::make("DataType", { DataType::QS8, DataType::F16, DataType::F32 }); +const auto lenet_data_types = framework::dataset::make("DataType", { DataType::F16, DataType::F32 }); +#else /* ARM_COMPUTE_ENABLE_FP16 */ +const auto alexnet_data_types = framework::dataset::make("DataType", { DataType::QS8, DataType::F32 }); +const auto lenet_data_types = framework::dataset::make("DataType", { DataType::F32 }); +#endif /* ARM_COMPUTE_ENABLE_FP16 */ +} // namespace + using NEPoolingLayerFixture = PoolingLayerFixture; TEST_SUITE(NEON) REGISTER_FIXTURE_DATA_TEST_CASE(AlexNetPoolingLayer, NEPoolingLayerFixture, framework::DatasetMode::ALL, - framework::dataset::combine(framework::dataset::combine(datasets::AlexNetPoolingLayerDataset(), - framework::dataset::make("DataType", { DataType::F32, DataType::QS8 })), - framework::dataset::make("Batches", { 1, 4, 8 }))); + framework::dataset::combine(framework::dataset::combine(datasets::AlexNetPoolingLayerDataset(), alexnet_data_types), framework::dataset::make("Batches", { 1, 4, 8 }))); REGISTER_FIXTURE_DATA_TEST_CASE(LeNet5PoolingLayer, NEPoolingLayerFixture, framework::DatasetMode::ALL, - framework::dataset::combine(framework::dataset::combine(datasets::LeNet5PoolingLayerDataset(), - framework::dataset::make("DataType", DataType::F32)), - framework::dataset::make("Batches", { 1, 4, 8 }))); + framework::dataset::combine(framework::dataset::combine(datasets::LeNet5PoolingLayerDataset(), lenet_data_types), framework::dataset::make("Batches", { 1, 4, 8 }))); REGISTER_FIXTURE_DATA_TEST_CASE(GoogLeNetPoolingLayer, NEPoolingLayerFixture, framework::DatasetMode::ALL, - framework::dataset::combine(framework::dataset::combine(datasets::GoogLeNetPoolingLayerDataset(), - framework::dataset::make("DataType", DataType::F32)), - framework::dataset::make("Batches", { 1, 4, 8 }))); + framework::dataset::combine(framework::dataset::combine(datasets::GoogLeNetPoolingLayerDataset(), lenet_data_types), framework::dataset::make("Batches", { 1, 4, 8 }))); TEST_SUITE_END() } // namespace test diff --git a/tests/validation/NEON/PoolingLayer.cpp b/tests/validation/NEON/PoolingLayer.cpp index 9d6c8824ca..3961770310 100644 --- a/tests/validation/NEON/PoolingLayer.cpp +++ b/tests/validation/NEON/PoolingLayer.cpp @@ -39,8 +39,11 @@ using namespace arm_compute::test::validation; namespace { -const float tolerance_q = 0; /**< Tolerance value for comparing reference's output against implementation's output for quantized input */ -const float tolerance_f = 1e-05; /**< Tolerance value for comparing reference's output against implementation's output for float input */ +const float tolerance_q = 0; /**< Tolerance value for comparing reference's output against implementation's output for quantized input */ +const float tolerance_f32 = 1e-05; /**< Tolerance value for comparing reference's output against implementation's output for float input */ +#ifdef ARM_COMPUTE_ENABLE_FP16 +const float tolerance_f16 = 0.001f; /**< Tolerance value for comparing reference's output against half precision floating point implementation's output */ +#endif /* ARM_COMPUTE_ENABLE_FP16 */ /** Compute Neon pooling layer function. * @@ -73,6 +76,7 @@ Tensor compute_pooling_layer(const TensorShape &shape_in, const TensorShape &sha switch(dt) { case DataType::F32: + case DataType::F16: min = -1; max = 1; break; @@ -123,7 +127,7 @@ BOOST_DATA_TEST_CASE(RandomDataset, RawTensor ref_dst = Reference::compute_reference_pooling_layer(obj.src_shape, obj.dst_shape, dt, obj.info); // Validate output - validate(Accessor(dst), ref_dst, tolerance_f, 0); + validate(Accessor(dst), ref_dst, tolerance_f32, 0); } BOOST_DATA_TEST_CASE(RunSmall7x7, @@ -140,10 +144,29 @@ BOOST_DATA_TEST_CASE(RunSmall7x7, RawTensor ref_dst = Reference::compute_reference_pooling_layer(src_shape, dst_shape, dt, pool_info); // Validate output - validate(Accessor(dst), ref_dst, tolerance_f, 0); + validate(Accessor(dst), ref_dst, tolerance_f32, 0); } BOOST_AUTO_TEST_SUITE_END() +#ifdef ARM_COMPUTE_ENABLE_FP16 +BOOST_AUTO_TEST_SUITE(Float16) +BOOST_TEST_DECORATOR(*boost::unit_test::label("precommit")) +BOOST_DATA_TEST_CASE(RandomDataset, + RandomPoolingLayerDataset() * boost::unit_test::data::make(DataType::F16), + obj, dt) +{ + // Compute function + Tensor dst = compute_pooling_layer(obj.src_shape, obj.dst_shape, dt, obj.info); + + // Compute reference + RawTensor ref_dst = Reference::compute_reference_pooling_layer(obj.src_shape, obj.dst_shape, dt, obj.info); + + // Validate output + validate(NEAccessor(dst), ref_dst, tolerance_f16, 0); +} +BOOST_AUTO_TEST_SUITE_END() +#endif /* ARM_COMPUTE_ENABLE_FP16 */ + BOOST_AUTO_TEST_SUITE(Quantized) BOOST_TEST_DECORATOR(*boost::unit_test::label("precommit")) BOOST_DATA_TEST_CASE(RandomDataset, diff --git a/tests/validation/Reference.cpp b/tests/validation/Reference.cpp index c0d5a20c16..b838907320 100644 --- a/tests/validation/Reference.cpp +++ b/tests/validation/Reference.cpp @@ -698,6 +698,7 @@ RawTensor Reference::compute_reference_pooling_layer(const TensorShape &shape_in switch(dt) { case DataType::F32: + case DataType::F16: min = -1; max = 1; break; diff --git a/tests/validation/TensorOperations.h b/tests/validation/TensorOperations.h index 67dadd6da3..b8e5a6678c 100644 --- a/tests/validation/TensorOperations.h +++ b/tests/validation/TensorOperations.h @@ -44,6 +44,26 @@ #include #include +#if ARM_COMPUTE_ENABLE_FP16 +//Beware! most std templates acting on types don't work with the data type float16_t +namespace std +{ +template <> +class numeric_limits +{ +public: + static float16_t lowest() + { + return -std::numeric_limits::max(); // -inf + }; + static float16_t max() + { + return std::numeric_limits::max(); // +inf + }; +}; +} +#endif /* ARM_COMPUTE_ENABLE_FP16 */ + namespace arm_compute { namespace test @@ -1476,7 +1496,7 @@ void pooling_layer(const Tensor &in, Tensor &out, PoolingLayerInfo pool_in { for(int x = wstart; x < wend; ++x) { - T val = in[r * h_in * w_in + y * w_in + x]; + const T val = in[r * h_in * w_in + y * w_in + x]; if(val > max_val) { max_val = val; -- cgit v1.2.1