COMPMID-1548: NEON FP16 mismatches on CannyEdge and HarrisCorners.

Removes FP16 from HarrisCorners and CannyEdge. Change-Id: I5e4f9205fdbe4de85f04f55ecf1568c837e56cc0 Reviewed-on: https://eu-gerrit-1.euhpc.arm.com/146247 Tested-by: Jenkins <bsgcomp@arm.com> Reviewed-by: Michele DiGiorgio <michele.digiorgio@arm.com>
author: Georgios Pinitas <georgios.pinitas@arm.com> 2018-08-30 16:02:11 +0100
committer: Anthony Barbier <anthony.barbier@arm.com> 2018-11-02 16:54:54 +0000
commit: 09d3451b4caf8d5e0e7cf2c6097e50a1f815d027 (patch)
tree: 8e9b8e9882d0120c85ff08473dbf3721b7602677
parent: b57be0da77370e5e71fe82dfa281f528279e8127 (diff)
download: ComputeLibrary-09d3451b4caf8d5e0e7cf2c6097e50a1f815d027.tar.gz
24 files changed, 72 insertions, 1326 deletions
diff --git a/arm_compute/core/NEON/kernels/NECannyEdgeKernel.h b/arm_compute/core/NEON/kernels/NECannyEdgeKernel.h
index 58ef1757fe..7924d32807 100644
--- a/arm_compute/core/NEON/kernels/NECannyEdgeKernel.h
+++ b/arm_compute/core/NEON/kernels/NECannyEdgeKernel.h
@@ -85,24 +85,6 @@ protected:
     ITensor          *_phase;     /**< Destination tensor - Quantized phase */
 };
 
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-/** NEON kernel to perform Gradient computation for FP16 datatype
- */
-class NEGradientFP16Kernel : public NEGradientKernel
-{
-public:
-    const char *name() const override
-    {
-        return "NEGradientFP16Kernel";
-    }
-    // Inherited methods overriden:
-    void configure(const ITensor *gx, const ITensor *gy, ITensor *magnitude, ITensor *phase, int32_t norm_type) override;
-};
-#else  /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
-/** NEON kernel to perform Gradient computation for FP16 datatype */
-using NEGradientFP16Kernel = NEGradientKernel;
-#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
-
 /** NEON kernel to perform Non-Maxima suppression for Canny Edge.
  *
  * @note This kernel is meant to be used alongside CannyEdge and performs a non-maxima suppression using magnitude and phase of input
diff --git a/arm_compute/core/NEON/kernels/NEHarrisCornersKernel.h b/arm_compute/core/NEON/kernels/NEHarrisCornersKernel.h
index aabf8b312b..42a899d55e 100644
--- a/arm_compute/core/NEON/kernels/NEHarrisCornersKernel.h
+++ b/arm_compute/core/NEON/kernels/NEHarrisCornersKernel.h
@@ -102,34 +102,5 @@ private:
     /** Harris Score function to use for the particular image types passed to configure() */
     HarrisScoreFunction *_func;
 };
-
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-/** Interface for the accumulate Weighted kernel using F16 */
-template <int32_t block_size>
-class NEHarrisScoreFP16Kernel : public INEHarrisScoreKernel
-{
-public:
-    const char *name() const override
-    {
-        return "NEHarrisScoreFP16Kernel";
-    }
-    /** Default constructor */
-    NEHarrisScoreFP16Kernel();
-    // Inherited methods overridden:
-    void configure(const IImage *input1, const IImage *input2, IImage *output, float norm_factor, float strength_thresh, float sensitivity, bool border_undefined) override;
-    BorderSize border_size() const override;
-    void run(const Window &window, const ThreadInfo &info) override;
-
-private:
-    using HarrisScoreFunction = void(const void *__restrict input1_ptr, const void *__restrict input2_ptr, void *__restrict output_ptr, int32_t input_stride,
-                                     float norm_factor, float sensitivity, float strength_thresh);
-    /** Harris Score function to use for the particular image types passed to configure() */
-    HarrisScoreFunction *_func;
-};
-#else  /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
-/** Interface for the accumulate Weighted kernel using FP16 */
-template <int32_t block_size>
-using NEHarrisScoreFP16Kernel = NEHarrisScoreKernel<block_size>;
-#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
 } // namespace arm_compute
 #endif /* __ARM_COMPUTE_NEHARRISCORNERSKERNEL_H__ */
diff --git a/arm_compute/runtime/NEON/functions/NECannyEdge.h b/arm_compute/runtime/NEON/functions/NECannyEdge.h
index 17fefcc0ab..0b6e555667 100644
--- a/arm_compute/runtime/NEON/functions/NECannyEdge.h
+++ b/arm_compute/runtime/NEON/functions/NECannyEdge.h
@@ -74,11 +74,8 @@ public:
      * @param[in]      norm_type             Normalization type. If 1, L1-Norm otherwise L2-Norm
      * @param[in]      border_mode           Border mode to use for the convolution.
      * @param[in]      constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT.
-     * @param[in]      use_fp16              (Optional) If true the FP16 kernels will be used. If false F32 kernels are used.
-     *
      */
-    void configure(ITensor *input, ITensor *output, int32_t upper_thr, int32_t lower_thr, int32_t gradient_size, int32_t norm_type, BorderMode border_mode, uint8_t constant_border_value = 0,
-                   bool use_fp16 = false);
+    void configure(ITensor *input, ITensor *output, int32_t upper_thr, int32_t lower_thr, int32_t gradient_size, int32_t norm_type, BorderMode border_mode, uint8_t constant_border_value = 0);
 
     // Inherited methods overridden:
     void run() override;
diff --git a/arm_compute/runtime/NEON/functions/NEHarrisCorners.h b/arm_compute/runtime/NEON/functions/NEHarrisCorners.h
index 6ea14a38e5..b35a9add04 100644
--- a/arm_compute/runtime/NEON/functions/NEHarrisCorners.h
+++ b/arm_compute/runtime/NEON/functions/NEHarrisCorners.h
@@ -79,11 +79,10 @@ public:
      * @param[out]     corners               Array of keypoints to store the results.
      * @param[in]      border_mode           Border mode to use
      * @param[in]      constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT.
-     * @param[in]      use_fp16              (Optional) If true the FP16 kernels will be used. If false F32 kernels are used.
      */
     void configure(IImage *input, float threshold, float min_dist, float sensitivity,
                    int32_t gradient_size, int32_t block_size, KeyPointArray *corners,
-                   BorderMode border_mode, uint8_t constant_border_value = 0, bool use_fp16 = false);
+                   BorderMode border_mode, uint8_t constant_border_value = 0);
 
     // Inherited methods overridden:
     void run() override;
diff --git a/docs/00_introduction.dox b/docs/00_introduction.dox
index e3bbea24db..9c6df59335 100644
--- a/docs/00_introduction.dox
+++ b/docs/00_introduction.dox
@@ -509,7 +509,7 @@ v17.04 Public bug fixes release
  -  @ref NEFillArrayKernel
  -  @ref NEGaussianPyramidHorKernel
  -  @ref NEGaussianPyramidVertKernel
- -  @ref NEHarrisScoreFP16Kernel
+ -  NEHarrisScoreFP16Kernel
  -  @ref NEHarrisScoreKernel
  -  @ref NEHOGDetectorKernel
  -  @ref NELogits1DMaxKernel
diff --git a/src/core/NEON/kernels/NECannyEdgeKernel.cpp b/src/core/NEON/kernels/NECannyEdgeKernel.cpp
index dc37452415..fa51a7bb0b 100644
--- a/src/core/NEON/kernels/NECannyEdgeKernel.cpp
+++ b/src/core/NEON/kernels/NECannyEdgeKernel.cpp
@@ -51,744 +51,6 @@ constexpr int EDGE    = 255;
 constexpr int MAYBE   = 127;
 } // namespace
 
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-namespace fp16
-{
-inline uint8x8_t phase_quantization(const float32x4x2_t &gx, const float32x4x2_t &gy)
-{
-    // Constant use for evaluating score1 and score3
-    static const float32x4_t const45 = vdupq_n_f32(0.70710678118655f);
-    static const float32x4_t zero    = vdupq_n_f32(0.0f);
-    static const float32x4_t one     = vdupq_n_f32(1.0f);
-    static const float32x4_t two     = vdupq_n_f32(2.0f);
-    static const float32x4_t three   = vdupq_n_f32(3.0f);
-
-    // Score0: (1, 0)
-    const float32x4x2_t score0 =
-    {
-        vabsq_f32(gx.val[0]),
-        vabsq_f32(gx.val[1])
-    };
-
-    // Score2: ( 0, 1 )
-    const float32x4x2_t score2 =
-    {
-        vabsq_f32(gy.val[0]),
-        vabsq_f32(gy.val[1])
-    };
-
-    // Score1 and Score3: ( sqrt(2) / 2, sqrt(2) / 2 ) - ( -sqrt(2) / 2, sqrt(2) / 2 )
-    float32x4x2_t score1 =
-    {
-        vmulq_f32(gy.val[0], const45),
-        vmulq_f32(gy.val[1], const45)
-    };
-
-    float32x4x2_t score3 = score1;
-
-    score1.val[0] = vmlaq_f32(score1.val[0], gx.val[0], const45);
-    score1.val[1] = vmlaq_f32(score1.val[1], gx.val[1], const45);
-    score3.val[0] = vmlsq_f32(score3.val[0], gx.val[0], const45);
-    score3.val[1] = vmlsq_f32(score3.val[1], gx.val[1], const45);
-
-    score1.val[0] = vabsq_f32(score1.val[0]);
-    score1.val[1] = vabsq_f32(score1.val[1]);
-    score3.val[0] = vabsq_f32(score3.val[0]);
-    score3.val[1] = vabsq_f32(score3.val[1]);
-
-    float32x4x2_t phase =
-    {
-        zero,
-        zero
-    };
-
-    float32x4x2_t old_score = score0;
-
-    // score1 > old_score?
-    uint32x4x2_t mask =
-    {
-        vcgtq_f32(score1.val[0], old_score.val[0]),
-        vcgtq_f32(score1.val[1], old_score.val[1])
-    };
-
-    phase.val[0]     = vbslq_f32(mask.val[0], one, phase.val[0]);
-    phase.val[1]     = vbslq_f32(mask.val[1], one, phase.val[1]);
-    old_score.val[0] = vbslq_f32(mask.val[0], score1.val[0], old_score.val[0]);
-    old_score.val[1] = vbslq_f32(mask.val[1], score1.val[1], old_score.val[1]);
-
-    // score2 > old_score?
-    mask.val[0] = vcgtq_f32(score2.val[0], old_score.val[0]);
-    mask.val[1] = vcgtq_f32(score2.val[1], old_score.val[1]);
-
-    phase.val[0]     = vbslq_f32(mask.val[0], two, phase.val[0]);
-    phase.val[1]     = vbslq_f32(mask.val[1], two, phase.val[1]);
-    old_score.val[0] = vbslq_f32(mask.val[0], score2.val[0], old_score.val[0]);
-    old_score.val[1] = vbslq_f32(mask.val[1], score2.val[1], old_score.val[1]);
-
-    // score3 > old_score?
-    mask.val[0] = vcgtq_f32(score3.val[0], old_score.val[0]);
-    mask.val[1] = vcgtq_f32(score3.val[1], old_score.val[1]);
-
-    phase.val[0]     = vbslq_f32(mask.val[0], three, phase.val[0]);
-    phase.val[1]     = vbslq_f32(mask.val[1], three, phase.val[1]);
-    old_score.val[0] = vbslq_f32(mask.val[0], score3.val[0], old_score.val[0]);
-    old_score.val[1] = vbslq_f32(mask.val[1], score3.val[1], old_score.val[1]);
-
-    // Convert from float32x4_t to uint8x8_t
-    return vmovn_u16(vcombine_u16(vmovn_u32(vcvtq_u32_f32(phase.val[0])),
-                                  vmovn_u32(vcvtq_u32_f32(phase.val[1]))));
-}
-
-inline uint8x8_t phase_quantization(float16x8_t gx, float16x8_t gy)
-{
-    // Constant use for evaluating score1 and score3
-    static const float16x8_t const45 = vdupq_n_f16(0.70710678118655f);
-    static const float16x8_t zero    = vdupq_n_f16(0.0f);
-    static const float16x8_t one     = vdupq_n_f16(1.0f);
-    static const float16x8_t two     = vdupq_n_f16(2.0f);
-    static const float16x8_t three   = vdupq_n_f16(3.0f);
-
-    // Score0: (1, 0)
-    const float16x8_t score0 = vabsq_f16(gx);
-
-    // Score2: ( 0, 1 )
-    const float16x8_t score2 = vabsq_f16(gy);
-
-    // Score1 and Score3: ( sqrt(2) / 2, sqrt(2) / 2 ) - ( -sqrt(2) / 2, sqrt(2) / 2 )
-    float16x8_t score1 = vmulq_f16(gy, const45);
-    float16x8_t score3 = score1;
-
-    score1 = vfmaq_f16(score1, gx, const45);
-    score3 = vfmsq_f16(score3, gx, const45);
-
-    score1 = vabsq_f16(score1);
-    score3 = vabsq_f16(score3);
-
-    float16x8_t phase     = zero;
-    float16x8_t old_score = score0;
-
-    // score1 > old_score?
-    uint16x8_t mask = vcgtq_f16(score1, old_score);
-
-    phase     = vbslq_f16(mask, one, phase);
-    old_score = vbslq_f16(mask, score1, old_score);
-
-    // score2 > old_score?
-    mask = vcgtq_f16(score2, old_score);
-
-    phase     = vbslq_f16(mask, two, phase);
-    old_score = vbslq_f16(mask, score2, old_score);
-
-    // score3 > old_score?
-    mask = vcgtq_f16(score3, old_score);
-
-    phase = vbslq_f16(mask, three, phase);
-
-    // Convert from float16x8_t to uint8x8_t
-    return vmovn_u16(vcvtq_u16_f16(phase));
-}
-
-/** Computes the gradient phase if gradient_size = 3 or 5. The output is quantized.
- *         0 = 0°, 1 = 45°, 2 = 90°, 3 = 135°
- *
- * @param[in] gx Gx component
- * @param[in] gy Gy component
- *
- * @return quantized phase for 8 pixels
- */
-inline uint8x8_t phase_quantization_S16_S16(int16x8_t gx, int16x8_t gy)
-{
-    return phase_quantization(vcvtq_f16_s16(gx), vcvtq_f16_s16(gy));
-}
-
-/** Computes the gradient phase if gradient_size = 7. The output is quantized.
- *         0 = 0°, 1 = 45°, 2 = 90°, 3 = 135°
- *
- * @param[in] gx Gx component
- * @param[in] gy Gy component
- *
- * @return quantized phase for 8 pixels
- */
-inline uint8x8_t phase_quantization_S32_S32(const int32x4x2_t &gx, const int32x4x2_t &gy)
-{
-    // Convert to float
-    const float32x4x2_t gx_f32 =
-    {
-        vcvtq_f32_s32(gx.val[0]),
-        vcvtq_f32_s32(gx.val[1])
-    };
-
-    const float32x4x2_t gy_f32 =
-    {
-        vcvtq_f32_s32(gy.val[0]),
-        vcvtq_f32_s32(gy.val[1])
-    };
-
-    return phase_quantization(gx_f32, gy_f32);
-}
-
-/** Computes the magnitude using the L1-norm type if gradient_size = 3 or 5
- *
- * @param[in] gx Gx component
- * @param[in] gy Gy component
- *
- * @return magnitude for 8 pixels
- */
-inline uint16x8_t mag_l1_S16_S16(int16x8_t gx, int16x8_t gy)
-{
-    return vaddq_u16(vreinterpretq_u16_s16(vabsq_s16(gx)),
-                     vreinterpretq_u16_s16(vabsq_s16(gy)));
-}
-
-/** Computes the magnitude using the L1-norm type if gradient_size = 7
- *
- * @param[in] gx Gx component
- * @param[in] gy Gy component
- *
- * @return magnitude for 8 pixels
- */
-inline uint32x4x2_t mag_l1_S32_S32(const int32x4x2_t &gx, const int32x4x2_t &gy)
-{
-    const uint32x4x2_t gx_abs =
-    {
-        vreinterpretq_u32_s32(vabsq_s32(gx.val[0])),
-        vreinterpretq_u32_s32(vabsq_s32(gx.val[1]))
-    };
-
-    const uint32x4x2_t gy_abs =
-    {
-        vreinterpretq_u32_s32(vabsq_s32(gy.val[0])),
-        vreinterpretq_u32_s32(vabsq_s32(gy.val[1]))
-    };
-
-    const uint32x4x2_t out =
-    {
-        vaddq_u32(gx_abs.val[0], gy_abs.val[0]),
-        vaddq_u32(gx_abs.val[1], gy_abs.val[1])
-    };
-
-    return out;
-}
-
-inline float32x4x2_t mag_l2(const float32x4x2_t &gx, const float32x4x2_t &gy)
-{
-    // x^2 ...
-    float32x4x2_t mag =
-    {
-        vmulq_f32(gx.val[0], gx.val[0]),
-        vmulq_f32(gx.val[1], gx.val[1])
-    };
-
-    // ... + y^2
-    mag.val[0] = vmlaq_f32(mag.val[0], gy.val[0], gy.val[0]);
-    mag.val[1] = vmlaq_f32(mag.val[1], gy.val[1], gy.val[1]);
-
-    // sqrt(...)
-    mag.val[0] = vmulq_f32(vrsqrteq_f32(mag.val[0]), mag.val[0]);
-    mag.val[1] = vmulq_f32(vrsqrteq_f32(mag.val[1]), mag.val[1]);
-
-    return mag;
-}
-
-inline float16x8_t mag_l2(float16x8_t gx, float16x8_t gy)
-{
-    // x^2 ...
-    float16x8_t mag = vmulq_f16(gx, gx);
-
-    // ... + y^2
-    mag = vfmaq_f16(mag, gy, gy);
-
-    // sqrt(...)
-    mag = vmulq_f16(vrsqrteq_f16(mag), mag);
-
-    return mag;
-}
-
-/** Computes the magnitude using L2-norm if gradient_size = 3 or 5
- *
- * @param[in] gx Gx component
- * @param[in] gy Gy component
- *
- * @return magnitude for 8 pixels
- */
-inline uint16x8_t mag_l2_S16_S16(int16x8_t gx, int16x8_t gy)
-{
-    /* Compute magnitude using L2 normalization */
-    const float16x8_t gx2 = vcvtq_f16_s16(gx);
-    const float16x8_t gy2 = vcvtq_f16_s16(gy);
-    const float16x8_t mag = mag_l2(gx2, gy2);
-
-    /* Store magnitude - Convert to uint16x8 */
-    return vcvtq_u16_f16(mag);
-}
-
-/** Computes the magnitude using L2-norm if gradient_size = 7
- *
- * @param[in] gx Gx component
- * @param[in] gy Gy component
- *
- * @return magnitude for 8 pixels
- */
-inline uint32x4x2_t mag_l2_S32_S32(const int32x4x2_t &gx, const int32x4x2_t &gy)
-{
-    // Compute magnitude using L2 normalization
-    float32x4x2_t gx2 =
-    {
-        vcvtq_f32_s32(gx.val[0]),
-        vcvtq_f32_s32(gx.val[1])
-    };
-
-    float32x4x2_t gy2 =
-    {
-        vcvtq_f32_s32(gy.val[0]),
-        vcvtq_f32_s32(gy.val[1])
-    };
-
-    const float32x4x2_t mag = mag_l2(gx2, gy2);
-    const uint32x4x2_t  mag32 =
-    {
-        vcvtq_u32_f32(mag.val[0]),
-        vcvtq_u32_f32(mag.val[1])
-    };
-
-    return mag32;
-}
-
-/** Gradient function used when the gradient size = 3 or 5 and when the norm_type = L1-norm
- *
- * @param[in]  in1_ptr  Pointer to source image. Gx image. Data type supported S16
- * @param[in]  in2_ptr  Pointer to source image. Gy image. Data type supported S16
- * @param[out] out1_ptr Pointer to destination image. Magnitude. Data type supported U16
- * @param[out] out2_ptr Pointer to destination image. Quantized phase. Data type supported U8
- */
-void mag_phase_l1norm_S16_S16_U16_U8(const void *__restrict in1_ptr, const void *__restrict in2_ptr, void *__restrict out1_ptr, void *__restrict out2_ptr)
-{
-    const auto in1  = static_cast<const int16_t *__restrict>(in1_ptr);
-    const auto in2  = static_cast<const int16_t *__restrict>(in2_ptr);
-    const auto out1 = static_cast<uint16_t *__restrict>(out1_ptr);
-    const auto out2 = static_cast<uint8_t *__restrict>(out2_ptr);
-
-    const int16x8x4_t gx =
-    {
-        vld1q_s16(in1),
-        vld1q_s16(in1 + 8),
-        vld1q_s16(in1 + 16),
-        vld1q_s16(in1 + 24)
-    };
-
-    const int16x8x4_t gy =
-    {
-        vld1q_s16(in2),
-        vld1q_s16(in2 + 8),
-        vld1q_s16(in2 + 16),
-        vld1q_s16(in2 + 24)
-    };
-
-    // Compute and store phase
-    vst1_u8(out2 + 0, phase_quantization_S16_S16(gx.val[0], gy.val[0]));
-    vst1_u8(out2 + 8, phase_quantization_S16_S16(gx.val[1], gy.val[1]));
-    vst1_u8(out2 + 16, phase_quantization_S16_S16(gx.val[2], gy.val[2]));
-    vst1_u8(out2 + 24, phase_quantization_S16_S16(gx.val[3], gy.val[3]));
-
-    // Compute ans store magnitude using L1 normalization
-    vst1q_u16(out1 + 0, mag_l1_S16_S16(gx.val[0], gy.val[0]));
-    vst1q_u16(out1 + 8, mag_l1_S16_S16(gx.val[1], gy.val[1]));
-    vst1q_u16(out1 + 16, mag_l1_S16_S16(gx.val[2], gy.val[2]));
-    vst1q_u16(out1 + 24, mag_l1_S16_S16(gx.val[3], gy.val[3]));
-}
-
-/** Gradient function used when the gradient size = 3 or 5 and when the norm_type = L2-norm
- *
- * @param[in]  in1_ptr  Pointer to source image. Gx image. Data type supported S16
- * @param[in]  in2_ptr  Pointer to source image. Gy image. Data type supported S16
- * @param[out] out1_ptr Pointer to destination image. Magnitude. Data type supported U16
- * @param[out] out2_ptr Pointer to destination image. Quantized phase. Data type supported U8
- */
-void mag_phase_l2norm_S16_S16_U16_U8(const void *__restrict in1_ptr, const void *__restrict in2_ptr, void *__restrict out1_ptr, void *__restrict out2_ptr)
-{
-    const auto in1  = static_cast<const int16_t *__restrict>(in1_ptr);
-    const auto in2  = static_cast<const int16_t *__restrict>(in2_ptr);
-    const auto out1 = static_cast<uint16_t *__restrict>(out1_ptr);
-    const auto out2 = static_cast<uint8_t *__restrict>(out2_ptr);
-
-    const int16x8x4_t gx =
-    {
-        vld1q_s16(in1),
-        vld1q_s16(in1 + 8),
-        vld1q_s16(in1 + 16),
-        vld1q_s16(in1 + 24)
-    };
-
-    const int16x8x4_t gy =
-    {
-        vld1q_s16(in2),
-        vld1q_s16(in2 + 8),
-        vld1q_s16(in2 + 16),
-        vld1q_s16(in2 + 24)
-    };
-
-    // Compute and store phase
-    vst1_u8(out2 + 0, phase_quantization_S16_S16(gx.val[0], gy.val[0]));
-    vst1_u8(out2 + 8, phase_quantization_S16_S16(gx.val[1], gy.val[1]));
-    vst1_u8(out2 + 16, phase_quantization_S16_S16(gx.val[2], gy.val[2]));
-    vst1_u8(out2 + 24, phase_quantization_S16_S16(gx.val[3], gy.val[3]));
-
-    // Compute and store magnitude using L2 normalization
-    vst1q_u16(out1 + 0, mag_l2_S16_S16(gx.val[0], gy.val[0]));
-    vst1q_u16(out1 + 8, mag_l2_S16_S16(gx.val[1], gy.val[1]));
-    vst1q_u16(out1 + 16, mag_l2_S16_S16(gx.val[2], gy.val[2]));
-    vst1q_u16(out1 + 24, mag_l2_S16_S16(gx.val[3], gy.val[3]));
-}
-
-/** Gradient function used when the gradient size = 7 and when the norm_type = L1-norm
- *
- * @param[in]  in1_ptr  Pointer to source image. Gx image. Data type supported S32
- * @param[in]  in2_ptr  Pointer to source image. Gy image. Data type supported S32
- * @param[out] out1_ptr Pointer to destination image. Magnitude. Data type supported U32
- * @param[out] out2_ptr Pointer to destination image. Quantized phase. Data type supported U8
- */
-void mag_phase_l1norm_S32_S32_U32_U8(const void *__restrict in1_ptr, const void *__restrict in2_ptr, void *__restrict out1_ptr, void *__restrict out2_ptr)
-{
-    auto in1  = static_cast<const int32_t *__restrict>(in1_ptr);
-    auto in2  = static_cast<const int32_t *__restrict>(in2_ptr);
-    auto out1 = static_cast<uint32_t *__restrict>(out1_ptr);
-    auto out2 = static_cast<uint8_t *__restrict>(out2_ptr);
-
-    // Process low and high part
-    for(size_t i = 0; i < 2; ++i, in1 += 16, in2 += 16, out1 += 16, out2 += 16)
-    {
-        const int32x4x2_t gx0 =
-        {
-            vld1q_s32(in1 + 0),
-            vld1q_s32(in1 + 4)
-        };
-
-        const int32x4x2_t gx1 =
-        {
-            vld1q_s32(in1 + 8),
-            vld1q_s32(in1 + 12)
-        };
-
-        const int32x4x2_t gy0 =
-        {
-            vld1q_s32(in2 + 0),
-            vld1q_s32(in2 + 4)
-        };
-
-        const int32x4x2_t gy1 =
-        {
-            vld1q_s32(in2 + 8),
-            vld1q_s32(in2 + 12)
-        };
-
-        // Compute and store phase
-        vst1_u8(out2 + 0, phase_quantization_S32_S32(gx0, gy0));
-        vst1_u8(out2 + 8, phase_quantization_S32_S32(gx1, gy1));
-
-        // Compute magnitude using L1 normalization
-        const uint32x4x2_t mag0 = mag_l1_S32_S32(gx0, gy0);
-        const uint32x4x2_t mag1 = mag_l1_S32_S32(gx1, gy1);
-
-        // Store magnitude
-        vst1q_u32(out1 + 0, mag0.val[0]);
-        vst1q_u32(out1 + 4, mag0.val[1]);
-        vst1q_u32(out1 + 8, mag1.val[0]);
-        vst1q_u32(out1 + 12, mag1.val[1]);
-    }
-}
-
-/** Gradient function used when the gradient size = 7 and when the norm_type = L2-norm
- *
- * @param[in]  in1_ptr  Pointer to source image. Gx image. Data type supported S32
- * @param[in]  in2_ptr  Pointer to source image. Gy image. Data type supported S32
- * @param[out] out1_ptr Pointer to destination image. Magnitude. Data type supported U32
- * @param[out] out2_ptr Pointer to destination image. Quantized phase. Data type supported U8
- */
-void mag_phase_l2norm_S32_S32_U32_U8(const void *__restrict in1_ptr, const void *__restrict in2_ptr, void *__restrict out1_ptr, void *__restrict out2_ptr)
-{
-    auto in1  = static_cast<const int32_t *__restrict>(in1_ptr);
-    auto in2  = static_cast<const int32_t *__restrict>(in2_ptr);
-    auto out1 = static_cast<uint32_t *__restrict>(out1_ptr);
-    auto out2 = static_cast<uint8_t *__restrict>(out2_ptr);
-
-    // Process low and high part
-    for(size_t i = 0; i < 2; ++i, in1 += 16, in2 += 16, out1 += 16, out2 += 16)
-    {
-        const int32x4x2_t gx0 =
-        {
-            vld1q_s32(in1 + 0),
-            vld1q_s32(in1 + 4)
-        };
-
-        const int32x4x2_t gx1 =
-        {
-            vld1q_s32(in1 + 8),
-            vld1q_s32(in1 + 12)
-        };
-
-        const int32x4x2_t gy0 =
-        {
-            vld1q_s32(in2 + 0),
-            vld1q_s32(in2 + 4)
-        };
-
-        const int32x4x2_t gy1 =
-        {
-            vld1q_s32(in2 + 8),
-            vld1q_s32(in2 + 12)
-        };
-
-        // Compute and store phase
-        vst1_u8(out2 + 0, phase_quantization_S32_S32(gx0, gy0));
-        vst1_u8(out2 + 8, phase_quantization_S32_S32(gx1, gy1));
-
-        // Compute magnitude using L2 normalization
-        const uint32x4x2_t mag0 = mag_l2_S32_S32(gx0, gy0);
-        const uint32x4x2_t mag1 = mag_l2_S32_S32(gx1, gy1);
-
-        // Store magnitude
-        vst1q_u32(out1 + 0, mag0.val[0]);
-        vst1q_u32(out1 + 4, mag0.val[1]);
-        vst1q_u32(out1 + 8, mag1.val[0]);
-        vst1q_u32(out1 + 12, mag1.val[1]);
-    }
-}
-
-inline uint16x4_t non_max_U32_helper(const uint32_t *in, const uint16x4_t pc, const uint32_t stride_mag, const int32_t lower_thr, const int32_t upper_thr)
-{
-    // Phase for 4 pixel
-    const uint32x4_t pc32 = vmovl_u16(pc);
-
-    // Get magnitude for 4 pixel
-    uint32x4_t mc = vld1q_u32(in);
-
-    // Angle_quantized: 0 = 0°, 1 = 45°, 2 = 90°, 3 = 135°
-    // 0 degree
-    const uint32x4_t mk0_0 = vld1q_u32(in - 1);
-    const uint32x4_t mk0_1 = vld1q_u32(in + 1);
-    uint32x4_t       mask0 = vceqq_u32(pc32, vdupq_n_u32(0));
-    mask0                  = vandq_u32(mask0, vcgtq_u32(mc, mk0_0));
-    mask0                  = vandq_u32(mask0, vcgtq_u32(mc, mk0_1));
-
-    // 45 degree
-    const uint32x4_t mk45_0 = vld1q_u32(in - stride_mag - 1);
-    const uint32x4_t mk45_1 = vld1q_u32(in + stride_mag + 1);
-    uint32x4_t       mask1  = vceqq_u32(pc32, vdupq_n_u32(1));
-    mask1                   = vandq_u32(mask1, vcgtq_u32(mc, mk45_0));
-    mask1                   = vandq_u32(mask1, vcgtq_u32(mc, mk45_1));
-
-    // 90 degree
-    const uint32x4_t mk90_0 = vld1q_u32(in - stride_mag);
-    const uint32x4_t mk90_1 = vld1q_u32(in + stride_mag);
-    uint32x4_t       mask2  = vceqq_u32(pc32, vdupq_n_u32(2));
-    mask2                   = vandq_u32(mask2, vcgtq_u32(mc, mk90_0));
-    mask2                   = vandq_u32(mask2, vcgtq_u32(mc, mk90_1));
-
-    // 135 degree
-    const uint32x4_t mk135_0 = vld1q_u32(in - stride_mag + 1);
-    const uint32x4_t mk135_1 = vld1q_u32(in + stride_mag - 1);
-    uint32x4_t       mask3   = vceqq_u32(pc32, vdupq_n_u32(3));
-    mask3                    = vandq_u32(mask3, vcgtq_u32(mc, mk135_0));
-    mask3                    = vandq_u32(mask3, vcgtq_u32(mc, mk135_1));
-
-    // Merge masks
-    mask0 = vorrq_u32(mask0, mask1);
-    mask2 = vorrq_u32(mask2, mask3);
-    mask0 = vorrq_u32(mask0, mask2);
-
-    mc = vbslq_u32(mask0, mc, vdupq_n_u32(0));
-
-    // mc > upper_thr
-    mask0 = vcgtq_u32(mc, vdupq_n_u32(upper_thr));
-
-    // mc <= lower_thr
-    mask1 = vcleq_u32(mc, vdupq_n_u32(lower_thr));
-
-    // mc <= upper_thr && mc > lower_thr
-    mask2 = vcleq_u32(mc, vdupq_n_u32(upper_thr));
-    mask2 = vandq_u32(mask2, vcgtq_u32(mc, vdupq_n_u32(lower_thr)));
-
-    mc = vbslq_u32(mask0, vdupq_n_u32(EDGE), mc);
-    mc = vbslq_u32(mask1, vdupq_n_u32(NO_EDGE), mc);
-    mc = vbslq_u32(mask2, vdupq_n_u32(MAYBE), mc);
-
-    return vmovn_u32(mc);
-}
-
-/** Computes edge tracing when is called by edge_trace_U8_U8 recursively
- *
- * @param[in]  in         Pointer to source image. Data type supported U8
- * @param[out] out        Pointer to destination image. Data type supported U8
- * @param[in]  in_stride  Stride of the input image
- * @param[in]  out_stride Stride of the output image
- */
-void edge_trace_recursive_U8_U8(uint8_t *__restrict in, uint8_t *__restrict out, const int32_t in_stride, const int32_t out_stride)
-{
-    // Look for MAYBE pixels in 8 directions
-    *out = EDGE;
-
-    // (-1, 0)
-    uint8_t pixel = *(in - 1);
-
-    if(pixel == MAYBE)
-    {
-        // Touched a MAYBE point. MAYBE becomes EDGE
-        *(in - 1) = EDGE;
-
-        edge_trace_recursive_U8_U8(in - 1, out - 1, in_stride, out_stride);
-    }
-
-    // (+1, 0)
-    pixel = *(in + 1);
-
-    if(pixel == MAYBE)
-    {
-        // Touched a MAYBE point. MAYBE becomes EDGE
-        *(in + 1) = EDGE;
-
-        edge_trace_recursive_U8_U8(in + 1, out + 1, in_stride, out_stride);
-    }
-
-    in -= in_stride;
-    out -= out_stride;
-
-    // (-1, -1)
-    pixel = *(in - 1);
-
-    if(pixel == MAYBE)
-    {
-        // Touched a MAYBE point. MAYBE becomes EDGE
-        *(in - 1) = EDGE;
-
-        edge_trace_recursive_U8_U8(in - 1, out - 1, in_stride, out_stride);
-    }
-
-    // (0, -1)
-    pixel = *in;
-
-    if(pixel == MAYBE)
-    {
-        // Touched a MAYBE point. MAYBE becomes EDGE
-        *in = EDGE;
-
-        edge_trace_recursive_U8_U8(in, out, in_stride, out_stride);
-    }
-
-    // (+1, -1)
-    pixel = *(in + 1);
-
-    if(pixel == MAYBE)
-    {
-        // Touched a MAYBE point. MAYBE becomes EDGE
-        *(in + 1) = EDGE;
-
-        edge_trace_recursive_U8_U8(in + 1, out + 1, in_stride, out_stride);
-    }
-
-    in += in_stride * 2;
-    out += out_stride * 2;
-
-    // (-1, +1)
-    pixel = *(in - 1);
-
-    if(pixel == MAYBE)
-    {
-        // Touched a MAYBE point. MAYBE becomes EDGE
-        *(in - 1) = EDGE;
-
-        edge_trace_recursive_U8_U8(in - 1, out - 1, in_stride, out_stride);
-    }
-
-    // (0, +1)
-    pixel = *in;
-
-    if(pixel == MAYBE)
-    {
-        // Touched a MAYBE point. MAYBE becomes EDGE
-        *in = EDGE;
-
-        edge_trace_recursive_U8_U8(in, out, in_stride, out_stride);
-    }
-
-    // (+1, +1)
-    pixel = *(in + 1);
-
-    if(pixel == MAYBE)
-    {
-        // Touched a MAYBE point. MAYBE becomes EDGE
-        *(in + 1) = EDGE;
-
-        edge_trace_recursive_U8_U8(in + 1, out + 1, in_stride, out_stride);
-    }
-}
-} // namespace fp16
-
-void NEGradientFP16Kernel::configure(const ITensor *gx, const ITensor *gy, ITensor *magnitude, ITensor *phase, int32_t norm_type)
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(gx, gy, magnitude, phase);
-
-    set_shape_if_empty(*magnitude->info(), gx->info()->tensor_shape());
-    set_shape_if_empty(*phase->info(), gx->info()->tensor_shape());
-
-    Format magnitude_format = gx->info()->data_type() == DataType::S16 ? Format::U16 : Format::U32;
-    set_format_if_unknown(*magnitude->info(), magnitude_format);
-    set_format_if_unknown(*phase->info(), Format::U8);
-
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(gx, gy, magnitude, phase);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(gx, 1, DataType::S16, DataType::S32);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(gy, 1, DataType::S16, DataType::S32);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(magnitude, 1, DataType::U16, DataType::U32);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(phase, 1, DataType::U8);
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(gx, gy);
-    ARM_COMPUTE_ERROR_ON_MSG(element_size_from_data_type(gx->info()->data_type()) != element_size_from_data_type(magnitude->info()->data_type()), "Magnitude must have the same element size as Gx and Gy");
-
-    _gx        = gx;
-    _gy        = gy;
-    _magnitude = magnitude;
-    _phase     = phase;
-
-    if(_gx->info()->data_type() == DataType::S16)
-    {
-        if(norm_type == 1)
-        {
-            _func = &fp16::mag_phase_l1norm_S16_S16_U16_U8;
-        }
-        else
-        {
-            _func = &fp16::mag_phase_l2norm_S16_S16_U16_U8;
-        }
-    }
-    else
-    {
-        if(norm_type == 1)
-        {
-            _func = &fp16::mag_phase_l1norm_S32_S32_U32_U8;
-        }
-        else
-        {
-            _func = &fp16::mag_phase_l2norm_S32_S32_U32_U8;
-        }
-    }
-
-    constexpr unsigned int num_elems_processed_per_iteration = 32;
-
-    // Configure kernel window
-    Window win = calculate_max_window(*_gx->info(), Steps(num_elems_processed_per_iteration));
-
-    AccessWindowHorizontal gx_access(_gx->info(), 0, num_elems_processed_per_iteration);
-    AccessWindowHorizontal gy_access(_gy->info(), 0, num_elems_processed_per_iteration);
-    AccessWindowHorizontal mag_access(_magnitude->info(), 0, num_elems_processed_per_iteration);
-    AccessWindowHorizontal phase_access(_phase->info(), 0, num_elems_processed_per_iteration);
-
-    update_window_and_padding(win, gx_access, gy_access, mag_access, phase_access);
-
-    mag_access.set_valid_region(win, _gx->info()->valid_region());
-    phase_access.set_valid_region(win, _gx->info()->valid_region());
-
-    INEKernel::configure(win);
-}
-#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
-
 namespace
 {
 inline uint8x8_t phase_quantization(const float32x4x2_t &gx, const float32x4x2_t &gy)
diff --git a/src/core/NEON/kernels/NEHarrisCornersKernel.cpp b/src/core/NEON/kernels/NEHarrisCornersKernel.cpp
index 5e1c216b65..61221c1070 100644
--- a/src/core/NEON/kernels/NEHarrisCornersKernel.cpp
+++ b/src/core/NEON/kernels/NEHarrisCornersKernel.cpp
@@ -39,330 +39,6 @@
 
 using namespace arm_compute;
 
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-
-namespace fp16
-{
-inline float16x8_t harris_score(float16x8_t gx2, float16x8_t gy2, float16x8_t gxgy, float sensitivity, float strength_thresh)
-{
-    static const float16x8_t zero = vdupq_n_f16(0.f);
-
-    // Trace^2
-    float16x8_t trace2 = vaddq_f16(gx2, gy2);
-    trace2             = vmulq_f16(trace2, trace2);
-
-    // Det(A)
-    float16x8_t det = vmulq_f16(gx2, gy2);
-    det             = vfmsq_f16(det, gxgy, gxgy);
-
-    // Det(A) - sensitivity * trace^2
-    const float16x8_t mc = vfmsq_f16(det, vdupq_n_f16(sensitivity), trace2);
-
-    // mc > strength_thresh
-    const uint16x8_t mask = vcgtq_f16(mc, vdupq_n_f16(strength_thresh));
-
-    return vbslq_f16(mask, mc, zero);
-}
-
-template <size_t block_size>
-inline void harris_score1xN_FLOAT_FLOAT_FLOAT(float16x8_t low_gx, float16x8_t low_gy, float16x8_t high_gx, float16x8_t high_gy, float16x8_t &gx2, float16x8_t &gy2, float16x8_t &gxgy,
-                                              float norm_factor)
-{
-    const float16x8_t norm_factor_fp16 = vdupq_n_f16(norm_factor);
-
-    // Normalize
-    low_gx  = vmulq_f16(low_gx, norm_factor_fp16);
-    low_gy  = vmulq_f16(low_gy, norm_factor_fp16);
-    high_gx = vmulq_f16(high_gx, norm_factor_fp16);
-    high_gy = vmulq_f16(high_gy, norm_factor_fp16);
-
-    float16x8_t gx = vextq_f16(low_gx, high_gx, 0);
-    float16x8_t gy = vextq_f16(low_gy, high_gy, 0);
-
-    gx2  = vfmaq_f16(gx2, gx, gx);
-    gy2  = vfmaq_f16(gy2, gy, gy);
-    gxgy = vfmaq_f16(gxgy, gx, gy);
-
-    gx = vextq_f16(low_gx, high_gx, 1);
-    gy = vextq_f16(low_gy, high_gy, 1);
-
-    gx2  = vfmaq_f16(gx2, gx, gx);
-    gy2  = vfmaq_f16(gy2, gy, gy);
-    gxgy = vfmaq_f16(gxgy, gx, gy);
-
-    gx = vextq_f16(low_gx, high_gx, 2);
-    gy = vextq_f16(low_gy, high_gy, 2);
-
-    gx2  = vfmaq_f16(gx2, gx, gx);
-    gy2  = vfmaq_f16(gy2, gy, gy);
-    gxgy = vfmaq_f16(gxgy, gx, gy);
-
-    if(block_size > 3)
-    {
-        gx = vextq_f16(low_gx, high_gx, 3);
-        gy = vextq_f16(low_gy, high_gy, 3);
-
-        gx2  = vfmaq_f16(gx2, gx, gx);
-        gy2  = vfmaq_f16(gy2, gy, gy);
-        gxgy = vfmaq_f16(gxgy, gx, gy);
-
-        gx = vextq_f16(low_gx, high_gx, 4);
-        gy = vextq_f16(low_gy, high_gy, 4);
-
-        gx2  = vfmaq_f16(gx2, gx, gx);
-        gy2  = vfmaq_f16(gy2, gy, gy);
-        gxgy = vfmaq_f16(gxgy, gx, gy);
-    }
-
-    if(block_size == 7)
-    {
-        gx = vextq_f16(low_gx, high_gx, 5);
-        gy = vextq_f16(low_gy, high_gy, 5);
-
-        gx2  = vfmaq_f16(gx2, gx, gx);
-        gy2  = vfmaq_f16(gy2, gy, gy);
-        gxgy = vfmaq_f16(gxgy, gx, gy);
-
-        gx = vextq_f16(low_gx, high_gx, 6);
-        gy = vextq_f16(low_gy, high_gy, 6);
-
-        gx2  = vfmaq_f16(gx2, gx, gx);
-        gy2  = vfmaq_f16(gy2, gy, gy);
-        gxgy = vfmaq_f16(gxgy, gx, gy);
-    }
-}
-
-template <size_t block_size>
-inline void harris_score_S16_S16_FLOAT(const void *__restrict in1_ptr, const void *__restrict in2_ptr, void *__restrict out_ptr, int32_t in_stride, float norm_factor, float sensitivity,
-                                       float strength_thresh)
-{
-    auto           gx_ptr_0 = static_cast<const int16_t *__restrict>(in1_ptr) - (block_size / 2) * (in_stride + 1);
-    auto           gy_ptr_0 = static_cast<const int16_t *__restrict>(in2_ptr) - (block_size / 2) * (in_stride + 1);
-    const int16_t *gx_ptr_1 = gx_ptr_0 + 8;
-    const int16_t *gy_ptr_1 = gy_ptr_0 + 8;
-    const auto     output   = static_cast<float *__restrict>(out_ptr);
-
-    // Gx^2, Gy^2 and Gx*Gy
-    float16x8_t gx2  = vdupq_n_f16(0.0f);
-    float16x8_t gy2  = vdupq_n_f16(0.0f);
-    float16x8_t gxgy = vdupq_n_f16(0.0f);
-
-    for(size_t i = 0; i < block_size; ++i)
-    {
-        const float16x8_t low_gx  = vcvtq_f16_s16(vld1q_s16(gx_ptr_0));
-        const float16x8_t high_gx = vcvtq_f16_s16(vld1q_s16(gx_ptr_1));
-        const float16x8_t low_gy  = vcvtq_f16_s16(vld1q_s16(gy_ptr_0));
-        const float16x8_t high_gy = vcvtq_f16_s16(vld1q_s16(gy_ptr_1));
-        harris_score1xN_FLOAT_FLOAT_FLOAT<block_size>(low_gx, low_gy, high_gx, high_gy, gx2, gy2, gxgy, norm_factor);
-
-        // Update gx and gy pointer
-        gx_ptr_0 += in_stride;
-        gy_ptr_0 += in_stride;
-        gx_ptr_1 += in_stride;
-        gy_ptr_1 += in_stride;
-    }
-
-    // Calculate harris score
-    const float16x8_t mc = harris_score(gx2, gy2, gxgy, sensitivity, strength_thresh);
-
-    // Store score
-    vst1q_f32(output + 0, vcvt_f32_f16(vget_low_f16(mc)));
-    vst1q_f32(output + 4, vcvt_f32_f16(vget_high_f16(mc)));
-}
-
-template <size_t block_size>
-inline void harris_score_S32_S32_FLOAT(const void *__restrict in1_ptr, const void *__restrict in2_ptr, void *__restrict out_ptr, int32_t in_stride, float norm_factor, float sensitivity,
-                                       float strength_thresh)
-{
-    static const float16x8_t zero = vdupq_n_f16(0.0f);
-
-    auto           gx_ptr_0 = static_cast<const int32_t *__restrict>(in1_ptr) - (block_size / 2) * (in_stride + 1);
-    auto           gy_ptr_0 = static_cast<const int32_t *__restrict>(in2_ptr) - (block_size / 2) * (in_stride + 1);
-    const int32_t *gx_ptr_1 = gx_ptr_0 + 4;
-    const int32_t *gy_ptr_1 = gy_ptr_0 + 4;
-    const int32_t *gx_ptr_2 = gx_ptr_0 + 8;
-    const int32_t *gy_ptr_2 = gy_ptr_0 + 8;
-    const auto     output   = static_cast<float *__restrict>(out_ptr);
-
-    // Gx^2, Gy^2 and Gx*Gy
-    float16x8_t gx2  = zero;
-    float16x8_t gy2  = zero;
-    float16x8_t gxgy = zero;
-
-    for(size_t i = 0; i < block_size; ++i)
-    {
-        const float16x8_t low_gx = vcombine_f16(vcvt_f16_f32(vcvtq_f32_s32(vld1q_s32(gx_ptr_0))),
-                                                vcvt_f16_f32(vcvtq_f32_s32(vld1q_s32(gx_ptr_1))));
-        const float16x8_t high_gx = vcombine_f16(vcvt_f16_f32(vcvtq_f32_s32(vld1q_s32(gx_ptr_2))),
-                                                 vget_low_f16(zero));
-        const float16x8_t low_gy = vcombine_f16(vcvt_f16_f32(vcvtq_f32_s32(vld1q_s32(gy_ptr_0))),
-                                                vcvt_f16_f32(vcvtq_f32_s32(vld1q_s32(gy_ptr_1))));
-        const float16x8_t high_gy = vcombine_f16(vcvt_f16_f32(vcvtq_f32_s32(vld1q_s32(gy_ptr_2))),
-                                                 vget_low_f16(zero));
-        harris_score1xN_FLOAT_FLOAT_FLOAT<block_size>(low_gx, low_gy, high_gx, high_gy, gx2, gy2, gxgy, norm_factor);
-
-        // Update gx and gy pointer
-        gx_ptr_0 += in_stride;
-        gy_ptr_0 += in_stride;
-        gx_ptr_1 += in_stride;
-        gy_ptr_1 += in_stride;
-        gx_ptr_2 += in_stride;
-        gy_ptr_2 += in_stride;
-    }
-
-    // Calculate harris score
-    const float16x8_t mc = harris_score(gx2, gy2, gxgy, sensitivity, strength_thresh);
-
-    // Store score
-    vst1q_f32(output + 0, vcvt_f32_f16(vget_low_f16(mc)));
-    vst1q_f32(output + 4, vcvt_f32_f16(vget_high_f16(mc)));
-}
-
-template <>
-inline void harris_score_S32_S32_FLOAT<7>(const void *__restrict in1_ptr, const void *__restrict in2_ptr, void *__restrict out_ptr, int32_t in_stride, float norm_factor, float sensitivity,
-                                          float strength_thresh)
-{
-    static const float16x8_t zero = vdupq_n_f16(0.0f);
-
-    auto           gx_ptr_0 = static_cast<const int32_t *__restrict>(in1_ptr) - 3 * (in_stride + 1);
-    auto           gy_ptr_0 = static_cast<const int32_t *__restrict>(in2_ptr) - 3 * (in_stride + 1);
-    const int32_t *gx_ptr_1 = gx_ptr_0 + 4;
-    const int32_t *gy_ptr_1 = gy_ptr_0 + 4;
-    const int32_t *gx_ptr_2 = gx_ptr_0 + 8;
-    const int32_t *gy_ptr_2 = gy_ptr_0 + 8;
-    const int32_t *gx_ptr_3 = gx_ptr_0 + 12;
-    const int32_t *gy_ptr_3 = gy_ptr_0 + 12;
-    const auto     output   = static_cast<float *__restrict>(out_ptr);
-
-    // Gx^2, Gy^2 and Gx*Gy
-    float16x8_t gx2  = zero;
-    float16x8_t gy2  = zero;
-    float16x8_t gxgy = zero;
-
-    for(size_t i = 0; i < 7; ++i)
-    {
-        const float16x8_t low_gx = vcombine_f16(vcvt_f16_f32(vcvtq_f32_s32(vld1q_s32(gx_ptr_0))),
-                                                vcvt_f16_f32(vcvtq_f32_s32(vld1q_s32(gx_ptr_1))));
-        const float16x8_t high_gx = vcombine_f16(vcvt_f16_f32(vcvtq_f32_s32(vld1q_s32(gx_ptr_2))),
-                                                 vcvt_f16_f32(vcvtq_f32_s32(vld1q_s32(gx_ptr_3))));
-        const float16x8_t low_gy = vcombine_f16(vcvt_f16_f32(vcvtq_f32_s32(vld1q_s32(gy_ptr_0))),
-                                                vcvt_f16_f32(vcvtq_f32_s32(vld1q_s32(gy_ptr_1))));
-        const float16x8_t high_gy = vcombine_f16(vcvt_f16_f32(vcvtq_f32_s32(vld1q_s32(gy_ptr_2))),
-                                                 vcvt_f16_f32(vcvtq_f32_s32(vld1q_s32(gy_ptr_3))));
-        harris_score1xN_FLOAT_FLOAT_FLOAT<7>(low_gx, low_gy, high_gx, high_gy, gx2, gy2, gxgy, norm_factor);
-
-        // Update gx and gy pointer
-        gx_ptr_0 += in_stride;
-        gy_ptr_0 += in_stride;
-        gx_ptr_1 += in_stride;
-        gy_ptr_1 += in_stride;
-        gx_ptr_2 += in_stride;
-        gy_ptr_2 += in_stride;
-    }
-
-    // Calculate harris score
-    const float16x8_t mc = harris_score(gx2, gy2, gxgy, sensitivity, strength_thresh);
-
-    // Store score
-    vst1q_f32(output + 0, vcvt_f32_f16(vget_low_f16(mc)));
-    vst1q_f32(output + 4, vcvt_f32_f16(vget_high_f16(mc)));
-}
-
-} // namespace fp16
-
-template <int32_t block_size>
-BorderSize        NEHarrisScoreFP16Kernel<block_size>::border_size() const
-{
-    return _border_size;
-}
-
-template <int32_t block_size>
-NEHarrisScoreFP16Kernel<block_size>::NEHarrisScoreFP16Kernel()
-    : INEHarrisScoreKernel(), _func(nullptr)
-{
-}
-
-template <int32_t block_size>
-void NEHarrisScoreFP16Kernel<block_size>::run(const Window &window, const ThreadInfo &info)
-{
-    ARM_COMPUTE_UNUSED(info);
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
-    ARM_COMPUTE_ERROR_ON(_func == nullptr);
-
-    Iterator input1(_input1, window);
-    Iterator input2(_input2, window);
-    Iterator output(_output, window);
-
-    const size_t input_stride = _input1->info()->strides_in_bytes()[1] / element_size_from_data_type(_input1->info()->data_type());
-
-    execute_window_loop(window, [&](const Coordinates & id)
-    {
-        (*_func)(input1.ptr(), input2.ptr(), output.ptr(), input_stride, _norm_factor, _sensitivity, _strength_thresh);
-    },
-    input1, input2, output);
-}
-
-template <int32_t block_size>
-void NEHarrisScoreFP16Kernel<block_size>::configure(const IImage *input1, const IImage *input2, IImage *output, float norm_factor, float strength_thresh, float sensitivity,
-                                                    bool border_undefined)
-{
-    ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(input1);
-    ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(input2);
-    ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(output);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::S16, DataType::S32);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input2, 1, DataType::S16, DataType::S32);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F32);
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input1, input2);
-    ARM_COMPUTE_ERROR_ON(0.0f == norm_factor);
-
-    _input1          = input1;
-    _input2          = input2;
-    _output          = output;
-    _sensitivity     = sensitivity;
-    _strength_thresh = strength_thresh;
-    _norm_factor     = norm_factor;
-    _border_size     = BorderSize(block_size / 2);
-
-    if(input1->info()->data_type() == DataType::S16)
-    {
-        _func = &fp16::harris_score_S16_S16_FLOAT<block_size>;
-    }
-    else
-    {
-        _func = &fp16::harris_score_S32_S32_FLOAT<block_size>;
-    }
-
-    ARM_COMPUTE_ERROR_ON(nullptr == _func);
-
-    constexpr unsigned int num_elems_processed_per_iteration = 8;
-    constexpr unsigned int num_elems_read_per_iteration      = 16;
-    constexpr unsigned int num_elems_written_per_iteration   = 8;
-    constexpr unsigned int num_rows_read_per_iteration       = block_size;
-
-    // Configure kernel window
-    Window                 win = calculate_max_window(*input1->info(), Steps(num_elems_processed_per_iteration), border_undefined, border_size());
-    AccessWindowHorizontal output_access(output->info(), 0, num_elems_written_per_iteration);
-
-    update_window_and_padding(win,
-                              AccessWindowRectangle(input1->info(), -_border_size.left, -_border_size.top, num_elems_read_per_iteration, num_rows_read_per_iteration),
-                              AccessWindowRectangle(input2->info(), -_border_size.left, -_border_size.top, num_elems_read_per_iteration, num_rows_read_per_iteration),
-                              output_access);
-
-    ValidRegion valid_region = intersect_valid_regions(input1->info()->valid_region(),
-                                                       input2->info()->valid_region());
-
-    output_access.set_valid_region(win, valid_region, border_undefined, border_size());
-
-    INEKernel::configure(win);
-}
-
-template class arm_compute::NEHarrisScoreFP16Kernel<3>;
-template class arm_compute::NEHarrisScoreFP16Kernel<5>;
-template class arm_compute::NEHarrisScoreFP16Kernel<7>;
-
-#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
-
 template class arm_compute::NEHarrisScoreKernel<3>;
 template class arm_compute::NEHarrisScoreKernel<5>;
 template class arm_compute::NEHarrisScoreKernel<7>;
diff --git a/src/runtime/NEON/functions/NECannyEdge.cpp b/src/runtime/NEON/functions/NECannyEdge.cpp
index d72c98bb50..0e5d50fd21 100644
--- a/src/runtime/NEON/functions/NECannyEdge.cpp
+++ b/src/runtime/NEON/functions/NECannyEdge.cpp
@@ -58,8 +58,7 @@ NECannyEdge::NECannyEdge(std::shared_ptr<IMemoryManager> memory_manager) // NOLI
 {
 }
 
-void NECannyEdge::configure(ITensor *input, ITensor *output, int32_t upper_thr, int32_t lower_thr, int32_t gradient_size, int32_t norm_type, BorderMode border_mode, uint8_t constant_border_value,
-                            bool use_fp16)
+void NECannyEdge::configure(ITensor *input, ITensor *output, int32_t upper_thr, int32_t lower_thr, int32_t gradient_size, int32_t norm_type, BorderMode border_mode, uint8_t constant_border_value)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
     ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
@@ -127,18 +126,9 @@ void NECannyEdge::configure(ITensor *input, ITensor *output, int32_t upper_thr,
     _memory_group.manage(&_phase);
 
     // Configure gradient
-    if(use_fp16)
-    {
-        auto k = arm_compute::support::cpp14::make_unique<NEGradientFP16Kernel>();
-        k->configure(&_gx, &_gy, &_magnitude, &_phase, norm_type);
-        _gradient = std::move(k);
-    }
-    else
-    {
-        auto k = arm_compute::support::cpp14::make_unique<NEGradientKernel>();
-        k->configure(&_gx, &_gy, &_magnitude, &_phase, norm_type);
-        _gradient = std::move(k);
-    }
+    auto k = arm_compute::support::cpp14::make_unique<NEGradientKernel>();
+    k->configure(&_gx, &_gy, &_magnitude, &_phase, norm_type);
+    _gradient = std::move(k);
 
     // Allocate intermediate tensors
     _gx.allocator()->allocate();
diff --git a/src/runtime/NEON/functions/NEHarrisCorners.cpp b/src/runtime/NEON/functions/NEHarrisCorners.cpp
index 25e28d2213..db5e9265bb 100644
--- a/src/runtime/NEON/functions/NEHarrisCorners.cpp
+++ b/src/runtime/NEON/functions/NEHarrisCorners.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, 2017 ARM Limited.
+ * Copyright (c) 2016-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -61,7 +61,7 @@ NEHarrisCorners::NEHarrisCorners(std::shared_ptr<IMemoryManager> memory_manager)
 
 void NEHarrisCorners::configure(IImage *input, float threshold, float min_dist,
                                 float sensitivity, int32_t gradient_size, int32_t block_size, KeyPointArray *corners,
-                                BorderMode border_mode, uint8_t constant_border_value, bool use_fp16)
+                                BorderMode border_mode, uint8_t constant_border_value)
 {
     ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(input);
     ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
@@ -126,62 +126,31 @@ void NEHarrisCorners::configure(IImage *input, float threshold, float min_dist,
     // Manage intermediate buffers
     _memory_group.manage(&_score);
 
-    if(use_fp16)
+    // Set/init Harris Score kernel accordingly with block_size
+    switch(block_size)
     {
-        switch(block_size)
+        case 3:
         {
-            case 3:
-            {
-                auto k = arm_compute::support::cpp14::make_unique<NEHarrisScoreFP16Kernel<3>>();
-                k->configure(&_gx, &_gy, &_score, norm_factor, threshold, sensitivity, border_mode == BorderMode::UNDEFINED);
-                _harris_score = std::move(k);
-            }
-            break;
-            case 5:
-            {
-                auto k = arm_compute::support::cpp14::make_unique<NEHarrisScoreFP16Kernel<5>>();
-                k->configure(&_gx, &_gy, &_score, norm_factor, threshold, sensitivity, border_mode == BorderMode::UNDEFINED);
-                _harris_score = std::move(k);
-            }
-            break;
-            case 7:
-            {
-                auto k = arm_compute::support::cpp14::make_unique<NEHarrisScoreFP16Kernel<7>>();
-                k->configure(&_gx, &_gy, &_score, norm_factor, threshold, sensitivity, border_mode == BorderMode::UNDEFINED);
-                _harris_score = std::move(k);
-            }
-            default:
-                break;
+            auto k = arm_compute::support::cpp14::make_unique<NEHarrisScoreKernel<3>>();
+            k->configure(&_gx, &_gy, &_score, norm_factor, threshold, sensitivity, border_mode == BorderMode::UNDEFINED);
+            _harris_score = std::move(k);
         }
-    }
-    else
-    {
-        // Set/init Harris Score kernel accordingly with block_size
-        switch(block_size)
+        break;
+        case 5:
         {
-            case 3:
-            {
-                auto k = arm_compute::support::cpp14::make_unique<NEHarrisScoreKernel<3>>();
-                k->configure(&_gx, &_gy, &_score, norm_factor, threshold, sensitivity, border_mode == BorderMode::UNDEFINED);
-                _harris_score = std::move(k);
-            }
-            break;
-            case 5:
-            {
-                auto k = arm_compute::support::cpp14::make_unique<NEHarrisScoreKernel<5>>();
-                k->configure(&_gx, &_gy, &_score, norm_factor, threshold, sensitivity, border_mode == BorderMode::UNDEFINED);
-                _harris_score = std::move(k);
-            }
-            break;
-            case 7:
-            {
-                auto k = arm_compute::support::cpp14::make_unique<NEHarrisScoreKernel<7>>();
-                k->configure(&_gx, &_gy, &_score, norm_factor, threshold, sensitivity, border_mode == BorderMode::UNDEFINED);
-                _harris_score = std::move(k);
-            }
-            default:
-                break;
+            auto k = arm_compute::support::cpp14::make_unique<NEHarrisScoreKernel<5>>();
+            k->configure(&_gx, &_gy, &_score, norm_factor, threshold, sensitivity, border_mode == BorderMode::UNDEFINED);
+            _harris_score = std::move(k);
         }
+        break;
+        case 7:
+        {
+            auto k = arm_compute::support::cpp14::make_unique<NEHarrisScoreKernel<7>>();
+            k->configure(&_gx, &_gy, &_score, norm_factor, threshold, sensitivity, border_mode == BorderMode::UNDEFINED);
+            _harris_score = std::move(k);
+        }
+        default:
+            break;
     }
 
     // Configure border filling before harris score
diff --git a/tests/benchmark/CL/CannyEdge.cpp b/tests/benchmark/CL/CannyEdge.cpp
index 7e327f3df3..8cc918f4c8 100644
--- a/tests/benchmark/CL/CannyEdge.cpp
+++ b/tests/benchmark/CL/CannyEdge.cpp
@@ -42,10 +42,9 @@ namespace
 {
 // *INDENT-OFF*
 // clang-format off
-const auto use_fp16           = framework::dataset::make("UseFP16", { false });
 const auto canny_edge_dataset = combine(framework::dataset::make("GradientSize", { 3, 5, 7 }),
                                 combine(framework::dataset::make("Normalization", { MagnitudeType::L1NORM, MagnitudeType::L2NORM }),
-                                combine(datasets::BorderModes(), use_fp16)));
+                                        datasets::BorderModes()));
 } // namespace
 
 using CLCannyEdgeFixture = CannyEdgeFixture<CLTensor, CLCannyEdge, CLAccessor>;
diff --git a/tests/benchmark/CL/HarrisCorners.cpp b/tests/benchmark/CL/HarrisCorners.cpp
index 990ac24894..dbaf95c98d 100644
--- a/tests/benchmark/CL/HarrisCorners.cpp
+++ b/tests/benchmark/CL/HarrisCorners.cpp
@@ -54,25 +54,23 @@ using CLHarrisCornersFixture = HarrisCornersFixture<CLTensor, CLHarrisCorners, C
 TEST_SUITE(CL)
 TEST_SUITE(HarrisCorners)
 
-REGISTER_FIXTURE_DATA_TEST_CASE(RunSmall, CLHarrisCornersFixture, framework::DatasetMode::PRECOMMIT, combine(combine(combine(combine(combine(combine(combine(combine(datasets::SmallImageFiles(),
+REGISTER_FIXTURE_DATA_TEST_CASE(RunSmall, CLHarrisCornersFixture, framework::DatasetMode::PRECOMMIT, combine(combine(combine(combine(combine(combine(combine(datasets::SmallImageFiles(),
                                                                                                                      framework::dataset::make("Format", { Format::U8 })),
                                                                                                                      threshold),
                                                                                                                      min_dist),
                                                                                                                      sensitivity),
                                                                                                                      gradient_size),
                                                                                                                      block_size),
-                                                                                                                     border_mode),
-                                                                                                             framework::dataset::make("UseFP16", { false })));
+                                                                                                             border_mode));
 
-REGISTER_FIXTURE_DATA_TEST_CASE(RunLarge, CLHarrisCornersFixture, framework::DatasetMode::NIGHTLY, combine(combine(combine(combine(combine(combine(combine(combine(datasets::LargeImageFiles(),
+REGISTER_FIXTURE_DATA_TEST_CASE(RunLarge, CLHarrisCornersFixture, framework::DatasetMode::NIGHTLY, combine(combine(combine(combine(combine(combine(combine(datasets::LargeImageFiles(),
                                                                                                                    framework::dataset::make("Format", { Format::U8 })),
                                                                                                                    threshold),
                                                                                                                    min_dist),
                                                                                                                    sensitivity),
                                                                                                                    gradient_size),
                                                                                                                    block_size),
-                                                                                                                   border_mode),
-                                                                                                           framework::dataset::make("UseFP16", { false })));
+                                                                                                           border_mode));
 
 TEST_SUITE_END() // HarrisCorners
 TEST_SUITE_END() // CL
diff --git a/tests/benchmark/NEON/CannyEdge.cpp b/tests/benchmark/NEON/CannyEdge.cpp
index 50fd832e5d..c4414507c9 100644
--- a/tests/benchmark/NEON/CannyEdge.cpp
+++ b/tests/benchmark/NEON/CannyEdge.cpp
@@ -42,10 +42,9 @@ namespace
 {
 // *INDENT-OFF*
 // clang-format off
-const auto use_fp16           = framework::dataset::make("UseFP16", { false });
 const auto canny_edge_dataset = combine(framework::dataset::make("GradientSize", { 3, 5, 7 }),
                                 combine(framework::dataset::make("Normalization", { MagnitudeType::L1NORM, MagnitudeType::L2NORM }),
-                                combine(datasets::BorderModes(), use_fp16)));
+                                        datasets::BorderModes()));
 } // namespace
 
 using NECannyEdgeFixture = CannyEdgeFixture<Tensor, NECannyEdge, Accessor>;
diff --git a/tests/benchmark/NEON/HarrisCorners.cpp b/tests/benchmark/NEON/HarrisCorners.cpp
index 33315ffcd0..75a6794cc5 100644
--- a/tests/benchmark/NEON/HarrisCorners.cpp
+++ b/tests/benchmark/NEON/HarrisCorners.cpp
@@ -53,49 +53,23 @@ using NEHarrisCornersFixture = HarrisCornersFixture<Tensor, NEHarrisCorners, Acc
 TEST_SUITE(NEON)
 TEST_SUITE(HarrisCorners)
 
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-TEST_SUITE(FP16)
-
-REGISTER_FIXTURE_DATA_TEST_CASE(RunSmall, NEHarrisCornersFixture, framework::DatasetMode::PRECOMMIT, combine(combine(combine(combine(combine(combine(combine(combine(datasets::SmallImageFiles(),
-                                                                                                                     framework::dataset::make("Format", { Format::U8 })),
-                                                                                                                     threshold),
-                                                                                                                     min_dist),
-                                                                                                                     sensitivity),
-                                                                                                                     gradient_size),
-                                                                                                                     block_size),
-                                                                                                                     border_mode),
-                                                                                                             framework::dataset::make("UseFP16", { true })));
-REGISTER_FIXTURE_DATA_TEST_CASE(RunLarge, NEHarrisCornersFixture, framework::DatasetMode::NIGHTLY, combine(combine(combine(combine(combine(combine(combine(combine(datasets::LargeImageFiles(),
-                                                                                                                   framework::dataset::make("Format", { Format::U8 })),
-                                                                                                                   threshold),
-                                                                                                                   min_dist),
-                                                                                                                   sensitivity),
-                                                                                                                   gradient_size),
-                                                                                                                   block_size),
-                                                                                                                   border_mode),
-                                                                                                           framework::dataset::make("UseFP16", { true })));
-TEST_SUITE_END() // FP16
-#endif           // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-
 TEST_SUITE(S16)
-REGISTER_FIXTURE_DATA_TEST_CASE(RunSmall, NEHarrisCornersFixture, framework::DatasetMode::PRECOMMIT, combine(combine(combine(combine(combine(combine(combine(combine(datasets::SmallImageFiles(),
+REGISTER_FIXTURE_DATA_TEST_CASE(RunSmall, NEHarrisCornersFixture, framework::DatasetMode::PRECOMMIT, combine(combine(combine(combine(combine(combine(combine(datasets::SmallImageFiles(),
                                                                                                                      framework::dataset::make("Format", { Format::U8 })),
                                                                                                                      threshold),
                                                                                                                      min_dist),
                                                                                                                      sensitivity),
                                                                                                                      gradient_size),
                                                                                                                      block_size),
-                                                                                                                     border_mode),
-                                                                                                             framework::dataset::make("UseFP16", { false })));
-REGISTER_FIXTURE_DATA_TEST_CASE(RunLarge, NEHarrisCornersFixture, framework::DatasetMode::NIGHTLY, combine(combine(combine(combine(combine(combine(combine(combine(datasets::LargeImageFiles(),
+                                                                                                             border_mode));
+REGISTER_FIXTURE_DATA_TEST_CASE(RunLarge, NEHarrisCornersFixture, framework::DatasetMode::NIGHTLY, combine(combine(combine(combine(combine(combine(combine(datasets::LargeImageFiles(),
                                                                                                                    framework::dataset::make("Format", { Format::U8 })),
                                                                                                                    threshold),
                                                                                                                    min_dist),
                                                                                                                    sensitivity),
                                                                                                                    gradient_size),
                                                                                                                    block_size),
-                                                                                                                   border_mode),
-                                                                                                           framework::dataset::make("UseFP16", { false })));
+                                                                                                           border_mode));
 TEST_SUITE_END() // S16
 TEST_SUITE_END() // HarrisCorners
 TEST_SUITE_END() // NEON
diff --git a/tests/benchmark/fixtures/CannyEdgeFixture.h b/tests/benchmark/fixtures/CannyEdgeFixture.h
index 099dabaf18..e2aec24b3e 100644
--- a/tests/benchmark/fixtures/CannyEdgeFixture.h
+++ b/tests/benchmark/fixtures/CannyEdgeFixture.h
@@ -44,7 +44,7 @@ class CannyEdgeFixture : public framework::Fixture
 {
 public:
     template <typename...>
-    void setup(std::string image, int gradient_size, MagnitudeType norm_type, BorderMode border_mode, bool use_fp16, Format format)
+    void setup(std::string image, int gradient_size, MagnitudeType norm_type, BorderMode border_mode, Format format)
     {
         // Load the image (cached by the library if loaded before)
         const RawTensor &raw = library->get(image, format);
@@ -52,7 +52,7 @@ public:
         src = create_tensor<TensorType>(raw.shape(), format);
         dst = create_tensor<TensorType>(raw.shape(), format);
 
-        configure_target<Function>(canny_edge_func, src, dst, gradient_size, static_cast<int>(norm_type) + 1, border_mode, use_fp16);
+        canny_edge_func.configure(&src, &dst, upper_thresh, lower_thresh, gradient_size, static_cast<int>(norm_type) + 1, border_mode, constant_border_value);
 
         // Allocate tensors
         src.allocator()->allocate();
@@ -72,22 +72,6 @@ public:
         sync_tensor_if_necessary<TensorType>(dst);
     }
 
-protected:
-    template <typename F, typename std::enable_if<std::is_same<F, NECannyEdge>::value, int>::type = 0>
-    void configure_target(F &func, TensorType &src, TensorType &dst, int gradient_size, int norm_type, BorderMode border_mode, bool use_fp16)
-    {
-        func.configure(&src, &dst, upper_thresh, lower_thresh, gradient_size, norm_type, border_mode, constant_border_value, use_fp16);
-    }
-
-    template <typename F, typename std::enable_if<std::is_same<F, CLCannyEdge>::value, int>::type = 0>
-    void configure_target(F &func, TensorType &src, TensorType &dst, int gradient_size, int norm_type, BorderMode border_mode, bool use_fp16)
-    {
-        ARM_COMPUTE_UNUSED(use_fp16);
-        ARM_COMPUTE_ERROR_ON(use_fp16);
-
-        func.configure(&src, &dst, upper_thresh, lower_thresh, gradient_size, norm_type, border_mode, constant_border_value);
-    }
-
 private:
     static const int32_t lower_thresh          = 0;
     static const int32_t upper_thresh          = 255;
diff --git a/tests/benchmark/fixtures/HarrisCornersFixture.h b/tests/benchmark/fixtures/HarrisCornersFixture.h
index c7ce683ab5..29d1741156 100644
--- a/tests/benchmark/fixtures/HarrisCornersFixture.h
+++ b/tests/benchmark/fixtures/HarrisCornersFixture.h
@@ -42,8 +42,7 @@ class HarrisCornersFixture : public framework::Fixture
 public:
     template <typename...>
     void setup(std::string image, Format format, float threshold, float min_dist, float sensitivity,
-               int32_t gradient_size, int32_t block_size,
-               BorderMode border_mode, bool use_fp16)
+               int32_t gradient_size, int32_t block_size, BorderMode border_mode)
     {
         // Load the image (cached by the library if loaded before)
         const RawTensor &raw = library->get(image, format);
@@ -52,7 +51,7 @@ public:
         src = create_tensor<TensorType>(raw.shape(), format);
 
         // Create and configure function
-        harris_corners_func.configure(&src, threshold, min_dist, sensitivity, gradient_size, block_size, &out, border_mode, 0, use_fp16);
+        harris_corners_func.configure(&src, threshold, min_dist, sensitivity, gradient_size, block_size, &out, border_mode, 0);
 
         // Allocate tensor
         src.allocator()->allocate();
diff --git a/tests/validation/CL/CannyEdge.cpp b/tests/validation/CL/CannyEdge.cpp
index 7aa178adba..d130aa4bd1 100644
--- a/tests/validation/CL/CannyEdge.cpp
+++ b/tests/validation/CL/CannyEdge.cpp
@@ -48,21 +48,16 @@ namespace
 /* Allowed ratio of mismatches between target and reference (1.0 = 100%) */
 const float allowed_mismatch_ratio = 0.1f;
 
-const auto use_fp16 = framework::dataset::make("UseFP16", { false });
-
 const auto data = combine(framework::dataset::make("GradientSize", { 3, 5, 7 }),
-                          combine(framework::dataset::make("Normalization", { MagnitudeType::L1NORM, MagnitudeType::L2NORM }), combine(datasets::BorderModes(), use_fp16)));
+                          combine(framework::dataset::make("Normalization", { MagnitudeType::L1NORM, MagnitudeType::L2NORM }), datasets::BorderModes()));
 } // namespace
 
 TEST_SUITE(CL)
 TEST_SUITE(CannyEdge)
 
 DATA_TEST_CASE(Configuration, framework::DatasetMode::ALL, combine(combine(concat(datasets::Small2DShapes(), datasets::Large2DShapes()), data), framework::dataset::make("Format", Format::U8)),
-               shape, gradient_size, normalization, border_mode, use_fp16, format)
+               shape, gradient_size, normalization, border_mode, format)
 {
-    ARM_COMPUTE_UNUSED(use_fp16);
-    ARM_COMPUTE_ERROR_ON(use_fp16);
-
     CannyEdgeParameters params = canny_edge_parameters();
     // Convert normalisation type to integer
     const auto norm_type = static_cast<int>(normalization) + 1;
diff --git a/tests/validation/CL/HarrisCorners.cpp b/tests/validation/CL/HarrisCorners.cpp
index 890367c166..ccc9293fef 100644
--- a/tests/validation/CL/HarrisCorners.cpp
+++ b/tests/validation/CL/HarrisCorners.cpp
@@ -46,21 +46,15 @@ namespace validation
 {
 namespace
 {
-const auto use_fp16 = framework::dataset::make("UseFP16",
-{ false });
-
-const auto data = combine(framework::dataset::make("GradientSize", { 3, 5, 7 }), combine(framework::dataset::make("BlockSize", { 3, 5, 7 }), combine(datasets::BorderModes(), use_fp16)));
+const auto data = combine(framework::dataset::make("GradientSize", { 3, 5, 7 }), combine(framework::dataset::make("BlockSize", { 3, 5, 7 }), datasets::BorderModes()));
 } // namespace
 
 TEST_SUITE(CL)
 TEST_SUITE(HarrisCorners)
 
 DATA_TEST_CASE(Configuration, framework::DatasetMode::ALL, combine(combine(concat(datasets::Small2DShapes(), datasets::Large2DShapes()), data), framework::dataset::make("Format", Format::U8)), shape,
-               gradient_size, block_size, border_mode, use_fp16, format)
+               gradient_size, block_size, border_mode, format)
 {
-    ARM_COMPUTE_UNUSED(use_fp16);
-    ARM_COMPUTE_ERROR_ON(use_fp16);
-
     std::mt19937                          gen(library->seed());
     std::uniform_real_distribution<float> real_dist(0.f, 0.01f);
 
diff --git a/tests/validation/NEON/CannyEdge.cpp b/tests/validation/NEON/CannyEdge.cpp
index 5697b622f2..7c4cd8090f 100644
--- a/tests/validation/NEON/CannyEdge.cpp
+++ b/tests/validation/NEON/CannyEdge.cpp
@@ -48,23 +48,15 @@ namespace
 /* Allowed ratio of mismatches between target and reference (1.0 = 100%) */
 const float allowed_mismatch_ratio = 0.1f;
 
-const auto use_fp16 = framework::dataset::make("UseFP16",
-{
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-    true,
-#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
-    false
-});
-
 const auto data = combine(framework::dataset::make("GradientSize", { 3, 5, 7 }),
-                          combine(framework::dataset::make("Normalization", { MagnitudeType::L1NORM, MagnitudeType::L2NORM }), combine(datasets::BorderModes(), use_fp16)));
+                          combine(framework::dataset::make("Normalization", { MagnitudeType::L1NORM, MagnitudeType::L2NORM }), datasets::BorderModes()));
 } // namespace
 
 TEST_SUITE(NEON)
 TEST_SUITE(CannyEdge)
 
 DATA_TEST_CASE(Configuration, framework::DatasetMode::ALL, combine(combine(concat(datasets::Small2DShapes(), datasets::Large2DShapes()), data), framework::dataset::make("Format", Format::U8)),
-               shape, gradient_size, normalization, border_mode, use_fp16, format)
+               shape, gradient_size, normalization, border_mode, format)
 {
     CannyEdgeParameters params = canny_edge_parameters();
     // Convert normalisation type to integer
@@ -81,7 +73,7 @@ DATA_TEST_CASE(Configuration, framework::DatasetMode::ALL, combine(combine(conca
 
     // Create Canny edge configure function
     NECannyEdge canny_edge;
-    canny_edge.configure(&src, &dst, params.upper_thresh, params.lower_thresh, gradient_size, norm_type, border_mode, params.constant_border_value, use_fp16);
+    canny_edge.configure(&src, &dst, params.upper_thresh, params.lower_thresh, gradient_size, norm_type, border_mode, params.constant_border_value);
 
     // Validate valid region
     validate(src.info()->valid_region(), shape_to_valid_region(shape, (BorderMode::UNDEFINED == border_mode)));
diff --git a/tests/validation/NEON/HarrisCorners.cpp b/tests/validation/NEON/HarrisCorners.cpp
index 3474a96f8f..e0bbd92a2e 100644
--- a/tests/validation/NEON/HarrisCorners.cpp
+++ b/tests/validation/NEON/HarrisCorners.cpp
@@ -50,22 +50,14 @@ const float allowed_missing_percentage = 10.f;
 /* Allowed percentage of keypoints mismatching between target and reference */
 const float allowed_mismatch_percentage = 10.f;
 
-const auto use_fp16 = framework::dataset::make("UseFP16",
-{
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-    true,
-#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
-    false
-});
-
-const auto data = combine(framework::dataset::make("GradientSize", { 3, 5, 7 }), combine(framework::dataset::make("BlockSize", { 3, 5, 7 }), combine(datasets::BorderModes(), use_fp16)));
+const auto data = combine(framework::dataset::make("GradientSize", { 3, 5, 7 }), combine(framework::dataset::make("BlockSize", { 3, 5, 7 }), datasets::BorderModes()));
 } // namespace
 
 TEST_SUITE(NEON)
 TEST_SUITE(HarrisCorners)
 
 DATA_TEST_CASE(Configuration, framework::DatasetMode::ALL, combine(combine(concat(datasets::Small2DShapes(), datasets::Large2DShapes()), data), framework::dataset::make("Format", Format::U8)), shape,
-               gradient_size, block_size, border_mode, use_fp16, format)
+               gradient_size, block_size, border_mode, format)
 {
     std::mt19937                          gen(library->seed());
     std::uniform_real_distribution<float> real_dist(0.f, 0.01f);
@@ -90,7 +82,7 @@ DATA_TEST_CASE(Configuration, framework::DatasetMode::ALL, combine(combine(conca
 
     // Create harris corners configure function
     NEHarrisCorners harris_corners;
-    harris_corners.configure(&src, threshold, min_dist, sensitivity, gradient_size, block_size, &corners, border_mode, constant_border_value, use_fp16);
+    harris_corners.configure(&src, threshold, min_dist, sensitivity, gradient_size, block_size, &corners, border_mode, constant_border_value);
 
     // Validate padding
     PaddingCalculator calculator(shape.x(), 8);
diff --git a/tests/validation/fixtures/CannyEdgeFixture.h b/tests/validation/fixtures/CannyEdgeFixture.h
index 0f37c46641..d52b17e550 100644
--- a/tests/validation/fixtures/CannyEdgeFixture.h
+++ b/tests/validation/fixtures/CannyEdgeFixture.h
@@ -47,12 +47,11 @@ class CannyEdgeValidationFixture : public framework::Fixture
 {
 public:
     template <typename...>
-    void setup(std::string image, int gradient_size, MagnitudeType norm_type, BorderMode border_mode, bool use_fp16, Format format)
+    void setup(std::string image, int gradient_size, MagnitudeType norm_type, BorderMode border_mode, Format format)
     {
         CannyEdgeParameters params = canny_edge_parameters();
 
-        _target = compute_target(image, gradient_size, norm_type, border_mode, use_fp16, format, params);
-        //TODO(COMPMID-543): Add use_fp16 to reference
+        _target    = compute_target(image, gradient_size, norm_type, border_mode, format, params);
         _reference = compute_reference(image, gradient_size, norm_type, border_mode, format, params);
     }
 
@@ -63,21 +62,7 @@ protected:
         library->fill(tensor, raw);
     }
 
-    template <typename F, typename std::enable_if<std::is_same<F, NECannyEdge>::value, int>::type = 0>
-    void configure_target(F &func, TensorType &src, TensorType &dst, int gradient_size, int norm_type, BorderMode border_mode, bool use_fp16, const CannyEdgeParameters &params)
-    {
-        func.configure(&src, &dst, params.upper_thresh, params.lower_thresh, gradient_size, norm_type, border_mode, params.constant_border_value, use_fp16);
-    }
-
-    template <typename F, typename std::enable_if<std::is_same<F, CLCannyEdge>::value, int>::type = 0>
-    void configure_target(F &func, TensorType &src, TensorType &dst, int gradient_size, int norm_type, BorderMode border_mode, bool use_fp16, const CannyEdgeParameters &params)
-    {
-        ARM_COMPUTE_UNUSED(use_fp16);
-        ARM_COMPUTE_ERROR_ON(use_fp16);
-        func.configure(&src, &dst, params.upper_thresh, params.lower_thresh, gradient_size, norm_type, border_mode, params.constant_border_value);
-    }
-
-    TensorType compute_target(const std::string &image, int gradient_size, MagnitudeType norm_type, BorderMode border_mode, bool use_fp16, Format format, const CannyEdgeParameters &params)
+    TensorType compute_target(const std::string &image, int gradient_size, MagnitudeType norm_type, BorderMode border_mode, Format format, const CannyEdgeParameters &params)
     {
         // Load the image (cached by the library if loaded before)
         const RawTensor &raw = library->get(image, format);
@@ -90,7 +75,7 @@ protected:
 
         // Create Canny edge configure function
         FunctionType canny_edge;
-        configure_target<FunctionType>(canny_edge, src, dst, gradient_size, static_cast<int>(norm_type) + 1, border_mode, use_fp16, params);
+        canny_edge.configure(&src, &dst, params.upper_thresh, params.lower_thresh, gradient_size, static_cast<int>(norm_type) + 1, border_mode, params.constant_border_value);
 
         ARM_COMPUTE_EXPECT(src.info()->is_resizable(), framework::LogLevel::ERRORS);
         ARM_COMPUTE_EXPECT(dst.info()->is_resizable(), framework::LogLevel::ERRORS);
diff --git a/tests/validation/fixtures/HarrisCornersFixture.h b/tests/validation/fixtures/HarrisCornersFixture.h
index 1c30157344..f1d1f2d135 100644
--- a/tests/validation/fixtures/HarrisCornersFixture.h
+++ b/tests/validation/fixtures/HarrisCornersFixture.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017, 2018 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -47,12 +47,11 @@ class HarrisCornersValidationFixture : public framework::Fixture
 {
 public:
     template <typename...>
-    void setup(std::string image, int gradient_size, int block_size, BorderMode border_mode, bool use_fp16, Format format)
+    void setup(std::string image, int gradient_size, int block_size, BorderMode border_mode, Format format)
     {
         HarrisCornersParameters params = harris_corners_parameters();
 
-        _target = compute_target(image, gradient_size, block_size, border_mode, use_fp16, format, params);
-        //TODO(COMPMID-543): Add use_fp16 to reference
+        _target    = compute_target(image, gradient_size, block_size, border_mode, format, params);
         _reference = compute_reference(image, gradient_size, block_size, border_mode, format, params);
     }
 
@@ -63,21 +62,7 @@ protected:
         library->fill(tensor, raw);
     }
 
-    template <typename F, typename std::enable_if<std::is_same<F, NEHarrisCorners>::value, int>::type = 0>
-    void configure_target(F &func, TensorType &src, ArrayType &corners, int gradient_size, int block_size, BorderMode border_mode, bool use_fp16, const HarrisCornersParameters &params)
-    {
-        func.configure(&src, params.threshold, params.min_dist, params.sensitivity, gradient_size, block_size, &corners, border_mode, params.constant_border_value, use_fp16);
-    }
-
-    template <typename F, typename std::enable_if<std::is_same<F, CLHarrisCorners>::value, int>::type = 0>
-    void configure_target(F &func, TensorType &src, ArrayType &corners, int gradient_size, int block_size, BorderMode border_mode, bool use_fp16, const HarrisCornersParameters &params)
-    {
-        ARM_COMPUTE_UNUSED(use_fp16);
-        ARM_COMPUTE_ERROR_ON(use_fp16);
-        func.configure(&src, params.threshold, params.min_dist, params.sensitivity, gradient_size, block_size, &corners, border_mode, params.constant_border_value);
-    }
-
-    ArrayType compute_target(std::string image, int gradient_size, int block_size, BorderMode border_mode, bool use_fp16, Format format, const HarrisCornersParameters &params)
+    ArrayType compute_target(std::string image, int gradient_size, int block_size, BorderMode border_mode, Format format, const HarrisCornersParameters &params)
     {
         // Load the image (cached by the library if loaded before)
         const RawTensor &raw = library->get(image, format);
@@ -90,7 +75,7 @@ protected:
 
         // Create harris corners configure function
         FunctionType harris_corners;
-        configure_target<FunctionType>(harris_corners, src, corners, gradient_size, block_size, border_mode, use_fp16, params);
+        harris_corners.configure(&src, params.threshold, params.min_dist, params.sensitivity, gradient_size, block_size, &corners, border_mode, params.constant_border_value);
 
         ARM_COMPUTE_EXPECT(src.info()->is_resizable(), framework::LogLevel::ERRORS);
 
diff --git a/tests/validation/reference/CannyEdgeDetector.cpp b/tests/validation/reference/CannyEdgeDetector.cpp
index cfe8ae8100..d50452bfe8 100644
--- a/tests/validation/reference/CannyEdgeDetector.cpp
+++ b/tests/validation/reference/CannyEdgeDetector.cpp
@@ -231,7 +231,8 @@ SimpleTensor<T> canny_edge_detector_impl(const SimpleTensor<T> &src, int32_t upp
 } // namespace
 
 template <typename T>
-SimpleTensor<T> canny_edge_detector(const SimpleTensor<T> &src, int32_t upper_thresh, int32_t lower_thresh, int gradient_size, MagnitudeType norm_type,
+SimpleTensor<T> canny_edge_detector(const SimpleTensor<T> &src,
+                                    int32_t upper_thresh, int32_t lower_thresh, int gradient_size, MagnitudeType norm_type,
                                     BorderMode border_mode, T constant_border_value)
 {
     if(gradient_size < 7)
@@ -244,7 +245,8 @@ SimpleTensor<T> canny_edge_detector(const SimpleTensor<T> &src, int32_t upper_th
     }
 }
 
-template SimpleTensor<uint8_t> canny_edge_detector(const SimpleTensor<uint8_t> &src, int32_t upper_thresh, int32_t lower_thresh, int gradient_size, MagnitudeType norm_type,
+template SimpleTensor<uint8_t> canny_edge_detector(const SimpleTensor<uint8_t> &src,
+                                                   int32_t upper_thresh, int32_t lower_thresh, int gradient_size, MagnitudeType norm_type,
                                                    BorderMode border_mode, uint8_t constant_border_value);
 } // namespace reference
 } // namespace validation
diff --git a/tests/validation/reference/CannyEdgeDetector.h b/tests/validation/reference/CannyEdgeDetector.h
index a46c145153..ee6199d96e 100644
--- a/tests/validation/reference/CannyEdgeDetector.h
+++ b/tests/validation/reference/CannyEdgeDetector.h
@@ -36,7 +36,8 @@ namespace validation
 namespace reference
 {
 template <typename T>
-SimpleTensor<T> canny_edge_detector(const SimpleTensor<T> &src, int32_t upper_thresh, int32_t lower_thresh, int gradient_size, MagnitudeType norm_type,
+SimpleTensor<T> canny_edge_detector(const SimpleTensor<T> &src,
+                                    int32_t upper_thresh, int32_t lower_thresh, int gradient_size, MagnitudeType norm_type,
                                     BorderMode border_mode, T constant_border_value = 0);
 } // namespace reference
 } // namespace validation
diff --git a/tests/validation/reference/HarrisCornerDetector.h b/tests/validation/reference/HarrisCornerDetector.h
index 042e8570c2..f208eaa743 100644
--- a/tests/validation/reference/HarrisCornerDetector.h
+++ b/tests/validation/reference/HarrisCornerDetector.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -38,8 +38,9 @@ namespace validation
 namespace reference
 {
 template <typename T>
-std::vector<KeyPoint> harris_corner_detector(const SimpleTensor<T> &src, float threshold, float min_dist, float sensitivity, int gradient_size, int block_size, BorderMode border_mode,
-                                             T constant_border_value = 0);
+std::vector<KeyPoint> harris_corner_detector(const SimpleTensor<T> &src,
+                                             float threshold, float min_dist, float sensitivity, int gradient_size, int block_size,
+                                             BorderMode border_mode, T constant_border_value = 0);
 } // namespace reference
 } // namespace validation
 } // namespace test
author	Georgios Pinitas <georgios.pinitas@arm.com>	2018-08-30 16:02:11 +0100
committer	Anthony Barbier <anthony.barbier@arm.com>	2018-11-02 16:54:54 +0000
commit	09d3451b4caf8d5e0e7cf2c6097e50a1f815d027 (patch)
tree	8e9b8e9882d0120c85ff08473dbf3721b7602677
parent	b57be0da77370e5e71fe82dfa281f528279e8127 (diff)
download	ComputeLibrary-09d3451b4caf8d5e0e7cf2c6097e50a1f815d027.tar.gz