aboutsummaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
authorGeorgios Pinitas <georgios.pinitas@arm.com>2018-08-30 16:02:11 +0100
committerAnthony Barbier <anthony.barbier@arm.com>2018-11-02 16:54:54 +0000
commit09d3451b4caf8d5e0e7cf2c6097e50a1f815d027 (patch)
tree8e9b8e9882d0120c85ff08473dbf3721b7602677 /src
parentb57be0da77370e5e71fe82dfa281f528279e8127 (diff)
downloadComputeLibrary-09d3451b4caf8d5e0e7cf2c6097e50a1f815d027.tar.gz
COMPMID-1548: NEON FP16 mismatches on CannyEdge and HarrisCorners.
Removes FP16 from HarrisCorners and CannyEdge. Change-Id: I5e4f9205fdbe4de85f04f55ecf1568c837e56cc0 Reviewed-on: https://eu-gerrit-1.euhpc.arm.com/146247 Tested-by: Jenkins <bsgcomp@arm.com> Reviewed-by: Michele DiGiorgio <michele.digiorgio@arm.com>
Diffstat (limited to 'src')
-rw-r--r--src/core/NEON/kernels/NECannyEdgeKernel.cpp738
-rw-r--r--src/core/NEON/kernels/NEHarrisCornersKernel.cpp324
-rw-r--r--src/runtime/NEON/functions/NECannyEdge.cpp18
-rw-r--r--src/runtime/NEON/functions/NEHarrisCorners.cpp75
4 files changed, 26 insertions, 1129 deletions
diff --git a/src/core/NEON/kernels/NECannyEdgeKernel.cpp b/src/core/NEON/kernels/NECannyEdgeKernel.cpp
index dc37452415..fa51a7bb0b 100644
--- a/src/core/NEON/kernels/NECannyEdgeKernel.cpp
+++ b/src/core/NEON/kernels/NECannyEdgeKernel.cpp
@@ -51,744 +51,6 @@ constexpr int EDGE = 255;
constexpr int MAYBE = 127;
} // namespace
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-namespace fp16
-{
-inline uint8x8_t phase_quantization(const float32x4x2_t &gx, const float32x4x2_t &gy)
-{
- // Constant use for evaluating score1 and score3
- static const float32x4_t const45 = vdupq_n_f32(0.70710678118655f);
- static const float32x4_t zero = vdupq_n_f32(0.0f);
- static const float32x4_t one = vdupq_n_f32(1.0f);
- static const float32x4_t two = vdupq_n_f32(2.0f);
- static const float32x4_t three = vdupq_n_f32(3.0f);
-
- // Score0: (1, 0)
- const float32x4x2_t score0 =
- {
- vabsq_f32(gx.val[0]),
- vabsq_f32(gx.val[1])
- };
-
- // Score2: ( 0, 1 )
- const float32x4x2_t score2 =
- {
- vabsq_f32(gy.val[0]),
- vabsq_f32(gy.val[1])
- };
-
- // Score1 and Score3: ( sqrt(2) / 2, sqrt(2) / 2 ) - ( -sqrt(2) / 2, sqrt(2) / 2 )
- float32x4x2_t score1 =
- {
- vmulq_f32(gy.val[0], const45),
- vmulq_f32(gy.val[1], const45)
- };
-
- float32x4x2_t score3 = score1;
-
- score1.val[0] = vmlaq_f32(score1.val[0], gx.val[0], const45);
- score1.val[1] = vmlaq_f32(score1.val[1], gx.val[1], const45);
- score3.val[0] = vmlsq_f32(score3.val[0], gx.val[0], const45);
- score3.val[1] = vmlsq_f32(score3.val[1], gx.val[1], const45);
-
- score1.val[0] = vabsq_f32(score1.val[0]);
- score1.val[1] = vabsq_f32(score1.val[1]);
- score3.val[0] = vabsq_f32(score3.val[0]);
- score3.val[1] = vabsq_f32(score3.val[1]);
-
- float32x4x2_t phase =
- {
- zero,
- zero
- };
-
- float32x4x2_t old_score = score0;
-
- // score1 > old_score?
- uint32x4x2_t mask =
- {
- vcgtq_f32(score1.val[0], old_score.val[0]),
- vcgtq_f32(score1.val[1], old_score.val[1])
- };
-
- phase.val[0] = vbslq_f32(mask.val[0], one, phase.val[0]);
- phase.val[1] = vbslq_f32(mask.val[1], one, phase.val[1]);
- old_score.val[0] = vbslq_f32(mask.val[0], score1.val[0], old_score.val[0]);
- old_score.val[1] = vbslq_f32(mask.val[1], score1.val[1], old_score.val[1]);
-
- // score2 > old_score?
- mask.val[0] = vcgtq_f32(score2.val[0], old_score.val[0]);
- mask.val[1] = vcgtq_f32(score2.val[1], old_score.val[1]);
-
- phase.val[0] = vbslq_f32(mask.val[0], two, phase.val[0]);
- phase.val[1] = vbslq_f32(mask.val[1], two, phase.val[1]);
- old_score.val[0] = vbslq_f32(mask.val[0], score2.val[0], old_score.val[0]);
- old_score.val[1] = vbslq_f32(mask.val[1], score2.val[1], old_score.val[1]);
-
- // score3 > old_score?
- mask.val[0] = vcgtq_f32(score3.val[0], old_score.val[0]);
- mask.val[1] = vcgtq_f32(score3.val[1], old_score.val[1]);
-
- phase.val[0] = vbslq_f32(mask.val[0], three, phase.val[0]);
- phase.val[1] = vbslq_f32(mask.val[1], three, phase.val[1]);
- old_score.val[0] = vbslq_f32(mask.val[0], score3.val[0], old_score.val[0]);
- old_score.val[1] = vbslq_f32(mask.val[1], score3.val[1], old_score.val[1]);
-
- // Convert from float32x4_t to uint8x8_t
- return vmovn_u16(vcombine_u16(vmovn_u32(vcvtq_u32_f32(phase.val[0])),
- vmovn_u32(vcvtq_u32_f32(phase.val[1]))));
-}
-
-inline uint8x8_t phase_quantization(float16x8_t gx, float16x8_t gy)
-{
- // Constant use for evaluating score1 and score3
- static const float16x8_t const45 = vdupq_n_f16(0.70710678118655f);
- static const float16x8_t zero = vdupq_n_f16(0.0f);
- static const float16x8_t one = vdupq_n_f16(1.0f);
- static const float16x8_t two = vdupq_n_f16(2.0f);
- static const float16x8_t three = vdupq_n_f16(3.0f);
-
- // Score0: (1, 0)
- const float16x8_t score0 = vabsq_f16(gx);
-
- // Score2: ( 0, 1 )
- const float16x8_t score2 = vabsq_f16(gy);
-
- // Score1 and Score3: ( sqrt(2) / 2, sqrt(2) / 2 ) - ( -sqrt(2) / 2, sqrt(2) / 2 )
- float16x8_t score1 = vmulq_f16(gy, const45);
- float16x8_t score3 = score1;
-
- score1 = vfmaq_f16(score1, gx, const45);
- score3 = vfmsq_f16(score3, gx, const45);
-
- score1 = vabsq_f16(score1);
- score3 = vabsq_f16(score3);
-
- float16x8_t phase = zero;
- float16x8_t old_score = score0;
-
- // score1 > old_score?
- uint16x8_t mask = vcgtq_f16(score1, old_score);
-
- phase = vbslq_f16(mask, one, phase);
- old_score = vbslq_f16(mask, score1, old_score);
-
- // score2 > old_score?
- mask = vcgtq_f16(score2, old_score);
-
- phase = vbslq_f16(mask, two, phase);
- old_score = vbslq_f16(mask, score2, old_score);
-
- // score3 > old_score?
- mask = vcgtq_f16(score3, old_score);
-
- phase = vbslq_f16(mask, three, phase);
-
- // Convert from float16x8_t to uint8x8_t
- return vmovn_u16(vcvtq_u16_f16(phase));
-}
-
-/** Computes the gradient phase if gradient_size = 3 or 5. The output is quantized.
- * 0 = 0°, 1 = 45°, 2 = 90°, 3 = 135°
- *
- * @param[in] gx Gx component
- * @param[in] gy Gy component
- *
- * @return quantized phase for 8 pixels
- */
-inline uint8x8_t phase_quantization_S16_S16(int16x8_t gx, int16x8_t gy)
-{
- return phase_quantization(vcvtq_f16_s16(gx), vcvtq_f16_s16(gy));
-}
-
-/** Computes the gradient phase if gradient_size = 7. The output is quantized.
- * 0 = 0°, 1 = 45°, 2 = 90°, 3 = 135°
- *
- * @param[in] gx Gx component
- * @param[in] gy Gy component
- *
- * @return quantized phase for 8 pixels
- */
-inline uint8x8_t phase_quantization_S32_S32(const int32x4x2_t &gx, const int32x4x2_t &gy)
-{
- // Convert to float
- const float32x4x2_t gx_f32 =
- {
- vcvtq_f32_s32(gx.val[0]),
- vcvtq_f32_s32(gx.val[1])
- };
-
- const float32x4x2_t gy_f32 =
- {
- vcvtq_f32_s32(gy.val[0]),
- vcvtq_f32_s32(gy.val[1])
- };
-
- return phase_quantization(gx_f32, gy_f32);
-}
-
-/** Computes the magnitude using the L1-norm type if gradient_size = 3 or 5
- *
- * @param[in] gx Gx component
- * @param[in] gy Gy component
- *
- * @return magnitude for 8 pixels
- */
-inline uint16x8_t mag_l1_S16_S16(int16x8_t gx, int16x8_t gy)
-{
- return vaddq_u16(vreinterpretq_u16_s16(vabsq_s16(gx)),
- vreinterpretq_u16_s16(vabsq_s16(gy)));
-}
-
-/** Computes the magnitude using the L1-norm type if gradient_size = 7
- *
- * @param[in] gx Gx component
- * @param[in] gy Gy component
- *
- * @return magnitude for 8 pixels
- */
-inline uint32x4x2_t mag_l1_S32_S32(const int32x4x2_t &gx, const int32x4x2_t &gy)
-{
- const uint32x4x2_t gx_abs =
- {
- vreinterpretq_u32_s32(vabsq_s32(gx.val[0])),
- vreinterpretq_u32_s32(vabsq_s32(gx.val[1]))
- };
-
- const uint32x4x2_t gy_abs =
- {
- vreinterpretq_u32_s32(vabsq_s32(gy.val[0])),
- vreinterpretq_u32_s32(vabsq_s32(gy.val[1]))
- };
-
- const uint32x4x2_t out =
- {
- vaddq_u32(gx_abs.val[0], gy_abs.val[0]),
- vaddq_u32(gx_abs.val[1], gy_abs.val[1])
- };
-
- return out;
-}
-
-inline float32x4x2_t mag_l2(const float32x4x2_t &gx, const float32x4x2_t &gy)
-{
- // x^2 ...
- float32x4x2_t mag =
- {
- vmulq_f32(gx.val[0], gx.val[0]),
- vmulq_f32(gx.val[1], gx.val[1])
- };
-
- // ... + y^2
- mag.val[0] = vmlaq_f32(mag.val[0], gy.val[0], gy.val[0]);
- mag.val[1] = vmlaq_f32(mag.val[1], gy.val[1], gy.val[1]);
-
- // sqrt(...)
- mag.val[0] = vmulq_f32(vrsqrteq_f32(mag.val[0]), mag.val[0]);
- mag.val[1] = vmulq_f32(vrsqrteq_f32(mag.val[1]), mag.val[1]);
-
- return mag;
-}
-
-inline float16x8_t mag_l2(float16x8_t gx, float16x8_t gy)
-{
- // x^2 ...
- float16x8_t mag = vmulq_f16(gx, gx);
-
- // ... + y^2
- mag = vfmaq_f16(mag, gy, gy);
-
- // sqrt(...)
- mag = vmulq_f16(vrsqrteq_f16(mag), mag);
-
- return mag;
-}
-
-/** Computes the magnitude using L2-norm if gradient_size = 3 or 5
- *
- * @param[in] gx Gx component
- * @param[in] gy Gy component
- *
- * @return magnitude for 8 pixels
- */
-inline uint16x8_t mag_l2_S16_S16(int16x8_t gx, int16x8_t gy)
-{
- /* Compute magnitude using L2 normalization */
- const float16x8_t gx2 = vcvtq_f16_s16(gx);
- const float16x8_t gy2 = vcvtq_f16_s16(gy);
- const float16x8_t mag = mag_l2(gx2, gy2);
-
- /* Store magnitude - Convert to uint16x8 */
- return vcvtq_u16_f16(mag);
-}
-
-/** Computes the magnitude using L2-norm if gradient_size = 7
- *
- * @param[in] gx Gx component
- * @param[in] gy Gy component
- *
- * @return magnitude for 8 pixels
- */
-inline uint32x4x2_t mag_l2_S32_S32(const int32x4x2_t &gx, const int32x4x2_t &gy)
-{
- // Compute magnitude using L2 normalization
- float32x4x2_t gx2 =
- {
- vcvtq_f32_s32(gx.val[0]),
- vcvtq_f32_s32(gx.val[1])
- };
-
- float32x4x2_t gy2 =
- {
- vcvtq_f32_s32(gy.val[0]),
- vcvtq_f32_s32(gy.val[1])
- };
-
- const float32x4x2_t mag = mag_l2(gx2, gy2);
- const uint32x4x2_t mag32 =
- {
- vcvtq_u32_f32(mag.val[0]),
- vcvtq_u32_f32(mag.val[1])
- };
-
- return mag32;
-}
-
-/** Gradient function used when the gradient size = 3 or 5 and when the norm_type = L1-norm
- *
- * @param[in] in1_ptr Pointer to source image. Gx image. Data type supported S16
- * @param[in] in2_ptr Pointer to source image. Gy image. Data type supported S16
- * @param[out] out1_ptr Pointer to destination image. Magnitude. Data type supported U16
- * @param[out] out2_ptr Pointer to destination image. Quantized phase. Data type supported U8
- */
-void mag_phase_l1norm_S16_S16_U16_U8(const void *__restrict in1_ptr, const void *__restrict in2_ptr, void *__restrict out1_ptr, void *__restrict out2_ptr)
-{
- const auto in1 = static_cast<const int16_t *__restrict>(in1_ptr);
- const auto in2 = static_cast<const int16_t *__restrict>(in2_ptr);
- const auto out1 = static_cast<uint16_t *__restrict>(out1_ptr);
- const auto out2 = static_cast<uint8_t *__restrict>(out2_ptr);
-
- const int16x8x4_t gx =
- {
- vld1q_s16(in1),
- vld1q_s16(in1 + 8),
- vld1q_s16(in1 + 16),
- vld1q_s16(in1 + 24)
- };
-
- const int16x8x4_t gy =
- {
- vld1q_s16(in2),
- vld1q_s16(in2 + 8),
- vld1q_s16(in2 + 16),
- vld1q_s16(in2 + 24)
- };
-
- // Compute and store phase
- vst1_u8(out2 + 0, phase_quantization_S16_S16(gx.val[0], gy.val[0]));
- vst1_u8(out2 + 8, phase_quantization_S16_S16(gx.val[1], gy.val[1]));
- vst1_u8(out2 + 16, phase_quantization_S16_S16(gx.val[2], gy.val[2]));
- vst1_u8(out2 + 24, phase_quantization_S16_S16(gx.val[3], gy.val[3]));
-
- // Compute ans store magnitude using L1 normalization
- vst1q_u16(out1 + 0, mag_l1_S16_S16(gx.val[0], gy.val[0]));
- vst1q_u16(out1 + 8, mag_l1_S16_S16(gx.val[1], gy.val[1]));
- vst1q_u16(out1 + 16, mag_l1_S16_S16(gx.val[2], gy.val[2]));
- vst1q_u16(out1 + 24, mag_l1_S16_S16(gx.val[3], gy.val[3]));
-}
-
-/** Gradient function used when the gradient size = 3 or 5 and when the norm_type = L2-norm
- *
- * @param[in] in1_ptr Pointer to source image. Gx image. Data type supported S16
- * @param[in] in2_ptr Pointer to source image. Gy image. Data type supported S16
- * @param[out] out1_ptr Pointer to destination image. Magnitude. Data type supported U16
- * @param[out] out2_ptr Pointer to destination image. Quantized phase. Data type supported U8
- */
-void mag_phase_l2norm_S16_S16_U16_U8(const void *__restrict in1_ptr, const void *__restrict in2_ptr, void *__restrict out1_ptr, void *__restrict out2_ptr)
-{
- const auto in1 = static_cast<const int16_t *__restrict>(in1_ptr);
- const auto in2 = static_cast<const int16_t *__restrict>(in2_ptr);
- const auto out1 = static_cast<uint16_t *__restrict>(out1_ptr);
- const auto out2 = static_cast<uint8_t *__restrict>(out2_ptr);
-
- const int16x8x4_t gx =
- {
- vld1q_s16(in1),
- vld1q_s16(in1 + 8),
- vld1q_s16(in1 + 16),
- vld1q_s16(in1 + 24)
- };
-
- const int16x8x4_t gy =
- {
- vld1q_s16(in2),
- vld1q_s16(in2 + 8),
- vld1q_s16(in2 + 16),
- vld1q_s16(in2 + 24)
- };
-
- // Compute and store phase
- vst1_u8(out2 + 0, phase_quantization_S16_S16(gx.val[0], gy.val[0]));
- vst1_u8(out2 + 8, phase_quantization_S16_S16(gx.val[1], gy.val[1]));
- vst1_u8(out2 + 16, phase_quantization_S16_S16(gx.val[2], gy.val[2]));
- vst1_u8(out2 + 24, phase_quantization_S16_S16(gx.val[3], gy.val[3]));
-
- // Compute and store magnitude using L2 normalization
- vst1q_u16(out1 + 0, mag_l2_S16_S16(gx.val[0], gy.val[0]));
- vst1q_u16(out1 + 8, mag_l2_S16_S16(gx.val[1], gy.val[1]));
- vst1q_u16(out1 + 16, mag_l2_S16_S16(gx.val[2], gy.val[2]));
- vst1q_u16(out1 + 24, mag_l2_S16_S16(gx.val[3], gy.val[3]));
-}
-
-/** Gradient function used when the gradient size = 7 and when the norm_type = L1-norm
- *
- * @param[in] in1_ptr Pointer to source image. Gx image. Data type supported S32
- * @param[in] in2_ptr Pointer to source image. Gy image. Data type supported S32
- * @param[out] out1_ptr Pointer to destination image. Magnitude. Data type supported U32
- * @param[out] out2_ptr Pointer to destination image. Quantized phase. Data type supported U8
- */
-void mag_phase_l1norm_S32_S32_U32_U8(const void *__restrict in1_ptr, const void *__restrict in2_ptr, void *__restrict out1_ptr, void *__restrict out2_ptr)
-{
- auto in1 = static_cast<const int32_t *__restrict>(in1_ptr);
- auto in2 = static_cast<const int32_t *__restrict>(in2_ptr);
- auto out1 = static_cast<uint32_t *__restrict>(out1_ptr);
- auto out2 = static_cast<uint8_t *__restrict>(out2_ptr);
-
- // Process low and high part
- for(size_t i = 0; i < 2; ++i, in1 += 16, in2 += 16, out1 += 16, out2 += 16)
- {
- const int32x4x2_t gx0 =
- {
- vld1q_s32(in1 + 0),
- vld1q_s32(in1 + 4)
- };
-
- const int32x4x2_t gx1 =
- {
- vld1q_s32(in1 + 8),
- vld1q_s32(in1 + 12)
- };
-
- const int32x4x2_t gy0 =
- {
- vld1q_s32(in2 + 0),
- vld1q_s32(in2 + 4)
- };
-
- const int32x4x2_t gy1 =
- {
- vld1q_s32(in2 + 8),
- vld1q_s32(in2 + 12)
- };
-
- // Compute and store phase
- vst1_u8(out2 + 0, phase_quantization_S32_S32(gx0, gy0));
- vst1_u8(out2 + 8, phase_quantization_S32_S32(gx1, gy1));
-
- // Compute magnitude using L1 normalization
- const uint32x4x2_t mag0 = mag_l1_S32_S32(gx0, gy0);
- const uint32x4x2_t mag1 = mag_l1_S32_S32(gx1, gy1);
-
- // Store magnitude
- vst1q_u32(out1 + 0, mag0.val[0]);
- vst1q_u32(out1 + 4, mag0.val[1]);
- vst1q_u32(out1 + 8, mag1.val[0]);
- vst1q_u32(out1 + 12, mag1.val[1]);
- }
-}
-
-/** Gradient function used when the gradient size = 7 and when the norm_type = L2-norm
- *
- * @param[in] in1_ptr Pointer to source image. Gx image. Data type supported S32
- * @param[in] in2_ptr Pointer to source image. Gy image. Data type supported S32
- * @param[out] out1_ptr Pointer to destination image. Magnitude. Data type supported U32
- * @param[out] out2_ptr Pointer to destination image. Quantized phase. Data type supported U8
- */
-void mag_phase_l2norm_S32_S32_U32_U8(const void *__restrict in1_ptr, const void *__restrict in2_ptr, void *__restrict out1_ptr, void *__restrict out2_ptr)
-{
- auto in1 = static_cast<const int32_t *__restrict>(in1_ptr);
- auto in2 = static_cast<const int32_t *__restrict>(in2_ptr);
- auto out1 = static_cast<uint32_t *__restrict>(out1_ptr);
- auto out2 = static_cast<uint8_t *__restrict>(out2_ptr);
-
- // Process low and high part
- for(size_t i = 0; i < 2; ++i, in1 += 16, in2 += 16, out1 += 16, out2 += 16)
- {
- const int32x4x2_t gx0 =
- {
- vld1q_s32(in1 + 0),
- vld1q_s32(in1 + 4)
- };
-
- const int32x4x2_t gx1 =
- {
- vld1q_s32(in1 + 8),
- vld1q_s32(in1 + 12)
- };
-
- const int32x4x2_t gy0 =
- {
- vld1q_s32(in2 + 0),
- vld1q_s32(in2 + 4)
- };
-
- const int32x4x2_t gy1 =
- {
- vld1q_s32(in2 + 8),
- vld1q_s32(in2 + 12)
- };
-
- // Compute and store phase
- vst1_u8(out2 + 0, phase_quantization_S32_S32(gx0, gy0));
- vst1_u8(out2 + 8, phase_quantization_S32_S32(gx1, gy1));
-
- // Compute magnitude using L2 normalization
- const uint32x4x2_t mag0 = mag_l2_S32_S32(gx0, gy0);
- const uint32x4x2_t mag1 = mag_l2_S32_S32(gx1, gy1);
-
- // Store magnitude
- vst1q_u32(out1 + 0, mag0.val[0]);
- vst1q_u32(out1 + 4, mag0.val[1]);
- vst1q_u32(out1 + 8, mag1.val[0]);
- vst1q_u32(out1 + 12, mag1.val[1]);
- }
-}
-
-inline uint16x4_t non_max_U32_helper(const uint32_t *in, const uint16x4_t pc, const uint32_t stride_mag, const int32_t lower_thr, const int32_t upper_thr)
-{
- // Phase for 4 pixel
- const uint32x4_t pc32 = vmovl_u16(pc);
-
- // Get magnitude for 4 pixel
- uint32x4_t mc = vld1q_u32(in);
-
- // Angle_quantized: 0 = 0°, 1 = 45°, 2 = 90°, 3 = 135°
- // 0 degree
- const uint32x4_t mk0_0 = vld1q_u32(in - 1);
- const uint32x4_t mk0_1 = vld1q_u32(in + 1);
- uint32x4_t mask0 = vceqq_u32(pc32, vdupq_n_u32(0));
- mask0 = vandq_u32(mask0, vcgtq_u32(mc, mk0_0));
- mask0 = vandq_u32(mask0, vcgtq_u32(mc, mk0_1));
-
- // 45 degree
- const uint32x4_t mk45_0 = vld1q_u32(in - stride_mag - 1);
- const uint32x4_t mk45_1 = vld1q_u32(in + stride_mag + 1);
- uint32x4_t mask1 = vceqq_u32(pc32, vdupq_n_u32(1));
- mask1 = vandq_u32(mask1, vcgtq_u32(mc, mk45_0));
- mask1 = vandq_u32(mask1, vcgtq_u32(mc, mk45_1));
-
- // 90 degree
- const uint32x4_t mk90_0 = vld1q_u32(in - stride_mag);
- const uint32x4_t mk90_1 = vld1q_u32(in + stride_mag);
- uint32x4_t mask2 = vceqq_u32(pc32, vdupq_n_u32(2));
- mask2 = vandq_u32(mask2, vcgtq_u32(mc, mk90_0));
- mask2 = vandq_u32(mask2, vcgtq_u32(mc, mk90_1));
-
- // 135 degree
- const uint32x4_t mk135_0 = vld1q_u32(in - stride_mag + 1);
- const uint32x4_t mk135_1 = vld1q_u32(in + stride_mag - 1);
- uint32x4_t mask3 = vceqq_u32(pc32, vdupq_n_u32(3));
- mask3 = vandq_u32(mask3, vcgtq_u32(mc, mk135_0));
- mask3 = vandq_u32(mask3, vcgtq_u32(mc, mk135_1));
-
- // Merge masks
- mask0 = vorrq_u32(mask0, mask1);
- mask2 = vorrq_u32(mask2, mask3);
- mask0 = vorrq_u32(mask0, mask2);
-
- mc = vbslq_u32(mask0, mc, vdupq_n_u32(0));
-
- // mc > upper_thr
- mask0 = vcgtq_u32(mc, vdupq_n_u32(upper_thr));
-
- // mc <= lower_thr
- mask1 = vcleq_u32(mc, vdupq_n_u32(lower_thr));
-
- // mc <= upper_thr && mc > lower_thr
- mask2 = vcleq_u32(mc, vdupq_n_u32(upper_thr));
- mask2 = vandq_u32(mask2, vcgtq_u32(mc, vdupq_n_u32(lower_thr)));
-
- mc = vbslq_u32(mask0, vdupq_n_u32(EDGE), mc);
- mc = vbslq_u32(mask1, vdupq_n_u32(NO_EDGE), mc);
- mc = vbslq_u32(mask2, vdupq_n_u32(MAYBE), mc);
-
- return vmovn_u32(mc);
-}
-
-/** Computes edge tracing when is called by edge_trace_U8_U8 recursively
- *
- * @param[in] in Pointer to source image. Data type supported U8
- * @param[out] out Pointer to destination image. Data type supported U8
- * @param[in] in_stride Stride of the input image
- * @param[in] out_stride Stride of the output image
- */
-void edge_trace_recursive_U8_U8(uint8_t *__restrict in, uint8_t *__restrict out, const int32_t in_stride, const int32_t out_stride)
-{
- // Look for MAYBE pixels in 8 directions
- *out = EDGE;
-
- // (-1, 0)
- uint8_t pixel = *(in - 1);
-
- if(pixel == MAYBE)
- {
- // Touched a MAYBE point. MAYBE becomes EDGE
- *(in - 1) = EDGE;
-
- edge_trace_recursive_U8_U8(in - 1, out - 1, in_stride, out_stride);
- }
-
- // (+1, 0)
- pixel = *(in + 1);
-
- if(pixel == MAYBE)
- {
- // Touched a MAYBE point. MAYBE becomes EDGE
- *(in + 1) = EDGE;
-
- edge_trace_recursive_U8_U8(in + 1, out + 1, in_stride, out_stride);
- }
-
- in -= in_stride;
- out -= out_stride;
-
- // (-1, -1)
- pixel = *(in - 1);
-
- if(pixel == MAYBE)
- {
- // Touched a MAYBE point. MAYBE becomes EDGE
- *(in - 1) = EDGE;
-
- edge_trace_recursive_U8_U8(in - 1, out - 1, in_stride, out_stride);
- }
-
- // (0, -1)
- pixel = *in;
-
- if(pixel == MAYBE)
- {
- // Touched a MAYBE point. MAYBE becomes EDGE
- *in = EDGE;
-
- edge_trace_recursive_U8_U8(in, out, in_stride, out_stride);
- }
-
- // (+1, -1)
- pixel = *(in + 1);
-
- if(pixel == MAYBE)
- {
- // Touched a MAYBE point. MAYBE becomes EDGE
- *(in + 1) = EDGE;
-
- edge_trace_recursive_U8_U8(in + 1, out + 1, in_stride, out_stride);
- }
-
- in += in_stride * 2;
- out += out_stride * 2;
-
- // (-1, +1)
- pixel = *(in - 1);
-
- if(pixel == MAYBE)
- {
- // Touched a MAYBE point. MAYBE becomes EDGE
- *(in - 1) = EDGE;
-
- edge_trace_recursive_U8_U8(in - 1, out - 1, in_stride, out_stride);
- }
-
- // (0, +1)
- pixel = *in;
-
- if(pixel == MAYBE)
- {
- // Touched a MAYBE point. MAYBE becomes EDGE
- *in = EDGE;
-
- edge_trace_recursive_U8_U8(in, out, in_stride, out_stride);
- }
-
- // (+1, +1)
- pixel = *(in + 1);
-
- if(pixel == MAYBE)
- {
- // Touched a MAYBE point. MAYBE becomes EDGE
- *(in + 1) = EDGE;
-
- edge_trace_recursive_U8_U8(in + 1, out + 1, in_stride, out_stride);
- }
-}
-} // namespace fp16
-
-void NEGradientFP16Kernel::configure(const ITensor *gx, const ITensor *gy, ITensor *magnitude, ITensor *phase, int32_t norm_type)
-{
- ARM_COMPUTE_ERROR_ON_NULLPTR(gx, gy, magnitude, phase);
-
- set_shape_if_empty(*magnitude->info(), gx->info()->tensor_shape());
- set_shape_if_empty(*phase->info(), gx->info()->tensor_shape());
-
- Format magnitude_format = gx->info()->data_type() == DataType::S16 ? Format::U16 : Format::U32;
- set_format_if_unknown(*magnitude->info(), magnitude_format);
- set_format_if_unknown(*phase->info(), Format::U8);
-
- ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(gx, gy, magnitude, phase);
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(gx, 1, DataType::S16, DataType::S32);
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(gy, 1, DataType::S16, DataType::S32);
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(magnitude, 1, DataType::U16, DataType::U32);
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(phase, 1, DataType::U8);
- ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(gx, gy);
- ARM_COMPUTE_ERROR_ON_MSG(element_size_from_data_type(gx->info()->data_type()) != element_size_from_data_type(magnitude->info()->data_type()), "Magnitude must have the same element size as Gx and Gy");
-
- _gx = gx;
- _gy = gy;
- _magnitude = magnitude;
- _phase = phase;
-
- if(_gx->info()->data_type() == DataType::S16)
- {
- if(norm_type == 1)
- {
- _func = &fp16::mag_phase_l1norm_S16_S16_U16_U8;
- }
- else
- {
- _func = &fp16::mag_phase_l2norm_S16_S16_U16_U8;
- }
- }
- else
- {
- if(norm_type == 1)
- {
- _func = &fp16::mag_phase_l1norm_S32_S32_U32_U8;
- }
- else
- {
- _func = &fp16::mag_phase_l2norm_S32_S32_U32_U8;
- }
- }
-
- constexpr unsigned int num_elems_processed_per_iteration = 32;
-
- // Configure kernel window
- Window win = calculate_max_window(*_gx->info(), Steps(num_elems_processed_per_iteration));
-
- AccessWindowHorizontal gx_access(_gx->info(), 0, num_elems_processed_per_iteration);
- AccessWindowHorizontal gy_access(_gy->info(), 0, num_elems_processed_per_iteration);
- AccessWindowHorizontal mag_access(_magnitude->info(), 0, num_elems_processed_per_iteration);
- AccessWindowHorizontal phase_access(_phase->info(), 0, num_elems_processed_per_iteration);
-
- update_window_and_padding(win, gx_access, gy_access, mag_access, phase_access);
-
- mag_access.set_valid_region(win, _gx->info()->valid_region());
- phase_access.set_valid_region(win, _gx->info()->valid_region());
-
- INEKernel::configure(win);
-}
-#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
-
namespace
{
inline uint8x8_t phase_quantization(const float32x4x2_t &gx, const float32x4x2_t &gy)
diff --git a/src/core/NEON/kernels/NEHarrisCornersKernel.cpp b/src/core/NEON/kernels/NEHarrisCornersKernel.cpp
index 5e1c216b65..61221c1070 100644
--- a/src/core/NEON/kernels/NEHarrisCornersKernel.cpp
+++ b/src/core/NEON/kernels/NEHarrisCornersKernel.cpp
@@ -39,330 +39,6 @@
using namespace arm_compute;
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-
-namespace fp16
-{
-inline float16x8_t harris_score(float16x8_t gx2, float16x8_t gy2, float16x8_t gxgy, float sensitivity, float strength_thresh)
-{
- static const float16x8_t zero = vdupq_n_f16(0.f);
-
- // Trace^2
- float16x8_t trace2 = vaddq_f16(gx2, gy2);
- trace2 = vmulq_f16(trace2, trace2);
-
- // Det(A)
- float16x8_t det = vmulq_f16(gx2, gy2);
- det = vfmsq_f16(det, gxgy, gxgy);
-
- // Det(A) - sensitivity * trace^2
- const float16x8_t mc = vfmsq_f16(det, vdupq_n_f16(sensitivity), trace2);
-
- // mc > strength_thresh
- const uint16x8_t mask = vcgtq_f16(mc, vdupq_n_f16(strength_thresh));
-
- return vbslq_f16(mask, mc, zero);
-}
-
-template <size_t block_size>
-inline void harris_score1xN_FLOAT_FLOAT_FLOAT(float16x8_t low_gx, float16x8_t low_gy, float16x8_t high_gx, float16x8_t high_gy, float16x8_t &gx2, float16x8_t &gy2, float16x8_t &gxgy,
- float norm_factor)
-{
- const float16x8_t norm_factor_fp16 = vdupq_n_f16(norm_factor);
-
- // Normalize
- low_gx = vmulq_f16(low_gx, norm_factor_fp16);
- low_gy = vmulq_f16(low_gy, norm_factor_fp16);
- high_gx = vmulq_f16(high_gx, norm_factor_fp16);
- high_gy = vmulq_f16(high_gy, norm_factor_fp16);
-
- float16x8_t gx = vextq_f16(low_gx, high_gx, 0);
- float16x8_t gy = vextq_f16(low_gy, high_gy, 0);
-
- gx2 = vfmaq_f16(gx2, gx, gx);
- gy2 = vfmaq_f16(gy2, gy, gy);
- gxgy = vfmaq_f16(gxgy, gx, gy);
-
- gx = vextq_f16(low_gx, high_gx, 1);
- gy = vextq_f16(low_gy, high_gy, 1);
-
- gx2 = vfmaq_f16(gx2, gx, gx);
- gy2 = vfmaq_f16(gy2, gy, gy);
- gxgy = vfmaq_f16(gxgy, gx, gy);
-
- gx = vextq_f16(low_gx, high_gx, 2);
- gy = vextq_f16(low_gy, high_gy, 2);
-
- gx2 = vfmaq_f16(gx2, gx, gx);
- gy2 = vfmaq_f16(gy2, gy, gy);
- gxgy = vfmaq_f16(gxgy, gx, gy);
-
- if(block_size > 3)
- {
- gx = vextq_f16(low_gx, high_gx, 3);
- gy = vextq_f16(low_gy, high_gy, 3);
-
- gx2 = vfmaq_f16(gx2, gx, gx);
- gy2 = vfmaq_f16(gy2, gy, gy);
- gxgy = vfmaq_f16(gxgy, gx, gy);
-
- gx = vextq_f16(low_gx, high_gx, 4);
- gy = vextq_f16(low_gy, high_gy, 4);
-
- gx2 = vfmaq_f16(gx2, gx, gx);
- gy2 = vfmaq_f16(gy2, gy, gy);
- gxgy = vfmaq_f16(gxgy, gx, gy);
- }
-
- if(block_size == 7)
- {
- gx = vextq_f16(low_gx, high_gx, 5);
- gy = vextq_f16(low_gy, high_gy, 5);
-
- gx2 = vfmaq_f16(gx2, gx, gx);
- gy2 = vfmaq_f16(gy2, gy, gy);
- gxgy = vfmaq_f16(gxgy, gx, gy);
-
- gx = vextq_f16(low_gx, high_gx, 6);
- gy = vextq_f16(low_gy, high_gy, 6);
-
- gx2 = vfmaq_f16(gx2, gx, gx);
- gy2 = vfmaq_f16(gy2, gy, gy);
- gxgy = vfmaq_f16(gxgy, gx, gy);
- }
-}
-
-template <size_t block_size>
-inline void harris_score_S16_S16_FLOAT(const void *__restrict in1_ptr, const void *__restrict in2_ptr, void *__restrict out_ptr, int32_t in_stride, float norm_factor, float sensitivity,
- float strength_thresh)
-{
- auto gx_ptr_0 = static_cast<const int16_t *__restrict>(in1_ptr) - (block_size / 2) * (in_stride + 1);
- auto gy_ptr_0 = static_cast<const int16_t *__restrict>(in2_ptr) - (block_size / 2) * (in_stride + 1);
- const int16_t *gx_ptr_1 = gx_ptr_0 + 8;
- const int16_t *gy_ptr_1 = gy_ptr_0 + 8;
- const auto output = static_cast<float *__restrict>(out_ptr);
-
- // Gx^2, Gy^2 and Gx*Gy
- float16x8_t gx2 = vdupq_n_f16(0.0f);
- float16x8_t gy2 = vdupq_n_f16(0.0f);
- float16x8_t gxgy = vdupq_n_f16(0.0f);
-
- for(size_t i = 0; i < block_size; ++i)
- {
- const float16x8_t low_gx = vcvtq_f16_s16(vld1q_s16(gx_ptr_0));
- const float16x8_t high_gx = vcvtq_f16_s16(vld1q_s16(gx_ptr_1));
- const float16x8_t low_gy = vcvtq_f16_s16(vld1q_s16(gy_ptr_0));
- const float16x8_t high_gy = vcvtq_f16_s16(vld1q_s16(gy_ptr_1));
- harris_score1xN_FLOAT_FLOAT_FLOAT<block_size>(low_gx, low_gy, high_gx, high_gy, gx2, gy2, gxgy, norm_factor);
-
- // Update gx and gy pointer
- gx_ptr_0 += in_stride;
- gy_ptr_0 += in_stride;
- gx_ptr_1 += in_stride;
- gy_ptr_1 += in_stride;
- }
-
- // Calculate harris score
- const float16x8_t mc = harris_score(gx2, gy2, gxgy, sensitivity, strength_thresh);
-
- // Store score
- vst1q_f32(output + 0, vcvt_f32_f16(vget_low_f16(mc)));
- vst1q_f32(output + 4, vcvt_f32_f16(vget_high_f16(mc)));
-}
-
-template <size_t block_size>
-inline void harris_score_S32_S32_FLOAT(const void *__restrict in1_ptr, const void *__restrict in2_ptr, void *__restrict out_ptr, int32_t in_stride, float norm_factor, float sensitivity,
- float strength_thresh)
-{
- static const float16x8_t zero = vdupq_n_f16(0.0f);
-
- auto gx_ptr_0 = static_cast<const int32_t *__restrict>(in1_ptr) - (block_size / 2) * (in_stride + 1);
- auto gy_ptr_0 = static_cast<const int32_t *__restrict>(in2_ptr) - (block_size / 2) * (in_stride + 1);
- const int32_t *gx_ptr_1 = gx_ptr_0 + 4;
- const int32_t *gy_ptr_1 = gy_ptr_0 + 4;
- const int32_t *gx_ptr_2 = gx_ptr_0 + 8;
- const int32_t *gy_ptr_2 = gy_ptr_0 + 8;
- const auto output = static_cast<float *__restrict>(out_ptr);
-
- // Gx^2, Gy^2 and Gx*Gy
- float16x8_t gx2 = zero;
- float16x8_t gy2 = zero;
- float16x8_t gxgy = zero;
-
- for(size_t i = 0; i < block_size; ++i)
- {
- const float16x8_t low_gx = vcombine_f16(vcvt_f16_f32(vcvtq_f32_s32(vld1q_s32(gx_ptr_0))),
- vcvt_f16_f32(vcvtq_f32_s32(vld1q_s32(gx_ptr_1))));
- const float16x8_t high_gx = vcombine_f16(vcvt_f16_f32(vcvtq_f32_s32(vld1q_s32(gx_ptr_2))),
- vget_low_f16(zero));
- const float16x8_t low_gy = vcombine_f16(vcvt_f16_f32(vcvtq_f32_s32(vld1q_s32(gy_ptr_0))),
- vcvt_f16_f32(vcvtq_f32_s32(vld1q_s32(gy_ptr_1))));
- const float16x8_t high_gy = vcombine_f16(vcvt_f16_f32(vcvtq_f32_s32(vld1q_s32(gy_ptr_2))),
- vget_low_f16(zero));
- harris_score1xN_FLOAT_FLOAT_FLOAT<block_size>(low_gx, low_gy, high_gx, high_gy, gx2, gy2, gxgy, norm_factor);
-
- // Update gx and gy pointer
- gx_ptr_0 += in_stride;
- gy_ptr_0 += in_stride;
- gx_ptr_1 += in_stride;
- gy_ptr_1 += in_stride;
- gx_ptr_2 += in_stride;
- gy_ptr_2 += in_stride;
- }
-
- // Calculate harris score
- const float16x8_t mc = harris_score(gx2, gy2, gxgy, sensitivity, strength_thresh);
-
- // Store score
- vst1q_f32(output + 0, vcvt_f32_f16(vget_low_f16(mc)));
- vst1q_f32(output + 4, vcvt_f32_f16(vget_high_f16(mc)));
-}
-
-template <>
-inline void harris_score_S32_S32_FLOAT<7>(const void *__restrict in1_ptr, const void *__restrict in2_ptr, void *__restrict out_ptr, int32_t in_stride, float norm_factor, float sensitivity,
- float strength_thresh)
-{
- static const float16x8_t zero = vdupq_n_f16(0.0f);
-
- auto gx_ptr_0 = static_cast<const int32_t *__restrict>(in1_ptr) - 3 * (in_stride + 1);
- auto gy_ptr_0 = static_cast<const int32_t *__restrict>(in2_ptr) - 3 * (in_stride + 1);
- const int32_t *gx_ptr_1 = gx_ptr_0 + 4;
- const int32_t *gy_ptr_1 = gy_ptr_0 + 4;
- const int32_t *gx_ptr_2 = gx_ptr_0 + 8;
- const int32_t *gy_ptr_2 = gy_ptr_0 + 8;
- const int32_t *gx_ptr_3 = gx_ptr_0 + 12;
- const int32_t *gy_ptr_3 = gy_ptr_0 + 12;
- const auto output = static_cast<float *__restrict>(out_ptr);
-
- // Gx^2, Gy^2 and Gx*Gy
- float16x8_t gx2 = zero;
- float16x8_t gy2 = zero;
- float16x8_t gxgy = zero;
-
- for(size_t i = 0; i < 7; ++i)
- {
- const float16x8_t low_gx = vcombine_f16(vcvt_f16_f32(vcvtq_f32_s32(vld1q_s32(gx_ptr_0))),
- vcvt_f16_f32(vcvtq_f32_s32(vld1q_s32(gx_ptr_1))));
- const float16x8_t high_gx = vcombine_f16(vcvt_f16_f32(vcvtq_f32_s32(vld1q_s32(gx_ptr_2))),
- vcvt_f16_f32(vcvtq_f32_s32(vld1q_s32(gx_ptr_3))));
- const float16x8_t low_gy = vcombine_f16(vcvt_f16_f32(vcvtq_f32_s32(vld1q_s32(gy_ptr_0))),
- vcvt_f16_f32(vcvtq_f32_s32(vld1q_s32(gy_ptr_1))));
- const float16x8_t high_gy = vcombine_f16(vcvt_f16_f32(vcvtq_f32_s32(vld1q_s32(gy_ptr_2))),
- vcvt_f16_f32(vcvtq_f32_s32(vld1q_s32(gy_ptr_3))));
- harris_score1xN_FLOAT_FLOAT_FLOAT<7>(low_gx, low_gy, high_gx, high_gy, gx2, gy2, gxgy, norm_factor);
-
- // Update gx and gy pointer
- gx_ptr_0 += in_stride;
- gy_ptr_0 += in_stride;
- gx_ptr_1 += in_stride;
- gy_ptr_1 += in_stride;
- gx_ptr_2 += in_stride;
- gy_ptr_2 += in_stride;
- }
-
- // Calculate harris score
- const float16x8_t mc = harris_score(gx2, gy2, gxgy, sensitivity, strength_thresh);
-
- // Store score
- vst1q_f32(output + 0, vcvt_f32_f16(vget_low_f16(mc)));
- vst1q_f32(output + 4, vcvt_f32_f16(vget_high_f16(mc)));
-}
-
-} // namespace fp16
-
-template <int32_t block_size>
-BorderSize NEHarrisScoreFP16Kernel<block_size>::border_size() const
-{
- return _border_size;
-}
-
-template <int32_t block_size>
-NEHarrisScoreFP16Kernel<block_size>::NEHarrisScoreFP16Kernel()
- : INEHarrisScoreKernel(), _func(nullptr)
-{
-}
-
-template <int32_t block_size>
-void NEHarrisScoreFP16Kernel<block_size>::run(const Window &window, const ThreadInfo &info)
-{
- ARM_COMPUTE_UNUSED(info);
- ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
- ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
- ARM_COMPUTE_ERROR_ON(_func == nullptr);
-
- Iterator input1(_input1, window);
- Iterator input2(_input2, window);
- Iterator output(_output, window);
-
- const size_t input_stride = _input1->info()->strides_in_bytes()[1] / element_size_from_data_type(_input1->info()->data_type());
-
- execute_window_loop(window, [&](const Coordinates & id)
- {
- (*_func)(input1.ptr(), input2.ptr(), output.ptr(), input_stride, _norm_factor, _sensitivity, _strength_thresh);
- },
- input1, input2, output);
-}
-
-template <int32_t block_size>
-void NEHarrisScoreFP16Kernel<block_size>::configure(const IImage *input1, const IImage *input2, IImage *output, float norm_factor, float strength_thresh, float sensitivity,
- bool border_undefined)
-{
- ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(input1);
- ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(input2);
- ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(output);
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::S16, DataType::S32);
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input2, 1, DataType::S16, DataType::S32);
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F32);
- ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input1, input2);
- ARM_COMPUTE_ERROR_ON(0.0f == norm_factor);
-
- _input1 = input1;
- _input2 = input2;
- _output = output;
- _sensitivity = sensitivity;
- _strength_thresh = strength_thresh;
- _norm_factor = norm_factor;
- _border_size = BorderSize(block_size / 2);
-
- if(input1->info()->data_type() == DataType::S16)
- {
- _func = &fp16::harris_score_S16_S16_FLOAT<block_size>;
- }
- else
- {
- _func = &fp16::harris_score_S32_S32_FLOAT<block_size>;
- }
-
- ARM_COMPUTE_ERROR_ON(nullptr == _func);
-
- constexpr unsigned int num_elems_processed_per_iteration = 8;
- constexpr unsigned int num_elems_read_per_iteration = 16;
- constexpr unsigned int num_elems_written_per_iteration = 8;
- constexpr unsigned int num_rows_read_per_iteration = block_size;
-
- // Configure kernel window
- Window win = calculate_max_window(*input1->info(), Steps(num_elems_processed_per_iteration), border_undefined, border_size());
- AccessWindowHorizontal output_access(output->info(), 0, num_elems_written_per_iteration);
-
- update_window_and_padding(win,
- AccessWindowRectangle(input1->info(), -_border_size.left, -_border_size.top, num_elems_read_per_iteration, num_rows_read_per_iteration),
- AccessWindowRectangle(input2->info(), -_border_size.left, -_border_size.top, num_elems_read_per_iteration, num_rows_read_per_iteration),
- output_access);
-
- ValidRegion valid_region = intersect_valid_regions(input1->info()->valid_region(),
- input2->info()->valid_region());
-
- output_access.set_valid_region(win, valid_region, border_undefined, border_size());
-
- INEKernel::configure(win);
-}
-
-template class arm_compute::NEHarrisScoreFP16Kernel<3>;
-template class arm_compute::NEHarrisScoreFP16Kernel<5>;
-template class arm_compute::NEHarrisScoreFP16Kernel<7>;
-
-#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
-
template class arm_compute::NEHarrisScoreKernel<3>;
template class arm_compute::NEHarrisScoreKernel<5>;
template class arm_compute::NEHarrisScoreKernel<7>;
diff --git a/src/runtime/NEON/functions/NECannyEdge.cpp b/src/runtime/NEON/functions/NECannyEdge.cpp
index d72c98bb50..0e5d50fd21 100644
--- a/src/runtime/NEON/functions/NECannyEdge.cpp
+++ b/src/runtime/NEON/functions/NECannyEdge.cpp
@@ -58,8 +58,7 @@ NECannyEdge::NECannyEdge(std::shared_ptr<IMemoryManager> memory_manager) // NOLI
{
}
-void NECannyEdge::configure(ITensor *input, ITensor *output, int32_t upper_thr, int32_t lower_thr, int32_t gradient_size, int32_t norm_type, BorderMode border_mode, uint8_t constant_border_value,
- bool use_fp16)
+void NECannyEdge::configure(ITensor *input, ITensor *output, int32_t upper_thr, int32_t lower_thr, int32_t gradient_size, int32_t norm_type, BorderMode border_mode, uint8_t constant_border_value)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
@@ -127,18 +126,9 @@ void NECannyEdge::configure(ITensor *input, ITensor *output, int32_t upper_thr,
_memory_group.manage(&_phase);
// Configure gradient
- if(use_fp16)
- {
- auto k = arm_compute::support::cpp14::make_unique<NEGradientFP16Kernel>();
- k->configure(&_gx, &_gy, &_magnitude, &_phase, norm_type);
- _gradient = std::move(k);
- }
- else
- {
- auto k = arm_compute::support::cpp14::make_unique<NEGradientKernel>();
- k->configure(&_gx, &_gy, &_magnitude, &_phase, norm_type);
- _gradient = std::move(k);
- }
+ auto k = arm_compute::support::cpp14::make_unique<NEGradientKernel>();
+ k->configure(&_gx, &_gy, &_magnitude, &_phase, norm_type);
+ _gradient = std::move(k);
// Allocate intermediate tensors
_gx.allocator()->allocate();
diff --git a/src/runtime/NEON/functions/NEHarrisCorners.cpp b/src/runtime/NEON/functions/NEHarrisCorners.cpp
index 25e28d2213..db5e9265bb 100644
--- a/src/runtime/NEON/functions/NEHarrisCorners.cpp
+++ b/src/runtime/NEON/functions/NEHarrisCorners.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2016, 2017 ARM Limited.
+ * Copyright (c) 2016-2018 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -61,7 +61,7 @@ NEHarrisCorners::NEHarrisCorners(std::shared_ptr<IMemoryManager> memory_manager)
void NEHarrisCorners::configure(IImage *input, float threshold, float min_dist,
float sensitivity, int32_t gradient_size, int32_t block_size, KeyPointArray *corners,
- BorderMode border_mode, uint8_t constant_border_value, bool use_fp16)
+ BorderMode border_mode, uint8_t constant_border_value)
{
ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(input);
ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
@@ -126,62 +126,31 @@ void NEHarrisCorners::configure(IImage *input, float threshold, float min_dist,
// Manage intermediate buffers
_memory_group.manage(&_score);
- if(use_fp16)
+ // Set/init Harris Score kernel accordingly with block_size
+ switch(block_size)
{
- switch(block_size)
+ case 3:
{
- case 3:
- {
- auto k = arm_compute::support::cpp14::make_unique<NEHarrisScoreFP16Kernel<3>>();
- k->configure(&_gx, &_gy, &_score, norm_factor, threshold, sensitivity, border_mode == BorderMode::UNDEFINED);
- _harris_score = std::move(k);
- }
- break;
- case 5:
- {
- auto k = arm_compute::support::cpp14::make_unique<NEHarrisScoreFP16Kernel<5>>();
- k->configure(&_gx, &_gy, &_score, norm_factor, threshold, sensitivity, border_mode == BorderMode::UNDEFINED);
- _harris_score = std::move(k);
- }
- break;
- case 7:
- {
- auto k = arm_compute::support::cpp14::make_unique<NEHarrisScoreFP16Kernel<7>>();
- k->configure(&_gx, &_gy, &_score, norm_factor, threshold, sensitivity, border_mode == BorderMode::UNDEFINED);
- _harris_score = std::move(k);
- }
- default:
- break;
+ auto k = arm_compute::support::cpp14::make_unique<NEHarrisScoreKernel<3>>();
+ k->configure(&_gx, &_gy, &_score, norm_factor, threshold, sensitivity, border_mode == BorderMode::UNDEFINED);
+ _harris_score = std::move(k);
}
- }
- else
- {
- // Set/init Harris Score kernel accordingly with block_size
- switch(block_size)
+ break;
+ case 5:
{
- case 3:
- {
- auto k = arm_compute::support::cpp14::make_unique<NEHarrisScoreKernel<3>>();
- k->configure(&_gx, &_gy, &_score, norm_factor, threshold, sensitivity, border_mode == BorderMode::UNDEFINED);
- _harris_score = std::move(k);
- }
- break;
- case 5:
- {
- auto k = arm_compute::support::cpp14::make_unique<NEHarrisScoreKernel<5>>();
- k->configure(&_gx, &_gy, &_score, norm_factor, threshold, sensitivity, border_mode == BorderMode::UNDEFINED);
- _harris_score = std::move(k);
- }
- break;
- case 7:
- {
- auto k = arm_compute::support::cpp14::make_unique<NEHarrisScoreKernel<7>>();
- k->configure(&_gx, &_gy, &_score, norm_factor, threshold, sensitivity, border_mode == BorderMode::UNDEFINED);
- _harris_score = std::move(k);
- }
- default:
- break;
+ auto k = arm_compute::support::cpp14::make_unique<NEHarrisScoreKernel<5>>();
+ k->configure(&_gx, &_gy, &_score, norm_factor, threshold, sensitivity, border_mode == BorderMode::UNDEFINED);
+ _harris_score = std::move(k);
}
+ break;
+ case 7:
+ {
+ auto k = arm_compute::support::cpp14::make_unique<NEHarrisScoreKernel<7>>();
+ k->configure(&_gx, &_gy, &_score, norm_factor, threshold, sensitivity, border_mode == BorderMode::UNDEFINED);
+ _harris_score = std::move(k);
+ }
+ default:
+ break;
}
// Configure border filling before harris score