aboutsummaryrefslogtreecommitdiff
path: root/src/core/NEON/kernels/NEMagnitudePhaseKernel.cpp
diff options
context:
space:
mode:
authorGian Marco Iodice <gianmarco.iodice@arm.com>2018-08-23 17:26:21 +0100
committerAnthony Barbier <anthony.barbier@arm.com>2018-11-02 16:54:54 +0000
commitfbf3ecc1833b860019f965c88cda87ec9f44c3d5 (patch)
treea839e4c12b27346731c1dfd9a58e6eec343e5569 /src/core/NEON/kernels/NEMagnitudePhaseKernel.cpp
parent1fa17c2edc51e53c6bd388b332d825208f6562e8 (diff)
downloadComputeLibrary-fbf3ecc1833b860019f965c88cda87ec9f44c3d5.tar.gz
COMPMID-1534 - Fix GEMM and Magnitude test for FP16
On GEMM we had accuracy issue On Magnitude we have disabled the fp16 acceleration since we do not have feature parity with CL and this function is not used for ML Change-Id: Iaebe3bbbd2a9f45db0c714aa5ebaf48eb0b65741 Reviewed-on: https://eu-gerrit-1.euhpc.arm.com/145467 Tested-by: Jenkins <bsgcomp@arm.com> Reviewed-by: Georgios Pinitas <georgios.pinitas@arm.com> Reviewed-by: Anthony Barbier <anthony.barbier@arm.com>
Diffstat (limited to 'src/core/NEON/kernels/NEMagnitudePhaseKernel.cpp')
-rw-r--r--src/core/NEON/kernels/NEMagnitudePhaseKernel.cpp382
1 files changed, 1 insertions, 381 deletions
diff --git a/src/core/NEON/kernels/NEMagnitudePhaseKernel.cpp b/src/core/NEON/kernels/NEMagnitudePhaseKernel.cpp
index 2d7c29d9a0..4a318f02c1 100644
--- a/src/core/NEON/kernels/NEMagnitudePhaseKernel.cpp
+++ b/src/core/NEON/kernels/NEMagnitudePhaseKernel.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2016, 2017 ARM Limited.
+ * Copyright (c) 2016-2018 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -51,386 +51,6 @@ constexpr float COEFF1 = 0.0663f;
constexpr float COEFF2 = 0.2447f;
} // namespace
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-namespace fp16
-{
-inline float16x8_t inv(float16x8_t x)
-{
- const float16x8_t estimate = vrecpeq_f16(x);
- return vmulq_f16(estimate, vrecpsq_f16(x, estimate));
-}
-
-inline float16x8_t atan2_fast(float16x8_t gx, float16x8_t gy, float16x8_t scale)
-{
- static const float16x8_t one = vdupq_n_f16(1.0f);
- static const float16x8_t ninety = vdupq_n_f16(90.f * SCALE_FACTOR);
- static const float16x8_t epsilon = vdupq_n_f16(1e-9f);
- static const float16x8_t piover4 = vdupq_n_f16(PI_4);
- static const float16x8_t coeff1 = vdupq_n_f16(COEFF1);
- static const float16x8_t coeff2 = vdupq_n_f16(COEFF2);
-
- const float16x8_t abs_gx = vabsq_f16(gx);
- const float16x8_t abs_gy = vabsq_f16(gy);
- const float16x8_t tmin = vminq_f16(abs_gx, abs_gy);
- const float16x8_t tmax = vmaxq_f16(abs_gx, abs_gy);
-
- // z = min(x, y) / max(x, y)
- const float16x8_t z = vmulq_f16(tmin, inv(vaddq_f16(tmax, epsilon)));
- const float16x8_t absz = vabsq_f16(z);
-
- // = x * [pi/4 + (1 - |x|) * (0.2447 + 0.0663 * |x|)]
- float16x8_t arctan = vmulq_f16(z, vfmaq_f16(piover4,
- vsubq_f16(one, absz),
- vfmaq_f16(coeff2, coeff1, absz)));
-
- // Radians to degrees conversion with applied a scale factor in order to have the result [0, 255]
- arctan = vmulq_f16(arctan, scale);
-
- /* If z > 1, result = 90 - result */
- return vbslq_f16(vcgeq_f16(abs_gx, abs_gy), arctan, vsubq_f16(ninety, arctan));
-}
-
-inline float16x8_t atan2_0_360(float16x8_t gx, float16x8_t gy)
-{
- static const float16x8_t scale = vdupq_n_f16(SCALE_360);
- static const float16x8_t threesixty = vdupq_n_f16(360.0f * SCALE_FACTOR);
- static const float16x8_t zero = vdupq_n_f16(0.0f);
- static const float16x8_t oneeighty = vdupq_n_f16(180.0f * SCALE_FACTOR);
-
- float16x8_t arctan = atan2_fast(gx, gy, scale);
-
- // Choose correct quadrant
- arctan = vbslq_f16(vcltq_f16(gx, zero), vsubq_f16(oneeighty, arctan), arctan);
- arctan = vbslq_f16(vcltq_f16(gy, zero), vsubq_f16(threesixty, arctan), arctan);
-
- return arctan;
-}
-
-inline float16x8_t atan2_0_180(float16x8_t gx, float16x8_t gy)
-{
- static const float16x8_t scale = vdupq_n_f16(SCALE_180);
- static const float16x8_t threesixty = vdupq_n_f16(360.0f * SCALE_FACTOR);
- static const float16x8_t oneeighty = vdupq_n_f16(180.0f * SCALE_FACTOR);
- static const float16x8_t zero = vdupq_n_f16(0.0f);
-
- float16x8_t arctan = atan2_fast(gx, gy, scale);
-
- // Choose correct quadrant
- arctan = vbslq_f16(vcltq_f16(gx, zero), vsubq_f16(oneeighty, arctan), arctan);
- arctan = vbslq_f16(vcltq_f16(gy, zero), vsubq_f16(threesixty, arctan), arctan);
- arctan = vbslq_f16(vcgtq_f16(arctan, oneeighty), vsubq_f16(arctan, oneeighty), arctan);
-
- return arctan;
-}
-
-inline float32x4_t invsqrtv(float32x4_t x)
-{
- float32x4_t sqrt_reciprocal = vrsqrteq_f32(x);
-
- sqrt_reciprocal = vmulq_f32(vrsqrtsq_f32(vmulq_f32(x, sqrt_reciprocal), sqrt_reciprocal),
- sqrt_reciprocal);
- sqrt_reciprocal = vmulq_f32(vrsqrtsq_f32(vmulq_f32(x, sqrt_reciprocal), sqrt_reciprocal),
- sqrt_reciprocal);
-
- return sqrt_reciprocal;
-}
-
-inline float32x4_t sqrtv(float32x4_t x)
-{
- float32x4_t res = vdupq_n_f32(0.5f);
- return vmlaq_f32(res, x, invsqrtv(x));
-}
-
-inline int16x8_t magnitude_l1(int16x8_t input1, int16x8_t input2)
-{
- return vqaddq_s16(vqabsq_s16(input1), vqabsq_s16(input2));
-}
-
-inline int16x8_t magnitude_l2(int16x8_t input1, int16x8_t input2)
-{
- const int32x4x2_t square_x =
- {
- vmull_s16(vget_low_s16(input1), vget_low_s16(input1)),
- vmull_s16(vget_high_s16(input1), vget_high_s16(input1))
- };
-
- const int32x4x2_t square_y =
- {
- vmull_s16(vget_low_s16(input2), vget_low_s16(input2)),
- vmull_s16(vget_high_s16(input2), vget_high_s16(input2))
- };
-
- const uint32x4x2_t sum =
- {
- vaddq_u32(vreinterpretq_u32_s32(square_x.val[0]),
- vreinterpretq_u32_s32(square_y.val[0])),
- vaddq_u32(vreinterpretq_u32_s32(square_x.val[1]),
- vreinterpretq_u32_s32(square_y.val[1]))
- };
-
- const float32x4x2_t res =
- {
- sqrtv(vcvtq_f32_u32(sum.val[0])),
- sqrtv(vcvtq_f32_u32(sum.val[1]))
- };
-
- return vcombine_s16(vqmovn_s32(vcvtq_s32_f32(res.val[0])),
- vqmovn_s32(vcvtq_s32_f32(res.val[1])));
-}
-
-inline uint8x8_t phase_signed(int16x8_t input1, int16x8_t input2)
-{
- static const float16x8_t zeropointfive = vdupq_n_f16(0.5f);
-
- const float16x8_t inputx_f16 = vcvtq_f16_s16(input1);
- const float16x8_t inputy_f16 = vcvtq_f16_s16(input2);
-
- // Compute fast atan2
- const float16x8_t angle = atan2_0_360(inputx_f16, inputy_f16);
-
- return vqmovun_s16(vcvtq_s16_f16(vaddq_f16(angle, zeropointfive)));
-}
-
-inline uint8x8_t phase_unsigned(int16x8_t input1, int16x8_t input2)
-{
- static const float16x8_t zeropointfive = vdupq_n_f16(0.5f);
-
- const float16x8_t inputx_f16 = vcvtq_f16_s16(input1);
- const float16x8_t inputy_f16 = vcvtq_f16_s16(input2);
-
- // Compute fast atan2
- const float16x8_t angle = atan2_0_180(inputx_f16, inputy_f16);
-
- return vqmovun_s16(vcvtq_s16_f16(vaddq_f16(angle, zeropointfive)));
-}
-
-template <MagnitudeType mag_type>
-inline int16x8x2_t compute_magnitude(const int16x8x2_t &in0, const int16x8x2_t &gx);
-
-template <>
-inline int16x8x2_t compute_magnitude<MagnitudeType::L2NORM>(const int16x8x2_t &in0, const int16x8x2_t &gx)
-{
- const int16x8x2_t mag =
- {
- magnitude_l2(in0.val[0], gx.val[0]),
- magnitude_l2(in0.val[1], gx.val[1])
- };
-
- return mag;
-}
-
-template <>
-inline int16x8x2_t compute_magnitude<MagnitudeType::L1NORM>(const int16x8x2_t &in0, const int16x8x2_t &gx)
-{
- const int16x8x2_t mag =
- {
- magnitude_l1(in0.val[0], gx.val[0]),
- magnitude_l1(in0.val[1], gx.val[1])
- };
-
- return mag;
-}
-
-template <PhaseType phase_type>
-inline uint8x16_t compute_phase(const int16x8x2_t &in0, const int16x8x2_t &gx);
-
-template <>
-inline uint8x16_t compute_phase<PhaseType::SIGNED>(const int16x8x2_t &in0, const int16x8x2_t &gx)
-{
- return vcombine_u8(phase_signed(in0.val[0], gx.val[0]),
- phase_signed(in0.val[1], gx.val[1]));
-}
-
-template <>
-inline uint8x16_t compute_phase<PhaseType::UNSIGNED>(const int16x8x2_t &in0, const int16x8x2_t &gx)
-{
- return vcombine_u8(phase_unsigned(in0.val[0], gx.val[0]),
- phase_unsigned(in0.val[1], gx.val[1]));
-}
-} // namespace fp16
-
-template <MagnitudeType mag_type, PhaseType phase_type>
-NEMagnitudePhaseFP16Kernel<mag_type, phase_type>::NEMagnitudePhaseFP16Kernel()
- : _func(nullptr), _gx(nullptr), _gy(nullptr), _magnitude(nullptr), _phase(nullptr)
-{
-}
-
-template <MagnitudeType mag_type, PhaseType phase_type>
-void NEMagnitudePhaseFP16Kernel<mag_type, phase_type>::configure(const ITensor *gx, const ITensor *gy, ITensor *magnitude, ITensor *phase)
-{
- ARM_COMPUTE_ERROR_ON_FORMAT_NOT_IN(gx, Format::S16);
- ARM_COMPUTE_ERROR_ON_FORMAT_NOT_IN(gy, Format::S16);
- ARM_COMPUTE_ERROR_ON((nullptr == magnitude) && (nullptr == phase));
-
- const bool run_mag = magnitude != nullptr;
- const bool run_phase = phase != nullptr;
-
- if(run_mag)
- {
- ARM_COMPUTE_ERROR_ON_FORMAT_NOT_IN(magnitude, Format::S16);
- }
-
- if(run_phase)
- {
- ARM_COMPUTE_ERROR_ON_FORMAT_NOT_IN(phase, Format::U8);
- }
-
- _gx = gx;
- _gy = gy;
- _magnitude = magnitude;
- _phase = phase;
-
- if(run_mag && run_phase)
- {
- /* Run magnitude and phase */
- _func = &NEMagnitudePhaseFP16Kernel<mag_type, phase_type>::magnitude_phase;
- }
- else if(run_mag)
- {
- /* Run magnitude */
- _func = &NEMagnitudePhaseFP16Kernel<mag_type, phase_type>::magnitude;
- }
- else if(run_phase)
- {
- /* Run phase */
- _func = &NEMagnitudePhaseFP16Kernel<mag_type, phase_type>::phase;
- }
- else
- {
- ARM_COMPUTE_ERROR("At least one output must be NOT NULL");
- }
-
- const unsigned int num_elems_processed_per_iteration = 16;
-
- // Configure kernel window
- Window win = calculate_max_window(*gx->info(), Steps(num_elems_processed_per_iteration));
- AccessWindowHorizontal magnitude_access(magnitude == nullptr ? nullptr : magnitude->info(), 0, num_elems_processed_per_iteration);
- AccessWindowHorizontal phase_access(phase == nullptr ? nullptr : phase->info(), 0, num_elems_processed_per_iteration);
-
- update_window_and_padding(win,
- AccessWindowHorizontal(gx->info(), 0, num_elems_processed_per_iteration),
- AccessWindowHorizontal(gy->info(), 0, num_elems_processed_per_iteration),
- magnitude_access,
- phase_access);
-
- ValidRegion valid_region = intersect_valid_regions(gx->info()->valid_region(),
- gy->info()->valid_region());
-
- magnitude_access.set_valid_region(win, valid_region);
- phase_access.set_valid_region(win, valid_region);
-
- INEKernel::configure(win);
-}
-
-template <MagnitudeType mag_type, PhaseType phase_type>
-void NEMagnitudePhaseFP16Kernel<mag_type, phase_type>::magnitude(const Window &window)
-{
- Iterator gx(_gx, window);
- Iterator gy(_gy, window);
- Iterator magnitude(_magnitude, window);
-
- execute_window_loop(window, [&](const Coordinates & id)
- {
- const int16x8x2_t input1 =
- {
- vld1q_s16(reinterpret_cast<int16_t *>(gx.ptr())),
- vld1q_s16(reinterpret_cast<int16_t *>(gx.ptr()) + 8)
- };
-
- const int16x8x2_t input2 =
- {
- vld1q_s16(reinterpret_cast<int16_t *>(gy.ptr())),
- vld1q_s16(reinterpret_cast<int16_t *>(gy.ptr()) + 8)
- };
-
- // Compute and store magnitude
- const int16x8x2_t mag = fp16::compute_magnitude<mag_type>(input1, input2);
-
- /* Store magnitude */
- vst1q_s16(reinterpret_cast<int16_t *>(magnitude.ptr()), mag.val[0]);
- vst1q_s16(reinterpret_cast<int16_t *>(magnitude.ptr()) + 8, mag.val[1]);
- },
- gx, gy, magnitude);
-}
-
-template <MagnitudeType mag_type, PhaseType phase_type>
-void NEMagnitudePhaseFP16Kernel<mag_type, phase_type>::phase(const Window &window)
-{
- Iterator gx(_gx, window);
- Iterator gy(_gy, window);
- Iterator phase(_phase, window);
-
- execute_window_loop(window, [&](const Coordinates & id)
- {
- const int16x8x2_t input1 =
- {
- vld1q_s16(reinterpret_cast<int16_t *>(gx.ptr())),
- vld1q_s16(reinterpret_cast<int16_t *>(gx.ptr()) + 8)
- };
-
- const int16x8x2_t input2 =
- {
- vld1q_s16(reinterpret_cast<int16_t *>(gy.ptr())),
- vld1q_s16(reinterpret_cast<int16_t *>(gy.ptr()) + 8)
- };
-
- // Compute and store phase
- vst1q_u8(phase.ptr(), fp16::compute_phase<phase_type>(input1, input2));
- },
- gx, gy, phase);
-}
-
-template <MagnitudeType mag_type, PhaseType phase_type>
-void NEMagnitudePhaseFP16Kernel<mag_type, phase_type>::magnitude_phase(const Window &window)
-{
- Iterator gx(_gx, window);
- Iterator gy(_gy, window);
- Iterator magnitude(_magnitude, window);
- Iterator phase(_phase, window);
-
- execute_window_loop(window, [&](const Coordinates & id)
- {
- const int16x8x2_t input1 =
- {
- vld1q_s16(reinterpret_cast<int16_t *>(gx.ptr())),
- vld1q_s16(reinterpret_cast<int16_t *>(gx.ptr()) + 8)
- };
-
- const int16x8x2_t input2 =
- {
- vld1q_s16(reinterpret_cast<int16_t *>(gy.ptr())),
- vld1q_s16(reinterpret_cast<int16_t *>(gy.ptr()) + 8)
- };
-
- // Compute and store magnitude
- const int16x8x2_t mag = fp16::compute_magnitude<mag_type>(input1, input2);
-
- vst1q_s16(reinterpret_cast<int16_t *>(magnitude.ptr()), mag.val[0]);
- vst1q_s16(reinterpret_cast<int16_t *>(magnitude.ptr()) + 8, mag.val[1]);
-
- // Compute and store phase
- vst1q_u8(phase.ptr(), fp16::compute_phase<phase_type>(input1, input2));
- },
- gx, gy, magnitude, phase);
-}
-
-template <MagnitudeType mag_type, PhaseType phase_type>
-void NEMagnitudePhaseFP16Kernel<mag_type, phase_type>::run(const Window &window, const ThreadInfo &info)
-{
- ARM_COMPUTE_UNUSED(info);
- ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
- ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
- ARM_COMPUTE_ERROR_ON(_func == nullptr);
-
- (this->*_func)(window);
-}
-
-template class arm_compute::NEMagnitudePhaseFP16Kernel<MagnitudeType::L1NORM, PhaseType::SIGNED>;
-template class arm_compute::NEMagnitudePhaseFP16Kernel<MagnitudeType::L2NORM, PhaseType::SIGNED>;
-template class arm_compute::NEMagnitudePhaseFP16Kernel<MagnitudeType::L1NORM, PhaseType::UNSIGNED>;
-template class arm_compute::NEMagnitudePhaseFP16Kernel<MagnitudeType::L2NORM, PhaseType::UNSIGNED>;
-#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
-
namespace
{
inline float32x4_t inv(float32x4_t x)