diff options
author | Pablo Tello <pablo.tello@arm.com> | 2017-07-03 16:25:09 +0100 |
---|---|---|
committer | Anthony Barbier <anthony.barbier@arm.com> | 2018-09-17 14:15:39 +0100 |
commit | df24618b53cffed1c574e11e9fd4ba7740f8c009 (patch) | |
tree | 1f1145bca27c5dd0ca63538c2e8cdadd2b0a03cf /arm_compute | |
parent | d1b0ecc206e3858327503888c4a46842ec1808e9 (diff) | |
download | ComputeLibrary-df24618b53cffed1c574e11e9fd4ba7740f8c009.tar.gz |
COMPMID-421: Added FP16 suppot to NENormalizationLayer and NEPixelWiseMultiplication.
Change-Id: If174f8071502fc5cc94b27cd44a9b1d5e451a9e2
Reviewed-on: http://mpd-gerrit.cambridge.arm.com/79553
Tested-by: Kaizen <jeremy.johnson+kaizengerrit@arm.com>
Reviewed-by: Georgios Pinitas <georgios.pinitas@arm.com>
Diffstat (limited to 'arm_compute')
-rw-r--r-- | arm_compute/core/NEON/NEMath.h | 20 | ||||
-rw-r--r-- | arm_compute/core/NEON/NEMath.inl | 98 | ||||
-rw-r--r-- | arm_compute/core/NEON/kernels/NENormalizationLayerKernel.h | 4 | ||||
-rw-r--r-- | arm_compute/core/NEON/kernels/NEPixelWiseMultiplicationKernel.h | 6 | ||||
-rw-r--r-- | arm_compute/runtime/NEON/functions/NENormalizationLayer.h | 2 |
5 files changed, 123 insertions, 7 deletions
diff --git a/arm_compute/core/NEON/NEMath.h b/arm_compute/core/NEON/NEMath.h index bb8a330c1e..8dd9d609e7 100644 --- a/arm_compute/core/NEON/NEMath.h +++ b/arm_compute/core/NEON/NEMath.h @@ -91,6 +91,26 @@ float32x4_t vtanhq_f32(float32x4_t val); * @return The calculated power. */ float32x4_t vpowq_f32(float32x4_t val, float32x4_t n); + +#ifdef ARM_COMPUTE_ENABLE_FP16 +/** Calculate exponential + * + * @param[in] x Input vector value in F16 format. + * + * @return The calculated exponent. + */ +float16x8_t vexpq_f16(float16x8_t x); +/** Calculate n power of a number. + * + * pow(x,n) = e^(n*log(x)) + * + * @param[in] val Input vector value in F16 format. + * @param[in] n Powers to raise the input to. + * + * @return The calculated power. + */ +float16x8_t vpowq_f16(float16x8_t val, float16x8_t n); +#endif /* ARM_COMPUTE_ENABLE_FP16 */ } #include "arm_compute/core/NEON/NEMath.inl" #endif /* __ARM_COMPUTE_NEMATH_H__ */ diff --git a/arm_compute/core/NEON/NEMath.inl b/arm_compute/core/NEON/NEMath.inl index 9a49493cf6..c73c54501f 100644 --- a/arm_compute/core/NEON/NEMath.inl +++ b/arm_compute/core/NEON/NEMath.inl @@ -141,4 +141,100 @@ inline float32x4_t vpowq_f32(float32x4_t val, float32x4_t n) { return vexpq_f32(vmulq_f32(n, vlogq_f32(val))); } -}
\ No newline at end of file + +#ifdef ARM_COMPUTE_ENABLE_FP16 +/* Exponent polynomial coefficients */ +const std::array<float16x8_t, 8> exp_tab_f16 = +{ + { + vdupq_n_f16(1.f), + vdupq_n_f16(0.0416598916054f), + vdupq_n_f16(0.500000596046f), + vdupq_n_f16(0.0014122662833f), + vdupq_n_f16(1.00000011921f), + vdupq_n_f16(0.00833693705499f), + vdupq_n_f16(0.166665703058f), + vdupq_n_f16(0.000195780929062f), + } +}; + +/* Logarithm polynomial coefficients */ +const std::array<float16x8_t, 8> log_tab_f16 = +{ + { + vdupq_n_f16(-2.29561495781f), + vdupq_n_f16(-2.47071170807f), + vdupq_n_f16(-5.68692588806f), + vdupq_n_f16(-0.165253549814f), + vdupq_n_f16(5.17591238022f), + vdupq_n_f16(0.844007015228f), + vdupq_n_f16(4.58445882797f), + vdupq_n_f16(0.0141278216615f), + } +}; + +inline float16x8_t vinvq_f16(float16x8_t x) +{ + float16x8_t recip = vrecpeq_f16(x); + recip = vmulq_f16(vrecpsq_f16(x, recip), recip); + recip = vmulq_f16(vrecpsq_f16(x, recip), recip); + return recip; +} + +inline float16x8_t vtaylor_polyq_f16(float16x8_t x, const std::array<float16x8_t, 8> &coeffs) +{ + const float16x8_t A = vaddq_f16(coeffs[0], vmulq_f16(coeffs[4], x)); + const float16x8_t B = vaddq_f16(coeffs[2], vmulq_f16(coeffs[6], x)); + const float16x8_t C = vaddq_f16(coeffs[1], vmulq_f16(coeffs[5], x)); + const float16x8_t D = vaddq_f16(coeffs[3], vmulq_f16(coeffs[7], x)); + const float16x8_t x2 = vmulq_f16(x, x); + const float16x8_t x4 = vmulq_f16(x2, x2); + const float16x8_t res = vaddq_f16(vaddq_f16(A, vmulq_f16(B, x2)), vmulq_f16(vaddq_f16(C, vmulq_f16(D, x2)), x4)); + return res; +} + +inline float16x8_t vexpq_f16(float16x8_t x) +{ + static const float16x8_t CONST_LN2 = vdupq_n_f16(0.6931471805f); // ln(2) + static const float16x8_t CONST_INV_LN2 = vdupq_n_f16(1.4426950408f); // 1/ln(2) + static const float16x8_t CONST_0 = vdupq_n_f16(0.f); + static const int16x8_t CONST_NEGATIVE_126 = vdupq_n_s16(-126); + + // Perform range reduction [-log(2),log(2)] + const int16x8_t m = vcvtq_s16_f16(vmulq_f16(x, CONST_INV_LN2)); + const float16x8_t val = vsubq_f16(x, vmulq_f16(vcvtq_f16_s16(m), CONST_LN2)); + + // Polynomial Approximation + float16x8_t poly = vtaylor_polyq_f16(val, exp_tab_f16); + + // Reconstruct + poly = vreinterpretq_f16_s16(vqaddq_s16(vreinterpretq_s16_f16(poly), vqshlq_n_s16(m, 9))); + poly = vbslq_f16(vcltq_s16(m, CONST_NEGATIVE_126), CONST_0, poly); + + return poly; +} + +inline float16x8_t vlogq_f16(float16x8_t x) +{ + static const int16x8_t CONST_127 = vdupq_n_s16(127); // 127 + static const float16x8_t CONST_LN2 = vdupq_n_f16(0.6931471805f); // ln(2) + + // Extract exponent + const int16x8_t m = vsubq_s16(vreinterpretq_s16_u16(vshrq_n_u16(vreinterpretq_u16_f16(x), 9)), CONST_127); + const float16x8_t val = vreinterpretq_f16_s16(vsubq_s16(vreinterpretq_s16_f16(x), vshlq_n_s16(m, 9))); + + // Polynomial Approximation + float16x8_t poly = vtaylor_polyq_f16(val, log_tab_f16); + + // Reconstruct + poly = vaddq_f16(poly, vmulq_f16(vcvtq_f16_s16(m), CONST_LN2)); + + return poly; +} + +inline float16x8_t vpowq_f16(float16x8_t val, float16x8_t n) +{ + return vexpq_f16(vmulq_f16(n, vlogq_f16(val))); +} +#endif /* ARM_COMPUTE_ENABLE_FP16 */ +} diff --git a/arm_compute/core/NEON/kernels/NENormalizationLayerKernel.h b/arm_compute/core/NEON/kernels/NENormalizationLayerKernel.h index d4e36d5ff1..b1bc594e4c 100644 --- a/arm_compute/core/NEON/kernels/NENormalizationLayerKernel.h +++ b/arm_compute/core/NEON/kernels/NENormalizationLayerKernel.h @@ -73,8 +73,8 @@ private: * * @param[in] window Region on which to execute the kernel. */ - template <unsigned int dim, bool do_2D_norm> - void normalize(const Window &window); + template <DataType dt, unsigned int dim, bool do_2D_norm> + void normalize_float(const Window &window); /** Function to perform normalization for fixed-point values depending on * the given template dimension. The second template parameter specifies diff --git a/arm_compute/core/NEON/kernels/NEPixelWiseMultiplicationKernel.h b/arm_compute/core/NEON/kernels/NEPixelWiseMultiplicationKernel.h index 7e402cd220..433a20e48e 100644 --- a/arm_compute/core/NEON/kernels/NEPixelWiseMultiplicationKernel.h +++ b/arm_compute/core/NEON/kernels/NEPixelWiseMultiplicationKernel.h @@ -52,9 +52,9 @@ public: * @note For @p scale equal to 1/255 only round to nearest even (implemented as round half up) is supported. * For all other scale values only round to zero (implemented as round towards minus infinity) is supported. * - * @param[in] input1 An input tensor. Data types supported: U8/QS8/S16/F32. - * @param[in] input2 An input tensor. Data types supported: U8/QS8/S16/F32. - * @param[out] output The output tensor. Data types supported: U8 (Only if both inputs are U8) /S16/F32. + * @param[in] input1 An input tensor. Data types supported: U8/QS8/S16/F16/F32. + * @param[in] input2 An input tensor. Data types supported: U8/QS8/S16/F16/F32. + * @param[out] output The output tensor. Data types supported: U8 (Only if both inputs are U8) /S16/F16/F32. * @param[in] scale Scale to apply after multiplication. * Scale must be positive and its value must be either 1/255 or 1/2^n where n is between 0 and 15. * @param[in] overflow_policy Overflow policy. diff --git a/arm_compute/runtime/NEON/functions/NENormalizationLayer.h b/arm_compute/runtime/NEON/functions/NENormalizationLayer.h index 3202867c43..4cfea226f3 100644 --- a/arm_compute/runtime/NEON/functions/NENormalizationLayer.h +++ b/arm_compute/runtime/NEON/functions/NENormalizationLayer.h @@ -52,7 +52,7 @@ public: /** Set the input and output tensors. * * @param[in] input Source tensor. 3 lower dims represent a single input with dimensions [width, height, IFM], - * and an optional 4th dimension for batch of inputs. Data type supported: QS8/F32 + * and an optional 4th dimension for batch of inputs. Data type supported: QS8/F16/F32 * @param[out] output Destination with the same dimensions, data type and number of channels of @p input * @param[in] norm_info Normalization layer information like the normalization type, normalization size and other parameters. */ |