aboutsummaryrefslogtreecommitdiff
path: root/arm_compute/core/NEON
diff options
context:
space:
mode:
Diffstat (limited to 'arm_compute/core/NEON')
-rw-r--r--arm_compute/core/NEON/INEKernel.h3
-rw-r--r--arm_compute/core/NEON/INESimpleKernel.h3
-rw-r--r--arm_compute/core/NEON/NEColorConvertHelper.inl102
-rw-r--r--arm_compute/core/NEON/NEFixedPoint.inl4
-rw-r--r--arm_compute/core/NEON/NEMath.inl15
-rw-r--r--arm_compute/core/NEON/kernels/NEAccumulateKernel.h1
-rw-r--r--arm_compute/core/NEON/kernels/NEBatchNormalizationLayerKernel.h2
-rw-r--r--arm_compute/core/NEON/kernels/NEBox3x3Kernel.h3
-rw-r--r--arm_compute/core/NEON/kernels/NECannyEdgeKernel.h3
-rw-r--r--arm_compute/core/NEON/kernels/NEDepthwiseConvolutionLayer3x3Kernel.h2
-rw-r--r--arm_compute/core/NEON/kernels/NEDequantizationLayerKernel.h2
-rw-r--r--arm_compute/core/NEON/kernels/NEGEMMMatrixVectorMultiplyKernel.h1
-rw-r--r--arm_compute/core/NEON/kernels/NEHarrisCornersKernel.h1
-rw-r--r--arm_compute/core/NEON/kernels/NELKTrackerKernel.h1
-rw-r--r--arm_compute/core/NEON/kernels/NEMagnitudePhaseKernel.h1
-rw-r--r--arm_compute/core/NEON/kernels/NENonMaximaSuppression3x3Kernel.h3
-rw-r--r--arm_compute/core/NEON/kernels/NENormalizationLayerKernel.h2
-rw-r--r--arm_compute/core/NEON/kernels/NEQuantizationLayerKernel.h2
-rw-r--r--arm_compute/core/NEON/kernels/NEWinogradLayerKernel.h37
-rw-r--r--arm_compute/core/NEON/kernels/detail/NEActivationFunctionDetail.h51
-rw-r--r--arm_compute/core/NEON/wrapper/traits.h4
21 files changed, 215 insertions, 28 deletions
diff --git a/arm_compute/core/NEON/INEKernel.h b/arm_compute/core/NEON/INEKernel.h
index 529606a709..32d7ab6338 100644
--- a/arm_compute/core/NEON/INEKernel.h
+++ b/arm_compute/core/NEON/INEKernel.h
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -28,6 +28,7 @@
namespace arm_compute
{
+/** Common interface for all kernels implemented in NEON. */
using INEKernel = ICPPKernel;
} // namespace arm_compute
#endif /*__ARM_COMPUTE_INEKERNEL_H__ */
diff --git a/arm_compute/core/NEON/INESimpleKernel.h b/arm_compute/core/NEON/INESimpleKernel.h
index 0d2211ac32..15fc3be5ed 100644
--- a/arm_compute/core/NEON/INESimpleKernel.h
+++ b/arm_compute/core/NEON/INESimpleKernel.h
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2016, 2017 ARM Limited.
+ * Copyright (c) 2016-2018 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -28,6 +28,7 @@
namespace arm_compute
{
+/** Interface for simple NEON kernels having 1 tensor input and 1 tensor output */
using INESimpleKernel = ICPPSimpleKernel;
} // namespace arm_compute
#endif /*__ARM_COMPUTE_INESIMPLEKERNEL_H__ */
diff --git a/arm_compute/core/NEON/NEColorConvertHelper.inl b/arm_compute/core/NEON/NEColorConvertHelper.inl
index 9a9caefaab..0da5affe18 100644
--- a/arm_compute/core/NEON/NEColorConvertHelper.inl
+++ b/arm_compute/core/NEON/NEColorConvertHelper.inl
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2016, 2017 ARM Limited.
+ * Copyright (c) 2016-2018 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -30,6 +30,7 @@
namespace
{
+#ifndef DOXYGEN_SKIP_THIS
constexpr float red_coef_bt709 = 1.5748F;
constexpr float green_coef_bt709 = -0.1873f;
constexpr float green_coef2_bt709 = -0.4681f;
@@ -296,10 +297,18 @@ inline void store_rgb_to_yuv4(const uint8x16_t &rvec, const uint8x16_t &gvec, co
vst1q_u8(out_u, uvec);
vst1q_u8(out_v, vvec);
}
+#endif /* DOXYGEN_SKIP_THIS */
}
namespace arm_compute
{
+/** Convert RGB to RGBX.
+ *
+ * @param[in] input Input RGB data buffer.
+ * @param[out] output Output RGBX buffer.
+ * @param[in] win Window for iterating the buffers.
+ *
+ */
void colorconvert_rgb_to_rgbx(const void *__restrict input, void *__restrict output, const Window &win)
{
ARM_COMPUTE_ERROR_ON(nullptr == input);
@@ -324,6 +333,13 @@ void colorconvert_rgb_to_rgbx(const void *__restrict input, void *__restrict out
in, out);
}
+/** Convert RGBX to RGB.
+ *
+ * @param[in] input Input RGBX data buffer.
+ * @param[out] output Output RGB buffer.
+ * @param[in] win Window for iterating the buffers.
+ *
+ */
void colorconvert_rgbx_to_rgb(const void *input, void *output, const Window &win)
{
ARM_COMPUTE_ERROR_ON(nullptr == input);
@@ -347,6 +363,13 @@ void colorconvert_rgbx_to_rgb(const void *input, void *output, const Window &win
in, out);
}
+/** Convert YUYV to RGB.
+ *
+ * @param[in] input Input YUYV data buffer.
+ * @param[out] output Output RGB buffer.
+ * @param[in] win Window for iterating the buffers.
+ *
+ */
template <bool yuyv, bool alpha>
void colorconvert_yuyv_to_rgb(const void *__restrict input, void *__restrict output, const Window &win)
{
@@ -385,6 +408,13 @@ void colorconvert_yuyv_to_rgb(const void *__restrict input, void *__restrict out
in, out);
}
+/** Convert NV12 to RGB.
+ *
+ * @param[in] input Input NV12 data buffer.
+ * @param[out] output Output RGB buffer.
+ * @param[in] win Window for iterating the buffers.
+ *
+ */
template <bool uv, bool alpha>
void colorconvert_nv12_to_rgb(const void *__restrict input, void *__restrict output, const Window &win)
{
@@ -441,6 +471,13 @@ void colorconvert_nv12_to_rgb(const void *__restrict input, void *__restrict out
in_y, in_uv, out);
}
+/** Convert IYUV to RGB.
+ *
+ * @param[in] input Input IYUV data buffer.
+ * @param[out] output Output RGB buffer.
+ * @param[in] win Window for iterating the buffers.
+ *
+ */
template <bool alpha>
void colorconvert_iyuv_to_rgb(const void *__restrict input, void *__restrict output, const Window &win)
{
@@ -498,6 +535,13 @@ void colorconvert_iyuv_to_rgb(const void *__restrict input, void *__restrict out
in_y, in_u, in_v, out);
}
+/** Convert YUYV to NV12.
+ *
+ * @param[in] input Input YUYV data buffer.
+ * @param[out] output Output NV12 buffer.
+ * @param[in] win Window for iterating the buffers.
+ *
+ */
template <bool yuyv>
void colorconvert_yuyv_to_nv12(const void *__restrict input, void *__restrict output, const Window &win)
{
@@ -547,6 +591,13 @@ void colorconvert_yuyv_to_nv12(const void *__restrict input, void *__restrict ou
in, out_y, out_uv);
}
+/** Convert IYUV to NV12.
+ *
+ * @param[in] input Input IYUV data buffer.
+ * @param[out] output Output NV12 buffer.
+ * @param[in] win Window for iterating the buffers.
+ *
+ */
void colorconvert_iyuv_to_nv12(const void *__restrict input, void *__restrict output, const Window &win)
{
ARM_COMPUTE_ERROR_ON(nullptr == input);
@@ -587,6 +638,13 @@ void colorconvert_iyuv_to_nv12(const void *__restrict input, void *__restrict ou
in_y, in_u, in_v, out_y, out_uv);
}
+/** Convert NV12 to IYUV.
+ *
+ * @param[in] input Input NV12 data buffer.
+ * @param[out] output Output IYUV buffer.
+ * @param[in] win Window for iterating the buffers.
+ *
+ */
template <bool uv>
void colorconvert_nv12_to_iyuv(const void *__restrict input, void *__restrict output, const Window &win)
{
@@ -629,6 +687,13 @@ void colorconvert_nv12_to_iyuv(const void *__restrict input, void *__restrict ou
in_y, in_uv, out_y, out_u, out_v);
}
+/** Convert YUYV to IYUV.
+ *
+ * @param[in] input Input YUYV data buffer.
+ * @param[out] output Output IYUV buffer.
+ * @param[in] win Window for iterating the buffers.
+ *
+ */
template <bool yuyv>
void colorconvert_yuyv_to_iyuv(const void *__restrict input, void *__restrict output, const Window &win)
{
@@ -682,6 +747,13 @@ void colorconvert_yuyv_to_iyuv(const void *__restrict input, void *__restrict ou
in, out_y, out_u, out_v);
}
+/** Convert NV12 to YUV4.
+ *
+ * @param[in] input Input NV12 data buffer.
+ * @param[out] output Output YUV4 buffer.
+ * @param[in] win Window for iterating the buffers.
+ *
+ */
template <bool uv>
void colorconvert_nv12_to_yuv4(const void *__restrict input, void *__restrict output, const Window &win)
{
@@ -734,6 +806,13 @@ void colorconvert_nv12_to_yuv4(const void *__restrict input, void *__restrict ou
in_y, in_uv, out_y, out_u, out_v);
}
+/** Convert IYUV to YUV4.
+ *
+ * @param[in] input Input IYUV data buffer.
+ * @param[out] output Output YUV4 buffer.
+ * @param[in] win Window for iterating the buffers.
+ *
+ */
void colorconvert_iyuv_to_yuv4(const void *__restrict input, void *__restrict output, const Window &win)
{
ARM_COMPUTE_ERROR_ON(nullptr == input);
@@ -785,6 +864,13 @@ void colorconvert_iyuv_to_yuv4(const void *__restrict input, void *__restrict ou
in_y, in_u, in_v, out_y, out_u, out_v);
}
+/** Convert RGB to NV12.
+ *
+ * @param[in] input Input RGB data buffer.
+ * @param[out] output Output NV12 buffer.
+ * @param[in] win Window for iterating the buffers.
+ *
+ */
template <bool alpha>
void colorconvert_rgb_to_nv12(const void *__restrict input, void *__restrict output, const Window &win)
{
@@ -821,6 +907,13 @@ void colorconvert_rgb_to_nv12(const void *__restrict input, void *__restrict out
in, out_y, out_uv);
}
+/** Convert RGB to IYUV.
+ *
+ * @param[in] input Input RGB data buffer.
+ * @param[out] output Output IYUV buffer.
+ * @param[in] win Window for iterating the buffers.
+ *
+ */
template <bool alpha>
void colorconvert_rgb_to_iyuv(const void *__restrict input, void *__restrict output, const Window &win)
{
@@ -858,6 +951,13 @@ void colorconvert_rgb_to_iyuv(const void *__restrict input, void *__restrict out
in, out_y, out_u, out_v);
}
+/** Convert RGB to YUV4.
+ *
+ * @param[in] input Input RGB data buffer.
+ * @param[out] output Output YUV4 buffer.
+ * @param[in] win Window for iterating the buffers.
+ *
+ */
template <bool alpha>
void colorconvert_rgb_to_yuv4(const void *__restrict input, void *__restrict output, const Window &win)
{
diff --git a/arm_compute/core/NEON/NEFixedPoint.inl b/arm_compute/core/NEON/NEFixedPoint.inl
index 966313d58b..b86c3cbec3 100644
--- a/arm_compute/core/NEON/NEFixedPoint.inl
+++ b/arm_compute/core/NEON/NEFixedPoint.inl
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -130,6 +130,7 @@ static const std::array<qint16x8_t, 4> log_tabq_qs16 =
}
};
+#ifndef DOXYGEN_SKIP_THIS
inline qint8x8_t vget_low_qs8(qint8x16_t a)
{
return vget_low_s8(a);
@@ -1996,4 +1997,5 @@ inline float32x4x2_t vmax2q_f32(float32x4x2_t a, float32x4x2_t b)
};
return res;
}
+#endif /* DOXYGEN_SKIP_THIS */
} // namespace arm_compute
diff --git a/arm_compute/core/NEON/NEMath.inl b/arm_compute/core/NEON/NEMath.inl
index 50f217c1f1..84154020a5 100644
--- a/arm_compute/core/NEON/NEMath.inl
+++ b/arm_compute/core/NEON/NEMath.inl
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2016, 2017 ARM Limited.
+ * Copyright (c) 2016-2018 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -24,7 +24,7 @@
namespace arm_compute
{
-/* Exponent polynomial coefficients */
+/** Exponent polynomial coefficients */
const std::array<float32x4_t, 8> exp_tab =
{
{
@@ -39,7 +39,7 @@ const std::array<float32x4_t, 8> exp_tab =
}
};
-/* Logarithm polynomial coefficients */
+/** Logarithm polynomial coefficients */
const std::array<float32x4_t, 8> log_tab =
{
{
@@ -54,6 +54,7 @@ const std::array<float32x4_t, 8> log_tab =
}
};
+#ifndef DOXYGEN_SKIP_THIS
inline float32x4_t vfloorq_f32(float32x4_t val)
{
static const float32x4_t CONST_1 = vdupq_n_f32(1.f);
@@ -168,8 +169,10 @@ inline float32x4_t vpowq_f32(float32x4_t val, float32x4_t n)
{
return vexpq_f32(vmulq_f32(n, vlogq_f32(val)));
}
+#endif /* DOXYGEN_SKIP_THIS */
+
#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-/* Exponent polynomial coefficients */
+/** Exponent polynomial coefficients */
const std::array<float16x8_t, 8> exp_tab_f16 =
{
{
@@ -184,7 +187,7 @@ const std::array<float16x8_t, 8> exp_tab_f16 =
}
};
-/* Logarithm polynomial coefficients */
+/** Logarithm polynomial coefficients */
const std::array<float16x8_t, 8> log_tab_f16 =
{
{
@@ -199,6 +202,7 @@ const std::array<float16x8_t, 8> log_tab_f16 =
}
};
+#ifndef DOXYGEN_SKIP_THIS
inline float16x4_t vinvsqrt_f16(float16x4_t x)
{
float16x4_t sqrt_reciprocal = vrsqrte_f16(x);
@@ -301,5 +305,6 @@ inline float16x8_t vpowq_f16(float16x8_t val, float16x8_t n)
{
return vexpq_f16(vmulq_f16(n, vlogq_f16(val)));
}
+#endif /* DOXYGEN_SKIP_THIS */
#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
} // namespace arm_compute
diff --git a/arm_compute/core/NEON/kernels/NEAccumulateKernel.h b/arm_compute/core/NEON/kernels/NEAccumulateKernel.h
index ad5a16c9f3..82a4199761 100644
--- a/arm_compute/core/NEON/kernels/NEAccumulateKernel.h
+++ b/arm_compute/core/NEON/kernels/NEAccumulateKernel.h
@@ -101,6 +101,7 @@ public:
void run(const Window &window, const ThreadInfo &info) override;
};
#else /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
+/** Interface for the accumulate weighted kernel using F16 */
using NEAccumulateWeightedFP16Kernel = NEAccumulateWeightedKernel;
#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
diff --git a/arm_compute/core/NEON/kernels/NEBatchNormalizationLayerKernel.h b/arm_compute/core/NEON/kernels/NEBatchNormalizationLayerKernel.h
index 63eb739487..2408a665e4 100644
--- a/arm_compute/core/NEON/kernels/NEBatchNormalizationLayerKernel.h
+++ b/arm_compute/core/NEON/kernels/NEBatchNormalizationLayerKernel.h
@@ -47,7 +47,7 @@ public:
NEBatchNormalizationLayerKernel &operator=(const NEBatchNormalizationLayerKernel &) = delete;
/** Default Move Constructor. */
NEBatchNormalizationLayerKernel(NEBatchNormalizationLayerKernel &&) = default;
- /** Default move assignment operator. */
+ /** Default move assignment operator */
NEBatchNormalizationLayerKernel &operator=(NEBatchNormalizationLayerKernel &&) = default;
/** Default destructor */
~NEBatchNormalizationLayerKernel() = default;
diff --git a/arm_compute/core/NEON/kernels/NEBox3x3Kernel.h b/arm_compute/core/NEON/kernels/NEBox3x3Kernel.h
index 9c139551cb..2f93fd2480 100644
--- a/arm_compute/core/NEON/kernels/NEBox3x3Kernel.h
+++ b/arm_compute/core/NEON/kernels/NEBox3x3Kernel.h
@@ -51,7 +51,7 @@ public:
};
#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-/** NEON kernel to perform a Box 3x3 filter using F16 simd
+/** NEON kernel to perform a Box 3x3 filter for FP16 datatype
*/
class NEBox3x3FP16Kernel : public NEBox3x3Kernel
{
@@ -64,6 +64,7 @@ public:
void run(const Window &window, const ThreadInfo &info) override;
};
#else /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
+/** NEON kernel to perform a Box 3x3 filter for FP16 datatype */
using NEBox3x3FP16Kernel = NEBox3x3Kernel;
#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
} // namespace arm_compute
diff --git a/arm_compute/core/NEON/kernels/NECannyEdgeKernel.h b/arm_compute/core/NEON/kernels/NECannyEdgeKernel.h
index 401b9e47af..58ef1757fe 100644
--- a/arm_compute/core/NEON/kernels/NECannyEdgeKernel.h
+++ b/arm_compute/core/NEON/kernels/NECannyEdgeKernel.h
@@ -86,7 +86,7 @@ protected:
};
#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-/** NEON kernel to perform Gradient computation
+/** NEON kernel to perform Gradient computation for FP16 datatype
*/
class NEGradientFP16Kernel : public NEGradientKernel
{
@@ -99,6 +99,7 @@ public:
void configure(const ITensor *gx, const ITensor *gy, ITensor *magnitude, ITensor *phase, int32_t norm_type) override;
};
#else /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
+/** NEON kernel to perform Gradient computation for FP16 datatype */
using NEGradientFP16Kernel = NEGradientKernel;
#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
diff --git a/arm_compute/core/NEON/kernels/NEDepthwiseConvolutionLayer3x3Kernel.h b/arm_compute/core/NEON/kernels/NEDepthwiseConvolutionLayer3x3Kernel.h
index 5871cc5dcb..0c2f30a98c 100644
--- a/arm_compute/core/NEON/kernels/NEDepthwiseConvolutionLayer3x3Kernel.h
+++ b/arm_compute/core/NEON/kernels/NEDepthwiseConvolutionLayer3x3Kernel.h
@@ -49,7 +49,7 @@ public:
NEDepthwiseConvolutionLayer3x3Kernel &operator=(const NEDepthwiseConvolutionLayer3x3Kernel &) = delete;
/** Default Move Constructor. */
NEDepthwiseConvolutionLayer3x3Kernel(NEDepthwiseConvolutionLayer3x3Kernel &&) = default;
- /** Default move assignment operator. */
+ /** Default move assignment operator */
NEDepthwiseConvolutionLayer3x3Kernel &operator=(NEDepthwiseConvolutionLayer3x3Kernel &&) = default;
/** Initialize the function's source, destination, conv and border_size.
*
diff --git a/arm_compute/core/NEON/kernels/NEDequantizationLayerKernel.h b/arm_compute/core/NEON/kernels/NEDequantizationLayerKernel.h
index 7ee2078e9e..f48e76f340 100644
--- a/arm_compute/core/NEON/kernels/NEDequantizationLayerKernel.h
+++ b/arm_compute/core/NEON/kernels/NEDequantizationLayerKernel.h
@@ -50,7 +50,7 @@ public:
NEDequantizationLayerKernel &operator=(const NEDequantizationLayerKernel &) = delete;
/** Default Move Constructor. */
NEDequantizationLayerKernel(NEDequantizationLayerKernel &&) = default;
- /** Default move assignment operator. */
+ /** Default move assignment operator */
NEDequantizationLayerKernel &operator=(NEDequantizationLayerKernel &&) = default;
/** Default destructor */
~NEDequantizationLayerKernel() = default;
diff --git a/arm_compute/core/NEON/kernels/NEGEMMMatrixVectorMultiplyKernel.h b/arm_compute/core/NEON/kernels/NEGEMMMatrixVectorMultiplyKernel.h
index 286be1acc9..a05d591850 100644
--- a/arm_compute/core/NEON/kernels/NEGEMMMatrixVectorMultiplyKernel.h
+++ b/arm_compute/core/NEON/kernels/NEGEMMMatrixVectorMultiplyKernel.h
@@ -30,6 +30,7 @@ namespace arm_compute
{
class ITensor;
+/** Interface for the GEMM matrix vector multiply kernel. **/
class NEGEMMMatrixVectorMultiplyKernel : public INESimpleKernel
{
public:
diff --git a/arm_compute/core/NEON/kernels/NEHarrisCornersKernel.h b/arm_compute/core/NEON/kernels/NEHarrisCornersKernel.h
index 8037e41695..aabf8b312b 100644
--- a/arm_compute/core/NEON/kernels/NEHarrisCornersKernel.h
+++ b/arm_compute/core/NEON/kernels/NEHarrisCornersKernel.h
@@ -127,6 +127,7 @@ private:
HarrisScoreFunction *_func;
};
#else /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
+/** Interface for the accumulate Weighted kernel using FP16 */
template <int32_t block_size>
using NEHarrisScoreFP16Kernel = NEHarrisScoreKernel<block_size>;
#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
diff --git a/arm_compute/core/NEON/kernels/NELKTrackerKernel.h b/arm_compute/core/NEON/kernels/NELKTrackerKernel.h
index f2105582eb..9a8947f9a0 100644
--- a/arm_compute/core/NEON/kernels/NELKTrackerKernel.h
+++ b/arm_compute/core/NEON/kernels/NELKTrackerKernel.h
@@ -45,6 +45,7 @@ struct NELKInternalKeypoint
bool tracking_status{ false }; /**< the tracking status of the keypoint */
};
+/** Interface for NEON Array of Internal Key Points. */
using INELKInternalKeypointArray = IArray<NELKInternalKeypoint>;
/** Interface for the Lucas-Kanade tracker kernel */
diff --git a/arm_compute/core/NEON/kernels/NEMagnitudePhaseKernel.h b/arm_compute/core/NEON/kernels/NEMagnitudePhaseKernel.h
index 522ed54f95..696721673d 100644
--- a/arm_compute/core/NEON/kernels/NEMagnitudePhaseKernel.h
+++ b/arm_compute/core/NEON/kernels/NEMagnitudePhaseKernel.h
@@ -165,6 +165,7 @@ private:
ITensor *_phase; /**< Output - Phase */
};
#else /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
+/** Template interface for the kernel to compute magnitude and phase */
template <MagnitudeType mag_type, PhaseType phase_type>
using NEMagnitudePhaseFP16Kernel = NEMagnitudePhaseKernel<mag_type, phase_type>;
#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
diff --git a/arm_compute/core/NEON/kernels/NENonMaximaSuppression3x3Kernel.h b/arm_compute/core/NEON/kernels/NENonMaximaSuppression3x3Kernel.h
index f122ed15fd..588de49316 100644
--- a/arm_compute/core/NEON/kernels/NENonMaximaSuppression3x3Kernel.h
+++ b/arm_compute/core/NEON/kernels/NENonMaximaSuppression3x3Kernel.h
@@ -83,7 +83,7 @@ protected:
};
#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-/** NEON kernel to perform Non-Maxima suppression 3x3 with intermediate results in F16 if the input data type is F32
+/** NEON kernel to perform Non-Maxima suppression 3x3 with intermediate results in FP16 if the input data type is FP32
*/
class NENonMaximaSuppression3x3FP16Kernel : public NENonMaximaSuppression3x3Kernel
{
@@ -101,6 +101,7 @@ public:
void configure(const ITensor *input, ITensor *output, bool border_undefined);
};
#else /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
+/** NEON kernel to perform Non-Maxima suppression 3x3 with intermediate results in FP16 if the input data type is FP32 */
using NENonMaximaSuppression3x3FP16Kernel = NENonMaximaSuppression3x3Kernel;
#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
} // namespace arm_compute
diff --git a/arm_compute/core/NEON/kernels/NENormalizationLayerKernel.h b/arm_compute/core/NEON/kernels/NENormalizationLayerKernel.h
index b835ca7c53..6ae7b73423 100644
--- a/arm_compute/core/NEON/kernels/NENormalizationLayerKernel.h
+++ b/arm_compute/core/NEON/kernels/NENormalizationLayerKernel.h
@@ -47,7 +47,7 @@ public:
NENormalizationLayerKernel &operator=(const NENormalizationLayerKernel &) = delete;
/** Default Move Constructor. */
NENormalizationLayerKernel(NENormalizationLayerKernel &&) = default;
- /** Default move assignment operator. */
+ /** Default move assignment operator */
NENormalizationLayerKernel &operator=(NENormalizationLayerKernel &&) = default;
/** Default destructor */
~NENormalizationLayerKernel() = default;
diff --git a/arm_compute/core/NEON/kernels/NEQuantizationLayerKernel.h b/arm_compute/core/NEON/kernels/NEQuantizationLayerKernel.h
index e7cf0a8ca4..ca7658bb7e 100644
--- a/arm_compute/core/NEON/kernels/NEQuantizationLayerKernel.h
+++ b/arm_compute/core/NEON/kernels/NEQuantizationLayerKernel.h
@@ -50,7 +50,7 @@ public:
NEQuantizationLayerKernel &operator=(const NEQuantizationLayerKernel &) = delete;
/** Default Move Constructor. */
NEQuantizationLayerKernel(NEQuantizationLayerKernel &&) = default;
- /** Default move assignment operator. */
+ /** Default move assignment operator */
NEQuantizationLayerKernel &operator=(NEQuantizationLayerKernel &&) = default;
/** Default destructor */
~NEQuantizationLayerKernel() = default;
diff --git a/arm_compute/core/NEON/kernels/NEWinogradLayerKernel.h b/arm_compute/core/NEON/kernels/NEWinogradLayerKernel.h
index 9169b75d19..2f44d19b4f 100644
--- a/arm_compute/core/NEON/kernels/NEWinogradLayerKernel.h
+++ b/arm_compute/core/NEON/kernels/NEWinogradLayerKernel.h
@@ -34,6 +34,7 @@ namespace arm_compute
{
class ITensor;
+/** Interface for the NEON kernel to perform Winograd input transform. */
template <typename T>
class INEWinogradLayerTransformInputKernel : public INEKernel
{
@@ -46,6 +47,8 @@ public:
* @param[in] n_rows Number of rows in each feature map.
* @param[in] n_cols Number of columns in each feature map.
* @param[in] same_padding Use "SAME" padding, otherwise use "VALID".
+ *
+ * @return Storage size (in units of TIn) required.
*/
virtual unsigned int get_input_storage_size(int n_batches, int n_channels, int n_rows, int n_cols, bool same_padding) const = 0;
@@ -72,11 +75,13 @@ public:
*/
virtual void configure(const T *const input, const int n_batches, const int n_rows, const int n_cols, const int n_channels, const PaddingType padding, T *const output, const int matrix_stride) = 0;
+ /** Destructor */
virtual ~INEWinogradLayerTransformInputKernel()
{
}
};
+/** NEON kernel to perform Winograd input transform. */
template <typename T, int OutputTileRows, int OutputTileCols, int KernelRows, int KernelCols>
class NEWinogradLayerTransformInputKernel : public INEWinogradLayerTransformInputKernel<T>
{
@@ -89,6 +94,8 @@ public:
* @param[in] n_rows Number of rows in each feature map.
* @param[in] n_cols Number of columns in each feature map.
* @param[in] same_padding Use "SAME" padding, otherwise use "VALID".
+ *
+ * @return Storage size (in units of TIn) required.
*/
unsigned int get_input_storage_size(
int n_batches,
@@ -107,6 +114,7 @@ public:
*/
int get_matrix_stride(const KernelShape &kernel_shape, const Tensor4DShape &input_shape, const PaddingType padding_type) const override;
+ /** Default constructor */
NEWinogradLayerTransformInputKernel();
const char *name() const override
@@ -139,7 +147,9 @@ public:
void run(const Window &window, const ThreadInfo &info) override;
bool is_parallelisable() const override;
+ /** Winograd base kernel */
using WinogradBase = winograd::WinogradGEMM<OutputTileRows, OutputTileCols, KernelCols, KernelCols>;
+ /** Winograd convolution kernel */
using WinogradConv = typename WinogradBase::template Convolution<T, T>;
private:
@@ -147,6 +157,7 @@ private:
std::unique_ptr<InputTransform> _transform;
};
+/** Interface for the NEON kernel to perform Winograd output transform. */
template <typename T>
class INEWinogradLayerTransformOutputKernel : public INEKernel
{
@@ -159,6 +170,8 @@ public:
* @param[in] n_cols Number of columns in each feature map of the input tensor.
* @param[in] n_output_channels Number of feature maps in the output tensor.
* @param[in] same_padding Use "SAME" padding, otherwise use "VALID".
+ *
+ * @return Storage size (in units of TOut) required.
*/
virtual unsigned int get_output_storage_size(int n_batches, int n_rows, int n_cols, int n_output_channels, bool same_padding) const = 0;
@@ -208,6 +221,7 @@ public:
}
};
+/** NEON kernel to perform Winograd output transform. */
template <typename T, int OutputTileRows, int OutputTileCols, int KernelRows, int KernelCols>
class NEWinogradLayerTransformOutputKernel : public INEWinogradLayerTransformOutputKernel<T>
{
@@ -227,7 +241,7 @@ public:
NEWinogradLayerTransformOutputKernel(NEWinogradLayerTransformOutputKernel &&) = default;
/** Allow instances of this class to be moved */
NEWinogradLayerTransformOutputKernel &operator=(NEWinogradLayerTransformOutputKernel &&) = default;
-
+ /** Default destructor */
~NEWinogradLayerTransformOutputKernel() = default;
// Inherited methods overridden:
@@ -239,6 +253,8 @@ public:
* @param[in] n_cols Number of columns in each feature map of the input tensor.
* @param[in] n_output_channels Number of feature maps in the output tensor.
* @param[in] same_padding Use "SAME" padding, otherwise use "VALID".
+ *
+ * @return Storage size (in units of TOut) required.
*/
unsigned int get_output_storage_size(int n_batches, int n_rows, int n_cols, int n_output_channels, bool same_padding) const override;
@@ -301,6 +317,7 @@ private:
int _n_channels;
};
+/** Interface for the NEON kernel to perform Winograd weights transform. */
template <typename T>
class INEWinogradLayerTransformWeightsKernel : public INEKernel
{
@@ -310,6 +327,8 @@ public:
*
* @param[in] n_output_channels Number of output feature maps.
* @param[in] n_input_channels Number of input feature maps.
+ *
+ * @return Storage size (in units of T) required.
*/
virtual unsigned int get_weight_storage_size(int n_output_channels, int n_input_channels) const = 0;
/** Gets the stride between matrices in the kernel worspace
@@ -335,10 +354,12 @@ public:
}
};
+/** NEON kernel to perform Winograd weights transform. */
template <typename T, int OutputTileRows, int OutputTileCols, int KernelRows, int KernelCols>
class NEWinogradLayerTransformWeightsKernel final : public INEWinogradLayerTransformWeightsKernel<T>
{
public:
+ /** Default constructor. */
NEWinogradLayerTransformWeightsKernel();
const char *name() const override
{
@@ -359,6 +380,7 @@ private:
std::unique_ptr<WeightsTransform> _transform;
};
+/** Interface for the NEON kernel to perform Winograd. */
template <typename TIn, typename TOut>
class INEWinogradLayerBatchedGEMMKernel : public INEKernel
{
@@ -406,16 +428,17 @@ public:
virtual int get_number_blocks() const = 0;
};
+/** NEON kernel to perform Winograd. */
template <typename TIn, typename TOut, int OutputTileRows, int OutputTileCols, int KernelRows, int KernelCols>
class NEWinogradLayerBatchedGEMMKernel : public INEWinogradLayerBatchedGEMMKernel<TIn, TOut>
{
public:
+ /** Winograd base kernel */
using WinogradBase = winograd::WinogradGEMM<OutputTileRows, OutputTileCols, KernelRows, KernelCols>;
+ /** Winograd convolution kernel */
using WinogradConv = typename WinogradBase::template Convolution<TIn, TOut>;
- using MultiGEMM = winograd::BatchedBlockedGemm<WinogradConv::M_BLOCK, WinogradConv::N_BLOCK, TIn, TOut>;
-
- static const int _output_tile_rows = OutputTileRows;
- static const int _output_tile_cols = OutputTileCols;
+ /** Winograd batched blocked GEMM operator */
+ using MultiGEMM = winograd::BatchedBlockedGemm<WinogradConv::M_BLOCK, WinogradConv::N_BLOCK, TIn, TOut>;
const char *name() const override
{
@@ -432,7 +455,7 @@ public:
NEWinogradLayerBatchedGEMMKernel(NEWinogradLayerBatchedGEMMKernel &&) = default;
/** Allow instances of this class to be moved */
NEWinogradLayerBatchedGEMMKernel &operator=(NEWinogradLayerBatchedGEMMKernel &&) = default;
-
+ /** Default destructor. */
~NEWinogradLayerBatchedGEMMKernel() = default;
// Inherited methods overridden:
@@ -474,6 +497,8 @@ public:
void run(const Window &window, const ThreadInfo &info) override;
private:
+ static const int _output_tile_rows = OutputTileRows;
+ static const int _output_tile_cols = OutputTileCols;
std::unique_ptr<MultiGEMM> _gemms;
};
diff --git a/arm_compute/core/NEON/kernels/detail/NEActivationFunctionDetail.h b/arm_compute/core/NEON/kernels/detail/NEActivationFunctionDetail.h
index e4d3f54943..71d5a9eef7 100644
--- a/arm_compute/core/NEON/kernels/detail/NEActivationFunctionDetail.h
+++ b/arm_compute/core/NEON/kernels/detail/NEActivationFunctionDetail.h
@@ -30,17 +30,25 @@ namespace arm_compute
{
namespace detail
{
-// Dummy activation object
/** Dummy activation object */
template <typename T, int S>
struct dummy
{
+ /** NEON vector type. */
using ExactType = typename wrapper::traits::neon_vector<T, S>::type;
+ /** Construct a dummy activation object.
+ *
+ * @param[in] act_info Activation layer information.
+ */
explicit dummy(ActivationLayerInfo act_info)
{
ARM_COMPUTE_UNUSED(act_info);
}
+ /** Run activation function.
+ *
+ * @param[in] vval Vector of values.
+ */
void operator()(ExactType &vval)
{
ARM_COMPUTE_UNUSED(vval);
@@ -50,62 +58,97 @@ struct dummy
template <typename T, int S>
struct relu
{
- using ExactType = typename wrapper::traits::neon_vector<T, S>::type;
+ /** NEON vector type. */
+ using ExactType = typename wrapper::traits::neon_vector<T, S>::type;
+ /** NEON vector tag type. */
using ExactTagType = typename wrapper::traits::neon_vector<T, S>::tag_type;
+ /** Construct a RELU activation object.
+ *
+ * @param[in] act_info Activation layer information.
+ */
explicit relu(ActivationLayerInfo act_info)
: vzero(wrapper::vdup_n(static_cast<T>(0.f), ExactTagType{}))
{
ARM_COMPUTE_UNUSED(act_info);
}
+ /** Run activation function.
+ *
+ * @param[in] vval Vector of values.
+ */
void operator()(ExactType &vval)
{
vval = wrapper::vmax(vzero, vval);
}
+ /** Vector of zeroes. */
const ExactType vzero;
};
/** Bounded RELU activation object */
template <typename T, int S>
struct brelu
{
- using ExactType = typename wrapper::traits::neon_vector<T, S>::type;
+ /** NEON vector type. */
+ using ExactType = typename wrapper::traits::neon_vector<T, S>::type;
+ /** NEON vector tag type. */
using ExactTagType = typename wrapper::traits::neon_vector<T, S>::tag_type;
+ /** Construct a bounded RELU activation object.
+ *
+ * @param[in] act_info Activation layer information.
+ */
explicit brelu(ActivationLayerInfo act_info)
: vzero(wrapper::vdup_n(static_cast<T>(0.f), ExactTagType{})),
valpha(wrapper::vdup_n(static_cast<T>(act_info.a()), ExactTagType{}))
{
}
+ /** Run activation function.
+ *
+ * @param[in] vval Vector of values.
+ */
void operator()(ExactType &vval)
{
vval = wrapper::vmin(valpha, wrapper::vmax(vzero, vval));
}
+ /** Vector of zeroes. */
const ExactType vzero;
+ /** Vector of alphas. */
const ExactType valpha;
};
/** Lower-Upper Bounded RELU activation object */
template <typename T, int S>
struct lubrelu
{
- using ExactType = typename wrapper::traits::neon_vector<T, S>::type;
+ /** NEON vector type. */
+ using ExactType = typename wrapper::traits::neon_vector<T, S>::type;
+ /** NEON vector tag type. */
using ExactTagType = typename wrapper::traits::neon_vector<T, S>::tag_type;
+ /** Construct a lower-upper bounded RELU activation object.
+ *
+ * @param[in] act_info Activation layer information.
+ */
explicit lubrelu(ActivationLayerInfo act_info)
: valpha(wrapper::vdup_n(static_cast<T>(act_info.a()), ExactTagType{})),
vbeta(wrapper::vdup_n(static_cast<T>(act_info.b()), ExactTagType{}))
{
}
+ /** Run activation function.
+ *
+ * @param[in] vval Vector of values.
+ */
void operator()(ExactType &vval)
{
vval = wrapper::vmin(valpha, wrapper::vmax(vbeta, vval));
}
+ /** Vector of alphas. */
const ExactType valpha;
+ /** Vector of betas. */
const ExactType vbeta;
};
} // namespace detail
diff --git a/arm_compute/core/NEON/wrapper/traits.h b/arm_compute/core/NEON/wrapper/traits.h
index 08b2c9b48f..495ddbb1af 100644
--- a/arm_compute/core/NEON/wrapper/traits.h
+++ b/arm_compute/core/NEON/wrapper/traits.h
@@ -42,7 +42,8 @@ struct vector_128_tag {};
/** Create the appropriate NEON vector given its type and size */
template <typename T, int S> struct neon_vector;
-/** Specializations */
+// Specializations
+#ifndef DOXYGEN_SKIP_THIS
template <> struct neon_vector<uint8_t, 8>{ using type = uint8x8_t; using tag_type = vector_64_tag; };
template <> struct neon_vector<int8_t, 8>{ using type = int8x8_t; using tag_type = vector_64_tag; };
template <> struct neon_vector<uint8_t, 16>{ using type = uint8x16_t; using tag_type = vector_128_tag; };
@@ -61,6 +62,7 @@ template <> struct neon_vector<uint64_t, 2>{ using type = uint64x2_t; using tag_
template <> struct neon_vector<int64_t, 2>{ using type = int64x2_t; using tag_type = vector_128_tag; };
template <> struct neon_vector<float_t, 2>{ using type = float32x2_t; using tag_type = vector_64_tag; };
template <> struct neon_vector<float_t, 4>{ using type = float32x4_t; using tag_type = vector_128_tag; };
+#endif /* DOXYGEN_SKIP_THIS */
/** Helper type template to get the type of a neon vector */
template <typename T, int S> using neon_vector_t = typename neon_vector<T, S>::type;