21 files changed, 215 insertions, 28 deletions
diff --git a/arm_compute/core/NEON/INEKernel.h b/arm_compute/core/NEON/INEKernel.h
index 529606a709..32d7ab6338 100644
--- a/arm_compute/core/NEON/INEKernel.h
+++ b/arm_compute/core/NEON/INEKernel.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -28,6 +28,7 @@
 
 namespace arm_compute
 {
+/** Common interface for all kernels implemented in NEON. */
 using INEKernel = ICPPKernel;
 } // namespace arm_compute
 #endif /*__ARM_COMPUTE_INEKERNEL_H__ */
diff --git a/arm_compute/core/NEON/INESimpleKernel.h b/arm_compute/core/NEON/INESimpleKernel.h
index 0d2211ac32..15fc3be5ed 100644
--- a/arm_compute/core/NEON/INESimpleKernel.h
+++ b/arm_compute/core/NEON/INESimpleKernel.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, 2017 ARM Limited.
+ * Copyright (c) 2016-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -28,6 +28,7 @@
 
 namespace arm_compute
 {
+/** Interface for simple NEON kernels having 1 tensor input and 1 tensor output */
 using INESimpleKernel = ICPPSimpleKernel;
 } // namespace arm_compute
 #endif /*__ARM_COMPUTE_INESIMPLEKERNEL_H__ */
diff --git a/arm_compute/core/NEON/NEColorConvertHelper.inl b/arm_compute/core/NEON/NEColorConvertHelper.inl
index 9a9caefaab..0da5affe18 100644
--- a/arm_compute/core/NEON/NEColorConvertHelper.inl
+++ b/arm_compute/core/NEON/NEColorConvertHelper.inl
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, 2017 ARM Limited.
+ * Copyright (c) 2016-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -30,6 +30,7 @@
 
 namespace
 {
+#ifndef DOXYGEN_SKIP_THIS
 constexpr float red_coef_bt709    = 1.5748F;
 constexpr float green_coef_bt709  = -0.1873f;
 constexpr float green_coef2_bt709 = -0.4681f;
@@ -296,10 +297,18 @@ inline void store_rgb_to_yuv4(const uint8x16_t &rvec, const uint8x16_t &gvec, co
     vst1q_u8(out_u, uvec);
     vst1q_u8(out_v, vvec);
 }
+#endif /* DOXYGEN_SKIP_THIS */
 }
 
 namespace arm_compute
 {
+/** Convert RGB to RGBX.
+ *
+ * @param[in]  input  Input RGB data buffer.
+ * @param[out] output Output RGBX buffer.
+ * @param[in]  win    Window for iterating the buffers.
+ *
+ */
 void colorconvert_rgb_to_rgbx(const void *__restrict input, void *__restrict output, const Window &win)
 {
     ARM_COMPUTE_ERROR_ON(nullptr == input);
@@ -324,6 +333,13 @@ void colorconvert_rgb_to_rgbx(const void *__restrict input, void *__restrict out
     in, out);
 }
 
+/** Convert RGBX to RGB.
+ *
+ * @param[in]  input  Input RGBX data buffer.
+ * @param[out] output Output RGB buffer.
+ * @param[in]  win    Window for iterating the buffers.
+ *
+ */
 void colorconvert_rgbx_to_rgb(const void *input, void *output, const Window &win)
 {
     ARM_COMPUTE_ERROR_ON(nullptr == input);
@@ -347,6 +363,13 @@ void colorconvert_rgbx_to_rgb(const void *input, void *output, const Window &win
     in, out);
 }
 
+/** Convert YUYV to RGB.
+ *
+ * @param[in]  input  Input YUYV data buffer.
+ * @param[out] output Output RGB buffer.
+ * @param[in]  win    Window for iterating the buffers.
+ *
+ */
 template <bool yuyv, bool alpha>
 void colorconvert_yuyv_to_rgb(const void *__restrict input, void *__restrict output, const Window &win)
 {
@@ -385,6 +408,13 @@ void colorconvert_yuyv_to_rgb(const void *__restrict input, void *__restrict out
     in, out);
 }
 
+/** Convert NV12 to RGB.
+ *
+ * @param[in]  input  Input NV12 data buffer.
+ * @param[out] output Output RGB buffer.
+ * @param[in]  win    Window for iterating the buffers.
+ *
+ */
 template <bool uv, bool alpha>
 void colorconvert_nv12_to_rgb(const void *__restrict input, void *__restrict output, const Window &win)
 {
@@ -441,6 +471,13 @@ void colorconvert_nv12_to_rgb(const void *__restrict input, void *__restrict out
     in_y, in_uv, out);
 }
 
+/** Convert IYUV to RGB.
+ *
+ * @param[in]  input  Input IYUV data buffer.
+ * @param[out] output Output RGB buffer.
+ * @param[in]  win    Window for iterating the buffers.
+ *
+ */
 template <bool alpha>
 void colorconvert_iyuv_to_rgb(const void *__restrict input, void *__restrict output, const Window &win)
 {
@@ -498,6 +535,13 @@ void colorconvert_iyuv_to_rgb(const void *__restrict input, void *__restrict out
     in_y, in_u, in_v, out);
 }
 
+/** Convert YUYV to NV12.
+ *
+ * @param[in]  input  Input YUYV data buffer.
+ * @param[out] output Output NV12 buffer.
+ * @param[in]  win    Window for iterating the buffers.
+ *
+ */
 template <bool yuyv>
 void colorconvert_yuyv_to_nv12(const void *__restrict input, void *__restrict output, const Window &win)
 {
@@ -547,6 +591,13 @@ void colorconvert_yuyv_to_nv12(const void *__restrict input, void *__restrict ou
     in, out_y, out_uv);
 }
 
+/** Convert IYUV to NV12.
+ *
+ * @param[in]  input  Input IYUV data buffer.
+ * @param[out] output Output NV12 buffer.
+ * @param[in]  win    Window for iterating the buffers.
+ *
+ */
 void colorconvert_iyuv_to_nv12(const void *__restrict input, void *__restrict output, const Window &win)
 {
     ARM_COMPUTE_ERROR_ON(nullptr == input);
@@ -587,6 +638,13 @@ void colorconvert_iyuv_to_nv12(const void *__restrict input, void *__restrict ou
     in_y, in_u, in_v, out_y, out_uv);
 }
 
+/** Convert NV12 to IYUV.
+ *
+ * @param[in]  input  Input NV12 data buffer.
+ * @param[out] output Output IYUV buffer.
+ * @param[in]  win    Window for iterating the buffers.
+ *
+ */
 template <bool uv>
 void colorconvert_nv12_to_iyuv(const void *__restrict input, void *__restrict output, const Window &win)
 {
@@ -629,6 +687,13 @@ void colorconvert_nv12_to_iyuv(const void *__restrict input, void *__restrict ou
     in_y, in_uv, out_y, out_u, out_v);
 }
 
+/** Convert YUYV to IYUV.
+ *
+ * @param[in]  input  Input YUYV data buffer.
+ * @param[out] output Output IYUV buffer.
+ * @param[in]  win    Window for iterating the buffers.
+ *
+ */
 template <bool yuyv>
 void colorconvert_yuyv_to_iyuv(const void *__restrict input, void *__restrict output, const Window &win)
 {
@@ -682,6 +747,13 @@ void colorconvert_yuyv_to_iyuv(const void *__restrict input, void *__restrict ou
     in, out_y, out_u, out_v);
 }
 
+/** Convert NV12 to YUV4.
+ *
+ * @param[in]  input  Input NV12 data buffer.
+ * @param[out] output Output YUV4 buffer.
+ * @param[in]  win    Window for iterating the buffers.
+ *
+ */
 template <bool uv>
 void colorconvert_nv12_to_yuv4(const void *__restrict input, void *__restrict output, const Window &win)
 {
@@ -734,6 +806,13 @@ void colorconvert_nv12_to_yuv4(const void *__restrict input, void *__restrict ou
     in_y, in_uv, out_y, out_u, out_v);
 }
 
+/** Convert IYUV to YUV4.
+ *
+ * @param[in]  input  Input IYUV data buffer.
+ * @param[out] output Output YUV4 buffer.
+ * @param[in]  win    Window for iterating the buffers.
+ *
+ */
 void colorconvert_iyuv_to_yuv4(const void *__restrict input, void *__restrict output, const Window &win)
 {
     ARM_COMPUTE_ERROR_ON(nullptr == input);
@@ -785,6 +864,13 @@ void colorconvert_iyuv_to_yuv4(const void *__restrict input, void *__restrict ou
     in_y, in_u, in_v, out_y, out_u, out_v);
 }
 
+/** Convert RGB to NV12.
+ *
+ * @param[in]  input  Input RGB data buffer.
+ * @param[out] output Output NV12 buffer.
+ * @param[in]  win    Window for iterating the buffers.
+ *
+ */
 template <bool alpha>
 void colorconvert_rgb_to_nv12(const void *__restrict input, void *__restrict output, const Window &win)
 {
@@ -821,6 +907,13 @@ void colorconvert_rgb_to_nv12(const void *__restrict input, void *__restrict out
     in, out_y, out_uv);
 }
 
+/** Convert RGB to IYUV.
+ *
+ * @param[in]  input  Input RGB data buffer.
+ * @param[out] output Output IYUV buffer.
+ * @param[in]  win    Window for iterating the buffers.
+ *
+ */
 template <bool alpha>
 void colorconvert_rgb_to_iyuv(const void *__restrict input, void *__restrict output, const Window &win)
 {
@@ -858,6 +951,13 @@ void colorconvert_rgb_to_iyuv(const void *__restrict input, void *__restrict out
     in, out_y, out_u, out_v);
 }
 
+/** Convert RGB to YUV4.
+ *
+ * @param[in]  input  Input RGB data buffer.
+ * @param[out] output Output YUV4 buffer.
+ * @param[in]  win    Window for iterating the buffers.
+ *
+ */
 template <bool alpha>
 void colorconvert_rgb_to_yuv4(const void *__restrict input, void *__restrict output, const Window &win)
 {
diff --git a/arm_compute/core/NEON/NEFixedPoint.inl b/arm_compute/core/NEON/NEFixedPoint.inl
index 966313d58b..b86c3cbec3 100644
--- a/arm_compute/core/NEON/NEFixedPoint.inl
+++ b/arm_compute/core/NEON/NEFixedPoint.inl
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -130,6 +130,7 @@ static const std::array<qint16x8_t, 4> log_tabq_qs16 =
     }
 };
 
+#ifndef DOXYGEN_SKIP_THIS
 inline qint8x8_t vget_low_qs8(qint8x16_t a)
 {
     return vget_low_s8(a);
@@ -1996,4 +1997,5 @@ inline float32x4x2_t vmax2q_f32(float32x4x2_t a, float32x4x2_t b)
     };
     return res;
 }
+#endif /* DOXYGEN_SKIP_THIS */
 } // namespace arm_compute
diff --git a/arm_compute/core/NEON/NEMath.inl b/arm_compute/core/NEON/NEMath.inl
index 50f217c1f1..84154020a5 100644
--- a/arm_compute/core/NEON/NEMath.inl
+++ b/arm_compute/core/NEON/NEMath.inl
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, 2017 ARM Limited.
+ * Copyright (c) 2016-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,7 +24,7 @@
 
 namespace arm_compute
 {
-/* Exponent polynomial coefficients */
+/** Exponent polynomial coefficients */
 const std::array<float32x4_t, 8> exp_tab =
 {
     {
@@ -39,7 +39,7 @@ const std::array<float32x4_t, 8> exp_tab =
     }
 };
 
-/* Logarithm polynomial coefficients */
+/** Logarithm polynomial coefficients */
 const std::array<float32x4_t, 8> log_tab =
 {
     {
@@ -54,6 +54,7 @@ const std::array<float32x4_t, 8> log_tab =
     }
 };
 
+#ifndef DOXYGEN_SKIP_THIS
 inline float32x4_t vfloorq_f32(float32x4_t val)
 {
     static const float32x4_t CONST_1 = vdupq_n_f32(1.f);
@@ -168,8 +169,10 @@ inline float32x4_t vpowq_f32(float32x4_t val, float32x4_t n)
 {
     return vexpq_f32(vmulq_f32(n, vlogq_f32(val)));
 }
+#endif /* DOXYGEN_SKIP_THIS */
+
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-/* Exponent polynomial coefficients */
+/** Exponent polynomial coefficients */
 const std::array<float16x8_t, 8> exp_tab_f16 =
 {
     {
@@ -184,7 +187,7 @@ const std::array<float16x8_t, 8> exp_tab_f16 =
     }
 };
 
-/* Logarithm polynomial coefficients */
+/** Logarithm polynomial coefficients */
 const std::array<float16x8_t, 8> log_tab_f16 =
 {
     {
@@ -199,6 +202,7 @@ const std::array<float16x8_t, 8> log_tab_f16 =
     }
 };
 
+#ifndef DOXYGEN_SKIP_THIS
 inline float16x4_t vinvsqrt_f16(float16x4_t x)
 {
     float16x4_t sqrt_reciprocal = vrsqrte_f16(x);
@@ -301,5 +305,6 @@ inline float16x8_t vpowq_f16(float16x8_t val, float16x8_t n)
 {
     return vexpq_f16(vmulq_f16(n, vlogq_f16(val)));
 }
+#endif /* DOXYGEN_SKIP_THIS */
 #endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
 } // namespace arm_compute
diff --git a/arm_compute/core/NEON/kernels/NEAccumulateKernel.h b/arm_compute/core/NEON/kernels/NEAccumulateKernel.h
index ad5a16c9f3..82a4199761 100644
--- a/arm_compute/core/NEON/kernels/NEAccumulateKernel.h
+++ b/arm_compute/core/NEON/kernels/NEAccumulateKernel.h
@@ -101,6 +101,7 @@ public:
     void run(const Window &window, const ThreadInfo &info) override;
 };
 #else  /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
+/** Interface for the accumulate weighted kernel using F16 */
 using NEAccumulateWeightedFP16Kernel = NEAccumulateWeightedKernel;
 #endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
 
diff --git a/arm_compute/core/NEON/kernels/NEBatchNormalizationLayerKernel.h b/arm_compute/core/NEON/kernels/NEBatchNormalizationLayerKernel.h
index 63eb739487..2408a665e4 100644
--- a/arm_compute/core/NEON/kernels/NEBatchNormalizationLayerKernel.h
+++ b/arm_compute/core/NEON/kernels/NEBatchNormalizationLayerKernel.h
@@ -47,7 +47,7 @@ public:
     NEBatchNormalizationLayerKernel &operator=(const NEBatchNormalizationLayerKernel &) = delete;
     /** Default Move Constructor. */
     NEBatchNormalizationLayerKernel(NEBatchNormalizationLayerKernel &&) = default;
-    /** Default move assignment operator. */
+    /** Default move assignment operator */
     NEBatchNormalizationLayerKernel &operator=(NEBatchNormalizationLayerKernel &&) = default;
     /** Default destructor */
     ~NEBatchNormalizationLayerKernel() = default;
diff --git a/arm_compute/core/NEON/kernels/NEBox3x3Kernel.h b/arm_compute/core/NEON/kernels/NEBox3x3Kernel.h
index 9c139551cb..2f93fd2480 100644
--- a/arm_compute/core/NEON/kernels/NEBox3x3Kernel.h
+++ b/arm_compute/core/NEON/kernels/NEBox3x3Kernel.h
@@ -51,7 +51,7 @@ public:
 };
 
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-/** NEON kernel to perform a Box 3x3 filter using F16 simd
+/** NEON kernel to perform a Box 3x3 filter for FP16 datatype
  */
 class NEBox3x3FP16Kernel : public NEBox3x3Kernel
 {
@@ -64,6 +64,7 @@ public:
     void run(const Window &window, const ThreadInfo &info) override;
 };
 #else  /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
+/** NEON kernel to perform a Box 3x3 filter for FP16 datatype */
 using NEBox3x3FP16Kernel = NEBox3x3Kernel;
 #endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
 } // namespace arm_compute
diff --git a/arm_compute/core/NEON/kernels/NECannyEdgeKernel.h b/arm_compute/core/NEON/kernels/NECannyEdgeKernel.h
index 401b9e47af..58ef1757fe 100644
--- a/arm_compute/core/NEON/kernels/NECannyEdgeKernel.h
+++ b/arm_compute/core/NEON/kernels/NECannyEdgeKernel.h
@@ -86,7 +86,7 @@ protected:
 };
 
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-/** NEON kernel to perform Gradient computation
+/** NEON kernel to perform Gradient computation for FP16 datatype
  */
 class NEGradientFP16Kernel : public NEGradientKernel
 {
@@ -99,6 +99,7 @@ public:
     void configure(const ITensor *gx, const ITensor *gy, ITensor *magnitude, ITensor *phase, int32_t norm_type) override;
 };
 #else  /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
+/** NEON kernel to perform Gradient computation for FP16 datatype */
 using NEGradientFP16Kernel = NEGradientKernel;
 #endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
 
diff --git a/arm_compute/core/NEON/kernels/NEDepthwiseConvolutionLayer3x3Kernel.h b/arm_compute/core/NEON/kernels/NEDepthwiseConvolutionLayer3x3Kernel.h
index 5871cc5dcb..0c2f30a98c 100644
--- a/arm_compute/core/NEON/kernels/NEDepthwiseConvolutionLayer3x3Kernel.h
+++ b/arm_compute/core/NEON/kernels/NEDepthwiseConvolutionLayer3x3Kernel.h
@@ -49,7 +49,7 @@ public:
     NEDepthwiseConvolutionLayer3x3Kernel &operator=(const NEDepthwiseConvolutionLayer3x3Kernel &) = delete;
     /** Default Move Constructor. */
     NEDepthwiseConvolutionLayer3x3Kernel(NEDepthwiseConvolutionLayer3x3Kernel &&) = default;
-    /** Default move assignment operator. */
+    /** Default move assignment operator */
     NEDepthwiseConvolutionLayer3x3Kernel &operator=(NEDepthwiseConvolutionLayer3x3Kernel &&) = default;
     /** Initialize the function's source, destination, conv and border_size.
      *
diff --git a/arm_compute/core/NEON/kernels/NEDequantizationLayerKernel.h b/arm_compute/core/NEON/kernels/NEDequantizationLayerKernel.h
index 7ee2078e9e..f48e76f340 100644
--- a/arm_compute/core/NEON/kernels/NEDequantizationLayerKernel.h
+++ b/arm_compute/core/NEON/kernels/NEDequantizationLayerKernel.h
@@ -50,7 +50,7 @@ public:
     NEDequantizationLayerKernel &operator=(const NEDequantizationLayerKernel &) = delete;
     /** Default Move Constructor. */
     NEDequantizationLayerKernel(NEDequantizationLayerKernel &&) = default;
-    /** Default move assignment operator. */
+    /** Default move assignment operator */
     NEDequantizationLayerKernel &operator=(NEDequantizationLayerKernel &&) = default;
     /** Default destructor */
     ~NEDequantizationLayerKernel() = default;
diff --git a/arm_compute/core/NEON/kernels/NEGEMMMatrixVectorMultiplyKernel.h b/arm_compute/core/NEON/kernels/NEGEMMMatrixVectorMultiplyKernel.h
index 286be1acc9..a05d591850 100644
--- a/arm_compute/core/NEON/kernels/NEGEMMMatrixVectorMultiplyKernel.h
+++ b/arm_compute/core/NEON/kernels/NEGEMMMatrixVectorMultiplyKernel.h
@@ -30,6 +30,7 @@ namespace arm_compute
 {
 class ITensor;
 
+/** Interface for the GEMM matrix vector multiply kernel. **/
 class NEGEMMMatrixVectorMultiplyKernel : public INESimpleKernel
 {
 public:
diff --git a/arm_compute/core/NEON/kernels/NEHarrisCornersKernel.h b/arm_compute/core/NEON/kernels/NEHarrisCornersKernel.h
index 8037e41695..aabf8b312b 100644
--- a/arm_compute/core/NEON/kernels/NEHarrisCornersKernel.h
+++ b/arm_compute/core/NEON/kernels/NEHarrisCornersKernel.h
@@ -127,6 +127,7 @@ private:
     HarrisScoreFunction *_func;
 };
 #else  /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
+/** Interface for the accumulate Weighted kernel using FP16 */
 template <int32_t block_size>
 using NEHarrisScoreFP16Kernel = NEHarrisScoreKernel<block_size>;
 #endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
diff --git a/arm_compute/core/NEON/kernels/NELKTrackerKernel.h b/arm_compute/core/NEON/kernels/NELKTrackerKernel.h
index f2105582eb..9a8947f9a0 100644
--- a/arm_compute/core/NEON/kernels/NELKTrackerKernel.h
+++ b/arm_compute/core/NEON/kernels/NELKTrackerKernel.h
@@ -45,6 +45,7 @@ struct NELKInternalKeypoint
     bool  tracking_status{ false }; /**< the tracking status of the keypoint */
 };
 
+/** Interface for NEON Array of Internal Key Points. */
 using INELKInternalKeypointArray = IArray<NELKInternalKeypoint>;
 
 /** Interface for the Lucas-Kanade tracker kernel */
diff --git a/arm_compute/core/NEON/kernels/NEMagnitudePhaseKernel.h b/arm_compute/core/NEON/kernels/NEMagnitudePhaseKernel.h
index 522ed54f95..696721673d 100644
--- a/arm_compute/core/NEON/kernels/NEMagnitudePhaseKernel.h
+++ b/arm_compute/core/NEON/kernels/NEMagnitudePhaseKernel.h
@@ -165,6 +165,7 @@ private:
     ITensor                  *_phase;     /**< Output - Phase */
 };
 #else  /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
+/** Template interface for the kernel to compute magnitude and phase */
 template <MagnitudeType mag_type, PhaseType phase_type>
 using NEMagnitudePhaseFP16Kernel = NEMagnitudePhaseKernel<mag_type, phase_type>;
 #endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
diff --git a/arm_compute/core/NEON/kernels/NENonMaximaSuppression3x3Kernel.h b/arm_compute/core/NEON/kernels/NENonMaximaSuppression3x3Kernel.h
index f122ed15fd..588de49316 100644
--- a/arm_compute/core/NEON/kernels/NENonMaximaSuppression3x3Kernel.h
+++ b/arm_compute/core/NEON/kernels/NENonMaximaSuppression3x3Kernel.h
@@ -83,7 +83,7 @@ protected:
 };
 
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-/** NEON kernel to perform Non-Maxima suppression 3x3 with intermediate results in F16 if the input data type is F32
+/** NEON kernel to perform Non-Maxima suppression 3x3 with intermediate results in FP16 if the input data type is FP32
  */
 class NENonMaximaSuppression3x3FP16Kernel : public NENonMaximaSuppression3x3Kernel
 {
@@ -101,6 +101,7 @@ public:
     void configure(const ITensor *input, ITensor *output, bool border_undefined);
 };
 #else  /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
+/** NEON kernel to perform Non-Maxima suppression 3x3 with intermediate results in FP16 if the input data type is FP32 */
 using NENonMaximaSuppression3x3FP16Kernel = NENonMaximaSuppression3x3Kernel;
 #endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
 } // namespace arm_compute
diff --git a/arm_compute/core/NEON/kernels/NENormalizationLayerKernel.h b/arm_compute/core/NEON/kernels/NENormalizationLayerKernel.h
index b835ca7c53..6ae7b73423 100644
--- a/arm_compute/core/NEON/kernels/NENormalizationLayerKernel.h
+++ b/arm_compute/core/NEON/kernels/NENormalizationLayerKernel.h
@@ -47,7 +47,7 @@ public:
     NENormalizationLayerKernel &operator=(const NENormalizationLayerKernel &) = delete;
     /** Default Move Constructor. */
     NENormalizationLayerKernel(NENormalizationLayerKernel &&) = default;
-    /** Default move assignment operator. */
+    /** Default move assignment operator */
     NENormalizationLayerKernel &operator=(NENormalizationLayerKernel &&) = default;
     /** Default destructor */
     ~NENormalizationLayerKernel() = default;
diff --git a/arm_compute/core/NEON/kernels/NEQuantizationLayerKernel.h b/arm_compute/core/NEON/kernels/NEQuantizationLayerKernel.h
index e7cf0a8ca4..ca7658bb7e 100644
--- a/arm_compute/core/NEON/kernels/NEQuantizationLayerKernel.h
+++ b/arm_compute/core/NEON/kernels/NEQuantizationLayerKernel.h
@@ -50,7 +50,7 @@ public:
     NEQuantizationLayerKernel &operator=(const NEQuantizationLayerKernel &) = delete;
     /** Default Move Constructor. */
     NEQuantizationLayerKernel(NEQuantizationLayerKernel &&) = default;
-    /** Default move assignment operator. */
+    /** Default move assignment operator */
     NEQuantizationLayerKernel &operator=(NEQuantizationLayerKernel &&) = default;
     /** Default destructor */
     ~NEQuantizationLayerKernel() = default;
diff --git a/arm_compute/core/NEON/kernels/NEWinogradLayerKernel.h b/arm_compute/core/NEON/kernels/NEWinogradLayerKernel.h
index 9169b75d19..2f44d19b4f 100644
--- a/arm_compute/core/NEON/kernels/NEWinogradLayerKernel.h
+++ b/arm_compute/core/NEON/kernels/NEWinogradLayerKernel.h
@@ -34,6 +34,7 @@ namespace arm_compute
 {
 class ITensor;
 
+/** Interface for the NEON kernel to perform Winograd input transform. */
 template <typename T>
 class INEWinogradLayerTransformInputKernel : public INEKernel
 {
@@ -46,6 +47,8 @@ public:
      * @param[in] n_rows       Number of rows in each feature map.
      * @param[in] n_cols       Number of columns in each feature map.
      * @param[in] same_padding Use "SAME" padding, otherwise use "VALID".
+     *
+     * @return Storage size (in units of TIn) required.
      */
     virtual unsigned int get_input_storage_size(int n_batches, int n_channels, int n_rows, int n_cols, bool same_padding) const = 0;
 
@@ -72,11 +75,13 @@ public:
      */
     virtual void configure(const T *const input, const int n_batches, const int n_rows, const int n_cols, const int n_channels, const PaddingType padding, T *const output, const int matrix_stride) = 0;
 
+    /** Destructor */
     virtual ~INEWinogradLayerTransformInputKernel()
     {
     }
 };
 
+/** NEON kernel to perform Winograd input transform. */
 template <typename T, int OutputTileRows, int OutputTileCols, int KernelRows, int KernelCols>
 class NEWinogradLayerTransformInputKernel : public INEWinogradLayerTransformInputKernel<T>
 {
@@ -89,6 +94,8 @@ public:
      * @param[in] n_rows       Number of rows in each feature map.
      * @param[in] n_cols       Number of columns in each feature map.
      * @param[in] same_padding Use "SAME" padding, otherwise use "VALID".
+     *
+     * @return Storage size (in units of TIn) required.
      */
     unsigned int get_input_storage_size(
         int  n_batches,
@@ -107,6 +114,7 @@ public:
      */
     int get_matrix_stride(const KernelShape &kernel_shape, const Tensor4DShape &input_shape, const PaddingType padding_type) const override;
 
+    /** Default constructor */
     NEWinogradLayerTransformInputKernel();
 
     const char *name() const override
@@ -139,7 +147,9 @@ public:
     void run(const Window &window, const ThreadInfo &info) override;
     bool is_parallelisable() const override;
 
+    /** Winograd base kernel */
     using WinogradBase = winograd::WinogradGEMM<OutputTileRows, OutputTileCols, KernelCols, KernelCols>;
+    /** Winograd convolution kernel */
     using WinogradConv = typename WinogradBase::template Convolution<T, T>;
 
 private:
@@ -147,6 +157,7 @@ private:
     std::unique_ptr<InputTransform> _transform;
 };
 
+/** Interface for the NEON kernel to perform Winograd output transform. */
 template <typename T>
 class INEWinogradLayerTransformOutputKernel : public INEKernel
 {
@@ -159,6 +170,8 @@ public:
      * @param[in] n_cols            Number of columns in each feature map of the input tensor.
      * @param[in] n_output_channels Number of feature maps in the output tensor.
      * @param[in] same_padding      Use "SAME" padding, otherwise use "VALID".
+     *
+     * @return Storage size (in units of TOut) required.
      */
     virtual unsigned int get_output_storage_size(int n_batches, int n_rows, int n_cols, int n_output_channels, bool same_padding) const = 0;
 
@@ -208,6 +221,7 @@ public:
     }
 };
 
+/** NEON kernel to perform Winograd output transform. */
 template <typename T, int OutputTileRows, int OutputTileCols, int KernelRows, int KernelCols>
 class NEWinogradLayerTransformOutputKernel : public INEWinogradLayerTransformOutputKernel<T>
 {
@@ -227,7 +241,7 @@ public:
     NEWinogradLayerTransformOutputKernel(NEWinogradLayerTransformOutputKernel &&) = default;
     /** Allow instances of this class to be moved */
     NEWinogradLayerTransformOutputKernel &operator=(NEWinogradLayerTransformOutputKernel &&) = default;
-
+    /** Default destructor */
     ~NEWinogradLayerTransformOutputKernel() = default;
 
     // Inherited methods overridden:
@@ -239,6 +253,8 @@ public:
      * @param[in] n_cols            Number of columns in each feature map of the input tensor.
      * @param[in] n_output_channels Number of feature maps in the output tensor.
      * @param[in] same_padding      Use "SAME" padding, otherwise use "VALID".
+     *
+     * @return Storage size (in units of TOut) required.
      */
     unsigned int get_output_storage_size(int n_batches, int n_rows, int n_cols, int n_output_channels, bool same_padding) const override;
 
@@ -301,6 +317,7 @@ private:
     int            _n_channels;
 };
 
+/** Interface for the NEON kernel to perform Winograd weights transform. */
 template <typename T>
 class INEWinogradLayerTransformWeightsKernel : public INEKernel
 {
@@ -310,6 +327,8 @@ public:
      *
      * @param[in] n_output_channels Number of output feature maps.
      * @param[in] n_input_channels  Number of input feature maps.
+     *
+     * @return Storage size (in units of T) required.
      */
     virtual unsigned int get_weight_storage_size(int n_output_channels, int n_input_channels) const = 0;
     /** Gets the stride between matrices in the kernel worspace
@@ -335,10 +354,12 @@ public:
     }
 };
 
+/** NEON kernel to perform Winograd weights transform. */
 template <typename T, int OutputTileRows, int OutputTileCols, int KernelRows, int KernelCols>
 class NEWinogradLayerTransformWeightsKernel final : public INEWinogradLayerTransformWeightsKernel<T>
 {
 public:
+    /** Default constructor. */
     NEWinogradLayerTransformWeightsKernel();
     const char *name() const override
     {
@@ -359,6 +380,7 @@ private:
     std::unique_ptr<WeightsTransform> _transform;
 };
 
+/** Interface for the NEON kernel to perform Winograd. */
 template <typename TIn, typename TOut>
 class INEWinogradLayerBatchedGEMMKernel : public INEKernel
 {
@@ -406,16 +428,17 @@ public:
     virtual int get_number_blocks() const = 0;
 };
 
+/** NEON kernel to perform Winograd. */
 template <typename TIn, typename TOut, int OutputTileRows, int OutputTileCols, int KernelRows, int KernelCols>
 class NEWinogradLayerBatchedGEMMKernel : public INEWinogradLayerBatchedGEMMKernel<TIn, TOut>
 {
 public:
+    /** Winograd base kernel */
     using WinogradBase = winograd::WinogradGEMM<OutputTileRows, OutputTileCols, KernelRows, KernelCols>;
+    /** Winograd convolution kernel */
     using WinogradConv = typename WinogradBase::template Convolution<TIn, TOut>;
-    using MultiGEMM    = winograd::BatchedBlockedGemm<WinogradConv::M_BLOCK, WinogradConv::N_BLOCK, TIn, TOut>;
-
-    static const int _output_tile_rows = OutputTileRows;
-    static const int _output_tile_cols = OutputTileCols;
+    /** Winograd batched blocked GEMM operator */
+    using MultiGEMM = winograd::BatchedBlockedGemm<WinogradConv::M_BLOCK, WinogradConv::N_BLOCK, TIn, TOut>;
 
     const char *name() const override
     {
@@ -432,7 +455,7 @@ public:
     NEWinogradLayerBatchedGEMMKernel(NEWinogradLayerBatchedGEMMKernel &&) = default;
     /** Allow instances of this class to be moved */
     NEWinogradLayerBatchedGEMMKernel &operator=(NEWinogradLayerBatchedGEMMKernel &&) = default;
-
+    /** Default destructor. */
     ~NEWinogradLayerBatchedGEMMKernel() = default;
 
     // Inherited methods overridden:
@@ -474,6 +497,8 @@ public:
     void run(const Window &window, const ThreadInfo &info) override;
 
 private:
+    static const int           _output_tile_rows = OutputTileRows;
+    static const int           _output_tile_cols = OutputTileCols;
     std::unique_ptr<MultiGEMM> _gemms;
 };
 
diff --git a/arm_compute/core/NEON/kernels/detail/NEActivationFunctionDetail.h b/arm_compute/core/NEON/kernels/detail/NEActivationFunctionDetail.h
index e4d3f54943..71d5a9eef7 100644
--- a/arm_compute/core/NEON/kernels/detail/NEActivationFunctionDetail.h
+++ b/arm_compute/core/NEON/kernels/detail/NEActivationFunctionDetail.h
@@ -30,17 +30,25 @@ namespace arm_compute
 {
 namespace detail
 {
-// Dummy activation object
 /** Dummy activation object */
 template <typename T, int S>
 struct dummy
 {
+    /** NEON vector type. */
     using ExactType = typename wrapper::traits::neon_vector<T, S>::type;
 
+    /** Construct a dummy activation object.
+     *
+     * @param[in] act_info Activation layer information.
+     */
     explicit dummy(ActivationLayerInfo act_info)
     {
         ARM_COMPUTE_UNUSED(act_info);
     }
+    /** Run activation function.
+     *
+     * @param[in] vval Vector of values.
+     */
     void operator()(ExactType &vval)
     {
         ARM_COMPUTE_UNUSED(vval);
@@ -50,62 +58,97 @@ struct dummy
 template <typename T, int S>
 struct relu
 {
-    using ExactType    = typename wrapper::traits::neon_vector<T, S>::type;
+    /** NEON vector type. */
+    using ExactType = typename wrapper::traits::neon_vector<T, S>::type;
+    /** NEON vector tag type. */
     using ExactTagType = typename wrapper::traits::neon_vector<T, S>::tag_type;
 
+    /** Construct a RELU activation object.
+     *
+     * @param[in] act_info Activation layer information.
+     */
     explicit relu(ActivationLayerInfo act_info)
         : vzero(wrapper::vdup_n(static_cast<T>(0.f), ExactTagType{}))
     {
         ARM_COMPUTE_UNUSED(act_info);
     }
 
+    /** Run activation function.
+     *
+     * @param[in] vval Vector of values.
+     */
     void operator()(ExactType &vval)
     {
         vval = wrapper::vmax(vzero, vval);
     }
 
+    /** Vector of zeroes. */
     const ExactType vzero;
 };
 /** Bounded RELU activation object */
 template <typename T, int S>
 struct brelu
 {
-    using ExactType    = typename wrapper::traits::neon_vector<T, S>::type;
+    /** NEON vector type. */
+    using ExactType = typename wrapper::traits::neon_vector<T, S>::type;
+    /** NEON vector tag type. */
     using ExactTagType = typename wrapper::traits::neon_vector<T, S>::tag_type;
 
+    /** Construct a bounded RELU activation object.
+     *
+     * @param[in] act_info Activation layer information.
+     */
     explicit brelu(ActivationLayerInfo act_info)
         : vzero(wrapper::vdup_n(static_cast<T>(0.f), ExactTagType{})),
           valpha(wrapper::vdup_n(static_cast<T>(act_info.a()), ExactTagType{}))
     {
     }
 
+    /** Run activation function.
+     *
+     * @param[in] vval Vector of values.
+     */
     void operator()(ExactType &vval)
     {
         vval = wrapper::vmin(valpha, wrapper::vmax(vzero, vval));
     }
 
+    /** Vector of zeroes. */
     const ExactType vzero;
+    /** Vector of alphas. */
     const ExactType valpha;
 };
 /** Lower-Upper Bounded RELU activation object */
 template <typename T, int S>
 struct lubrelu
 {
-    using ExactType    = typename wrapper::traits::neon_vector<T, S>::type;
+    /** NEON vector type. */
+    using ExactType = typename wrapper::traits::neon_vector<T, S>::type;
+    /** NEON vector tag type. */
     using ExactTagType = typename wrapper::traits::neon_vector<T, S>::tag_type;
 
+    /** Construct a lower-upper bounded RELU activation object.
+     *
+     * @param[in] act_info Activation layer information.
+     */
     explicit lubrelu(ActivationLayerInfo act_info)
         : valpha(wrapper::vdup_n(static_cast<T>(act_info.a()), ExactTagType{})),
           vbeta(wrapper::vdup_n(static_cast<T>(act_info.b()), ExactTagType{}))
     {
     }
 
+    /** Run activation function.
+     *
+     * @param[in] vval Vector of values.
+     */
     void operator()(ExactType &vval)
     {
         vval = wrapper::vmin(valpha, wrapper::vmax(vbeta, vval));
     }
 
+    /** Vector of alphas. */
     const ExactType valpha;
+    /** Vector of betas. */
     const ExactType vbeta;
 };
 } // namespace detail
diff --git a/arm_compute/core/NEON/wrapper/traits.h b/arm_compute/core/NEON/wrapper/traits.h
index 08b2c9b48f..495ddbb1af 100644
--- a/arm_compute/core/NEON/wrapper/traits.h
+++ b/arm_compute/core/NEON/wrapper/traits.h
@@ -42,7 +42,8 @@ struct vector_128_tag {};
 
 /** Create the appropriate NEON vector given its type and size */
 template <typename T, int S> struct neon_vector;
-/** Specializations */
+// Specializations
+#ifndef DOXYGEN_SKIP_THIS
 template <> struct neon_vector<uint8_t, 8>{ using type = uint8x8_t; using tag_type = vector_64_tag; };
 template <> struct neon_vector<int8_t, 8>{ using type = int8x8_t; using tag_type = vector_64_tag; };
 template <> struct neon_vector<uint8_t, 16>{ using type = uint8x16_t; using tag_type = vector_128_tag; };
@@ -61,6 +62,7 @@ template <> struct neon_vector<uint64_t, 2>{ using type = uint64x2_t; using tag_
 template <> struct neon_vector<int64_t, 2>{ using type = int64x2_t; using tag_type = vector_128_tag; };
 template <> struct neon_vector<float_t, 2>{ using type = float32x2_t; using tag_type = vector_64_tag; };
 template <> struct neon_vector<float_t, 4>{ using type = float32x4_t; using tag_type = vector_128_tag; };
+#endif /* DOXYGEN_SKIP_THIS */
 
 /**  Helper type template to get the type of a neon vector */
 template <typename T, int S> using neon_vector_t = typename neon_vector<T, S>::type;