22 files changed, 243 insertions, 14 deletions
diff --git a/arm_compute/runtime/NEON/functions/NEActivationLayer.h b/arm_compute/runtime/NEON/functions/NEActivationLayer.h
index ffda8406aa..b39a8d7701 100644
--- a/arm_compute/runtime/NEON/functions/NEActivationLayer.h
+++ b/arm_compute/runtime/NEON/functions/NEActivationLayer.h
@@ -62,6 +62,18 @@ public:
     /** [NEActivationLayer snippet] **/
     /** Set the input and output tensor.
      *
+     * Valid data layouts:
+     * - All
+     *
+     * Valid data type configurations:
+     * |src            |dst            |
+     * |:--------------|:--------------|
+     * |QASYMM8        |QASYMM8        |
+     * |QASYMM8_SIGNED |QASYMM8_SIGNED |
+     * |QSYMM16        |QSYMM16        |
+     * |F16            |F16            |
+     * |F32            |F32            |
+     *
      * @note If the output tensor is a nullptr or is equal to the input, the activation function will be performed in-place
      *
      * @param[in, out] input           Source tensor. In case of @p output tensor = nullptr, this tensor will store the result
diff --git a/arm_compute/runtime/NEON/functions/NEArithmeticAddition.h b/arm_compute/runtime/NEON/functions/NEArithmeticAddition.h
index 8f9fd27906..734e3502dd 100644
--- a/arm_compute/runtime/NEON/functions/NEArithmeticAddition.h
+++ b/arm_compute/runtime/NEON/functions/NEArithmeticAddition.h
@@ -51,19 +51,25 @@ public:
     NEArithmeticAddition &operator=(NEArithmeticAddition &&);
     /** Initialise the kernel's inputs, output and conversion policy.
      *
-     * Valid configurations (Input1,Input2) -> Output :
+     * Valid data layouts:
+     * - NHWC
+     * - NCHW
      *
-     *   - (U8,U8)           -> U8
-     *   - (U8,U8)           -> S16
-     *   - (S16,U8)          -> S16
-     *   - (U8,S16)          -> S16
-     *   - (S16,S16)         -> S16
-     *   - (S32,S32)         -> S32
-     *   - (F16,F16)         -> F16
-     *   - (F32,F32)         -> F32
-     *   - (QASYMM8,QASYMM8) -> QASYMM8
-     *   - (QASYMM8_SIGNED,QASYMM8_SIGNED) -> QASYMM8_SIGNED
-     *   - (QSYMM16,QSYMM16) -> QSYMM16
+     * Valid data type configurations:
+     * |src0           |src1           |dst            |
+     * |:--------------|:--------------|:--------------|
+     * |QASYMM8        |QASYMM8        |QASYMM8        |
+     * |QASYMM8_SIGNED |QASYMM8_SIGNED |QASYMM8_SIGNED |
+     * |QSYMM16        |QSYMM16        |QASYMM16       |
+     * |QSYMM16        |QSYMM16        |S32            |
+     * |U8             |U8             |U8             |
+     * |U8             |U8             |S16            |
+     * |U8             |S16            |S16            |
+     * |S16            |U8             |S16            |
+     * |S16            |S16            |S16            |
+     * |S32            |S32            |S32            |
+     * |F16            |F16            |F16            |
+     * |F32            |F32            |F32            |
      *
      * @param[in]  input1   First tensor input. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/S32/F32
      * @param[in]  input2   Second tensor input. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/S32/F32
diff --git a/arm_compute/runtime/NEON/functions/NEConcatenateLayer.h b/arm_compute/runtime/NEON/functions/NEConcatenateLayer.h
index 6aa724ab0c..dd1c709d76 100644
--- a/arm_compute/runtime/NEON/functions/NEConcatenateLayer.h
+++ b/arm_compute/runtime/NEON/functions/NEConcatenateLayer.h
@@ -55,6 +55,17 @@ public:
     NEConcatenateLayer &operator=(NEConcatenateLayer &&);
     /** Initialise the kernel's inputs vector and output.
      *
+     * Valid data layouts:
+     * - All
+     *
+     * Valid data type configurations:
+     * |src            |dst            |
+     * |:--------------|:--------------|
+     * |QASYMM8        |QASYMM8        |
+     * |QASYMM8_SIGNED |QASYMM8_SIGNED |
+     * |F16            |F16            |
+     * |F32            |F32            |
+     *
      * @note Input and output tensor dimensions preconditions defer depending on the concatenation axis.
      * @note Preconditions can be found respectively at @ref cpu::kernels::CpuConcatenateWidthKernel, @ref cpu::kernels::CpuConcatenateHeightKernel,
      *       @ref cpu::kernels::CpuConcatenateDepthKernel and @ref cpu::kernels::CpuConcatenateBatchKernel.
diff --git a/arm_compute/runtime/NEON/functions/NEConvertFullyConnectedWeights.h b/arm_compute/runtime/NEON/functions/NEConvertFullyConnectedWeights.h
index f9ce66db13..218877d421 100644
--- a/arm_compute/runtime/NEON/functions/NEConvertFullyConnectedWeights.h
+++ b/arm_compute/runtime/NEON/functions/NEConvertFullyConnectedWeights.h
@@ -51,6 +51,15 @@ public:
     ~NEConvertFullyConnectedWeights();
     /** Initialize the function.
      *
+     * Valid data layouts:
+     * - NHWC
+     * - NCHW
+     *
+     * Valid data type configurations:
+     * |src            |dst            |
+     * |:--------------|:--------------|
+     * |All            |All            |
+     *
      * @param[in]  input                Source weights tensor to convert. Must be 2 dimensional. Data types supported: All.
      * @param[out] output               The converted weights tensor. Shape and Data Type: Same as @p input.
      * @param[in]  original_input_shape Shape of the original input tensor (the one entering fully connected layer).
diff --git a/arm_compute/runtime/NEON/functions/NECopy.h b/arm_compute/runtime/NEON/functions/NECopy.h
index 56f8bd9803..ee02c259f4 100644
--- a/arm_compute/runtime/NEON/functions/NECopy.h
+++ b/arm_compute/runtime/NEON/functions/NECopy.h
@@ -53,6 +53,14 @@ public:
     NECopy &operator=(NECopy &&);
     /** Initialise the function's source and destination.
      *
+     * Valid data layouts:
+     * - All
+     *
+     * Valid data type configurations:
+     * |src            |dst            |
+     * |:--------------|:--------------|
+     * |All            |All            |
+     *
      * @param[in]  input  Source tensor. Data types supported: All
      * @param[out] output Output tensor. Data types supported: Same as @p input.
      *
diff --git a/arm_compute/runtime/NEON/functions/NEDequantizationLayer.h b/arm_compute/runtime/NEON/functions/NEDequantizationLayer.h
index a18566e6ca..dfec835f45 100644
--- a/arm_compute/runtime/NEON/functions/NEDequantizationLayer.h
+++ b/arm_compute/runtime/NEON/functions/NEDequantizationLayer.h
@@ -53,6 +53,23 @@ public:
     NEDequantizationLayer &operator=(NEDequantizationLayer &&) = default;
     /** Configure the kernel.
      *
+     * Valid data layouts:
+     * - All
+     *
+     * Valid data type configurations:
+     * |src                |dst            |
+     * |:------------------|:--------------|
+     * |QASYMM8            |F16            |
+     * |QASYMM8            |F32            |
+     * |QASYMM8_SIGNED     |F16            |
+     * |QASYMM8_SIGNED     |F32            |
+     * |QSYMM8_PER_CHANNEL |F16            |
+     * |QSYMM8_PER_CHANNEL |F32            |
+     * |QSYMM8             |F16            |
+     * |QSYMM8             |F32            |
+     * |QSYMM16            |F16            |
+     * |QSYMM16            |F32            |
+     *
      * @param[in]  input  Source tensor. Data types supported: QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL/QSYMM8/QSYMM16.
      * @param[out] output Destination tensor with the same dimensions of input. Data type supported: F16/F32.
      */
diff --git a/arm_compute/runtime/NEON/functions/NEDirectConvolutionLayer.h b/arm_compute/runtime/NEON/functions/NEDirectConvolutionLayer.h
index fc4017e635..82cabed6c9 100644
--- a/arm_compute/runtime/NEON/functions/NEDirectConvolutionLayer.h
+++ b/arm_compute/runtime/NEON/functions/NEDirectConvolutionLayer.h
@@ -58,6 +58,16 @@ public:
     ~NEDirectConvolutionLayer();
     /** Set the input, weights, biases and output tensors.
      *
+     * Valid data layouts:
+     * - NHWC
+     * - NCHW
+     *
+     * Valid data type configurations:
+     * |src0   |src1   |src2   |dst    |
+     * |:------|:------|:------|:------|
+     * |F16    |F16    |F16    |F16    |
+     * |F32    |F32    |F32    |F32    |
+     *
      * @note: DirectConvolution only works in the following configurations:
      *    1x1 convolution with stride_x = 1/2/3, stride_y = 1/2/3 data type = F16/F32
      *    3x3 convolution with stride_x = 1/2/3, stride_y = 1/2/3 data type = F16/F32
diff --git a/arm_compute/runtime/NEON/functions/NEFFT1D.h b/arm_compute/runtime/NEON/functions/NEFFT1D.h
index a533aa7f48..9654b1e604 100644
--- a/arm_compute/runtime/NEON/functions/NEFFT1D.h
+++ b/arm_compute/runtime/NEON/functions/NEFFT1D.h
@@ -63,6 +63,14 @@ public:
     ~NEFFT1D();
     /** Initialise the function's source and destinations.
      *
+     * Valid data layouts:
+     * - All
+     *
+     * Valid data type configurations:
+     * |src    |dst    |
+     * |:------|:------|
+     * |F32    |F32    |
+     *
      * @param[in]  input  Source tensor. Data types supported: F32. Number of channels supported: 1 (real tensor) or 2 (complex tensor).
      * @param[out] output Destination tensor.  Data types and data layouts supported: Same as @p input.
      *                    Number of channels supported: 1 (real tensor) or 2 (complex tensor).If @p input is real, @p output must be complex.
diff --git a/arm_compute/runtime/NEON/functions/NEFFT2D.h b/arm_compute/runtime/NEON/functions/NEFFT2D.h
index ce84a85105..57f38d1942 100644
--- a/arm_compute/runtime/NEON/functions/NEFFT2D.h
+++ b/arm_compute/runtime/NEON/functions/NEFFT2D.h
@@ -58,6 +58,14 @@ public:
     ~NEFFT2D();
     /** Initialise the function's source and destinations
      *
+     * Valid data layouts:
+     * - All
+     *
+     * Valid data type configurations:
+     * |src    |dst    |
+     * |:------|:------|
+     * |F32    |F32    |
+     *
      * @param[in]  input  Source tensor. Data types supported: F32.
      * @param[out] output Destination tensor. Data types and data layouts supported: Same as @p input.
      * @param[in]  config FFT related configuration
diff --git a/arm_compute/runtime/NEON/functions/NEFFTConvolutionLayer.h b/arm_compute/runtime/NEON/functions/NEFFTConvolutionLayer.h
index 213fa6093b..c5f4d45b6b 100644
--- a/arm_compute/runtime/NEON/functions/NEFFTConvolutionLayer.h
+++ b/arm_compute/runtime/NEON/functions/NEFFTConvolutionLayer.h
@@ -73,6 +73,14 @@ public:
     ~NEFFTConvolutionLayer();
     /** Set the input and output tensors.
      *
+     * Valid data layouts:
+     * - All
+     *
+     * Valid data type configurations:
+     * |src    |dst    |
+     * |:------|:------|
+     * |F32    |F32    |
+     *
      * @note: This function only works with any square kernel size and unit strides for both NCHW and NHWC data layout
      *
      * @param[in]  input            Source tensor. 3 lower dimensions represent a single input [width, height, IFM],
diff --git a/arm_compute/runtime/NEON/functions/NEFill.h b/arm_compute/runtime/NEON/functions/NEFill.h
index ba5d020496..e923ce33e1 100644
--- a/arm_compute/runtime/NEON/functions/NEFill.h
+++ b/arm_compute/runtime/NEON/functions/NEFill.h
@@ -53,6 +53,14 @@ public:
     NEFill &operator=(NEFill &&);
     /** Initialize the function
      *
+     * Valid data layouts:
+     * - All
+     *
+     * Valid data type configurations:
+     * |src    |dst    |
+     * |:------|:------|
+     * |All    |All    |
+     *
      * @param[in,out] tensor         Source tensor. Data types supported: All
      * @param[in]     constant_value Constant value to use to fill tensor.
      */
diff --git a/arm_compute/runtime/NEON/functions/NEFloor.h b/arm_compute/runtime/NEON/functions/NEFloor.h
index 9560eb9169..4d47b068db 100644
--- a/arm_compute/runtime/NEON/functions/NEFloor.h
+++ b/arm_compute/runtime/NEON/functions/NEFloor.h
@@ -54,6 +54,15 @@ public:
     ~NEFloor();
     /** Set the source, destination of the kernel
      *
+     * Valid data layouts:
+     * - All
+     *
+     * Valid data type configurations:
+     * |src    |dst    |
+     * |:------|:------|
+     * |F32    |F32    |
+     * |F16    |F16    |
+     *
      * @param[in]  input  Source tensor. Data type supported: F16/F32.
      * @param[out] output Destination tensor. Same as @p input
      */
diff --git a/arm_compute/runtime/NEON/functions/NEPReluLayer.h b/arm_compute/runtime/NEON/functions/NEPReluLayer.h
index b07febfe7f..81d5fd162c 100644
--- a/arm_compute/runtime/NEON/functions/NEPReluLayer.h
+++ b/arm_compute/runtime/NEON/functions/NEPReluLayer.h
@@ -55,6 +55,17 @@ public:
     NEPReluLayer &operator=(NEPReluLayer &&);
     /** Set the input and output tensor.
      *
+     * Valid data layouts:
+     * - All
+     *
+     * Valid data type configurations:
+     * |src            |dst            |
+     * |:--------------|:--------------|
+     * |QASYMM8        |QASYMM8        |
+     * |QASYMM8_SIGNED |QASYMM8_SIGNED |
+     * |F16            |F16            |
+     * |F32            |F32            |
+     *
      * @param[in]  input  Source tensor. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
      * @param[in]  alpha  Source alpha tensor. Data types supported: same of @p input.
      * @param[out] output Destination tensor. Data type supported: same as @p input
diff --git a/arm_compute/runtime/NEON/functions/NEPermute.h b/arm_compute/runtime/NEON/functions/NEPermute.h
index 2508458a3d..c863fde0ac 100644
--- a/arm_compute/runtime/NEON/functions/NEPermute.h
+++ b/arm_compute/runtime/NEON/functions/NEPermute.h
@@ -54,6 +54,15 @@ public:
     NEPermute &operator=(NEPermute &&) = default;
     /** Configure the permute function
      *
+     * Valid data layouts:
+     * - NHWC
+     * - NCHW
+     *
+     * Valid data type configurations:
+     * |src    |dst    |
+     * |:------|:------|
+     * |All    |All    |
+     *
      * @note Arbitrary permutation vectors are supported with rank not greater than 4
      *
      * @param[in]  input  The input tensor to permute. Data types supported: All
diff --git a/arm_compute/runtime/NEON/functions/NEPixelWiseMultiplication.h b/arm_compute/runtime/NEON/functions/NEPixelWiseMultiplication.h
index 6f4cce3cde..f8074e791a 100644
--- a/arm_compute/runtime/NEON/functions/NEPixelWiseMultiplication.h
+++ b/arm_compute/runtime/NEON/functions/NEPixelWiseMultiplication.h
@@ -52,6 +52,24 @@ public:
     NEPixelWiseMultiplication &operator=(NEPixelWiseMultiplication &&) = default;
     /** Initialise the kernel's inputs, output and convertion policy.
      *
+     * Valid data layouts:
+     * - All
+     *
+     * Valid data type configurations:
+     * |src0           |src1           |dst            |
+     * |:--------------|:--------------|:--------------|
+     * |QASYMM8        |QASYMM8        |QASYMM8        |
+     * |QASYMM8_SIGNED |QASYMM8_SIGNED |QASYMM8_SIGNED |
+     * |QSYMM16        |QSYMM16        |QASYMM16       |
+     * |QSYMM16        |QSYMM16        |S32            |
+     * |U8             |U8             |U8             |
+     * |U8             |U8             |S16            |
+     * |U8             |S16            |S16            |
+     * |S16            |U8             |S16            |
+     * |S16            |S16            |S16            |
+     * |F16            |F16            |F16            |
+     * |F32            |S32            |F32            |
+     *
      * @note For @p scale equal to 1/255 only round to nearest even (implemented as round half up) is supported.
      *       For all other scale values only round to zero (implemented as round towards minus infinity) is supported.
      *
diff --git a/arm_compute/runtime/NEON/functions/NEPoolingLayer.h b/arm_compute/runtime/NEON/functions/NEPoolingLayer.h
index cb136ebca9..851dc0ca32 100644
--- a/arm_compute/runtime/NEON/functions/NEPoolingLayer.h
+++ b/arm_compute/runtime/NEON/functions/NEPoolingLayer.h
@@ -59,6 +59,18 @@ public:
     ~NEPoolingLayer();
     /** Set the input and output tensors.
      *
+     * Valid data layouts:
+     * - NHWC
+     * - NCHW
+     *
+     * Valid data type configurations:
+     * |src            |dst            |
+     * |:--------------|:--------------|
+     * |QASYMM8        |QASYMM8        |
+     * |QASYMM8_SIGNED |QASYMM8_SIGNED |
+     * |F16            |F16            |
+     * |F32            |F32            |
+     *
      * @note F16 is supported for pool sizes 2 and 3 only
      *
      * @param[in, out] input     Source tensor. (Written to only when padding != 0) Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
diff --git a/arm_compute/runtime/NEON/functions/NEQuantizationLayer.h b/arm_compute/runtime/NEON/functions/NEQuantizationLayer.h
index 9e2d9ecf24..a7fadfc7cd 100644
--- a/arm_compute/runtime/NEON/functions/NEQuantizationLayer.h
+++ b/arm_compute/runtime/NEON/functions/NEQuantizationLayer.h
@@ -52,6 +52,25 @@ public:
     NEQuantizationLayer &operator=(NEQuantizationLayer &&) = default;
     /** Set the input and output tensors.
      *
+     * Valid data layouts:
+     * - All
+     *
+     * Valid data type configurations:
+     * |src                |dst            |
+     * |:------------------|:--------------|
+     * |QASYMM8            |QASYMM8        |
+     * |QASYMM8            |QASYMM8_SIGNED |
+     * |QASYMM8            |QASYMM16       |
+     * |QASYMM8_SIGNED     |QASYMM8        |
+     * |QASYMM8_SIGNED     |QASYMM8_SIGNED |
+     * |QASYMM8_SIGNED     |QASYMM16       |
+     * |F16                |QASYMM8        |
+     * |F16                |QASYMM8_SIGNED |
+     * |F16                |QASYMM16       |
+     * |F32                |QASYMM8        |
+     * |F32                |QASYMM8_SIGNED |
+     * |F32                |QASYMM16       |
+     *
      * @param[in]  input  Source tensor. The dimensions over the third will be interpreted as batches. Data types supported: QASYMM8/QASYMM8_SIGNED/F32/F16.
      * @param[out] output Destination tensor with the same dimensions of input. Data types supported: QASYMM8/QASYMM8_SIGNED/QASYMM16
      */
diff --git a/arm_compute/runtime/NEON/functions/NEReshapeLayer.h b/arm_compute/runtime/NEON/functions/NEReshapeLayer.h
index b8c0a841bc..3e6e33f797 100644
--- a/arm_compute/runtime/NEON/functions/NEReshapeLayer.h
+++ b/arm_compute/runtime/NEON/functions/NEReshapeLayer.h
@@ -52,6 +52,14 @@ public:
     NEReshapeLayer &operator=(NEReshapeLayer &&);
     /** Initialise the kernel's inputs and outputs
      *
+     * Valid data layouts:
+     * - All
+     *
+     * Valid data type configurations:
+     * |src    |dst    |
+     * |:------|:------|
+     * |All    |All    |
+     *
      * @param[in]  input  Input tensor. Data type supported: All
      * @param[out] output Output tensor. Data type supported: Same as @p input
      */
diff --git a/arm_compute/runtime/NEON/functions/NEScale.h b/arm_compute/runtime/NEON/functions/NEScale.h
index 45658a7cd3..233ee2969e 100644
--- a/arm_compute/runtime/NEON/functions/NEScale.h
+++ b/arm_compute/runtime/NEON/functions/NEScale.h
@@ -48,6 +48,20 @@ public:
     ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(NEScale);
     /** Initialize the function's source, destination, interpolation type and border_mode.
      *
+     * Valid data layouts:
+     * - NHWC
+     * - NCHW
+     *
+     * Valid data type configurations:
+     * |src            |dst            |
+     * |:--------------|:--------------|
+     * |QASYMM8        |QASYMM8        |
+     * |QASYMM8_SIGNED |QASYMM8_SIGNED |
+     * |F16            |F16            |
+     * |F32            |F32            |
+     * |U8             |U8             |
+     * |S16            |S16            |
+     *
      * @param[in, out] input  Source tensor. Data type supported: QASYMM8/QASYMM8_SIGNED/U8/S16/F16/F32. (Written to only for @p border_mode != UNDEFINED)
      * @param[out]     output Destination tensor. Data type supported: Same as @p input. All but the lowest two dimensions must be the same size as in the input tensor, i.e. scaling is only performed within the XY-plane.
      * @param[in]      info   @ref ScaleKernelInfo to be used for configuration
diff --git a/arm_compute/runtime/NEON/functions/NESlice.h b/arm_compute/runtime/NEON/functions/NESlice.h
index 28628778cb..214ffa512c 100644
--- a/arm_compute/runtime/NEON/functions/NESlice.h
+++ b/arm_compute/runtime/NEON/functions/NESlice.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 Arm Limited.
+ * Copyright (c) 2018-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -89,6 +89,14 @@ public:
 
     /** Configure kernel
      *
+     * Valid data layouts:
+     * - All
+     *
+     * Valid data type configurations:
+     * |src    |dst    |
+     * |:------|:------|
+     * |All    |All    |
+     *
      * @note Supported tensor rank: up to 4
      * @note Start indices must be non-negative. 0 <= starts[i]
      * @note End coordinates can be negative, which represents the number of elements before the end of that dimension.
diff --git a/arm_compute/runtime/NEON/functions/NEStridedSlice.h b/arm_compute/runtime/NEON/functions/NEStridedSlice.h
index f9c94f5301..7ba6a52a58 100644
--- a/arm_compute/runtime/NEON/functions/NEStridedSlice.h
+++ b/arm_compute/runtime/NEON/functions/NEStridedSlice.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 Arm Limited.
+ * Copyright (c) 2018-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -95,6 +95,14 @@ public:
 
     /** Configure kernel
      *
+     * Valid data layouts:
+     * - All
+     *
+     * Valid data type configurations:
+     * |src    |dst    |
+     * |:------|:------|
+     * |All    |All    |
+     *
      * @note Supported tensor rank: up to 4
      *
      * @param[in]  input            Source tensor. Data type supported: All
diff --git a/arm_compute/runtime/NEON/functions/NETranspose.h b/arm_compute/runtime/NEON/functions/NETranspose.h
index 78916f67b7..581fe74309 100644
--- a/arm_compute/runtime/NEON/functions/NETranspose.h
+++ b/arm_compute/runtime/NEON/functions/NETranspose.h
@@ -54,6 +54,14 @@ public:
     NETranspose &operator=(NETranspose &&) = default;
     /** Initialise the kernel's inputs and output
      *
+     * Valid data layouts:
+     * - All
+     *
+     * Valid data type configurations:
+     * |src    |dst    |
+     * |:------|:------|
+     * |All    |All    |
+     *
      * @param[in]  input  Input tensor. Data types supported: All
      * @param[out] output Output tensor. Data type supported: Same as @p input
      */