From 62687420901c12be609426f3cf9dee300d25746a Mon Sep 17 00:00:00 2001
From: Teresa Charlin <teresa.charlinreyes@arm.com>
Date: Wed, 28 Apr 2021 10:58:49 +0100
Subject: Update operator list documentation. Part 2.

All data type and data layout information for the operators are store in the function header files

Signed-off-by: Teresa Charlin <teresa.charlinreyes@arm.com>
Change-Id: I30b564f7eda6bbd99bf3ad36ddb6639ac118eb8b
Reviewed-on: https://eu-gerrit-1.euhpc.arm.com/c/VisualCompute/ComputeLibrary/+/319829
Tested-by: bsgcomp <bsgcomp@arm.com>
Reviewed-by: Michele DiGiorgio <michele.digiorgio@arm.com>
Comments-Addressed: bsgcomp <bsgcomp@arm.com>
Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/5531
Tested-by: Arm Jenkins <bsgcomp@arm.com>
Reviewed-by: SiCong Li <sicong.li@arm.com>
Comments-Addressed: Arm Jenkins <bsgcomp@arm.com>
---
 .../runtime/CL/functions/CLArgMinMaxLayer.h        |   14 +-
 .../CL/functions/CLBatchNormalizationLayer.h       |   12 +-
 .../runtime/CL/functions/CLBatchToSpaceLayer.h     |   11 +-
 arm_compute/runtime/CL/functions/CLBitwiseAnd.h    |    8 +
 arm_compute/runtime/CL/functions/CLBitwiseNot.h    |    7 +
 arm_compute/runtime/CL/functions/CLBitwiseOr.h     |    8 +
 arm_compute/runtime/CL/functions/CLBitwiseXor.h    |    8 +
 .../runtime/CL/functions/CLBoundingBoxTransform.h  |   13 +-
 arm_compute/runtime/CL/functions/CLCast.h          |   24 +-
 .../runtime/CL/functions/CLChannelShuffleLayer.h   |   10 +-
 .../runtime/CL/functions/CLConvolutionLayer.h      |   31 +-
 arm_compute/runtime/CL/functions/CLCropResize.h    |    8 +
 .../runtime/CL/functions/CLDeconvolutionLayer.h    |   14 +
 .../runtime/CL/functions/CLDepthConvertLayer.h     |   24 +-
 .../runtime/CL/functions/CLDepthToSpaceLayer.h     |   11 +-
 .../CL/functions/CLDepthwiseConvolutionLayer.h     |   14 +
 .../runtime/CL/functions/CLDequantizationLayer.h   |   19 +-
 arm_compute/runtime/CL/functions/CLFillBorder.h    |   10 +-
 arm_compute/runtime/CL/functions/CLFlattenLayer.h  |    8 +
 .../runtime/CL/functions/CLFullyConnectedLayer.h   |   12 +
 .../CL/functions/CLFuseBatchNormalization.h        |   12 +-
 arm_compute/runtime/CL/functions/CLGEMM.h          |    9 +
 .../runtime/CL/functions/CLGEMMConvolutionLayer.h  |   16 +-
 .../CL/functions/CLGEMMLowpMatrixMultiplyCore.h    |   22 +-
 arm_compute/runtime/CL/functions/CLGather.h        |   10 +-
 .../CL/functions/CLGenerateProposalsLayer.h        |   10 +
 .../CL/functions/CLInstanceNormalizationLayer.h    |   10 +
 .../runtime/CL/functions/CLL2NormalizeLayer.h      |   12 +-
 arm_compute/runtime/CL/functions/CLLSTMLayer.h     |    9 +
 .../runtime/CL/functions/CLLSTMLayerQuantized.h    |   10 +-
 .../runtime/CL/functions/CLMaxUnpoolingLayer.h     |   12 +
 .../CL/functions/CLMeanStdDevNormalizationLayer.h  |   12 +-
 .../runtime/CL/functions/CLNormalizationLayer.h    |   10 +
 arm_compute/runtime/CL/functions/CLPadLayer.h      |    9 +
 arm_compute/runtime/CL/functions/CLPriorBoxLayer.h |   11 +-
 arm_compute/runtime/CL/functions/CLQLSTMLayer.h    |    8 +
 .../runtime/CL/functions/CLQuantizationLayer.h     |   20 +-
 arm_compute/runtime/CL/functions/CLRNNLayer.h      |   10 +
 arm_compute/runtime/CL/functions/CLROIAlignLayer.h |   13 +-
 .../runtime/CL/functions/CLROIPoolingLayer.h       |   10 +
 arm_compute/runtime/CL/functions/CLRange.h         |   18 +-
 arm_compute/runtime/CL/functions/CLReduceMean.h    |   13 +-
 .../runtime/CL/functions/CLReductionOperation.h    |   12 +
 arm_compute/runtime/CL/functions/CLRemap.h         |   10 +-
 arm_compute/runtime/CL/functions/CLReorgLayer.h    |   11 +-
 arm_compute/runtime/CL/functions/CLReverse.h       |   10 +-
 arm_compute/runtime/CL/functions/CLSelect.h        |   10 +-
 .../runtime/CL/functions/CLSpaceToBatchLayer.h     |    9 +
 .../runtime/CL/functions/CLSpaceToDepthLayer.h     |   11 +-
 arm_compute/runtime/CL/functions/CLSplit.h         |   14 +-
 arm_compute/runtime/CL/functions/CLStackLayer.h    |   10 +-
 arm_compute/runtime/CL/functions/CLTile.h          |   10 +-
 arm_compute/runtime/CL/functions/CLUnstack.h       |   10 +-
 .../CL/functions/CLWinogradConvolutionLayer.h      |   12 +-
 .../runtime/NEON/functions/NEArgMinMaxLayer.h      |   12 +
 .../NEON/functions/NEBatchNormalizationLayer.h     |   12 +-
 .../runtime/NEON/functions/NEBatchToSpaceLayer.h   |   11 +-
 arm_compute/runtime/NEON/functions/NEBitwiseAnd.h  |   10 +-
 arm_compute/runtime/NEON/functions/NEBitwiseNot.h  |   10 +-
 arm_compute/runtime/NEON/functions/NEBitwiseOr.h   |   10 +-
 arm_compute/runtime/NEON/functions/NEBitwiseXor.h  |   10 +-
 .../NEON/functions/NEBoundingBoxTransform.h        |   11 +
 arm_compute/runtime/NEON/functions/NECast.h        |   26 +-
 .../runtime/NEON/functions/NEChannelShuffleLayer.h |   10 +-
 .../runtime/NEON/functions/NEConvolutionLayer.h    |   27 +-
 arm_compute/runtime/NEON/functions/NECropResize.h  |    8 +
 .../runtime/NEON/functions/NEDeconvolutionLayer.h  |   14 +
 .../runtime/NEON/functions/NEDepthConvertLayer.h   |   24 +-
 .../runtime/NEON/functions/NEDepthToSpaceLayer.h   |   11 +-
 .../NEON/functions/NEDepthwiseConvolutionLayer.h   |   14 +
 .../runtime/NEON/functions/NEDequantizationLayer.h |   19 +-
 arm_compute/runtime/NEON/functions/NEFillBorder.h  |    8 +
 .../runtime/NEON/functions/NEFlattenLayer.h        |   10 +-
 .../runtime/NEON/functions/NEFullyConnectedLayer.h |   13 +-
 .../NEON/functions/NEFuseBatchNormalization.h      |   12 +-
 arm_compute/runtime/NEON/functions/NEGEMM.h        |   10 +
 .../NEON/functions/NEGEMMConvolutionLayer.h        |   15 +
 .../NEON/functions/NEGEMMLowpMatrixMultiplyCore.h  |   20 +
 arm_compute/runtime/NEON/functions/NEGather.h      |   10 +-
 .../NEON/functions/NEGenerateProposalsLayer.h      |   10 +
 .../NEON/functions/NEInstanceNormalizationLayer.h  |   12 +-
 .../runtime/NEON/functions/NEL2NormalizeLayer.h    |   12 +-
 arm_compute/runtime/NEON/functions/NELSTMLayer.h   |   11 +-
 .../runtime/NEON/functions/NELSTMLayerQuantized.h  |    8 +
 .../runtime/NEON/functions/NEMaxUnpoolingLayer.h   |   12 +
 .../functions/NEMeanStdDevNormalizationLayer.h     |   12 +-
 .../runtime/NEON/functions/NENormalizationLayer.h  |   10 +
 arm_compute/runtime/NEON/functions/NEPadLayer.h    |    9 +
 .../runtime/NEON/functions/NEPriorBoxLayer.h       |   11 +-
 arm_compute/runtime/NEON/functions/NEQLSTMLayer.h  |    8 +
 .../runtime/NEON/functions/NEQuantizationLayer.h   |   20 +-
 arm_compute/runtime/NEON/functions/NERNNLayer.h    |   10 +
 .../runtime/NEON/functions/NEROIAlignLayer.h       |   15 +-
 .../runtime/NEON/functions/NEROIPoolingLayer.h     |    9 +
 arm_compute/runtime/NEON/functions/NERange.h       |   17 +-
 arm_compute/runtime/NEON/functions/NEReduceMean.h  |   13 +-
 .../runtime/NEON/functions/NEReductionOperation.h  |   16 +-
 arm_compute/runtime/NEON/functions/NERemap.h       |    8 +
 arm_compute/runtime/NEON/functions/NEReorgLayer.h  |   11 +-
 arm_compute/runtime/NEON/functions/NEReverse.h     |   10 +-
 arm_compute/runtime/NEON/functions/NESelect.h      |   10 +-
 .../runtime/NEON/functions/NESpaceToBatchLayer.h   |    9 +
 .../runtime/NEON/functions/NESpaceToDepthLayer.h   |    9 +
 arm_compute/runtime/NEON/functions/NESplit.h       |   14 +-
 arm_compute/runtime/NEON/functions/NEStackLayer.h  |   10 +-
 arm_compute/runtime/NEON/functions/NETile.h        |   10 +-
 arm_compute/runtime/NEON/functions/NEUnstack.h     |   10 +-
 .../NEON/functions/NEWinogradConvolutionLayer.h    |   10 +
 arm_compute/runtime/OperatorList.h                 |  301 ++-
 docs/09_operators_list.dox                         | 2156 +++++++++++++++++---
 src/core/CL/cl_kernels/bounding_box_transform.cl   |    4 +-
 .../cl_kernels/bounding_box_transform_quantized.cl |    4 +-
 src/core/CL/cl_kernels/crop_tensor.cl              |    4 +-
 src/core/CL/cl_kernels/depth_to_space.cl           |   10 +-
 src/core/NEON/kernels/NEReductionOperationKernel.h |    4 +-
 115 files changed, 3206 insertions(+), 601 deletions(-)

diff --git a/arm_compute/runtime/CL/functions/CLArgMinMaxLayer.h b/arm_compute/runtime/CL/functions/CLArgMinMaxLayer.h
index c254284cd7..a971163c45 100644
--- a/arm_compute/runtime/CL/functions/CLArgMinMaxLayer.h
+++ b/arm_compute/runtime/CL/functions/CLArgMinMaxLayer.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 Arm Limited.
+ * Copyright (c) 2018-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -64,6 +64,18 @@ public:
     /** Default destructor */
     ~CLArgMinMaxLayer();
     /** Set the input and output tensors.
+     *
+     * Valid data layouts:
+     * - All
+     *
+     * Valid data type configurations:
+     * |src            |dst       |
+     * |:--------------|:---------|
+     * |QASYMM8        |U32, S32  |
+     * |QASYMM8_SIGNED |U32, S32  |
+     * |S32            |U32, S32  |
+     * |F16            |U32, S32  |
+     * |F32            |U32, S32  |
      *
      * @param[in]  input  Input source tensor. Data types supported: QASYMM8/QASYMM8_SIGNED/S32/F16/F32.
      * @param[in]  axis   Axis to find max/min index.
diff --git a/arm_compute/runtime/CL/functions/CLBatchNormalizationLayer.h b/arm_compute/runtime/CL/functions/CLBatchNormalizationLayer.h
index c8acf9fc6b..fcfeb5ea3b 100644
--- a/arm_compute/runtime/CL/functions/CLBatchNormalizationLayer.h
+++ b/arm_compute/runtime/CL/functions/CLBatchNormalizationLayer.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 Arm Limited.
+ * Copyright (c) 2017-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -59,6 +59,16 @@ public:
     /** Default destructor */
     ~CLBatchNormalizationLayer();
     /** Set the input and output tensors.
+     *
+     * Valid data layouts:
+     * - NHWC
+     * - NCHW
+     *
+     * Valid data type configurations:
+     * |src            |dst            |
+     * |:--------------|:--------------|
+     * |F32            |F32            |
+     * |F16            |F16            |
      *
      * @note If the output tensor is a nullptr or is equal to the input, the batch normalization function will be performed in-place
      *
diff --git a/arm_compute/runtime/CL/functions/CLBatchToSpaceLayer.h b/arm_compute/runtime/CL/functions/CLBatchToSpaceLayer.h
index bdb58531d0..f6ba2b0b02 100644
--- a/arm_compute/runtime/CL/functions/CLBatchToSpaceLayer.h
+++ b/arm_compute/runtime/CL/functions/CLBatchToSpaceLayer.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 Arm Limited.
+ * Copyright (c) 2018-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -54,6 +54,15 @@ public:
     /** Default destructor */
     ~CLBatchToSpaceLayer();
     /** Set the input and output tensors.
+     *
+     * Valid data layouts:
+     * - NHWC
+     * - NCHW
+     *
+     * Valid data type configurations:
+     * |src0      |src1      |dst        |
+     * |:---------|:---------|:----------|
+     * |All       |s32       |All        |
      *
      * @param[in]  input       Tensor input. Supported tensor rank: 4. Data types supported: All.
      * @param[in]  block_shape 1-D tensor with shape [M]. Data types supported: S32
diff --git a/arm_compute/runtime/CL/functions/CLBitwiseAnd.h b/arm_compute/runtime/CL/functions/CLBitwiseAnd.h
index a703242875..b30be9b24f 100644
--- a/arm_compute/runtime/CL/functions/CLBitwiseAnd.h
+++ b/arm_compute/runtime/CL/functions/CLBitwiseAnd.h
@@ -40,6 +40,14 @@ class CLBitwiseAnd : public ICLSimpleFunction
 {
 public:
     /** Initialize the function
+     *
+     * Valid data layouts:
+     * - All
+     *
+     * Valid data type configurations:
+     * |src            |dst            |
+     * |:--------------|:--------------|
+     * |U8             |U8             |
      *
      * @param[in]  input1 Input tensor. Data types supported: U8.
      * @param[in]  input2 Input tensor. Data types supported: U8.
diff --git a/arm_compute/runtime/CL/functions/CLBitwiseNot.h b/arm_compute/runtime/CL/functions/CLBitwiseNot.h
index 6f65749d9f..1456ebe57e 100644
--- a/arm_compute/runtime/CL/functions/CLBitwiseNot.h
+++ b/arm_compute/runtime/CL/functions/CLBitwiseNot.h
@@ -40,6 +40,13 @@ class CLBitwiseNot : public ICLSimpleFunction
 {
 public:
     /** Initialize the function
+     * Valid data layouts:
+     * - All
+     *
+     * Valid data type configurations:
+     * |src            |dst            |
+     * |:--------------|:--------------|
+     * |U8             |U8             |
      *
      * @param[in]  input  Input tensor. Data types supported: U8.
      * @param[out] output Output tensor. Data types supported: U8.
diff --git a/arm_compute/runtime/CL/functions/CLBitwiseOr.h b/arm_compute/runtime/CL/functions/CLBitwiseOr.h
index 3c904fb903..ff0a1f0d73 100644
--- a/arm_compute/runtime/CL/functions/CLBitwiseOr.h
+++ b/arm_compute/runtime/CL/functions/CLBitwiseOr.h
@@ -40,6 +40,14 @@ class CLBitwiseOr : public ICLSimpleFunction
 {
 public:
     /** Initialize the function
+     *
+     * Valid data layouts:
+     * - All
+     *
+     * Valid data type configurations:
+     * |src            |dst            |
+     * |:--------------|:--------------|
+     * |U8             |U8             |
      *
      * @param[in]  input1 Input tensor. Data types supported: U8.
      * @param[in]  input2 Input tensor. Data types supported: U8.
diff --git a/arm_compute/runtime/CL/functions/CLBitwiseXor.h b/arm_compute/runtime/CL/functions/CLBitwiseXor.h
index a33a64ad71..0cd9d073b4 100644
--- a/arm_compute/runtime/CL/functions/CLBitwiseXor.h
+++ b/arm_compute/runtime/CL/functions/CLBitwiseXor.h
@@ -40,6 +40,14 @@ class CLBitwiseXor : public ICLSimpleFunction
 {
 public:
     /** Initialize the function
+     *
+     * Valid data layouts:
+     * - All
+     *
+     * Valid data type configurations:
+     * |src            |dst            |
+     * |:--------------|:--------------|
+     * |U8             |U8             |
      *
      * @param[in]  input1 Input tensor. Data types supported: U8.
      * @param[in]  input2 Input tensor. Data types supported: U8.
diff --git a/arm_compute/runtime/CL/functions/CLBoundingBoxTransform.h b/arm_compute/runtime/CL/functions/CLBoundingBoxTransform.h
index d6409106da..d3499c3949 100644
--- a/arm_compute/runtime/CL/functions/CLBoundingBoxTransform.h
+++ b/arm_compute/runtime/CL/functions/CLBoundingBoxTransform.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 Arm Limited.
+ * Copyright (c) 2018-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -44,6 +44,17 @@ class CLBoundingBoxTransform : public ICLSimpleFunction
 {
 public:
     /** Set the input and output tensors.
+     *
+     * Valid data layouts:
+     * - NHWC
+     * - NCHW
+     *
+     * Valid data type configurations:
+     * |src0           |src1           |dst            |
+     * |:--------------|:--------------|:--------------|
+     * |QASYMM16       |QASYMM8        |QASYMM16       |
+     * |F16            |F16            |F16            |
+     * |F32            |F32            |F32            |
      *
      * @param[in]  boxes      Source tensor. Bounding box proposals in pixel coordinates. Size(M, 4), format [x1, y1, x2, y2]. Data types supported: QASYMM16/F16/F32.
      * @param[out] pred_boxes Destination tensor. Pixel coordinates of the transformed bounding boxes. Size (M, 4*K), format [x1, y1, x2, y2]. Data types supported: Same as @p input
diff --git a/arm_compute/runtime/CL/functions/CLCast.h b/arm_compute/runtime/CL/functions/CLCast.h
index bd333d4e72..6e4cf62547 100644
--- a/arm_compute/runtime/CL/functions/CLCast.h
+++ b/arm_compute/runtime/CL/functions/CLCast.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 Arm Limited.
+ * Copyright (c) 2018-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -41,17 +41,21 @@ class CLCast : public ICLSimpleFunction
 public:
     /** Initialize the function's source, destination
      *
-     * Input data type must be different than output data type.
+     * Valid data layouts:
+     * - All
      *
-     * Valid conversions Input -> Output :
+     * Valid data type configurations:
+     * |src            |dst                                    |
+     * |:--------------|:--------------------------------------|
+     * |U8             | S8, U16, S16, U32, S32, F16, F32      |
+     * |U16            | U8, S8, S16, U32, S32, F16, F32       |
+     * |S16            | U8, S8, U16, U32, S32, F16, F32       |
+     * |U32            | U8, S8, U16, S16, S32, F16, F32       |
+     * |S32            | U8, S8, U16, S16, U32, F16, F32       |
+     * |F16            | U8, S8, U16, S16, U32, F32            |
+     * |F32            | U8, S8, U16, S16, U32, F16            |
      *
-     *   - U8  -> S8, U16, S16, U32, S32, F16, F32
-     *   - U16 -> U8, S8, S16, U32, S32, F16, F32
-     *   - S16 -> U8, S8, U16, U32, S32, F16, F32
-     *   - U32 -> U8, S8, U16, S16, S32, F16, F32
-     *   - S32 -> U8, S8, U16, S16, U32, F16, F32
-     *   - F16 -> U8, S8, U16, S16, U32, F32
-     *   - F32 -> U8, S8, U16, S16, U32, F16
+     * Input data type must be different than output data type.
      *
      * @param[in]  input  The input tensor to convert. Data types supported: U8/S8/U16/S16/U32/S32/F16/F32.
      * @param[out] output The output tensor. Data types supported: U8/S8/U16/S16/U32/S32/F16/F32.
diff --git a/arm_compute/runtime/CL/functions/CLChannelShuffleLayer.h b/arm_compute/runtime/CL/functions/CLChannelShuffleLayer.h
index 54cf59f59a..d60548d9cc 100644
--- a/arm_compute/runtime/CL/functions/CLChannelShuffleLayer.h
+++ b/arm_compute/runtime/CL/functions/CLChannelShuffleLayer.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 Arm Limited.
+ * Copyright (c) 2018-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -43,6 +43,14 @@ class CLChannelShuffleLayer : public ICLSimpleFunction
 {
 public:
     /** Initialize the function
+     *
+     * Valid data layouts:
+     * - NCHW
+     *
+     * Valid data type configurations:
+     * |src            |dst            |
+     * |:--------------|:--------------|
+     * |All            |All            |
      *
      * @param[in]  input      Input tensor. Data types supported: All.
      * @param[out] output     Output tensor. Data type supported: Same as @p input
diff --git a/arm_compute/runtime/CL/functions/CLConvolutionLayer.h b/arm_compute/runtime/CL/functions/CLConvolutionLayer.h
index d1de721193..6884754d83 100644
--- a/arm_compute/runtime/CL/functions/CLConvolutionLayer.h
+++ b/arm_compute/runtime/CL/functions/CLConvolutionLayer.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 Arm Limited.
+ * Copyright (c) 2017-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -84,14 +84,28 @@ public:
     /** Default move assignment operator */
     CLConvolutionLayer &operator=(CLConvolutionLayer &&) = default;
     /** Set the input and output tensors.
+     *
+     * Valid data layouts:
+     * - NHWC
+     * - NCHW
+     *
+     * Valid data type configurations:
+     * |src0           |src1               |src2   |dst            |
+     * |:--------------|:------------------|:------|:--------------|
+     * |F16            |F16                |F16    |F16            |
+     * |F32            |F32                |F32    |F32            |
+     * |QASYMM8        |QASYMM8            |S32    |QASYMM8        |
+     * |QASYMM8        |QSYMM8_PER_CHANNEL |S32    |QASYMM8        |
+     * |QASYMM8_SIGNED |QASYMM8_SIGNED     |S32    |QASYMM8_SIGNED |
+     * |QASYMM8_SIGNED |QSYMM8_PER_CHANNEL |S32    |QASYMM8_SIGNED |
      *
      * @param[in]  input            Source tensor. 3 lower dimensions represent a single input [width, height, IFM],
      *                              while every optional dimension from 4 and above represent a batch of inputs.
      *                              Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
      * @param[in]  weights          Weights tensor. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM].
-     *                              Data type supported: Same as @p input or QASYMM8/QSYMM8_PER_CHANNEL when @p input is QASYMM8.
+     *                              Data type supported: Same as @p input, also could be QSYMM8_PER_CHANNEL if input is QASYMM8/QASYMM8_SIGNED.
      * @param[in]  biases           Biases tensor. Shared biases supported. Biases are 1D tensor with dimensions [OFM].
-     *                              Data type supported: Should match @p input data type, except for input of QASYMM8 type where biases should be of S32 type.
+     *                              Data type supported: Same as @p input, except for input of QASYMM8/QASYMM8_SIGNED type where biases should be of S32 type.
      * @param[out] output           Destination tensor. 3 lower dimensions represent a single output [width, height, OFM], while the rest represent batch of outputs.
      *                              Data types supported: Same as @p input.
      * @param[in]  conv_info        Contains padding and stride information described in @ref PadStrideInfo.
@@ -111,9 +125,9 @@ public:
      *                              while every optional dimension from 4 and above represent a batch of inputs.
      *                              Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
      * @param[in]  weights          Weights tensor. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM].
-     *                              Data type supported: Same as @p input or QASYMM8/QSYMM8_PER_CHANNEL when @p input is QASYMM8.
+     *                              Data type supported: Same as @p input, also could be QSYMM8_PER_CHANNEL if input is QASYMM8/QASYMM8_SIGNED.
      * @param[in]  biases           Biases tensor. Shared biases supported. Biases are 1D tensor with dimensions [OFM].
-     *                              Data type supported: Should match @p input data type, except for input of QASYMM8 type where biases should be of S32 type.
+     *                              Data type supported: Same as @p input, except for input of QASYMM8/QASYMM8_SIGNED type where biases should be of S32 type.
      * @param[out] output           Destination tensor. 3 lower dimensions represent a single output [width, height, OFM], while the rest represent batch of outputs.
      *                              Data types supported: Same as @p input.
      * @param[in]  conv_info        Contains padding and stride information described in @ref PadStrideInfo.
@@ -133,8 +147,9 @@ public:
      *                             while every optional dimension from 4 and above represent a batch of inputs.
      *                             Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
      * @param[in] weights          Weights tensor. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM].
-     *                             Data type supported: Same as @p input or QASYMM8/QSYMM8_PER_CHANNEL when @p input is QASYMM8.
-     * @param[in] biases           Biases tensor. Shared biases supported. Biases are 1D tensor with dimensions [OFM]. Data type supported:Same as @p input.
+     *                             Data type supported: Same as @p input, also could be QSYMM8_PER_CHANNEL if input is QASYMM8/QASYMM8_SIGNED.
+     * @param[in] biases           Biases tensor. Shared biases supported. Biases are 1D tensor with dimensions [OFM].
+     *                             Data type supported: Same as @p input, except for input of QASYMM8/QASYMM8_SIGNED type where biases should be of S32 type.
      * @param[in] output           Destination tensor. 3 lower dimensions represent a single output [width, height, OFM], while the rest represent batch of outputs.
      *                             Data types supported: Same as @p input.
      * @param[in] conv_info        Contains padding and stride information described in @ref PadStrideInfo.
@@ -156,7 +171,7 @@ public:
      *                             while every optional dimension from 4 and above represent a batch of inputs.
      *                             Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
      * @param[in] weights          Weights tensor. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM].
-     *                             Data type supported: Same as @p input or QASYMM8/QSYMM8_PER_CHANNEL when @p input is QASYMM8.
+     *                             Data type supported: Same as @p input, also could be QSYMM8_PER_CHANNEL if input is QASYMM8/QASYMM8_SIGNED.
      * @param[in] output           Destination tensor. 3 lower dimensions represent a single output [width, height, OFM], while the rest represent batch of outputs.
      *                             Data types supported: Same as @p input.
      * @param[in] conv_info        Contains padding and stride information described in @ref PadStrideInfo.
diff --git a/arm_compute/runtime/CL/functions/CLCropResize.h b/arm_compute/runtime/CL/functions/CLCropResize.h
index 0dc3c48b32..5c60c2879c 100644
--- a/arm_compute/runtime/CL/functions/CLCropResize.h
+++ b/arm_compute/runtime/CL/functions/CLCropResize.h
@@ -60,6 +60,14 @@ public:
     ~CLCropResize();
 
     /** Configure kernel
+     *
+     * Valid data layouts:
+     * - NHWC
+     *
+     * Valid data type configurations:
+     * |src0     |src1     |src2   |dst      |
+     * |:--------|:--------|:------|:--------|
+     * |All      |F32      |F32    |F32      |
      *
      * @note Supported tensor rank: up to 4
      * @note Box indices may be outside of the bounds, in which case @p extrapolation_value is used.
diff --git a/arm_compute/runtime/CL/functions/CLDeconvolutionLayer.h b/arm_compute/runtime/CL/functions/CLDeconvolutionLayer.h
index 4be8c17835..2dd4cd4bf5 100644
--- a/arm_compute/runtime/CL/functions/CLDeconvolutionLayer.h
+++ b/arm_compute/runtime/CL/functions/CLDeconvolutionLayer.h
@@ -45,6 +45,20 @@ public:
     CLDeconvolutionLayer(std::shared_ptr<IMemoryManager> memory_manager = nullptr);
 
     /** Set the input, weights, biases and output tensors.
+     *
+     * Valid data layouts:
+     * - NHWC
+     * - NCHW
+     *
+     * Valid data type configurations:
+     * |src0           |src1               |src2   |dst            |
+     * |:--------------|:------------------|:------|:--------------|
+     * |F16            |F16                |F16    |F16            |
+     * |F32            |F32                |F32    |F32            |
+     * |QASYMM8        |QASYMM8            |S32    |QASYMM8        |
+     * |QASYMM8        |QSYMM8_PER_CHANNEL |S32    |QASYMM8        |
+     * |QASYMM8_SIGNED |QASYMM8_SIGNED     |S32    |QASYMM8_SIGNED |
+     * |QASYMM8_SIGNED |QSYMM8_PER_CHANNEL |S32    |QASYMM8_SIGNED |
      *
      * @param[in,out] input        Input tensor. 3 lower dimensions represent a single input, and an optional 4th dimension for batch of inputs. Data types supported: QASYMM8_SIGNED/QASYMM8/F16/F32.
      * @param[in]     weights      The 4d weights with dimensions [width, height, IFM, OFM]. Data type supported: Same as @p input or QSYMM8_PER_CHANNEL if @p input is QASYMM8/QASYMM8_SIGNED.
diff --git a/arm_compute/runtime/CL/functions/CLDepthConvertLayer.h b/arm_compute/runtime/CL/functions/CLDepthConvertLayer.h
index b0f297aec5..34dfdd7f3a 100644
--- a/arm_compute/runtime/CL/functions/CLDepthConvertLayer.h
+++ b/arm_compute/runtime/CL/functions/CLDepthConvertLayer.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2020 Arm Limited.
+ * Copyright (c) 2016-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -41,17 +41,21 @@ class CLDepthConvertLayer : public ICLSimpleFunction
 public:
     /** Initialize the function's source, destination
      *
-     * Input data type must be different than output data type.
+     * Valid data layouts:
+     * - All
      *
-     * Valid conversions Input -> Output :
+     * Valid data type configurations:
+     * |src            |dst                                   |
+     * |:--------------|:-------------------------------------|
+     * |U8             | S8, U16, S16, U32, S32, F16, F32     |
+     * |U16            | U8, S8, S16, U32, S32, F16, F32      |
+     * |S16            | U8, S8, U16, U32, S32, F16, F32      |
+     * |U32            | U8, S8, U16, S16, S32, F16, F32      |
+     * |S32            | U8, S8, U16, S16, U32, F16, F32      |
+     * |F16            | U8, S8, U16, S16, U32, F32           |
+     * |F32            | U8, S8, U16, S16, U32, F16           |
      *
-     *   - U8  -> S8, U16, S16, U32, S32, F16, F32
-     *   - U16 -> U8, S8, S16, U32, S32, F16, F32
-     *   - S16 -> U8, S8, U16, U32, S32, F16, F32
-     *   - U32 -> U8, S8, U16, S16, S32, F16, F32
-     *   - S32 -> U8, S8, U16, S16, U32, F16, F32
-     *   - F16 -> U8, S8, U16, S16, U32, F32
-     *   - F32 -> U8, S8, U16, S16, U32, F16
+     * Input data type must be different than output data type.
      *
      * @param[in]  input  The input tensor to convert. Data types supported: U8/S8/U16/S16/U32/S32/F16/F32.
      * @param[out] output The output tensor. Data types supported: U8/S8/U16/S16/U32/S32/F16/F32.
diff --git a/arm_compute/runtime/CL/functions/CLDepthToSpaceLayer.h b/arm_compute/runtime/CL/functions/CLDepthToSpaceLayer.h
index a0aa288dbf..0026cc2b67 100644
--- a/arm_compute/runtime/CL/functions/CLDepthToSpaceLayer.h
+++ b/arm_compute/runtime/CL/functions/CLDepthToSpaceLayer.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 Arm Limited.
+ * Copyright (c) 2019-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -38,6 +38,15 @@ class CLDepthToSpaceLayer : public ICLSimpleFunction
 {
 public:
     /** Set the input and output tensors.
+     *
+     * Valid data layouts:
+     * - NHWC
+     * - NCHW
+     *
+     * Valid data type configurations:
+     * |src            |dst            |
+     * |:--------------|:--------------|
+     * |All            |All            |
      *
      * @param[in]  input       Tensor input. Supported tensor rank: 4. Data types supported: All.
      * @param[out] output      Tensor output. Data types supported: same as @p input
diff --git a/arm_compute/runtime/CL/functions/CLDepthwiseConvolutionLayer.h b/arm_compute/runtime/CL/functions/CLDepthwiseConvolutionLayer.h
index 1af9e1dc6f..f31a17d9cb 100644
--- a/arm_compute/runtime/CL/functions/CLDepthwiseConvolutionLayer.h
+++ b/arm_compute/runtime/CL/functions/CLDepthwiseConvolutionLayer.h
@@ -57,6 +57,20 @@ public:
     /** Default destructor */
     ~CLDepthwiseConvolutionLayer();
     /** Initialize the function's source, destination, weights and convolution information.
+     *
+     * Valid data layouts:
+     * - NHWC
+     * - NCHW
+     *
+     * Valid data type configurations:
+     * |src0           |src1               |src2   |dst            |
+     * |:--------------|:------------------|:------|:--------------|
+     * |F16            |F16                |F16    |F16            |
+     * |F32            |F32                |F32    |F32            |
+     * |QASYMM8        |QASYMM8            |S32    |QASYMM8        |
+     * |QASYMM8        |QSYMM8_PER_CHANNEL |S32    |QASYMM8        |
+     * |QASYMM8_SIGNED |QASYMM8_SIGNED     |S32    |QASYMM8_SIGNED |
+     * |QASYMM8_SIGNED |QSYMM8_PER_CHANNEL |S32    |QASYMM8_SIGNED |
      *
      * @param[in, out] input            Source tensor. Data type supported: QASYMM8/QASYMM8_SIGNED/FP16/FP32. Data layout supported: NHWC, NCHW
      * @param[in]      weights          Weights tensor. These are 3D tensors with shape [kernel_x, kernel_y, IFM].
diff --git a/arm_compute/runtime/CL/functions/CLDequantizationLayer.h b/arm_compute/runtime/CL/functions/CLDequantizationLayer.h
index 4a5c3a3203..601c13d0e4 100644
--- a/arm_compute/runtime/CL/functions/CLDequantizationLayer.h
+++ b/arm_compute/runtime/CL/functions/CLDequantizationLayer.h
@@ -58,18 +58,13 @@ public:
      * - All
      *
      * Valid data type configurations:
-     * |src                |dst            |
-     * |:------------------|:--------------|
-     * |QASYMM8            |F16            |
-     * |QASYMM8            |F32            |
-     * |QASYMM8_SIGNED     |F16            |
-     * |QASYMM8_SIGNED     |F32            |
-     * |QSYMM8_PER_CHANNEL |F16            |
-     * |QSYMM8_PER_CHANNEL |F32            |
-     * |QSYMM8             |F16            |
-     * |QSYMM8             |F32            |
-     * |QSYMM16            |F16            |
-     * |QSYMM16            |F32            |
+     * |src                |dst       |
+     * |:------------------|:---------|
+     * |QASYMM8            |F16, F32  |
+     * |QASYMM8_SIGNED     |F16, F32  |
+     * |QSYMM8_PER_CHANNEL |F16, F32  |
+     * |QSYMM8             |F16, F32  |
+     * |QSYMM16            |F16, F32  |
      *
      * @param[in]  input  Source tensor with at least 3 dimensions. The dimensions over the third will be interpreted as batches.
      *                    Data types supported: QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL/QSYMM8/QSYMM16.
diff --git a/arm_compute/runtime/CL/functions/CLFillBorder.h b/arm_compute/runtime/CL/functions/CLFillBorder.h
index a4ad82dfd4..20f2e15b72 100644
--- a/arm_compute/runtime/CL/functions/CLFillBorder.h
+++ b/arm_compute/runtime/CL/functions/CLFillBorder.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2020 Arm Limited.
+ * Copyright (c) 2016-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -38,6 +38,14 @@ class CLFillBorder : public ICLSimpleFunction
 {
 public:
     /** Initialize the function
+     *
+     * Valid data layouts:
+     * - All
+     *
+     * Valid data type configurations:
+     * |src            |dst            |
+     * |:--------------|:--------------|
+     * |All            |All            |
      *
      * @param[in,out] tensor                Source tensor. Data types supported: U8/QASYMM8/S8/QASYMM8_SIGNED/U16/S16/U32/S32/F16/F32.
      * @param[in]     border_width          The border width
diff --git a/arm_compute/runtime/CL/functions/CLFlattenLayer.h b/arm_compute/runtime/CL/functions/CLFlattenLayer.h
index 8dedd7458d..d2c37b1c22 100644
--- a/arm_compute/runtime/CL/functions/CLFlattenLayer.h
+++ b/arm_compute/runtime/CL/functions/CLFlattenLayer.h
@@ -43,6 +43,14 @@ class CLFlattenLayer : public IFunction
 {
 public:
     /** Initialise the kernel's input and output.
+     *
+     * Valid data layouts:
+     * - All
+     *
+     * Valid data type configurations:
+     * |src            |dst            |
+     * |:--------------|:--------------|
+     * |All            |All            |
      *
      * @param[in]  input  First input tensor to flatten with at least 3 dimensions.
      *                    The dimensions above the third will be interpreted as batches. Data types supported: All.
diff --git a/arm_compute/runtime/CL/functions/CLFullyConnectedLayer.h b/arm_compute/runtime/CL/functions/CLFullyConnectedLayer.h
index 75cb2dc1fa..eec01bcebe 100644
--- a/arm_compute/runtime/CL/functions/CLFullyConnectedLayer.h
+++ b/arm_compute/runtime/CL/functions/CLFullyConnectedLayer.h
@@ -116,6 +116,18 @@ public:
     /** Default move assignment operator */
     CLFullyConnectedLayer &operator=(CLFullyConnectedLayer &&) = default;
     /** Set the input and output tensors.
+     *
+     * Valid data layouts:
+     * - NHWC
+     * - NCHW
+     *
+     * Valid data type configurations:
+     * |src0           |src1               |src2   |dst            |
+     * |:--------------|:------------------|:------|:--------------|
+     * |F16            |F16                |F16    |F16            |
+     * |F32            |F32                |F32    |F32            |
+     * |QASYMM8        |QASYMM8            |S32    |QASYMM8        |
+     * |QASYMM8_SIGNED |QASYMM8_SIGNED     |S32    |QASYMM8_SIGNED |
      *
      * @param[in]  input   Source tensor. Data type supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
      * @param[in]  weights Weights tensor. The weights must be 2 dimensional.
diff --git a/arm_compute/runtime/CL/functions/CLFuseBatchNormalization.h b/arm_compute/runtime/CL/functions/CLFuseBatchNormalization.h
index e35905fcf1..cd75270392 100644
--- a/arm_compute/runtime/CL/functions/CLFuseBatchNormalization.h
+++ b/arm_compute/runtime/CL/functions/CLFuseBatchNormalization.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 Arm Limited.
+ * Copyright (c) 2018-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -54,6 +54,16 @@ public:
     /** Default destructor */
     ~CLFuseBatchNormalization();
     /** Set the input and output tensors.
+     *
+     * Valid data layouts:
+     * - NHWC
+     * - NCHW
+     *
+     * Valid data type configurations:
+     * |src            |dst            |
+     * |:--------------|:--------------|
+     * |F32            |F32            |
+     * |F16            |F16            |
      *
      * @param[in]  input_weights Input weights tensor for convolution or depthwise convolution layer. Data type supported: F16/F32. Data layout supported: NCHW, NHWC
      * @param[in]  bn_mean       Batch normalization layer mean tensor. Same as @p input_weights
diff --git a/arm_compute/runtime/CL/functions/CLGEMM.h b/arm_compute/runtime/CL/functions/CLGEMM.h
index 8a210a2ba5..1e2ae7be64 100644
--- a/arm_compute/runtime/CL/functions/CLGEMM.h
+++ b/arm_compute/runtime/CL/functions/CLGEMM.h
@@ -125,6 +125,15 @@ public:
     /** Default destructor */
     ~CLGEMM();
     /** Initialise the kernel's inputs and output
+     *
+     * Valid data layouts:
+     * - All
+     *
+     * Valid data type configurations:
+     * |src0         |src1        |src2      |dst            |
+     * |:------------|:-----------|:---------|:--------------|
+     * |F32          |F32         |F32       |F32            |
+     * |F16          |F16         |F16       |F16            |
      *
      * @note GEMM: General Matrix Multiply - [alpha * A * B + beta * C].
      *
diff --git a/arm_compute/runtime/CL/functions/CLGEMMConvolutionLayer.h b/arm_compute/runtime/CL/functions/CLGEMMConvolutionLayer.h
index 4dbd0f828a..082b481047 100644
--- a/arm_compute/runtime/CL/functions/CLGEMMConvolutionLayer.h
+++ b/arm_compute/runtime/CL/functions/CLGEMMConvolutionLayer.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 Arm Limited.
+ * Copyright (c) 2017-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -193,6 +193,20 @@ public:
     /**Default destructor */
     ~CLGEMMConvolutionLayer();
     /** Set the input and output tensors.
+     *
+     * Valid data layouts:
+     * - NHWC
+     * - NCHW
+     *
+     * Valid data type configurations:
+     * |src0           |src1               |src2     |dst            |
+     * |:--------------|:------------------|:--------|:--------------|
+     * |F16            |F16                |F16      |F16            |
+     * |F32            |F32                |F32      |F32            |
+     * |QASYMM8        |QASYMM8            |S32      |QASYMM8        |
+     * |QASYMM8        |QSYMM8_PER_CHANNEL |S32      |QASYMM8        |
+     * |QASYMM8_SIGNED |QASYMM8_SIGNED     |S32      |QASYMM8_SIGNED |
+     * |QASYMM8_SIGNED |QSYMM8_PER_CHANNEL |S32      |QASYMM8_SIGNED |
      *
      * @param[in]  input        Source tensor. 3 lower dimensions represent a single input [width, height, IFM],
      *                          while every optional dimension from 4 and above represent a batch of inputs.
diff --git a/arm_compute/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.h b/arm_compute/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.h
index 4cc8899690..e7f4cb9d01 100644
--- a/arm_compute/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.h
+++ b/arm_compute/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 Arm Limited.
+ * Copyright (c) 2017-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -60,6 +60,26 @@ public:
     /** Default destructor */
     ~CLGEMMLowpMatrixMultiplyCore();
     /** Initialise the kernel's inputs, output
+     *
+     * Valid data layouts:
+     * - NHWC
+     * - NCHW
+     *
+     * Valid data type configurations:
+     * |src0           |src1               |src2     |dst            |
+     * |:--------------|:------------------|:--------|:--------------|
+     * |QASYMM8        |QASYMM8            |S32      |QASYMM8        |
+     * |QASYMM8        |QSYMM8_PER_CHANNEL |S32      |QASYMM8        |
+     * |QASYMM8        |QSYMM8             |S32      |QASYMM8        |
+     * |QASYMM8        |QASYMM8            |S32      |S32            |
+     * |QASYMM8        |QSYMM8_PER_CHANNEL |S32      |S32            |
+     * |QASYMM8        |QSYMM8             |S32      |S32            |
+     * |QASYMM8_SIGNED |QASYMM8_SIGNED     |S32      |QASYMM8_SIGNED |
+     * |QASYMM8_SIGNED |QSYMM8_PER_CHANNEL |S32      |QASYMM8_SIGNED |
+     * |QASYMM8_SIGNED |QSYMM8             |S32      |QASYMM8_SIGNED |
+     * |QASYMM8_SIGNED |QASYMM8_SIGNED     |S32      |S32            |
+     * |QASYMM8_SIGNED |QSYMM8_PER_CHANNEL |S32      |S32            |
+     * |QASYMM8_SIGNED |QSYMM8             |S32      |S32            |
      *
      * @note GEMMLowp:  low precision GEMM kernel. [A * B + C]
      *  This kernel performs the following computations:
diff --git a/arm_compute/runtime/CL/functions/CLGather.h b/arm_compute/runtime/CL/functions/CLGather.h
index 9c659be6fc..7a57c7358c 100644
--- a/arm_compute/runtime/CL/functions/CLGather.h
+++ b/arm_compute/runtime/CL/functions/CLGather.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 Arm Limited.
+ * Copyright (c) 2018-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -39,6 +39,14 @@ class CLGather : public ICLSimpleFunction
 {
 public:
     /** Initialise the kernel's inputs and outputs
+     *
+     * Valid data layouts:
+     * - All
+     *
+     * Valid data type configurations:
+     * |src            |dst            |
+     * |:--------------|:--------------|
+     * |All            |All            |
      *
      * @param[in]  input   Source tensor. Supported tensor rank: up to 4. Data type supported: All.
      * @param[in]  indices Indices tensor. Supported tensor rank: up to 1. Must be one of the following types: U32/S32. Each value must be in range [0, input.shape[@p axis])
diff --git a/arm_compute/runtime/CL/functions/CLGenerateProposalsLayer.h b/arm_compute/runtime/CL/functions/CLGenerateProposalsLayer.h
index bea470712c..aec5cdf1a8 100644
--- a/arm_compute/runtime/CL/functions/CLGenerateProposalsLayer.h
+++ b/arm_compute/runtime/CL/functions/CLGenerateProposalsLayer.h
@@ -76,6 +76,16 @@ public:
     ~CLGenerateProposalsLayer();
 
     /** Set the input and output tensors.
+     *
+     * Valid data layouts:
+     * - All
+     *
+     * Valid data type configurations:
+     * |src0           |src1               |src2     |dst            |
+     * |:--------------|:------------------|:--------|:--------------|
+     * |F16            |F16                |F16      |F16            |
+     * |F32            |F32                |F32      |F32            |
+     * |QASYMM8        |QSYMM8             |QSYMM16  |QASYMM8        |
      *
      * @param[in]  scores              Scores from convolution layer of size (W, H, A), where H and W are the height and width of the feature map, and A is the number of anchors.
      *                                 Data types supported: QASYMM8/F16/F32
diff --git a/arm_compute/runtime/CL/functions/CLInstanceNormalizationLayer.h b/arm_compute/runtime/CL/functions/CLInstanceNormalizationLayer.h
index a6e5b1622b..985a6a75f7 100644
--- a/arm_compute/runtime/CL/functions/CLInstanceNormalizationLayer.h
+++ b/arm_compute/runtime/CL/functions/CLInstanceNormalizationLayer.h
@@ -64,6 +64,16 @@ public:
     ~CLInstanceNormalizationLayer();
 
     /** Set the input and output tensors.
+     *
+     * Valid data layouts:
+     * - NHWC
+     * - NCHW
+     *
+     * Valid data type configurations:
+     * |src      |dst       |
+     * |:--------|:---------|
+     * |F16      |F16       |
+     * |F32      |F32       |
      *
      * @param[in, out] input               Source tensor. In case of @p output tensor = nullptr this tensor will store the result of the normalization.
      *                                     Data types supported: F16/F32. Data layout supported: NHWC, NCHW
diff --git a/arm_compute/runtime/CL/functions/CLL2NormalizeLayer.h b/arm_compute/runtime/CL/functions/CLL2NormalizeLayer.h
index 401d249eb4..4dc5c778d2 100644
--- a/arm_compute/runtime/CL/functions/CLL2NormalizeLayer.h
+++ b/arm_compute/runtime/CL/functions/CLL2NormalizeLayer.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 Arm Limited.
+ * Copyright (c) 2017-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -64,6 +64,16 @@ public:
     CLL2NormalizeLayer &operator=(CLL2NormalizeLayer &&) = default;
 
     /** Set the input and output tensors.
+     *
+     * Valid data layouts:
+     * - NHWC
+     * - NCHW
+     *
+     * Valid data type configurations:
+     * |src      |dst       |
+     * |:--------|:---------|
+     * |F16      |F16       |
+     * |F32      |F32       |
      *
      * @param[in]  input   Source tensor. Data types supported: F16/F32. Data layouts supported: NCHW/NHWC.
      * @param[out] output  Destination tensor. Data types and data layouts supported: Same as @p input.
diff --git a/arm_compute/runtime/CL/functions/CLLSTMLayer.h b/arm_compute/runtime/CL/functions/CLLSTMLayer.h
index 38a24d030b..d26b4c5595 100644
--- a/arm_compute/runtime/CL/functions/CLLSTMLayer.h
+++ b/arm_compute/runtime/CL/functions/CLLSTMLayer.h
@@ -74,6 +74,15 @@ public:
     /** Default destructor */
     ~CLLSTMLayer();
     /** Initialize function's tensors.
+     *
+     * Valid data layouts:
+     * - All
+     *
+     * Valid data type configurations:
+     * |src0 - src13 | dst0 - dst3 |
+     * |:------------|:------------|
+     * |F16          |F16          |
+     * |F32          |F32          |
      *
      * @param[in]  input                       Source tensor. Input is a 2D tensor with dimensions [input_size, batch_size]. Data types supported: F16/F32.
      * @param[in]  input_to_forget_weights     2D weights tensor with dimensions [input_size, num_units]. Data type supported: Same as @p input.
diff --git a/arm_compute/runtime/CL/functions/CLLSTMLayerQuantized.h b/arm_compute/runtime/CL/functions/CLLSTMLayerQuantized.h
index 0829052384..2ef7427a5a 100644
--- a/arm_compute/runtime/CL/functions/CLLSTMLayerQuantized.h
+++ b/arm_compute/runtime/CL/functions/CLLSTMLayerQuantized.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 Arm Limited.
+ * Copyright (c) 2019-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -72,6 +72,14 @@ public:
     /** Default move assignment operator */
     CLLSTMLayerQuantized &operator=(CLLSTMLayerQuantized &&) = default;
     /** Initialize function's tensors.
+     *
+     * Valid data layouts:
+     * - All
+     *
+     * Valid data type configurations:
+     * |src0 - src8 |src9 - src12 |src13   |src14  |dst0   |dst1   |
+     * |:-----------|:------------|:-------|:------|:------|:------|
+     * |QASYMM8     |S32          |QSYMM16 |QASYMM8|QSYMM16|QASYMM8|
      *
      * @param[in]  input                       Source tensor. Input is a 2D tensor with dimensions [input_size, batch_size]. Data types supported: QASYMM8.
      * @param[in]  input_to_input_weights      2D weights tensor with dimensions [input_size, output_size]. Data type supported: Same as @p input.
diff --git a/arm_compute/runtime/CL/functions/CLMaxUnpoolingLayer.h b/arm_compute/runtime/CL/functions/CLMaxUnpoolingLayer.h
index 24d620d372..f7ff1234f6 100644
--- a/arm_compute/runtime/CL/functions/CLMaxUnpoolingLayer.h
+++ b/arm_compute/runtime/CL/functions/CLMaxUnpoolingLayer.h
@@ -55,6 +55,18 @@ public:
     /** Default destructor */
     ~CLMaxUnpoolingLayer();
     /** Set the input and output tensors.
+     *
+     * Valid data layouts:
+     * - NHWC
+     * - NCHW
+     *
+     * Valid data type configurations:
+     * |src            |dst            |
+     * |:--------------|:--------------|
+     * |QASYMM8        |QASYMM8        |
+     * |QASYMM8_SIGNED |QASYMM8_SIGNED |
+     * |F16            |F16            |
+     * |F32            |F32            |
      *
      * @note Output shape must be equal to the shape of the original input to pool.
      *
diff --git a/arm_compute/runtime/CL/functions/CLMeanStdDevNormalizationLayer.h b/arm_compute/runtime/CL/functions/CLMeanStdDevNormalizationLayer.h
index cfe59eac09..68a7df24e6 100644
--- a/arm_compute/runtime/CL/functions/CLMeanStdDevNormalizationLayer.h
+++ b/arm_compute/runtime/CL/functions/CLMeanStdDevNormalizationLayer.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 Arm Limited.
+ * Copyright (c) 2019-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -38,6 +38,16 @@ class CLMeanStdDevNormalizationLayer : public ICLSimpleFunction
 {
 public:
     /** Initialise the function's input and outputs.
+     *
+     * Valid data layouts:
+     * - NHWC
+     * - NCHW
+     *
+     * Valid data type configurations:
+     * |src      |dst       |
+     * |:--------|:---------|
+     * |F32      |F32       |
+     * |F16      |F16       |
      *
      * @note If the output tensor is a nullptr, the normalization will be performed in-place.
      *
diff --git a/arm_compute/runtime/CL/functions/CLNormalizationLayer.h b/arm_compute/runtime/CL/functions/CLNormalizationLayer.h
index 706cb6f152..15406f7728 100644
--- a/arm_compute/runtime/CL/functions/CLNormalizationLayer.h
+++ b/arm_compute/runtime/CL/functions/CLNormalizationLayer.h
@@ -60,6 +60,16 @@ public:
     /** Default destructor */
     ~CLNormalizationLayer();
     /** Set the input and output tensors.
+     *
+     * Valid data layouts:
+     * - NHWC
+     * - NCHW
+     *
+     * Valid data type configurations:
+     * |src      |dst       |
+     * |:--------|:---------|
+     * |F32      |F32       |
+     * |F16      |F16       |
      *
      * @param[in, out] input     Source tensor. 3 lower dims represent a single input with dimensions [width, height, IFM],
      *                           and an optional 4th dimension for batch of inputs. Data types supported: F16/F32 (Written to by the border handler).
diff --git a/arm_compute/runtime/CL/functions/CLPadLayer.h b/arm_compute/runtime/CL/functions/CLPadLayer.h
index dae95f63e6..7f950bcfb3 100644
--- a/arm_compute/runtime/CL/functions/CLPadLayer.h
+++ b/arm_compute/runtime/CL/functions/CLPadLayer.h
@@ -58,6 +58,15 @@ public:
     ~CLPadLayer();
 
     /** Initialize the function
+     *
+     * Valid data layouts:
+     * - NHWC
+     * - NCHW
+     *
+     * Valid data type configurations:
+     * |src      |dst       |
+     * |:--------|:---------|
+     * |All      |All       |
      *
      * @param[in]  input          Source tensor. Data types supported: All.
      * @param[out] output         Output tensor. Data type supported: same as @p input
diff --git a/arm_compute/runtime/CL/functions/CLPriorBoxLayer.h b/arm_compute/runtime/CL/functions/CLPriorBoxLayer.h
index 9129bfd064..9b36c9e433 100644
--- a/arm_compute/runtime/CL/functions/CLPriorBoxLayer.h
+++ b/arm_compute/runtime/CL/functions/CLPriorBoxLayer.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 Arm Limited.
+ * Copyright (c) 2018-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -42,6 +42,15 @@ public:
     /** Constructor */
     CLPriorBoxLayer();
     /** Set the input and output tensors.
+     *
+     * Valid data layouts:
+     * - NHWC
+     * - NCHW
+     *
+     * Valid data type configurations:
+     * |src0     |src1     |dst      |
+     * |:--------|:--------|:--------|
+     * |F32      |F32      |F32      |
      *
      * @param[in]  input1 First source tensor. Data types supported: F32. Data layouts supported: NCHW/NHWC.
      * @param[in]  input2 Second source tensor. Data types and layouts supported: same as @p input1
diff --git a/arm_compute/runtime/CL/functions/CLQLSTMLayer.h b/arm_compute/runtime/CL/functions/CLQLSTMLayer.h
index 954f224424..bd00d56468 100644
--- a/arm_compute/runtime/CL/functions/CLQLSTMLayer.h
+++ b/arm_compute/runtime/CL/functions/CLQLSTMLayer.h
@@ -73,6 +73,14 @@ public:
     /** Default destructor */
     ~CLQLSTMLayer();
     /** Initialize function's tensors.
+     *
+     * Valid data layouts:
+     * - All
+     *
+     * Valid data type configurations:
+     * |src0          |src1 - src6  |src7 -src9   |src10  |src11         |dst0   |dst1 - dst2       |
+     * |:-------------|:------------|:------------|:------|:-------------|:------|:-----------------|
+     * |QASYMM8_SIGNED|QASYMM8      |S32          |QSYMM16|QASYMM8_SIGNED|QSYMM16|QASYMM8_SIGNED    |
      *
      * @param[in]  input                       Source tensor. Input is a 2D tensor with dimensions [input_size, batch_size]. Data types supported: QASYMM8_SIGNED.
      * @param[in]  input_to_forget_weights     2D weights tensor with dimensions [input_size, num_units]. Data type supported: QSYMM8.
diff --git a/arm_compute/runtime/CL/functions/CLQuantizationLayer.h b/arm_compute/runtime/CL/functions/CLQuantizationLayer.h
index 6a44a226d4..a61735cb97 100644
--- a/arm_compute/runtime/CL/functions/CLQuantizationLayer.h
+++ b/arm_compute/runtime/CL/functions/CLQuantizationLayer.h
@@ -63,20 +63,12 @@ public:
      * - All
      *
      * Valid data type configurations:
-     * |src                |dst            |
-     * |:------------------|:--------------|
-     * |QASYMM8            |QASYMM8        |
-     * |QASYMM8            |QASYMM8_SIGNED |
-     * |QASYMM8            |QASYMM16       |
-     * |QASYMM8_SIGNED     |QASYMM8        |
-     * |QASYMM8_SIGNED     |QASYMM8_SIGNED |
-     * |QASYMM8_SIGNED     |QASYMM16       |
-     * |F16                |QASYMM8        |
-     * |F16                |QASYMM8_SIGNED |
-     * |F16                |QASYMM16       |
-     * |F32                |QASYMM8        |
-     * |F32                |QASYMM8_SIGNED |
-     * |F32                |QASYMM16       |
+     * |src                |dst                                |
+     * |:------------------|:----------------------------------|
+     * |QASYMM8            |QASYMM8, QASYMM8_SIGNED, QASYMM16  |
+     * |QASYMM8_SIGNED     |QASYMM8, QASYMM8_SIGNED, QASYMM16  |
+     * |F16                |QASYMM8, QASYMM8_SIGNED, QASYMM16  |
+     * |F32                |QASYMM8, QASYMM8_SIGNED, QASYMM16  |
      *
      * @param[in]  input  Source tensor. The dimensions over the third will be interpreted as batches. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/32.
      * @param[out] output Destination tensor with the same dimensions of input. Data types supported: QASYMM8/QASYMM8_SIGNED/QASYMM16.
diff --git a/arm_compute/runtime/CL/functions/CLRNNLayer.h b/arm_compute/runtime/CL/functions/CLRNNLayer.h
index 50575daaa3..2b3b35e37d 100644
--- a/arm_compute/runtime/CL/functions/CLRNNLayer.h
+++ b/arm_compute/runtime/CL/functions/CLRNNLayer.h
@@ -50,6 +50,16 @@ public:
     /** Default destructor */
     ~CLRNNLayer();
     /** Initialize the function
+     *
+     * Valid data layouts:
+     * - NHWC
+     * - NCHW
+     *
+     * Valid data type configurations:
+     * |src0   |src1   |src2   |src3   |dst0   |dst1   |
+     * |:------|:------|:------|:------|:------|:------|
+     * |F16    |F16    |F16    |F16    |F16    |F16    |
+     * |F32    |F32    |F32    |F32    |F32    |F32    |
      *
      * @param[in]     input             Input is a 2-D tensor of shape [input_size, batch_size]. Data types supported: F16/F32
      * @param[in]     weights           Weights tensor of shape [input_size, num_units] that multiplies the input. Data types supported: Same as @p input
diff --git a/arm_compute/runtime/CL/functions/CLROIAlignLayer.h b/arm_compute/runtime/CL/functions/CLROIAlignLayer.h
index b4cd5560ef..1eaea1b297 100644
--- a/arm_compute/runtime/CL/functions/CLROIAlignLayer.h
+++ b/arm_compute/runtime/CL/functions/CLROIAlignLayer.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 Arm Limited.
+ * Copyright (c) 2018-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -44,6 +44,17 @@ class CLROIAlignLayer : public ICLSimpleFunction
 {
 public:
     /** Set the input and output tensors.
+     *
+     * Valid data layouts:
+     * - All
+     *
+     * Valid data type configurations:
+     * |src0           |src1           |dst            |
+     * |:--------------|:--------------|:--------------|
+     * |F16            |F16            |F16            |
+     * |F32            |F32            |F32            |
+     * |QASYMM8        |QASYMM16       |QASYMM8        |
+     * |QASYMM8_SIGNED |QASYMM16       |QASYMM8_SIGNED |
      *
      * @param[in]  input     Source tensor. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
      * @param[in]  rois      ROIs tensor, it is a 2D tensor of size [5, N] (where N is the number of ROIs) containing top left and bottom right corner
diff --git a/arm_compute/runtime/CL/functions/CLROIPoolingLayer.h b/arm_compute/runtime/CL/functions/CLROIPoolingLayer.h
index a4c5c76f2e..151586a1f6 100644
--- a/arm_compute/runtime/CL/functions/CLROIPoolingLayer.h
+++ b/arm_compute/runtime/CL/functions/CLROIPoolingLayer.h
@@ -44,6 +44,16 @@ class CLROIPoolingLayer : public ICLSimpleFunction
 {
 public:
     /** Set the input and output tensors.
+     *
+     * Valid data layouts:
+     * - All
+     *
+     * Valid data type configurations:
+     * |src0           |src1           |dst            |
+     * |:--------------|:--------------|:--------------|
+     * |F16            |U16            |F16            |
+     * |F32            |U16            |F32            |
+     * |QASYMM8        |U16            |QASYMM8        |
      *
      * @param[in]  input     Source tensor. Data types supported: F16/F32/QASYMM8
      * @param[in]  rois      ROIs tensor, it is a 2D tensor of size [5, N] (where N is the number of ROIs) containing top left and bottom right corner
diff --git a/arm_compute/runtime/CL/functions/CLRange.h b/arm_compute/runtime/CL/functions/CLRange.h
index e11e740861..fbce05162c 100644
--- a/arm_compute/runtime/CL/functions/CLRange.h
+++ b/arm_compute/runtime/CL/functions/CLRange.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 Arm Limited.
+ * Copyright (c) 2018-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -42,6 +42,22 @@ class CLRange : public ICLSimpleFunction
 {
 public:
     /** Initialize the kernel's start, end, step and output tensor.
+     *
+     * Valid data layouts:
+     * - All
+     *
+     * Valid data type configurations:
+     * |dst       |
+     * |:---------|
+     * |U8        |
+     * |S8        |
+     * |QASYMM8   |
+     * |U16       |
+     * |S16       |
+     * |U32       |
+     * |S32       |
+     * |F16       |
+     * |F32       |
      *
      * @param[out] output Output tensor. Data types supported: U8/S8/QASYMM8/U16/S16/U32/S32/F16/F32.
      * @param[in]  start  The starting value of the sequence.
diff --git a/arm_compute/runtime/CL/functions/CLReduceMean.h b/arm_compute/runtime/CL/functions/CLReduceMean.h
index c37ee8c5ab..1ce088b2ce 100644
--- a/arm_compute/runtime/CL/functions/CLReduceMean.h
+++ b/arm_compute/runtime/CL/functions/CLReduceMean.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 Arm Limited.
+ * Copyright (c) 2018-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -44,6 +44,17 @@ public:
     /** Default constructor */
     CLReduceMean(std::shared_ptr<IMemoryManager> memory_manager = nullptr);
     /** Configure kernel
+     *
+     * Valid data layouts:
+     * - All
+     *
+     * Valid data type configurations:
+     * |src            |dst            |
+     * |:--------------|:--------------|
+     * |QASYMM8        |QASYMM8        |
+     * |QASYMM8_SIGNED |QASYMM8_SIGNED |
+     * |F16            |F16            |
+     * |F32            |F32            |
      *
      * @note Supported tensor rank: up to 4
      *
diff --git a/arm_compute/runtime/CL/functions/CLReductionOperation.h b/arm_compute/runtime/CL/functions/CLReductionOperation.h
index 58164fdcb3..2245735b62 100644
--- a/arm_compute/runtime/CL/functions/CLReductionOperation.h
+++ b/arm_compute/runtime/CL/functions/CLReductionOperation.h
@@ -61,6 +61,18 @@ public:
     CLReductionOperation &operator=(CLReductionOperation &&) = default;
 
     /** Set the input and output tensors.
+     *
+     * Valid data layouts:
+     * - All
+     *
+     * Valid data type configurations:
+     * |src            |dst            |
+     * |:--------------|:--------------|
+     * |QASYMM8        |QASYMM8        |
+     * |QASYMM8_SIGNED |QASYMM8_SIGNED |
+     * |F16            |F16            |
+     * |F32            |F32            |
+     * |S32            |S32            |
      *
      * @param[in]  input     Source tensor. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32/S32.
      * @param[out] output    Destination tensor. Data types and data layouts supported: Same as @p input.
diff --git a/arm_compute/runtime/CL/functions/CLRemap.h b/arm_compute/runtime/CL/functions/CLRemap.h
index 87d5f9fec7..39ee13b8d0 100644
--- a/arm_compute/runtime/CL/functions/CLRemap.h
+++ b/arm_compute/runtime/CL/functions/CLRemap.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 Arm Limited.
+ * Copyright (c) 2017-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -46,6 +46,14 @@ class CLRemap : public ICLSimpleFunction
 {
 public:
     /** Initialise the function's sources, destination, interpolation policy and border mode.
+     *
+     * Valid data layouts:
+     * - All
+     *
+     * Valid data type configurations:
+     * |src0   |src1   |src2   |dst    |
+     * |:------|:------|:------|:------|
+     * |U8     |F32    |F32    |U 8    |
      *
      * @param[in,out] input                 Source tensor. Data types supported: U8. (Written to only for @p border_mode != UNDEFINED)
      * @param[in]     map_x                 Map for X coords. Data types supported: F32.
diff --git a/arm_compute/runtime/CL/functions/CLReorgLayer.h b/arm_compute/runtime/CL/functions/CLReorgLayer.h
index 0840fd13fd..976b8f6213 100644
--- a/arm_compute/runtime/CL/functions/CLReorgLayer.h
+++ b/arm_compute/runtime/CL/functions/CLReorgLayer.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 Arm Limited.
+ * Copyright (c) 2018-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -37,6 +37,15 @@ class CLReorgLayer : public ICLSimpleFunction
 {
 public:
     /** Initialise the function's source and destination.
+     *
+     * Valid data layouts:
+     * - NHWC
+     * - NCHW
+     *
+     * Valid data type configurations:
+     * |src            |dst            |
+     * |:--------------|:--------------|
+     * |All            |All            |
      *
      * @param[in]  input  Source tensor. Data types supported: All.
      * @param[out] output Destination tensor with tensor shape:
diff --git a/arm_compute/runtime/CL/functions/CLReverse.h b/arm_compute/runtime/CL/functions/CLReverse.h
index 81fa04b1f5..94c63ca92d 100644
--- a/arm_compute/runtime/CL/functions/CLReverse.h
+++ b/arm_compute/runtime/CL/functions/CLReverse.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 Arm Limited.
+ * Copyright (c) 2018-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -38,6 +38,14 @@ class CLReverse : public ICLSimpleFunction
 {
 public:
     /** Initialize the function
+     *
+     * Valid data layouts:
+     * - All
+     *
+     * Valid data type configurations:
+     * |src0           |src1           |dst            |
+     * |:--------------|:--------------|:--------------|
+     * |All            |U32            |All            |
      *
      * @param[in]  input  Input tensor. Data types supported: All.
      * @param[out] output Output tensor. Data type supported: Same as @p input
diff --git a/arm_compute/runtime/CL/functions/CLSelect.h b/arm_compute/runtime/CL/functions/CLSelect.h
index 7fd52312fb..8b1e6b2019 100644
--- a/arm_compute/runtime/CL/functions/CLSelect.h
+++ b/arm_compute/runtime/CL/functions/CLSelect.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 Arm Limited.
+ * Copyright (c) 2018-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -39,6 +39,14 @@ class CLSelect : public ICLSimpleFunction
 {
 public:
     /** Initialise the kernel's inputs and output.
+     *
+     * Valid data layouts:
+     * - All
+     *
+     * Valid data type configurations:
+     * |src0           |src1           |src2   |dst            |
+     * |:--------------|:--------------|:------|:--------------|
+     * |U8             |All            |All    |All            |
      *
      * @param[in]  c      Condition input tensor. Data types supported: U8.
      * @param[in]  x      First input tensor. Data types supported: All.
diff --git a/arm_compute/runtime/CL/functions/CLSpaceToBatchLayer.h b/arm_compute/runtime/CL/functions/CLSpaceToBatchLayer.h
index dc02fa1363..304a74137e 100644
--- a/arm_compute/runtime/CL/functions/CLSpaceToBatchLayer.h
+++ b/arm_compute/runtime/CL/functions/CLSpaceToBatchLayer.h
@@ -59,6 +59,15 @@ public:
     /** Default destructor */
     ~CLSpaceToBatchLayer();
     /** Set the input and output tensors.
+     *
+     * Valid data layouts:
+     * - NHWC
+     * - NCHW
+     *
+     * Valid data type configurations:
+     * |src0      |src1      |src2      |dst       |
+     * |:---------|:---------|:---------|:---------|
+     * |All       |S32       |S32       |All       |
      *
      * @param[in]  input       Tensor input. Supported tensor rank: 4. Data types supported: All.
      * @param[in]  block_shape 1-D tensor with shape [M]. Supported M: 2. Data types supported: S32
diff --git a/arm_compute/runtime/CL/functions/CLSpaceToDepthLayer.h b/arm_compute/runtime/CL/functions/CLSpaceToDepthLayer.h
index 9e476fe7bd..8a47e95f9d 100644
--- a/arm_compute/runtime/CL/functions/CLSpaceToDepthLayer.h
+++ b/arm_compute/runtime/CL/functions/CLSpaceToDepthLayer.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 Arm Limited.
+ * Copyright (c) 2019-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -53,6 +53,15 @@ public:
     /** Default destructor */
     ~CLSpaceToDepthLayer();
     /** Set the input and output tensors.
+     *
+     * Valid data layouts:
+     * - NHWC
+     * - NCHW
+     *
+     * Valid data type configurations:
+     * |src            |dst            |
+     * |:--------------|:--------------|
+     * |All            |All            |
      *
      * @param[in]  input       Tensor input. Supported tensor rank: 4. Data types supported: All.
      * @param[out] output      Tensor output. Data types supported: same as @p input
diff --git a/arm_compute/runtime/CL/functions/CLSplit.h b/arm_compute/runtime/CL/functions/CLSplit.h
index 2931203765..86c7bdde7d 100644
--- a/arm_compute/runtime/CL/functions/CLSplit.h
+++ b/arm_compute/runtime/CL/functions/CLSplit.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 Arm Limited.
+ * Copyright (c) 2018-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -40,6 +40,18 @@ namespace arm_compute
 class CLSplit : public CPPSplit<CLSlice, ICLTensor>
 {
 public:
+    /** CLSplit
+     *
+     * Valid data layouts:
+     * - All
+     *
+     * Valid data type configurations:
+     * |src            |dst            |
+     * |:--------------|:--------------|
+     * |All            |All            |
+     *
+     */
+
     // Inherited methods overridden:
     void run() override;
 };
diff --git a/arm_compute/runtime/CL/functions/CLStackLayer.h b/arm_compute/runtime/CL/functions/CLStackLayer.h
index 3861fd299a..54c903a706 100644
--- a/arm_compute/runtime/CL/functions/CLStackLayer.h
+++ b/arm_compute/runtime/CL/functions/CLStackLayer.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 Arm Limited.
+ * Copyright (c) 2018-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -58,6 +58,14 @@ public:
     /** Default destructor */
     ~CLStackLayer();
     /** Initialise the kernel's inputs vector and output.
+     *
+     * Valid data layouts:
+     * - All
+     *
+     * Valid data type configurations:
+     * |src            |dst            |
+     * |:--------------|:--------------|
+     * |All            |All            |
      *
      * @note Supported input tensor rank: up to 4
      *
diff --git a/arm_compute/runtime/CL/functions/CLTile.h b/arm_compute/runtime/CL/functions/CLTile.h
index 69743693ff..c266adbbd4 100644
--- a/arm_compute/runtime/CL/functions/CLTile.h
+++ b/arm_compute/runtime/CL/functions/CLTile.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 Arm Limited.
+ * Copyright (c) 2018-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -38,6 +38,14 @@ class CLTile : public ICLSimpleFunction
 {
 public:
     /** Set the source, destination of the kernel
+     *
+     * Valid data layouts:
+     * - All
+     *
+     * Valid data type configurations:
+     * |src            |dst            |
+     * |:--------------|:--------------|
+     * |All            |All            |
      *
      * @param[in]  input     Source tensor. Data type supported: All.
      * @param[in]  multiples Contains the number of times the input tensor should be replicated on the given dimension.
diff --git a/arm_compute/runtime/CL/functions/CLUnstack.h b/arm_compute/runtime/CL/functions/CLUnstack.h
index 5d4d5710ab..32ad439b70 100644
--- a/arm_compute/runtime/CL/functions/CLUnstack.h
+++ b/arm_compute/runtime/CL/functions/CLUnstack.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 Arm Limited.
+ * Copyright (c) 2018-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -47,6 +47,14 @@ public:
     /** Default constructor */
     CLUnstack();
     /** Set the input, output and unstacking axis.
+     *
+     * Valid data layouts:
+     * - All
+     *
+     * Valid data type configurations:
+     * |src            |dst            |
+     * |:--------------|:--------------|
+     * |All            |All            |
      *
      * @param[in]     input         A tensor to be unstacked. Data type supported: All.
      * @param[in,out] output_vector A vector of tensors. Data types supported: same as @p input.
diff --git a/arm_compute/runtime/CL/functions/CLWinogradConvolutionLayer.h b/arm_compute/runtime/CL/functions/CLWinogradConvolutionLayer.h
index 9ced69c1bb..7b42932f82 100644
--- a/arm_compute/runtime/CL/functions/CLWinogradConvolutionLayer.h
+++ b/arm_compute/runtime/CL/functions/CLWinogradConvolutionLayer.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 Arm Limited.
+ * Copyright (c) 2018-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -61,6 +61,16 @@ public:
     /** Default destructor */
     ~CLWinogradConvolutionLayer();
     /** Set the input and output tensors.
+     *
+     * Valid data layouts:
+     * - NHWC
+     * - NCHW
+     *
+     * Valid data type configurations:
+     * |src0           |src1           |src2   |dst            |
+     * |:--------------|:--------------|:------|:--------------|
+     * |F16            |F16            |F16    |F16            |
+     * |F32            |F32            |F32    |F32            |
      *
      * @note: This function only works with 3x3,3x1,1x3,5x5,5x1,1x5,7x1 and 1x7 kernels along with unit strides for both NCHW and NHWC data layout
      * @note  Some Winograd configurations (i.e. F(4x4, 5x5)) are supported only with enable_fast_math = true
diff --git a/arm_compute/runtime/NEON/functions/NEArgMinMaxLayer.h b/arm_compute/runtime/NEON/functions/NEArgMinMaxLayer.h
index cbf1d5b444..4392de7b28 100644
--- a/arm_compute/runtime/NEON/functions/NEArgMinMaxLayer.h
+++ b/arm_compute/runtime/NEON/functions/NEArgMinMaxLayer.h
@@ -63,6 +63,18 @@ public:
     /** Default destructor */
     ~NEArgMinMaxLayer();
     /** Set the input and output tensors.
+     *
+     * Valid data layouts:
+     * - All
+     *
+     * Valid data type configurations:
+     * |src            |dst        |
+     * |:--------------|:----------|
+     * |QASYMM8        |U32, S32   |
+     * |QASYMM8_SIGNED |U32, S32   |
+     * |S32            |U32, S32   |
+     * |F16            |U32, S32   |
+     * |F32            |U32, S32   |
      *
      * @param[in]  input  Input source tensor. Data types supported: QASYMM8_SIGNED/QASYMM8/S32/F16/F32.
      * @param[in]  axis   Axis to find max/min index.
diff --git a/arm_compute/runtime/NEON/functions/NEBatchNormalizationLayer.h b/arm_compute/runtime/NEON/functions/NEBatchNormalizationLayer.h
index 6d56a267a7..ec00fbdbf2 100644
--- a/arm_compute/runtime/NEON/functions/NEBatchNormalizationLayer.h
+++ b/arm_compute/runtime/NEON/functions/NEBatchNormalizationLayer.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 Arm Limited.
+ * Copyright (c) 2017-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -57,6 +57,16 @@ public:
     /** Default destructor */
     ~NEBatchNormalizationLayer();
     /** Set the input and output tensors.
+     *
+     * Valid data layouts:
+     * - NHWC
+     * - NCHW
+     *
+     * Valid data type configurations:
+     * |src            |dst            |
+     * |:--------------|:--------------|
+     * |F32            |F32            |
+     * |F16            |F16            |
      *
      * @note If the output tensor is a nullptr or is equal to the input, the batch normalization function will be performed in-place
      *
diff --git a/arm_compute/runtime/NEON/functions/NEBatchToSpaceLayer.h b/arm_compute/runtime/NEON/functions/NEBatchToSpaceLayer.h
index c2fd26d34c..810bf81a22 100644
--- a/arm_compute/runtime/NEON/functions/NEBatchToSpaceLayer.h
+++ b/arm_compute/runtime/NEON/functions/NEBatchToSpaceLayer.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 Arm Limited.
+ * Copyright (c) 2019-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -51,6 +51,15 @@ public:
     /** Default destructor */
     ~NEBatchToSpaceLayer() = default;
     /** Set the input and output tensors.
+     *
+     * Valid data layouts:
+     * - NHWC
+     * - NCHW
+     *
+     * Valid data type configurations:
+     * |src0      |src1      |dst        |
+     * |:---------|:---------|:----------|
+     * |All       |s32       |All        |
      *
      * @param[in]  input       Tensor input. Supported tensor rank: 4. Data types supported: All.
      * @param[in]  block_shape 1-D tensor with shape [M]. Data types supported: S32
diff --git a/arm_compute/runtime/NEON/functions/NEBitwiseAnd.h b/arm_compute/runtime/NEON/functions/NEBitwiseAnd.h
index 3203d2b9a7..1f95f193d3 100644
--- a/arm_compute/runtime/NEON/functions/NEBitwiseAnd.h
+++ b/arm_compute/runtime/NEON/functions/NEBitwiseAnd.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2020 Arm Limited.
+ * Copyright (c) 2016-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -47,6 +47,14 @@ public:
     /** Default destructor */
     ~NEBitwiseAnd() = default;
     /** Initialise the kernel's inputs and output
+     *
+     * Valid data layouts:
+     * - All
+     *
+     * Valid data type configurations:
+     * |src            |dst            |
+     * |:--------------|:--------------|
+     * |U8             |U8             |
      *
      * @param[in]  input1 First tensor input. Data type supported: U8.
      * @param[in]  input2 Second tensor input. Data type supported: U8.
diff --git a/arm_compute/runtime/NEON/functions/NEBitwiseNot.h b/arm_compute/runtime/NEON/functions/NEBitwiseNot.h
index 9fa0d38caf..c66bebf7cc 100644
--- a/arm_compute/runtime/NEON/functions/NEBitwiseNot.h
+++ b/arm_compute/runtime/NEON/functions/NEBitwiseNot.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 Arm Limited.
+ * Copyright (c) 2017-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -35,6 +35,14 @@ class NEBitwiseNot : public INESimpleFunctionNoBorder
 {
 public:
     /** Initialise the kernel's input and output
+     *
+     * Valid data layouts:
+     * - All
+     *
+     * Valid data type configurations:
+     * |src            |dst            |
+     * |:--------------|:--------------|
+     * |U8             |U8             |
      *
      * @param[in]  input  Input tensor. Data type supported: U8.
      * @param[out] output Output tensor. Data type supported: U8.
diff --git a/arm_compute/runtime/NEON/functions/NEBitwiseOr.h b/arm_compute/runtime/NEON/functions/NEBitwiseOr.h
index fba6b784de..183df212e4 100644
--- a/arm_compute/runtime/NEON/functions/NEBitwiseOr.h
+++ b/arm_compute/runtime/NEON/functions/NEBitwiseOr.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 Arm Limited.
+ * Copyright (c) 2017-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -35,6 +35,14 @@ class NEBitwiseOr : public INESimpleFunctionNoBorder
 {
 public:
     /** Initialise the kernel's inputs and output
+     *
+     * Valid data layouts:
+     * - All
+     *
+     * Valid data type configurations:
+     * |src            |dst            |
+     * |:--------------|:--------------|
+     * |U8             |U8             |
      *
      * @param[in]  input1 First tensor input. Data type supported: U8.
      * @param[in]  input2 Second tensor input. Data type supported: U8.
diff --git a/arm_compute/runtime/NEON/functions/NEBitwiseXor.h b/arm_compute/runtime/NEON/functions/NEBitwiseXor.h
index c6cb584284..126aaa6ddd 100644
--- a/arm_compute/runtime/NEON/functions/NEBitwiseXor.h
+++ b/arm_compute/runtime/NEON/functions/NEBitwiseXor.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 Arm Limited.
+ * Copyright (c) 2017-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -35,6 +35,14 @@ class NEBitwiseXor : public INESimpleFunctionNoBorder
 {
 public:
     /** Initialise the kernel's inputs and output
+     *
+     * Valid data layouts:
+     * - All
+     *
+     * Valid data type configurations:
+     * |src            |dst            |
+     * |:--------------|:--------------|
+     * |U8             |U8             |
      *
      * @param[in]  input1 First tensor input. Data type supported: U8.
      * @param[in]  input2 Second tensor input. Data type supported: U8.
diff --git a/arm_compute/runtime/NEON/functions/NEBoundingBoxTransform.h b/arm_compute/runtime/NEON/functions/NEBoundingBoxTransform.h
index c377520a12..2a196a2de5 100644
--- a/arm_compute/runtime/NEON/functions/NEBoundingBoxTransform.h
+++ b/arm_compute/runtime/NEON/functions/NEBoundingBoxTransform.h
@@ -37,6 +37,17 @@ class NEBoundingBoxTransform : public INESimpleFunctionNoBorder
 {
 public:
     /** Set the input and output tensors.
+     *
+     * Valid data layouts:
+     * - NHWC
+     * - NCHW
+     *
+     * Valid data type configurations:
+     * |src0           |src1           |dst            |
+     * |:--------------|:--------------|:--------------|
+     * |QASYMM16       |QASYMM8        |QASYMM16       |
+     * |F16            |F16            |F16            |
+     * |F32            |F32            |F32            |
      *
      * @param[in]  boxes      Source tensor. Bounding box proposals in pixel coordinates. Size(M, 4), format [x1, y1, x2, y2]. Data types supported: QASYMM16/F16/F32.
      * @param[out] pred_boxes Destination tensor. Pixel coordinates of the transformed bounding boxes. Size (M, 4*K), format [x1, y1, x2, y2]. Data types supported: Same as @p input
diff --git a/arm_compute/runtime/NEON/functions/NECast.h b/arm_compute/runtime/NEON/functions/NECast.h
index e536317660..eb7de1fadb 100644
--- a/arm_compute/runtime/NEON/functions/NECast.h
+++ b/arm_compute/runtime/NEON/functions/NECast.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 Arm Limited.
+ * Copyright (c) 2019-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -40,18 +40,22 @@ class NECast : public INESimpleFunctionNoBorder
 public:
     /** Initialize the function's source, destination
      *
-     * Input data type must be different than output data type.
+     * Valid data layouts:
+     * - All
      *
-     * Valid conversions Input -> Output :
+     * Valid data type configurations:
+     * |src            |dst                                             |
+     * |:--------------|:-----------------------------------------------|
+     * |QASYMM8_SIGNED | S16, S32, F32, F16                             |
+     * |QASYMM8        | U16, S16, S32, F32, F16                        |
+     * |U8             | U16, S16, S32, F32, F16                        |
+     * |U16            | U8, U32                                        |
+     * |S16            | QASYMM8_SIGNED, U8, S32                        |
+     * |F16            | QASYMM8_SIGNED, QASYMM8, F32, S32, U8          |
+     * |S32            | QASYMM8_SIGNED, QASYMM8, F16, F32, U8          |
+     * |F32            | QASYMM8_SIGNED, QASYMM8, BFLOAT16, F16, S32, U8|
      *
-     *   - QASYMM8_SIGNED -> S16, S32, F32, F16
-     *   - QASYMM8        -> U16, S16, S32, F32, F16
-     *   - U8             -> U16, S16, S32, F32, F16
-     *   - U16            -> U8, U32
-     *   - S16            -> QASYMM8_SIGNED, U8, S32
-     *   - F16            -> QASYMM8_SIGNED, QASYMM8, F32, S32, U8
-     *   - S32            -> QASYMM8_SIGNED, QASYMM8, F16, F32, U8
-     *   - F32            -> QASYMM8_SIGNED, QASYMM8, BFLOAT16, F16, S32, U8
+     * Input data type must be different than output data type.
      *
      * @param[in]  input  The input tensor to convert. Data types supported: QASYMM8_SIGNED/QASYMM8/U8/U16/S16/F16/S32/F32.
      * @param[out] output The output tensor. Data types supported: QASYMM8_SIGNED/QASYMM8/U8/S8/U16/S16/U32/S32/BFLOAT16/F16/F32.
diff --git a/arm_compute/runtime/NEON/functions/NEChannelShuffleLayer.h b/arm_compute/runtime/NEON/functions/NEChannelShuffleLayer.h
index aa11396c20..8888efec4f 100644
--- a/arm_compute/runtime/NEON/functions/NEChannelShuffleLayer.h
+++ b/arm_compute/runtime/NEON/functions/NEChannelShuffleLayer.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 Arm Limited.
+ * Copyright (c) 2018-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -43,6 +43,14 @@ class NEChannelShuffleLayer : public INESimpleFunctionNoBorder
 {
 public:
     /** Initialize the function
+     *
+     * Valid data layouts:
+     * - NCHW
+     *
+     * Valid data type configurations:
+     * |src            |dst            |
+     * |:--------------|:--------------|
+     * |All            |All            |
      *
      * @param[in]  input      Input tensor. Data types supported: All
      * @param[out] output     Output tensor. Data type supported: Same as @p input
diff --git a/arm_compute/runtime/NEON/functions/NEConvolutionLayer.h b/arm_compute/runtime/NEON/functions/NEConvolutionLayer.h
index b1e85523c5..f19aa8008b 100644
--- a/arm_compute/runtime/NEON/functions/NEConvolutionLayer.h
+++ b/arm_compute/runtime/NEON/functions/NEConvolutionLayer.h
@@ -85,13 +85,28 @@ public:
     /** Default destructor */
     ~NEConvolutionLayer() = default;
     /** Set the input and output tensors.
+     *
+     * Valid data layouts:
+     * - NHWC
+     * - NCHW
+     *
+     * Valid data type configurations:
+     * |src0           |src1               |src2   |dst            |
+     * |:--------------|:------------------|:------|:--------------|
+     * |F16            |F16                |F16    |F16            |
+     * |F32            |F32                |F32    |F32            |
+     * |QASYMM8        |QASYMM8            |S32    |QASYMM8        |
+     * |QASYMM8        |QSYMM8_PER_CHANNEL |S32    |QASYMM8        |
+     * |QASYMM8_SIGNED |QASYMM8_SIGNED     |S32    |QASYMM8_SIGNED |
+     * |QASYMM8_SIGNED |QSYMM8_PER_CHANNEL |S32    |QASYMM8_SIGNED |
      *
      * @param[in]  input            Source tensor. 3 lower dimensions represent a single input [width, height, IFM],
      *                              while every optional dimension from 4 and above represent a batch of inputs.
      *                              Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
-     * @param[in]  weights          Weights tensor. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM]. Data type supported: Same as @p input, also could be QSYMM8_PER_CHANNEL if input is QASYMM8/QASYMM8_SIGNED.
+     * @param[in]  weights          Weights tensor. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM].
+     *                              Data type supported: Same as @p input, also could be QSYMM8_PER_CHANNEL if input is QASYMM8/QASYMM8_SIGNED.
      * @param[in]  biases           Biases tensor. Shared biases supported. Biases are 1D tensor with dimensions [OFM].
-     *                              Data type supported: Should match @p input data type, except for input of QASYMM8/QASYMM8_SIGNED type where biases should be of S32 type.
+     *                              Data type supported: Same as @p input, except for input of QASYMM8/QASYMM8_SIGNED type where biases should be of S32 type.
      * @param[out] output           Destination tensor. 3 lower dimensions represent a single output [width, height, OFM], while the rest represent batch of outputs.
      *                              Data types supported: Same as @p input.
      * @param[in]  conv_info        Contains padding and stride information described in @ref PadStrideInfo.
@@ -110,9 +125,10 @@ public:
      * @param[in] input            Source tensor. 3 lower dimensions represent a single input [width, height, IFM],
      *                             while every optional dimension from 4 and above represent a batch of inputs.
      *                             Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
-     * @param[in] weights          Weights tensor. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM]. Data type supported:Same as @p input, also could be QSYMM8_PER_CHANNEL if input is QASYMM8/QASYMM8_SIGNED.
+     * @param[in] weights          Weights tensor. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM].
+     *                             Data type supported:Same as @p input, also could be QSYMM8_PER_CHANNEL if input is QASYMM8/QASYMM8_SIGNED.
      * @param[in] biases           Biases tensor. Shared biases supported. Biases are 1D tensor with dimensions [OFM].
-     *                             Data type supported: Should match @p input data type, except for input of QASYMM8/QASYMM8_SIGNED type where biases should be of S32 type.
+     *                             Data type supported: Same as @p input, except for input of QASYMM8/QASYMM8_SIGNED type where biases should be of S32 type.
      * @param[in] output           Destination tensor. 3 lower dimensions represent a single output [width, height, OFM], while the rest represent batch of outputs.
      *                             Data types supported: Same as @p input.
      * @param[in] conv_info        Contains padding and stride information described in @ref PadStrideInfo.
@@ -134,7 +150,8 @@ public:
      * @param[in] input            Source tensor. 3 lower dimensions represent a single input [width, height, IFM],
      *                             while every optional dimension from 4 and above represent a batch of inputs.
      *                             Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
-     * @param[in] weights          Weights tensor. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM]. Data type supported:Same as @p input, also could be QSYMM8_PER_CHANNEL if input is QASYMM8/QASYMM8_SIGNED.
+     * @param[in] weights          Weights tensor. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM].
+     *                             Data type supported:Same as @p input, also could be QSYMM8_PER_CHANNEL if input is QASYMM8/QASYMM8_SIGNED.
      * @param[in] output           Destination tensor. 3 lower dimensions represent a single output [width, height, OFM], while the rest represent batch of outputs.
      *                             Data types supported: Same as @p input.
      * @param[in] conv_info        Contains padding and stride information described in @ref PadStrideInfo.
diff --git a/arm_compute/runtime/NEON/functions/NECropResize.h b/arm_compute/runtime/NEON/functions/NECropResize.h
index 7dcf925650..143bbbc6f1 100644
--- a/arm_compute/runtime/NEON/functions/NECropResize.h
+++ b/arm_compute/runtime/NEON/functions/NECropResize.h
@@ -53,6 +53,14 @@ public:
     ~NECropResize();
 
     /** Configure kernel
+     *
+     * Valid data layouts:
+     * - NHWC
+     *
+     * Valid data type configurations:
+     * |src0     |src1     |src2   |dst      |
+     * |:--------|:--------|:------|:--------|
+     * |All      |F32      |F32    |F32      |
      *
      * @note Supported tensor rank: up to 4
      * @note Box indices may be outside of the bounds, in which case @p extrapolation_value is used.
diff --git a/arm_compute/runtime/NEON/functions/NEDeconvolutionLayer.h b/arm_compute/runtime/NEON/functions/NEDeconvolutionLayer.h
index c16cf26095..34ab0707c2 100644
--- a/arm_compute/runtime/NEON/functions/NEDeconvolutionLayer.h
+++ b/arm_compute/runtime/NEON/functions/NEDeconvolutionLayer.h
@@ -89,6 +89,20 @@ public:
     virtual ~NEDeconvolutionLayer() = default;
 
     /** Set the input, weights, biases and output tensors.
+     *
+     * Valid data layouts:
+     * - NHWC
+     * - NCHW
+     *
+     * Valid data type configurations:
+     * |src0           |src1               |src2   |dst            |
+     * |:--------------|:------------------|:------|:--------------|
+     * |F16            |F16                |F16    |F16            |
+     * |F32            |F32                |F32    |F32            |
+     * |QASYMM8        |QASYMM8            |S32    |QASYMM8        |
+     * |QASYMM8        |QSYMM8_PER_CHANNEL |S32    |QASYMM8        |
+     * |QASYMM8_SIGNED |QASYMM8_SIGNED     |S32    |QASYMM8_SIGNED |
+     * |QASYMM8_SIGNED |QSYMM8_PER_CHANNEL |S32    |QASYMM8_SIGNED |
      *
      * @param[in,out] input   Input tensor. 3 lower dimensions represent a single input, and an optional 4th dimension for batch of inputs. Data types supported: F32/F16/QASYMM8/QASYMM8_SIGNED.
      * @param[in]     weights The 4d weights with dimensions [width, height, IFM, OFM]. Data type supported: Same as @p input, also could be QSYMM8_PER_CHANNEL if input is QASYMM8/QASYMM8_SIGNED.
diff --git a/arm_compute/runtime/NEON/functions/NEDepthConvertLayer.h b/arm_compute/runtime/NEON/functions/NEDepthConvertLayer.h
index c9817a63c1..17cf539717 100644
--- a/arm_compute/runtime/NEON/functions/NEDepthConvertLayer.h
+++ b/arm_compute/runtime/NEON/functions/NEDepthConvertLayer.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2020 Arm Limited.
+ * Copyright (c) 2016-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -48,15 +48,21 @@ public:
     ~NEDepthConvertLayer() = default;
     /** Initialize the function's source, destination
      *
-     * Valid conversions Input -> Output :
+     * Valid data layouts:
+     * - All
      *
-     *   - QASYMM8  -> F16, F32
-     *   - U8       -> U16, S16, S32
-     *   - U16      -> U8, U32
-     *   - S16      -> U8, S32
-     *   - BFLOAT16 -> F32
-     *   - F16      -> QASYMM8, F32
-     *   - F32      -> QASYMM8, F16, BFLOAT16
+     * Valid data type configurations:
+     * |src            |dst                        |
+     * |:--------------|:--------------------------|
+     * |QASYMM8        | F16, F32                  |
+     * |U8             | U16, S16, S32             |
+     * |U16            | U8, U32                   |
+     * |S16            | U8, S32                   |
+     * |BFLOAT16       | F32                       |
+     * |F16            | QASYMM8, F32              |
+     * |F32            | QASYMM8, F16, BFLOAT16    |
+     *
+     * Input data type must be different than output data type.
      *
      * @param[in]  input  The input tensor to convert. Data types supported: QASYMM8/U8/U16/S16/BFLOAT16/F16/F32.
      * @param[out] output The output tensor. Data types supported: QASYMM8/U8/U16/S16/U32/S32/BFLOAT16/F16/F32.
diff --git a/arm_compute/runtime/NEON/functions/NEDepthToSpaceLayer.h b/arm_compute/runtime/NEON/functions/NEDepthToSpaceLayer.h
index 51f7ff7770..b9bdcd1f11 100644
--- a/arm_compute/runtime/NEON/functions/NEDepthToSpaceLayer.h
+++ b/arm_compute/runtime/NEON/functions/NEDepthToSpaceLayer.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 Arm Limited.
+ * Copyright (c) 2019-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -52,6 +52,15 @@ public:
     /** Default destructor */
     ~NEDepthToSpaceLayer() = default;
     /** Set the input and output tensors.
+     *
+     * Valid data layouts:
+     * - NHWC
+     * - NCHW
+     *
+     * Valid data type configurations:
+     * |src            |dst            |
+     * |:--------------|:--------------|
+     * |All            |All            |
      *
      * @param[in]  input       Tensor input. Supported tensor rank: 4. Data types supported: All
      * @param[out] output      Tensor output. Data types supported: same as @p input
diff --git a/arm_compute/runtime/NEON/functions/NEDepthwiseConvolutionLayer.h b/arm_compute/runtime/NEON/functions/NEDepthwiseConvolutionLayer.h
index c74b2a93ee..2f541758f4 100644
--- a/arm_compute/runtime/NEON/functions/NEDepthwiseConvolutionLayer.h
+++ b/arm_compute/runtime/NEON/functions/NEDepthwiseConvolutionLayer.h
@@ -54,6 +54,20 @@ public:
     /** Default destructor */
     ~NEDepthwiseConvolutionLayer();
     /** Initialize the function's source, destination, weights and convolution information.
+     *
+     * Valid data layouts:
+     * - NHWC
+     * - NCHW
+     *
+     * Valid data type configurations:
+     * |src0           |src1               |src2   |dst            |
+     * |:--------------|:------------------|:------|:--------------|
+     * |F16            |F16                |F16    |F16            |
+     * |F32            |F32                |F32    |F32            |
+     * |QASYMM8        |QASYMM8            |S32    |QASYMM8        |
+     * |QASYMM8        |QSYMM8_PER_CHANNEL |S32    |QASYMM8        |
+     * |QASYMM8_SIGNED |QASYMM8_SIGNED     |S32    |QASYMM8_SIGNED |
+     * |QASYMM8_SIGNED |QSYMM8_PER_CHANNEL |S32    |QASYMM8_SIGNED |
      *
      * @param[in, out] input            Source tensor. Data type supported: QASYMM8/QASYMM8_SIGNED/F16/F32
      * @param[out]     output           Destination tensor. Data type supported: same as @p input.
diff --git a/arm_compute/runtime/NEON/functions/NEDequantizationLayer.h b/arm_compute/runtime/NEON/functions/NEDequantizationLayer.h
index dfec835f45..91ed056cf3 100644
--- a/arm_compute/runtime/NEON/functions/NEDequantizationLayer.h
+++ b/arm_compute/runtime/NEON/functions/NEDequantizationLayer.h
@@ -57,18 +57,13 @@ public:
      * - All
      *
      * Valid data type configurations:
-     * |src                |dst            |
-     * |:------------------|:--------------|
-     * |QASYMM8            |F16            |
-     * |QASYMM8            |F32            |
-     * |QASYMM8_SIGNED     |F16            |
-     * |QASYMM8_SIGNED     |F32            |
-     * |QSYMM8_PER_CHANNEL |F16            |
-     * |QSYMM8_PER_CHANNEL |F32            |
-     * |QSYMM8             |F16            |
-     * |QSYMM8             |F32            |
-     * |QSYMM16            |F16            |
-     * |QSYMM16            |F32            |
+     * |src                |dst         |
+     * |:------------------|:-----------|
+     * |QASYMM8            |F16, F32    |
+     * |QASYMM8_SIGNED     |F16, F32    |
+     * |QSYMM8_PER_CHANNEL |F16, F32    |
+     * |QSYMM8             |F16, F32    |
+     * |QSYMM16            |F16, F32    |
      *
      * @param[in]  input  Source tensor. Data types supported: QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL/QSYMM8/QSYMM16.
      * @param[out] output Destination tensor with the same dimensions of input. Data type supported: F16/F32.
diff --git a/arm_compute/runtime/NEON/functions/NEFillBorder.h b/arm_compute/runtime/NEON/functions/NEFillBorder.h
index 8a8a0c7dc2..ab77c28839 100644
--- a/arm_compute/runtime/NEON/functions/NEFillBorder.h
+++ b/arm_compute/runtime/NEON/functions/NEFillBorder.h
@@ -41,6 +41,14 @@ class NEFillBorder : public IFunction
 public:
     NEFillBorder();
     /** Initialize the function's source, destination and border_mode.
+     *
+     * Valid data layouts:
+     * - All
+     *
+     * Valid data type configurations:
+     * |src            |dst            |
+     * |:--------------|:--------------|
+     * |All            |All            |
      *
      * @note This function fills the borders within the XY-planes.
      *
diff --git a/arm_compute/runtime/NEON/functions/NEFlattenLayer.h b/arm_compute/runtime/NEON/functions/NEFlattenLayer.h
index 1104aac77f..e688e918d9 100644
--- a/arm_compute/runtime/NEON/functions/NEFlattenLayer.h
+++ b/arm_compute/runtime/NEON/functions/NEFlattenLayer.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 Arm Limited.
+ * Copyright (c) 2017-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -38,6 +38,14 @@ class NEFlattenLayer : public IFunction
 {
 public:
     /** Initialise the kernel's input and output.
+     *
+     *  Valid data layouts:
+     * - All
+     *
+     * Valid data type configurations:
+     * |src            |dst            |
+     * |:--------------|:--------------|
+     * |All            |All            |
      *
      * @param[in]  input  First input tensor to flatten with at least 3 dimensions. The dimensions over the third will be interpreted as batches. Data types supported: All
      * @param[out] output Output tensor with shape [w*h*d, input_batches] where:
diff --git a/arm_compute/runtime/NEON/functions/NEFullyConnectedLayer.h b/arm_compute/runtime/NEON/functions/NEFullyConnectedLayer.h
index bc45e58b4b..9727e108a5 100644
--- a/arm_compute/runtime/NEON/functions/NEFullyConnectedLayer.h
+++ b/arm_compute/runtime/NEON/functions/NEFullyConnectedLayer.h
@@ -36,7 +36,6 @@
 
 namespace arm_compute
 {
-
 namespace weights_transformations
 {
 /** Basic function to manage the reshape weights generated from @ref NETranspose */
@@ -101,6 +100,18 @@ public:
     /** Default destructor */
     ~NEFullyConnectedLayer();
     /** Set the input and output tensors.
+     *
+     * Valid data layouts:
+     * - NHWC
+     * - NCHW
+     *
+     * Valid data type configurations:
+     * |src0           |src1               |src2   |dst            |
+     * |:--------------|:------------------|:------|:--------------|
+     * |F16            |F16                |F16    |F16            |
+     * |F32            |F32                |F32    |F32            |
+     * |QASYMM8        |QASYMM8            |S32    |QASYMM8        |
+     * |QASYMM8_SIGNED |QASYMM8_SIGNED     |S32    |QASYMM8_SIGNED |
      *
      * @param[in]  input   Source tensor. Data type supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
      * @param[in]  weights Weights tensor. The weights must be 2 dimensional.
diff --git a/arm_compute/runtime/NEON/functions/NEFuseBatchNormalization.h b/arm_compute/runtime/NEON/functions/NEFuseBatchNormalization.h
index 5dc804e240..3dd7f49044 100644
--- a/arm_compute/runtime/NEON/functions/NEFuseBatchNormalization.h
+++ b/arm_compute/runtime/NEON/functions/NEFuseBatchNormalization.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 Arm Limited.
+ * Copyright (c) 2018-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -51,6 +51,16 @@ public:
     /** Default destructor */
     ~NEFuseBatchNormalization();
     /** Set the input and output tensors.
+     *
+     * Valid data layouts:
+     * - NHWC
+     * - NCHW
+     *
+     * Valid data type configurations:
+     * |src            |dst            |
+     * |:--------------|:--------------|
+     * |F32            |F32            |
+     * |F16            |F16            |
      *
      * @param[in]  input_weights Input weights tensor for convolution or depthwise convolution layer. Data type supported: F16/F32. Data layout supported: NCHW, NHWC
      * @param[in]  bn_mean       Batch normalization layer mean tensor. Same as @p input_weights
diff --git a/arm_compute/runtime/NEON/functions/NEGEMM.h b/arm_compute/runtime/NEON/functions/NEGEMM.h
index a6c3436656..d4a9f68beb 100644
--- a/arm_compute/runtime/NEON/functions/NEGEMM.h
+++ b/arm_compute/runtime/NEON/functions/NEGEMM.h
@@ -75,6 +75,16 @@ public:
     /** Default destructor */
     ~NEGEMM();
     /** Initialise the kernel's inputs, output
+     *
+     * Valid data layouts:
+     * - All
+     *
+     * Valid data type configurations:
+     * |src0         |src1        |src2      |dst            |
+     * |:------------|:-----------|:---------|:--------------|
+     * |F32          |F32         |F32       |F32            |
+     * |F16          |F16         |F16       |F16            |
+     * |BFLOAT16     |BFLOAT16    |BFLOAT16  |BFLOAT16       |
      *
      * @note GEMM: General Matrix Multiply - [alpha * A * B + beta * C].
      * @note GEMM: The tensors a, b, c, d must have the same data type. You should not mix data types when calling this function.
diff --git a/arm_compute/runtime/NEON/functions/NEGEMMConvolutionLayer.h b/arm_compute/runtime/NEON/functions/NEGEMMConvolutionLayer.h
index 9897bf1d4d..e89eae1d31 100644
--- a/arm_compute/runtime/NEON/functions/NEGEMMConvolutionLayer.h
+++ b/arm_compute/runtime/NEON/functions/NEGEMMConvolutionLayer.h
@@ -176,6 +176,21 @@ public:
     /** Default destructor */
     ~NEGEMMConvolutionLayer();
     /** Set the input and output tensors.
+     *
+     * Valid data layouts:
+     * - NHWC
+     * - NCHW
+     *
+     * Valid data type configurations:
+     * |src0           |src1               |src2     |dst            |
+     * |:--------------|:------------------|:--------|:--------------|
+     * |F16            |F16                |F16      |F16            |
+     * |F32            |F32                |F32      |F32            |
+     * |BFLOAT16       |BFLOAT16           |BFLOAT16 |BFLOAT16       |
+     * |QASYMM8        |QASYMM8            |S32      |QASYMM8        |
+     * |QASYMM8        |QSYMM8_PER_CHANNEL |S32      |QASYMM8        |
+     * |QASYMM8_SIGNED |QASYMM8_SIGNED     |S32      |QASYMM8_SIGNED |
+     * |QASYMM8_SIGNED |QSYMM8_PER_CHANNEL |S32      |QASYMM8_SIGNED |
      *
      * @param[in]  input        Source tensor. 3 lower dimensions represent a single input [width, height, IFM],
      *                          while every optional dimension from 4 and above represent a batch of inputs.
diff --git a/arm_compute/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.h b/arm_compute/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.h
index b2b77bd406..780723e752 100644
--- a/arm_compute/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.h
+++ b/arm_compute/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.h
@@ -75,6 +75,26 @@ public:
     /** Default destructor */
     ~NEGEMMLowpMatrixMultiplyCore();
     /** Initialise the kernel's inputs, output
+     *
+     * Valid data layouts:
+     * - NHWC
+     * - NCHW
+     *
+     * Valid data type configurations:
+     * |src0           |src1               |src2     |dst            |
+     * |:--------------|:------------------|:--------|:--------------|
+     * |QASYMM8        |QASYMM8            |S32      |QASYMM8        |
+     * |QASYMM8        |QSYMM8_PER_CHANNEL |S32      |QASYMM8        |
+     * |QASYMM8        |QSYMM8             |S32      |QASYMM8        |
+     * |QASYMM8        |QASYMM8            |S32      |S32            |
+     * |QASYMM8        |QSYMM8_PER_CHANNEL |S32      |S32            |
+     * |QASYMM8        |QSYMM8             |S32      |S32            |
+     * |QASYMM8_SIGNED |QASYMM8_SIGNED     |S32      |QASYMM8_SIGNED |
+     * |QASYMM8_SIGNED |QSYMM8_PER_CHANNEL |S32      |QASYMM8_SIGNED |
+     * |QASYMM8_SIGNED |QSYMM8             |S32      |QASYMM8_SIGNED |
+     * |QASYMM8_SIGNED |QASYMM8_SIGNED     |S32      |S32            |
+     * |QASYMM8_SIGNED |QSYMM8_PER_CHANNEL |S32      |S32            |
+     * |QASYMM8_SIGNED |QSYMM8             |S32      |S32            |
      *
      * @note GEMM_LOWP:  low precision GEMM kernel
      *  This kernel performs the following computations:
diff --git a/arm_compute/runtime/NEON/functions/NEGather.h b/arm_compute/runtime/NEON/functions/NEGather.h
index a5e0461227..393a38ee4d 100644
--- a/arm_compute/runtime/NEON/functions/NEGather.h
+++ b/arm_compute/runtime/NEON/functions/NEGather.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 Arm Limited.
+ * Copyright (c) 2019-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -39,6 +39,14 @@ class NEGather : public INESimpleFunctionNoBorder
 {
 public:
     /** Initialise the kernel's inputs and outputs
+     *
+     * Valid data layouts:
+     * - All
+     *
+     * Valid data type configurations:
+     * |src            |dst            |
+     * |:--------------|:--------------|
+     * |All            |All            |
      *
      * @param[in]  input   Source tensor. Supported tensor rank: up to 4. Data type supported: All
      * @param[in]  indices Indices tensor. Supported tensor rank: up to 1. Must be one of the following type: U32/S32. Each value Must be in range [0, input.shape[@p axis])
diff --git a/arm_compute/runtime/NEON/functions/NEGenerateProposalsLayer.h b/arm_compute/runtime/NEON/functions/NEGenerateProposalsLayer.h
index 22c6ba2ed6..3b683382ec 100644
--- a/arm_compute/runtime/NEON/functions/NEGenerateProposalsLayer.h
+++ b/arm_compute/runtime/NEON/functions/NEGenerateProposalsLayer.h
@@ -71,6 +71,16 @@ public:
     ~NEGenerateProposalsLayer();
 
     /** Set the input and output tensors.
+     *
+     * Valid data layouts:
+     * - All
+     *
+     * Valid data type configurations:
+     * |src0           |src1               |src2     |dst            |
+     * |:--------------|:------------------|:--------|:--------------|
+     * |F16            |F16                |F16      |F16            |
+     * |F32            |F32                |F32      |F32            |
+     * |QASYMM8        |QSYMM8             |QSYMM16  |QASYMM8        |
      *
      * @param[in]  scores              Scores from convolution layer of size (W, H, A), where H and W are the height and width of the feature map, and A is the number of anchors.
      *                                 Data types supported: QASYMM8/F16/F32
diff --git a/arm_compute/runtime/NEON/functions/NEInstanceNormalizationLayer.h b/arm_compute/runtime/NEON/functions/NEInstanceNormalizationLayer.h
index 57165c94b4..bb0697072b 100644
--- a/arm_compute/runtime/NEON/functions/NEInstanceNormalizationLayer.h
+++ b/arm_compute/runtime/NEON/functions/NEInstanceNormalizationLayer.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 Arm Limited.
+ * Copyright (c) 2019-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -59,6 +59,16 @@ public:
     /** Default destructor */
     ~NEInstanceNormalizationLayer();
     /** Set the input and output tensors.
+     *
+     * Valid data layouts:
+     * - NHWC
+     * - NCHW
+     *
+     * Valid data type configurations:
+     * |src      |dst       |
+     * |:--------|:---------|
+     * |F16      |F16       |
+     * |F32      |F32       |
      *
      * @param[in, out] input   Source tensor. In case of @p output tensor = nullptr this tensor will store the result of the normalization.
      *                         Data types supported: F16/F32. Data layout supported: NHWC, NCHW
diff --git a/arm_compute/runtime/NEON/functions/NEL2NormalizeLayer.h b/arm_compute/runtime/NEON/functions/NEL2NormalizeLayer.h
index 173b9d2141..7f1a5e785e 100644
--- a/arm_compute/runtime/NEON/functions/NEL2NormalizeLayer.h
+++ b/arm_compute/runtime/NEON/functions/NEL2NormalizeLayer.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 Arm Limited.
+ * Copyright (c) 2017-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -59,6 +59,16 @@ public:
     /** Default destructor */
     ~NEL2NormalizeLayer();
     /** Set the input and output tensors.
+     *
+     * Valid data layouts:
+     * - NHWC
+     * - NCHW
+     *
+     * Valid data type configurations:
+     * |src      |dst       |
+     * |:--------|:---------|
+     * |F16      |F16       |
+     * |F32      |F32       |
      *
      * @param[in, out] input   Source tensor. Data types supported: F16/F32. (Written to only for border_size != 0)
      * @param[out]     output  Destination tensor. Data types and data layouts supported: same as @p input.
diff --git a/arm_compute/runtime/NEON/functions/NELSTMLayer.h b/arm_compute/runtime/NEON/functions/NELSTMLayer.h
index ef8defb827..075fb4530a 100644
--- a/arm_compute/runtime/NEON/functions/NELSTMLayer.h
+++ b/arm_compute/runtime/NEON/functions/NELSTMLayer.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 Arm Limited.
+ * Copyright (c) 2018-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -59,6 +59,15 @@ public:
     /** Default destructor */
     ~NELSTMLayer();
     /** Initialize function's tensors.
+     *
+     * Valid data layouts:
+     * - All
+     *
+     * Valid data type configurations:
+     * |src0 - src13 | dst0 - dst3 |
+     * |:------------|:------------|
+     * |F16          |F16          |
+     * |F32          |F32          |
      *
      * @param[in]  input                       Source tensor. Input is a 2D tensor with dimensions [input_size, batch_size]. Data types supported: F16/F32.
      * @param[in]  input_to_forget_weights     2D weights tensor with dimensions [input_size, num_units]. Data type supported: Same as @p input.
diff --git a/arm_compute/runtime/NEON/functions/NELSTMLayerQuantized.h b/arm_compute/runtime/NEON/functions/NELSTMLayerQuantized.h
index 53a024ae04..2f0c753691 100644
--- a/arm_compute/runtime/NEON/functions/NELSTMLayerQuantized.h
+++ b/arm_compute/runtime/NEON/functions/NELSTMLayerQuantized.h
@@ -76,6 +76,14 @@ public:
     /** Default destructor */
     ~NELSTMLayerQuantized();
     /** Initialize function's tensors.
+     *
+     * Valid data layouts:
+     * - All
+     *
+     * Valid data type configurations:
+     * |src0 - src8 |src9 - src12 |src13   |src14  |dst0   |dst1   |
+     * |:-----------|:------------|:-------|:------|:------|:------|
+     * |QASYMM8     |S32          |QSYMM16 |QASYMM8|QSYMM16|QASYMM8|
      *
      * @param[in]  input                       Source tensor. Input is a 2D tensor with dimensions [input_size, batch_size]. Data types supported: QASYMM8.
      * @param[in]  input_to_input_weights      2D weights tensor with dimensions [input_size, output_size]. Data type supported: Same as @p input.
diff --git a/arm_compute/runtime/NEON/functions/NEMaxUnpoolingLayer.h b/arm_compute/runtime/NEON/functions/NEMaxUnpoolingLayer.h
index fae26b3c93..41ea040457 100644
--- a/arm_compute/runtime/NEON/functions/NEMaxUnpoolingLayer.h
+++ b/arm_compute/runtime/NEON/functions/NEMaxUnpoolingLayer.h
@@ -56,6 +56,18 @@ public:
     /** Default destructor */
     ~NEMaxUnpoolingLayer();
     /** Set the input and output tensors.
+     *
+     * Valid data layouts:
+     * - NHWC
+     * - NCHW
+     *
+     * Valid data type configurations:
+     * |src            |dst            |
+     * |:--------------|:--------------|
+     * |QASYMM8        |QASYMM8        |
+     * |QASYMM8_SIGNED |QASYMM8_SIGNED |
+     * |F16            |F16            |
+     * |F32            |F32            |
      *
      * @note Only supported pool size 2
      *
diff --git a/arm_compute/runtime/NEON/functions/NEMeanStdDevNormalizationLayer.h b/arm_compute/runtime/NEON/functions/NEMeanStdDevNormalizationLayer.h
index 31e376191c..41aa81946b 100644
--- a/arm_compute/runtime/NEON/functions/NEMeanStdDevNormalizationLayer.h
+++ b/arm_compute/runtime/NEON/functions/NEMeanStdDevNormalizationLayer.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 Arm Limited.
+ * Copyright (c) 2019-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -49,6 +49,16 @@ public:
     /** Default destructor */
     ~NEMeanStdDevNormalizationLayer();
     /** Initialise the function's input and outputs.
+     *
+     * Valid data layouts:
+     * - NHWC
+     * - NCHW
+     *
+     * Valid data type configurations:
+     * |src      |dst       |
+     * |:--------|:---------|
+     * |F32      |F32       |
+     * |F16      |F16       |
      *
      * @note If the output tensor is a nullptr, the normalization will be performed in-place.
      *
diff --git a/arm_compute/runtime/NEON/functions/NENormalizationLayer.h b/arm_compute/runtime/NEON/functions/NENormalizationLayer.h
index 8c4ad1516e..fbe000445c 100644
--- a/arm_compute/runtime/NEON/functions/NENormalizationLayer.h
+++ b/arm_compute/runtime/NEON/functions/NENormalizationLayer.h
@@ -62,6 +62,16 @@ public:
     /** Default destructor */
     ~NENormalizationLayer();
     /** Set the input and output tensors.
+     *
+     * Valid data layouts:
+     * - NHWC
+     * - NCHW
+     *
+     * Valid data type configurations:
+     * |src      |dst       |
+     * |:--------|:---------|
+     * |F32      |F32       |
+     * |F16      |F16       |
      *
      * @param[in]  input     Source tensor. 3 lower dims represent a single input with dimensions [width, height, IFM],
      *                       and an optional 4th dimension for batch of inputs. Data type supported: F16/F32. Data layouts supported: NCHW/NHWC.
diff --git a/arm_compute/runtime/NEON/functions/NEPadLayer.h b/arm_compute/runtime/NEON/functions/NEPadLayer.h
index 76ff0643a0..4aa6725496 100644
--- a/arm_compute/runtime/NEON/functions/NEPadLayer.h
+++ b/arm_compute/runtime/NEON/functions/NEPadLayer.h
@@ -64,6 +64,15 @@ public:
     /** Default destructor */
     ~NEPadLayer();
     /** Initialize the function
+     *
+     * Valid data layouts:
+     * - NHWC
+     * - NCHW
+     *
+     * Valid data type configurations:
+     * |src      |dst       |
+     * |:--------|:---------|
+     * |All      |All       |
      *
      * @param[in]  input          Source tensor. Data types supported: All.
      * @param[out] output         Output tensor. Data type supported: same as @p input
diff --git a/arm_compute/runtime/NEON/functions/NEPriorBoxLayer.h b/arm_compute/runtime/NEON/functions/NEPriorBoxLayer.h
index 3cc79fa28e..38e0c9f3ad 100644
--- a/arm_compute/runtime/NEON/functions/NEPriorBoxLayer.h
+++ b/arm_compute/runtime/NEON/functions/NEPriorBoxLayer.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 Arm Limited.
+ * Copyright (c) 2018-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -37,6 +37,15 @@ class NEPriorBoxLayer : public INESimpleFunctionNoBorder
 {
 public:
     /** Set the input and output tensors.
+     *
+     * Valid data layouts:
+     * - NHWC
+     * - NCHW
+     *
+     * Valid data type configurations:
+     * |src0     |src1     |dst      |
+     * |:--------|:--------|:--------|
+     * |F32      |F32      |F32      |
      *
      * @param[in]  input1 First source tensor. Data types supported: F32. Data layouts supported: NCHW/NHWC.
      * @param[in]  input2 Second source tensor. Data types and layouts supported: same as @p input1
diff --git a/arm_compute/runtime/NEON/functions/NEQLSTMLayer.h b/arm_compute/runtime/NEON/functions/NEQLSTMLayer.h
index e706179415..7c2e9bc5a1 100644
--- a/arm_compute/runtime/NEON/functions/NEQLSTMLayer.h
+++ b/arm_compute/runtime/NEON/functions/NEQLSTMLayer.h
@@ -75,6 +75,14 @@ public:
     /** Default destructor */
     ~NEQLSTMLayer();
     /** Initialize function's tensors.
+     *
+     * Valid data layouts:
+     * - All
+     *
+     * Valid data type configurations:
+     * |src0          |src1 - src6  |src7 -src9   |src10  |src11         |dst0   |dst1 - dst2       |
+     * |:-------------|:------------|:------------|:------|:-------------|:------|:-----------------|
+     * |QASYMM8_SIGNED|QASYMM8      |S32          |QSYMM16|QASYMM8_SIGNED|QSYMM16|QASYMM8_SIGNED    |
      *
      * @param[in]  input                       Source tensor. Input is a 2D tensor with dimensions [input_size, batch_size]. Data types supported: QASYMM8_SIGNED.
      * @param[in]  input_to_forget_weights     2D weights tensor with dimensions [input_size, num_units]. Data type supported: QSYMM8.
diff --git a/arm_compute/runtime/NEON/functions/NEQuantizationLayer.h b/arm_compute/runtime/NEON/functions/NEQuantizationLayer.h
index a7fadfc7cd..eeca2bb1db 100644
--- a/arm_compute/runtime/NEON/functions/NEQuantizationLayer.h
+++ b/arm_compute/runtime/NEON/functions/NEQuantizationLayer.h
@@ -56,20 +56,12 @@ public:
      * - All
      *
      * Valid data type configurations:
-     * |src                |dst            |
-     * |:------------------|:--------------|
-     * |QASYMM8            |QASYMM8        |
-     * |QASYMM8            |QASYMM8_SIGNED |
-     * |QASYMM8            |QASYMM16       |
-     * |QASYMM8_SIGNED     |QASYMM8        |
-     * |QASYMM8_SIGNED     |QASYMM8_SIGNED |
-     * |QASYMM8_SIGNED     |QASYMM16       |
-     * |F16                |QASYMM8        |
-     * |F16                |QASYMM8_SIGNED |
-     * |F16                |QASYMM16       |
-     * |F32                |QASYMM8        |
-     * |F32                |QASYMM8_SIGNED |
-     * |F32                |QASYMM16       |
+     * |src                |dst                                    |
+     * |:------------------|:--------------------------------------|
+     * |QASYMM8            |QASYMM8, QASYMM8_SIGNED, QASYMM16      |
+     * |QASYMM8_SIGNED     |QASYMM8, QASYMM8_SIGNED, QASYMM16      |
+     * |F16                |QASYMM8, QASYMM8_SIGNED, QASYMM16      |
+     * |F32                |QASYMM8, QASYMM8_SIGNED, QASYMM16      |
      *
      * @param[in]  input  Source tensor. The dimensions over the third will be interpreted as batches. Data types supported: QASYMM8/QASYMM8_SIGNED/F32/F16.
      * @param[out] output Destination tensor with the same dimensions of input. Data types supported: QASYMM8/QASYMM8_SIGNED/QASYMM16
diff --git a/arm_compute/runtime/NEON/functions/NERNNLayer.h b/arm_compute/runtime/NEON/functions/NERNNLayer.h
index 66f7f2ea3f..667d3144ac 100644
--- a/arm_compute/runtime/NEON/functions/NERNNLayer.h
+++ b/arm_compute/runtime/NEON/functions/NERNNLayer.h
@@ -53,6 +53,16 @@ public:
     /** Default destructor */
     ~NERNNLayer();
     /** Initialize the function
+     *
+     * Valid data layouts:
+     * - NHWC
+     * - NCHW
+     *
+     * Valid data type configurations:
+     * |src0   |src1   |src2   |src3   |dst0   |dst1   |
+     * |:------|:------|:------|:------|:------|:------|
+     * |F16    |F16    |F16    |F16    |F16    |F16    |
+     * |F32    |F32    |F32    |F32    |F32    |F32    |
      *
      * @param[in]     input             Input is a 2-D tensor of shape [input_size, batch_size]. Data types supported: F16/F32
      * @param[in]     weights           Weights tensor of shape [input_size, num_units] that multiplies the input. Data types supported: Same as @p input
diff --git a/arm_compute/runtime/NEON/functions/NEROIAlignLayer.h b/arm_compute/runtime/NEON/functions/NEROIAlignLayer.h
index c72cd494d2..ea1af4daea 100644
--- a/arm_compute/runtime/NEON/functions/NEROIAlignLayer.h
+++ b/arm_compute/runtime/NEON/functions/NEROIAlignLayer.h
@@ -37,11 +37,22 @@ class NEROIAlignLayer : public INESimpleFunctionNoBorder
 {
 public:
     /** Set the input and output tensors.
+     *
+     * Valid data layouts:
+     * - All
+     *
+     * Valid data type configurations:
+     * |src0           |src1           |dst            |
+     * |:--------------|:--------------|:--------------|
+     * |F16            |F16            |F16            |
+     * |F32            |F32            |F32            |
+     * |QASYMM8        |QASYMM16       |QASYMM8        |
+     * |QASYMM8_SIGNED |QASYMM16       |QASYMM8_SIGNED |
      *
      * @param[in]  input     Source tensor. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
      * @param[in]  rois      ROIs tensor, it is a 2D tensor of size [5, N] (where N is the number of ROIs) containing top left and bottom right corner
      *                       as coordinate of an image and batch_id of ROI [ batch_id, x1, y1, x2, y2 ].
-     *                       Data types supported: QASYMM16 with scale of 0.125 and 0 offset if @p input is QASYMM8, otherwise same as @p input
+     *                       Data types supported: QASYMM16 with scale of 0.125 and 0 offset if @p input is QASYMM8/QASYMM8_SIGNED, otherwise same as @p input
      * @param[out] output    Destination tensor. Data types supported: Same as @p input.
      * @param[in]  pool_info Contains pooling operation information described in @ref ROIPoolingLayerInfo.
      *
@@ -54,7 +65,7 @@ public:
     /** Static function to check if given info will lead to a valid configuration of @ref NEROIAlignLayerKernel
      *
      * @param[in] input     Source tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
-     * @param[in] rois      ROIs tensor info. Data types supported: QASYMM16 with scale of 0.125 and 0 offset if @p input is QASYMM8,
+     * @param[in] rois      ROIs tensor info. Data types supported: QASYMM16 with scale of 0.125 and 0 offset if @p input is QASYMM8/QASYMM8_SIGNED,
      *                      otherwise same as @p input
      * @param[in] output    Destination tensor info. Data types supported: Same as @p input.
      * @param[in] pool_info Contains pooling operation information described in @ref ROIPoolingLayerInfo.
diff --git a/arm_compute/runtime/NEON/functions/NEROIPoolingLayer.h b/arm_compute/runtime/NEON/functions/NEROIPoolingLayer.h
index 214dd43402..2992b3eb95 100644
--- a/arm_compute/runtime/NEON/functions/NEROIPoolingLayer.h
+++ b/arm_compute/runtime/NEON/functions/NEROIPoolingLayer.h
@@ -52,6 +52,15 @@ public:
     /** Default destructor */
     ~NEROIPoolingLayer();
     /** Set the input and output tensors.
+     *
+     * Valid data layouts:
+     * - All
+     *
+     * Valid data type configurations:
+     * |src0           |src1           |dst            |
+     * |:--------------|:--------------|:--------------|
+     * |F32            |U16            |F32            |
+     * |QASYMM8        |U16            |QASYMM8        |
      *
      * @param[in]  input     Source tensor. Data types supported: QASYMM8/F32
      * @param[in]  rois      ROIs tensor, it is a 2D tensor of size [5, N] (where N is the number of ROIs) containing top left and bottom right corner
diff --git a/arm_compute/runtime/NEON/functions/NERange.h b/arm_compute/runtime/NEON/functions/NERange.h
index 28976001d7..cb14c8fdde 100644
--- a/arm_compute/runtime/NEON/functions/NERange.h
+++ b/arm_compute/runtime/NEON/functions/NERange.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 Arm Limited.
+ * Copyright (c) 2018-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -56,6 +56,21 @@ public:
     /** Default destructor */
     ~NERange();
     /** Initialize the kernel's start, end, step and output tensor.
+     *
+     * Valid data layouts:
+     * - All
+     *
+     * Valid data type configurations:
+     * |dst       |
+     * |:---------|
+     * |U8        |
+     * |S8        |
+     * |U16       |
+     * |S16       |
+     * |U32       |
+     * |S32       |
+     * |F16       |
+     * |F32       |
      *
      * @param[out] output Output tensor. Data types supported: U8/S8/U16/S16/U32/S32/F16/F32.
      * @param[in]  start  The starting value of the sequence.
diff --git a/arm_compute/runtime/NEON/functions/NEReduceMean.h b/arm_compute/runtime/NEON/functions/NEReduceMean.h
index 89cd09812b..7512115a3f 100644
--- a/arm_compute/runtime/NEON/functions/NEReduceMean.h
+++ b/arm_compute/runtime/NEON/functions/NEReduceMean.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 Arm Limited.
+ * Copyright (c) 2018-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -53,6 +53,17 @@ public:
     /** Default destructor */
     ~NEReduceMean();
     /** Configure kernel
+     *
+     * Valid data layouts:
+     * - All
+     *
+     * Valid data type configurations:
+     * |src            |dst            |
+     * |:--------------|:--------------|
+     * |QASYMM8        |QASYMM8        |
+     * |QASYMM8_SIGNED |QASYMM8_SIGNED |
+     * |F16            |F16            |
+     * |F32            |F32            |
      *
      * @note Supported tensor rank: up to 4
      *
diff --git a/arm_compute/runtime/NEON/functions/NEReductionOperation.h b/arm_compute/runtime/NEON/functions/NEReductionOperation.h
index b96b70926c..533c10adcf 100644
--- a/arm_compute/runtime/NEON/functions/NEReductionOperation.h
+++ b/arm_compute/runtime/NEON/functions/NEReductionOperation.h
@@ -58,7 +58,19 @@ public:
     ~NEReductionOperation();
     /** Set the input and output tensors.
      *
-     * @param[in, out] input     Source tensor. Data type supported: QASYMM8_SIGNED/QASYMM8/F16/F32/S32. Data layouts supported: NCHW. (Written to only for border_size != 0)
+     * Valid data layouts:
+     * - All
+     *
+     * Valid data type configurations:
+     * |src            |dst            |
+     * |:--------------|:--------------|
+     * |QASYMM8        |QASYMM8        |
+     * |QASYMM8_SIGNED |QASYMM8_SIGNED |
+     * |F16            |F16            |
+     * |F32            |F32            |
+     * |S32            |S32            |
+     *
+     * @param[in, out] input     Source tensor. Data type supported: QASYMM8_SIGNED/QASYMM8/F16/F32/S32. (Written to only for border_size != 0)
      * @param[out]     output    Destination tensor. Data types and data layouts supported: same as @p input.
      * @param[in]      axis      Dimension along which to reduce. Supported reduction axis : 0
      * @param[in]      op        Reduction operation to perform.
@@ -68,7 +80,7 @@ public:
 
     /** Static function to check if given info will lead to a valid configuration of @ref NEReductionOperation.
      *
-     * @param[in] input     Source tensor info. Data type supported: QASYMM8_SIGNED/QASYMM8/F16/F32/S32. Data layouts supported: NCHW.
+     * @param[in] input     Source tensor info. Data type supported: QASYMM8_SIGNED/QASYMM8/F16/F32/S32.
      * @param[in] output    Destination tensor info. Data types and data layouts supported: same as @p input.
      * @param[in] axis      Dimension along which to reduce. Supported reduction axis : 0
      * @param[in] op        Reduction operation to perform.
diff --git a/arm_compute/runtime/NEON/functions/NERemap.h b/arm_compute/runtime/NEON/functions/NERemap.h
index 835ebfab7e..1693078f66 100644
--- a/arm_compute/runtime/NEON/functions/NERemap.h
+++ b/arm_compute/runtime/NEON/functions/NERemap.h
@@ -43,6 +43,14 @@ class NERemap : public INESimpleFunction
 {
 public:
     /** Initialise the function's sources, destination, interpolation policy and border mode.
+     *
+     * Valid data layouts:
+     * - All
+     *
+     * Valid data type configurations:
+     * |src0   |src1   |src2   |dst    |
+     * |:------|:------|:------|:------|
+     * |U8     |F32    |F32    |U 8    |
      *
      * @param[in, out] input                 Source tensor. Data type supported: U8. (Written to only for @p border_mode != UNDEFINED)
      * @param[in]      map_x                 Map for X coordinates. Data type supported: F32.
diff --git a/arm_compute/runtime/NEON/functions/NEReorgLayer.h b/arm_compute/runtime/NEON/functions/NEReorgLayer.h
index f76d1d252c..0a7d824d10 100644
--- a/arm_compute/runtime/NEON/functions/NEReorgLayer.h
+++ b/arm_compute/runtime/NEON/functions/NEReorgLayer.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 Arm Limited.
+ * Copyright (c) 2018-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -38,6 +38,15 @@ class NEReorgLayer : public INESimpleFunctionNoBorder
 {
 public:
     /** Initialise the kernel's inputs and outputs
+     *
+     * Valid data layouts:
+     * - NHWC
+     * - NCHW
+     *
+     * Valid data type configurations:
+     * |src            |dst            |
+     * |:--------------|:--------------|
+     * |All            |All            |
      *
      * @param[in]  input  First tensor input. Data type supported: All
      * @param[out] output Output tensor. Data type supported: Same as @p input
diff --git a/arm_compute/runtime/NEON/functions/NEReverse.h b/arm_compute/runtime/NEON/functions/NEReverse.h
index 2048dafcb5..c02fff54a5 100644
--- a/arm_compute/runtime/NEON/functions/NEReverse.h
+++ b/arm_compute/runtime/NEON/functions/NEReverse.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 Arm Limited.
+ * Copyright (c) 2018-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -38,6 +38,14 @@ class NEReverse : public INESimpleFunctionNoBorder
 {
 public:
     /** Initialize the function
+     *
+     * Valid data layouts:
+     * - All
+     *
+     * Valid data type configurations:
+     * |src0           |src1           |dst            |
+     * |:--------------|:--------------|:--------------|
+     * |All            |U32            |All            |
      *
      * @param[in]  input  Input tensor. Data types supported: All
      * @param[out] output Output tensor. Data type supported: Same as @p input
diff --git a/arm_compute/runtime/NEON/functions/NESelect.h b/arm_compute/runtime/NEON/functions/NESelect.h
index c66fbfa7d4..c8e5a204dd 100644
--- a/arm_compute/runtime/NEON/functions/NESelect.h
+++ b/arm_compute/runtime/NEON/functions/NESelect.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 Arm Limited.
+ * Copyright (c) 2018-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -38,6 +38,14 @@ class NESelect : public INESimpleFunctionNoBorder
 {
 public:
     /** Initialise the kernel's inputs and output.
+     *
+     * Valid data layouts:
+     * - All
+     *
+     * Valid data type configurations:
+     * |src0           |src1           |src2   |dst            |
+     * |:--------------|:--------------|:------|:--------------|
+     * |U8             |All            |All    |All            |
      *
      * @param[in]  c      Condition input tensor. Data types supported: U8.
      * @param[in]  x      First input tensor. Data types supported: All.
diff --git a/arm_compute/runtime/NEON/functions/NESpaceToBatchLayer.h b/arm_compute/runtime/NEON/functions/NESpaceToBatchLayer.h
index 27c1ddf8e3..ad8c1467d0 100644
--- a/arm_compute/runtime/NEON/functions/NESpaceToBatchLayer.h
+++ b/arm_compute/runtime/NEON/functions/NESpaceToBatchLayer.h
@@ -57,6 +57,15 @@ public:
     /** Default destructor */
     ~NESpaceToBatchLayer();
     /** Set the input and output tensors.
+     *
+     * Valid data layouts:
+     * - NHWC
+     * - NCHW
+     *
+     * Valid data type configurations:
+     * |src0      |src1      |src2      |dst       |
+     * |:---------|:---------|:---------|:---------|
+     * |All       |S32       |S32       |All       |
      *
      * @param[in]  input       Tensor input. Supported tensor rank: 4. Data types supported: All.
      * @param[in]  block_shape 1-D tensor with shape [M]. Supported M: 2. Data types supported: S32
diff --git a/arm_compute/runtime/NEON/functions/NESpaceToDepthLayer.h b/arm_compute/runtime/NEON/functions/NESpaceToDepthLayer.h
index 73c228d8ee..1820cb8f6b 100644
--- a/arm_compute/runtime/NEON/functions/NESpaceToDepthLayer.h
+++ b/arm_compute/runtime/NEON/functions/NESpaceToDepthLayer.h
@@ -52,6 +52,15 @@ public:
     /** Default destructor */
     ~NESpaceToDepthLayer();
     /** Set the input and output tensors.
+     *
+     * Valid data layouts:
+     * - NHWC
+     * - NCHW
+     *
+     * Valid data type configurations:
+     * |src            |dst            |
+     * |:--------------|:--------------|
+     * |All            |All            |
      *
      * @param[in]  input       Tensor input. Supported tensor rank: 4. Data types supported: All.
      * @param[out] output      Tensor output. Data types supported: same as @p input
diff --git a/arm_compute/runtime/NEON/functions/NESplit.h b/arm_compute/runtime/NEON/functions/NESplit.h
index ede5ecf65a..206f299c06 100644
--- a/arm_compute/runtime/NEON/functions/NESplit.h
+++ b/arm_compute/runtime/NEON/functions/NESplit.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 Arm Limited.
+ * Copyright (c) 2018-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -40,6 +40,18 @@ namespace arm_compute
 class NESplit : public CPPSplit<NESlice>
 {
 public:
+    /** NESplit
+     *
+     * Valid data layouts:
+     * - All
+     *
+     * Valid data type configurations:
+     * |src            |dst            |
+     * |:--------------|:--------------|
+     * |All            |All            |
+     *
+     */
+
     // Inherited methods overridden:
     void run() override;
 };
diff --git a/arm_compute/runtime/NEON/functions/NEStackLayer.h b/arm_compute/runtime/NEON/functions/NEStackLayer.h
index f6fa4f2eb3..ae4e468f21 100644
--- a/arm_compute/runtime/NEON/functions/NEStackLayer.h
+++ b/arm_compute/runtime/NEON/functions/NEStackLayer.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 Arm Limited.
+ * Copyright (c) 2018-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -57,6 +57,14 @@ public:
     /** Default destructor */
     ~NEStackLayer();
     /** Initialise the kernel's inputs vector and output.
+     *
+     * Valid data layouts:
+     * - All
+     *
+     * Valid data type configurations:
+     * |src            |dst            |
+     * |:--------------|:--------------|
+     * |All            |All            |
      *
      * @note Supported input tensor rank: up to 4
      *
diff --git a/arm_compute/runtime/NEON/functions/NETile.h b/arm_compute/runtime/NEON/functions/NETile.h
index d5ce76c9cf..915e5aa1da 100644
--- a/arm_compute/runtime/NEON/functions/NETile.h
+++ b/arm_compute/runtime/NEON/functions/NETile.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 Arm Limited.
+ * Copyright (c) 2018-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -38,6 +38,14 @@ class NETile : public INESimpleFunctionNoBorder
 {
 public:
     /** Set the source, destination of the kernel
+     *
+     * Valid data layouts:
+     * - All
+     *
+     * Valid data type configurations:
+     * |src            |dst            |
+     * |:--------------|:--------------|
+     * |All            |All            |
      *
      * @param[in]  input     Source tensor. Data type supported: All.
      * @param[out] output    Destination tensor. Same as @p input
diff --git a/arm_compute/runtime/NEON/functions/NEUnstack.h b/arm_compute/runtime/NEON/functions/NEUnstack.h
index c8e85115f7..079fee5b9e 100644
--- a/arm_compute/runtime/NEON/functions/NEUnstack.h
+++ b/arm_compute/runtime/NEON/functions/NEUnstack.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 Arm Limited.
+ * Copyright (c) 2018-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -56,6 +56,14 @@ public:
     /** Default destructor */
     ~NEUnstack() = default;
     /** Set the input, output and unstacking axis.
+     *
+     * Valid data layouts:
+     * - All
+     *
+     * Valid data type configurations:
+     * |src            |dst            |
+     * |:--------------|:--------------|
+     * |All            |All            |
      *
      * @param[in]     input         A tensor to be unstacked. Data type supported: All.
      * @param[in,out] output_vector A vector of tensors. Data types supported: same as @p input.
diff --git a/arm_compute/runtime/NEON/functions/NEWinogradConvolutionLayer.h b/arm_compute/runtime/NEON/functions/NEWinogradConvolutionLayer.h
index 3367b10a96..77f9093ed4 100644
--- a/arm_compute/runtime/NEON/functions/NEWinogradConvolutionLayer.h
+++ b/arm_compute/runtime/NEON/functions/NEWinogradConvolutionLayer.h
@@ -65,6 +65,16 @@ public:
     ~NEWinogradConvolutionLayer() = default;
 
     /** Set the input and output tensors.
+     *
+     * Valid data layouts:
+     * - NHWC
+     * - NCHW
+     *
+     * Valid data type configurations:
+     * |src0           |src1           |src2   |dst            |
+     * |:--------------|:--------------|:------|:--------------|
+     * |F16            |F16            |F16    |F16            |
+     * |F32            |F32            |F32    |F32            |
      *
      * @param[in]  input            Source tensor. 3 lower dimensions represent a single input [width, height, IFM],
      *                              while every optional dimension from 4 and above represent a batch of inputs.
diff --git a/arm_compute/runtime/OperatorList.h b/arm_compute/runtime/OperatorList.h
index 8c43c68b90..a659a79423 100644
--- a/arm_compute/runtime/OperatorList.h
+++ b/arm_compute/runtime/OperatorList.h
@@ -40,7 +40,7 @@
  *
  */
 
-/** ArgMinMaxLayer (not ported)
+/** ArgMinMaxLayer
  *
  * Description:
  * Function to calculate the index of the minimum or maximum values in a tensor based on an axis.
@@ -71,27 +71,27 @@
  *
  */
 
-/** BatchNormalizationLayer (not ported)
+/** BatchNormalizationLayer
  *
  * Description:
- * @f[ out_i = \gamma * (\frac{in_i - \mu_{B}}{\sqrt{\sigma^2_{B} + \epsilon}}) + \beta \equiv BN_{\gamma,\beta}(in_i) @f]
+ * Function to perform batch normalization.
  *
  * Equivalent Android NNAPI Op:
- * None
+ * n/a
  *
  */
 
-/** BatchToSpaceLayer (not ported)
+/** BatchToSpaceLayer
  *
  * Description:
- * Rearranges (permutes) data from batch into blocks of spatial data, followed by cropping. It is the reverse transformation of SpaceToBatch (from TF website)
+ * Batch to space transformation.
  *
  * Equivalent Android NNAPI Op:
  * ANEURALNETWORKS_BATCH_TO_SPACE_ND
  *
  */
 
-/** BitwiseAnd (not ported)
+/** BitwiseAnd
  *
  * Description:
  * Function to performe bitwise AND between 2 tensors.
@@ -101,7 +101,7 @@
  *
  */
 
-/** BitwiseNot (not ported)
+/** BitwiseNot
  *
  * Description:
  * Function to performe bitwise NOT.
@@ -111,7 +111,7 @@
  *
  */
 
-/** BitwiseOr (not ported)
+/** BitwiseOr
  *
  * Description:
  * Function to performe bitwise OR between 2 tensors.
@@ -121,27 +121,27 @@
  *
  */
 
-/** BitwiseXor (not ported)
+/** BitwiseXor
  *
  * Description:
  * Function to performe bitwise XOR between 2 tensors.
  *
  * Equivalent Android NNAPI Op:
- * None
+ * n/a
  *
  */
 
-/** BoundingBoxTransform (not ported)
+/** BoundingBoxTransform
  *
  * Description:
- * Function to .
+ * Transform proposal bounding boxes to target bounding box using bounding box deltas.
  *
  * Equivalent Android NNAPI Op:
- * ?
+ * n/a
  *
  */
 
-/** Cast (not ported)
+/** Cast
  *
  * Description:
  * Function to cast a tensor.
@@ -151,20 +151,20 @@
  *
  */
 
-/** ChannelShuffelLayer (not ported)
+/** ChannelShuffleLayer
  *
  * Description:
- * Function to cast a tensor.
+ * Function to shuffle the channels of the input tensor.
  *
  * Equivalent Android NNAPI Op:
  * ANEURALNETWORKS_CHANNEL_SHUFFLE
  *
  */
 
-/** Comparison (not ported) (only CL)
+/** Comparison (only CL)
  *
  * Description:
- * Function to cast a tensor.
+ * Function to compare 2 tensors.
  *
  * Equivalent Android NNAPI Op:
  * ANEURALNETWORKS_EQUAL
@@ -192,11 +192,11 @@
  * Function to tranpose the wieghts for the fully connected layer.
  *
  * Equivalent Android NNAPI Op:
- * None
+ * n/a
  *
  */
 
-/** ConvolutionLayer (not ported)
+/** ConvolutionLayer
  *
  * Description:
  * Function to compute a convolution layer.
@@ -212,74 +212,74 @@
  * Function to copy a tensor.
  *
  * Equivalent Android NNAPI Op:
- * None
+ * n/a
  *
  */
 
 /** Crop (only CL)
  *
  * Description:
- * Function to .
+ * Performs a copy of input tensor to the output tensor.
  *
  * Equivalent Android NNAPI Op:
- * ?
+ * n/a
  *
  */
 
-/** CropResize (not ported)
+/** CropResize
  *
  * Description:
- * Function to .
+ * Function to perform cropping and resizing.
  *
  * Equivalent Android NNAPI Op:
- * ?
+ * n/a
  *
  */
 
-/** DeconvolutionLayer (not ported)
+/** DeconvolutionLayer
  *
  * Description:
- * Function to .
+ * Function to compute a deconvolution or tranpose convolution.
  *
  * Equivalent Android NNAPI Op:
  * ANEURALNETWORKS_TRANSPOSE_CONV_2D
  *
  */
 
-/** DeconvolutionLayerUpsample (only CL) (not ported)
+/** DeconvolutionLayerUpsample (only CL)
  *
  * Description:
- * Function to .
+ * Function to execute deconvolution upsample on OpenCL.
  *
  * Equivalent Android NNAPI Op:
  * ANEURALNETWORKS_TRANSPOSE_CONV_2D
  *
  */
 
-/** DepthConverterLayer (not ported)
+/** DepthConvertLayer
  *
  * Description:
- * Function to .
+ * Performs a down-scaling depth conversion.
  *
  * Equivalent Android NNAPI Op:
- * None
+ * n/a
  *
  */
 
-/** DepthToSpaceLayer (not ported)
+/** DepthToSpaceLayer
  *
  * Description:
- * Function to .
+ * Depth to Space transformation.
  *
  * Equivalent Android NNAPI Op:
  * ANEURALNETWORKS_DEPTH_TO_SPACE
  *
  */
 
-/** DepthwiseConvolutionLayer (not ported)
+/** DepthwiseConvolutionLayer
  *
  * Description:
- * Function to perform depthwise separable convolution
+ * Function to perform depthwise separable convolution.
  *
  * Equivalent Android NNAPI Op:
  * ANEURALNETWORKS_DEPTHWISE_CONV_2D
@@ -289,17 +289,17 @@
 /** DequantizationLayer
  *
  * Description:
- * Function to dequantize the values in a tensor
+ * Function to dequantize the values in a tensor.
  *
  * Equivalent Android NNAPI Op:
  * ANEURALNETWORKS_DEQUANTIZE
  *
  */
 
-/** DetectionPostProcessLayer (not ported) (no CL)
+/** DetectionPostProcessLayer (no CL)
  *
  * Description:
- * Function to generate the detection output based on center size encoded boxes, class prediction and anchors by doing non maximum suppression (NMS)
+ * Function to generate the detection output based on center size encoded boxes, class prediction and anchors by doing non maximum suppression (NMS).
  *
  * Equivalent Android NNAPI Op:
  * ANEURALNETWORKS_DETECTION_POSTPROCESSING
@@ -309,7 +309,7 @@
 /** DirectConvolutionLayer
  *
  * Description:
- * Function to
+ * Function to compute direct convolution.
  *
  * Equivalent Android NNAPI Op:
  * ANEURALNETWORKS_CONV_2D
@@ -319,7 +319,7 @@
 /** DirectDeconvolutionLayer (only CL)
  *
  * Description:
- * Function to
+ * Function to run the deconvolution layer.
  *
  * Equivalent Android NNAPI Op:
  * ANEURALNETWORKS_TRANSPOSE_CONV_2D
@@ -387,27 +387,27 @@
 /** FFT1D
  *
  * Description:
- * Fast Fourier Transform 1D
+ * Fast Fourier Transform 1D.
  *
  * Equivalent Android NNAPI Op:
- * None
+ * n/a
  *
  */
 
 /** FFT2D
  *
  * Description:
- * Fast Fourier Transform 2D
+ * Fast Fourier Transform 2D.
  *
  * Equivalent Android NNAPI Op:
- * None
+ * n/a
  *
  */
 
 /** FFTConvolutionLayer
  *
  * Description:
- * Fast Fourier Transform Convolution
+ * Fast Fourier Transform Convolution.
  *
  * Equivalent Android NNAPI Op:
  * ANEURALNETWORKS_CONV_2D
@@ -417,24 +417,24 @@
 /** Fill
  *
  * Description:
- * Set the values of a tensor with a given value
+ * Set the values of a tensor with a given value.
  *
  * Equivalent Android NNAPI Op:
  * ANEURALNETWORKS_FILL
  *
  */
 
-/** FillBorder (not ported)
+/** FillBorder
  *
  * Description:
- *
+ * Function to .
  *
  * Equivalent Android NNAPI Op:
- * ?
+ * n/a
  *
  */
 
-/** FlattenLayer (not ported)
+/** FlattenLayer
  *
  * Description:
  * Reshape a tensor to be 1D
@@ -447,104 +447,104 @@
 /** Floor
  *
  * Description:
- * Round the value to the lowest number
+ * Round the value to the lowest number.
  *
  * Equivalent Android NNAPI Op:
  * ANEURALNETWORKS_FLOOR
  *
  */
 
-/** FullyConnectedLayer (not ported)
+/** FullyConnectedLayer
  *
  * Description:
- * Function to perform a fully connected / dense layer
+ * Function to perform a fully connected / dense layer.
  *
  * Equivalent Android NNAPI Op:
  * ANEURALNETWORKS_FULLY_CONNECTED
  *
  */
 
-/** FuseBatchNormalization (not ported)
+/** FuseBatchNormalization
  *
  * Description:
- * Function to .
+ * Function to fuse the batch normalization node to a preceding convolution node.
  *
  * Equivalent Android NNAPI Op:
- * None
+ * n/a
  *
  */
 
-/** Gather (not ported)
+/** Gather
  *
  * Description:
- * Function to .
+ * Performs the Gather operation along the chosen axis.
  *
  * Equivalent Android NNAPI Op:
  * ANEURALNETWORKS_GATHER
  *
  */
 
-/** GEMM (not ported)
+/** GEMM
  *
  * Description:
  * General Matrix Multiplication.
  *
  * Equivalent Android NNAPI Op:
- * None
+ * n/a
  *
  */
 
-/** GEMMConv2D (not ported) (no CL)
+/** GEMMConv2D (no CL)
  *
  * Description:
  * General Matrix Multiplication.
  *
  * Equivalent Android NNAPI Op:
- * None
+ * ANEURALNETWORKS_CONV_2D
  *
  */
 
-/** GEMMConvolutionLayer (not ported)
+/** GEMMConvolutionLayer
  *
  * Description:
  * General Matrix Multiplication.
  *
  * Equivalent Android NNAPI Op:
- * None
+ * ANEURALNETWORKS_CONV_2D
  *
  */
 
-/** GEMMDeconvolutionLayer (not ported) (only CL)
+/** GEMMDeconvolutionLayer (only CL)
  *
  * Description:
  * General Matrix Multiplication.
  *
  * Equivalent Android NNAPI Op:
- * None
+ * ANEURALNETWORKS_TRANSPOSE_CONV_2D
  *
  */
 
-/** GEMMLowpMatrixMultiplyCore (not ported)
+/** GEMMLowpMatrixMultiplyCore
  *
  * Description:
  * General Matrix Multiplication.
  *
  * Equivalent Android NNAPI Op:
- * None
+ * n/a
  *
  */
 
-/** GEMMLowpOutputStage (not ported)
+/** GEMMLowpOutputStage
  *
  * Description:
  * General Matrix Multiplication.
  *
  * Equivalent Android NNAPI Op:
- * None
+ * n/a
  *
  */
 
-/** GenerateProposalsLayer (not ported)
+/** GenerateProposalsLayer
  *
  * Description:
  * Function to generate proposals for a RPN (Region Proposal Network).
@@ -554,7 +554,7 @@
  *
  */
 
-/** InstanceNormalizationLayer (not ported)
+/** InstanceNormalizationLayer
  *
  * Description:
  * Function to perform a Instance normalization on a given axis.
@@ -564,7 +564,7 @@
  *
  */
 
-/** L2NormalizationLayer (not ported)
+/** L2NormalizeLayer
  *
  * Description:
  * Function to perform a L2 normalization on a given axis.
@@ -583,102 +583,92 @@
  * - Logical NOT
  *
  * Equivalent Android NNAPI Op:
- * None?
+ * n/a
  *
  */
 
 /** LogicalAnd (only CL)
  *
  * Description:
- * Function to perform Logical AND
+ * Function to perform Logical AND.
  *
  * Equivalent Android NNAPI Op:
- * None?
+ * n/a
  *
  */
 
 /** LogicalOr (only CL)
  *
  * Description:
- * Function to perform Logical OR
+ * Function to perform Logical OR.
  *
  * Equivalent Android NNAPI Op:
- * None?
+ * n/a
  *
  */
 
 /** LogicalNot (only CL)
  *
  * Description:
- * Function to perform Logical NOT
+ * Function to perform Logical NOT.
  *
  * Equivalent Android NNAPI Op:
- * None?
+ * n/a
  *
  */
 
-/** LSTMLayer (not ported)
+/** LSTMLayer
  *
  * Description:
- * Function to perform LSTM
+ * Function to perform a single time step in a Long Short-Term Memory (LSTM) layer.
  *
  * Equivalent Android NNAPI Op:
  * ANEURALNETWORKS_LSTM
  *
  */
 
-/** LSTMLayerQuantized (not ported)
+/** LSTMLayerQuantized
  *
  * Description:
- * Function to perform LSTM
+ * Function to perform quantized LSTM (Long Short-Term Memory)
  *
  * Equivalent Android NNAPI Op:
  * ANEURALNETWORKS_QUANTIZED_LSTM
- * ANEURALNETWORKS_QUANTIZED_16BIT_LSTM ?
+ * ANEURALNETWORKS_QUANTIZED_16BIT_LSTM
  *
  */
 
-/** MaxUnpoolingLayer (not ported)
+/** MaxUnpoolingLayer
  *
  * Description:
- * Function to perform MaxUnpooling
+ * Function to perform MaxUnpooling.
  *
  * Equivalent Android NNAPI Op:
- *  ?
+ * n/a
  *
  */
 
-/** MeanStdDevNormalizationLayer (not ported)
+/** MeanStdDevNormalizationLayer
  *
  * Description:
  * Function to execute mean and standard deviation normalization.
  *
  * Equivalent Android NNAPI Op:
- * None ?
+ * n/a
  *
  */
 
-/** MeanStdDevNormalizationLayer (not ported)
- *
- * Description:
- * Function to execute mean and standard deviation normalization.
- *
- * Equivalent Android NNAPI Op:
- * None ?
- *
- */
-
-/** NormalizationLayer (not ported)
+/** NormalizationLayer
  *
  * Description:
  * Function to compute normalization layer.
  *
  * Equivalent Android NNAPI Op:
- * None ?
+ * ANEURALNETWORKS_LOCAL_RESPONSE_NORMALIZATION
  *
  */
 
-/** PadLayer (not ported)
+/** PadLayer
  *
  * Description:
  * Function to pad a tensor.
@@ -731,24 +721,24 @@
  *
  */
 
-/** PriorBoxLayer (not ported)
+/** PriorBoxLayer
  *
  * Description:
- * Function to compute the activation layer with the PRELU activation function.
+ * Function to .
  *
  * Equivalent Android NNAPI Op:
- * ?
+ * n/a
  *
  */
 
-/** QLSTMLayer (not ported)
+/** QLSTMLayer
  *
  * Description:
- * Function to perform LSTM
+ * Function to perform quantized LSTM (Long Short-Term Memory).
  *
  * Equivalent Android NNAPI Op:
  * ANEURALNETWORKS_QUANTIZED_LSTM
- * ANEURALNETWORKS_QUANTIZED_16BIT_LSTM ?
+ * ANEURALNETWORKS_QUANTIZED_16BIT_LSTM
  *
  */
 
@@ -762,17 +752,17 @@
  *
  */
 
-/** Range (not ported)
+/** Range
  *
  * Description:
- * Function to .
+ * Function to generates a sequence of numbers starting from START and extends by increments of 'STEP' up to but not including 'END'.
  *
  * Equivalent Android NNAPI Op:
- * none?
+ * n/a
  *
  */
 
-/** RecudeMean (not ported)
+/** ReduceMean
  *
  * Description:
  * Function to performe reduce mean operation.
@@ -782,22 +772,7 @@
  *
  */
 
-/** RecudeOperation (not ported)
- *
- * Description:
- * Function to performe reduce mean operation.
- *
- * Equivalent Android NNAPI Op:
- * ANEURALNETWORKS_REDUCE_ALL
- * ANEURALNETWORKS_REDUCE_ANY
- * ANEURALNETWORKS_REDUCE_MAX
- * ANEURALNETWORKS_REDUCE_MIN
- * ANEURALNETWORKS_REDUCE_PROD
- * ANEURALNETWORKS_REDUCE_SUM
- *
- */
-
-/** RecudeOperation (not ported)
+/** ReductionOperation
  *
  * Description:
  * Function to performe reduce with the following operations
@@ -820,20 +795,20 @@
  *
  */
 
-/** ReorgLayer (not ported)
+/** ReorgLayer
  *
  * Description:
- * Function to performe reorg
+ * Performs a reorganization layer of input tensor to the output tensor.
  *
  * Equivalent Android NNAPI Op:
- * None?
+ * n/a
  *
  */
 
 /** ReshapeLayer
  *
  * Description:
- * Fucntion to reshape a tensor
+ * Function to reshape a tensor.
  *
  * Equivalent Android NNAPI Op:
  * ANEURALNETWORKS_RESHAPE
@@ -841,40 +816,40 @@
  *
  */
 
-/** ReverseLayer (not ported)
+/** Reverse
  *
  * Description:
- * Fucntion to .
+ * Function to reverse tensor according to axis.
  *
  * Equivalent Android NNAPI Op:
- * None?
+ * n/a
  *
  */
 
-/** RNNLayer (not ported)
+/** RNNLayer
  *
  * Description:
- * Fucntion to perform RNN .
+ * Function to perform recurrent neural network layer.
  *
  * Equivalent Android NNAPI Op:
  * ANEURALNETWORKS_RNN
  *
  */
 
-/** ROIAligmentLayer (not ported)
+/** ROIAlignLayer
  *
  * Description:
- * Fucntion to perform RNN .
+ * Function to perform ROI alignment.
  *
  * Equivalent Android NNAPI Op:
  * ANEURALNETWORKS_ROI_ALIGN
  *
  */
 
-/** ROIPoolingLayer (not ported)
+/** ROIPoolingLayer
  *
  * Description:
- * Fucntion to perform RNN .
+ * Function to perform ROI pooling.
  *
  * Equivalent Android NNAPI Op:
  * ANEURALNETWORKS_ROI_POOLING
@@ -884,8 +859,8 @@
 /** Scale
  *
  * Description:
- * Fucntion to perform resize a tensor using to interpolate:
- * - Bilenear
+ * Function to perform resize a tensor using to interpolate:
+ * - Bilinear
  * - Nearest neighbor
  *
  * Equivalent Android NNAPI Op:
@@ -894,10 +869,10 @@
  *
  */
 
-/** Select (not ported)
+/** Select
  *
  * Description:
- * Fucntion to select values from 2 tensors depending on an input tensor of booleans.
+ * Function to select values from 2 tensors depending on an input tensor of booleans.
  *
  * Equivalent Android NNAPI Op:
  * ANEURALNETWORKS_SELECT
@@ -925,7 +900,7 @@
  *
  */
 
-/** SpaceToBatchLayer (not ported)
+/** SpaceToBatchLayer
  *
  * Description:
  * Function to divide a tensor spatially.
@@ -935,7 +910,7 @@
  *
  */
 
-/** SpaceToDepthLayer (not ported)
+/** SpaceToDepthLayer
  *
  * Description:
  * Function to rearrange blocks of spatial data into depth.
@@ -945,7 +920,7 @@
  *
  */
 
-/** Split (not ported)
+/** Split
  *
  * Description:
  * Function to split a tensor along a given axis.
@@ -955,13 +930,13 @@
  *
  */
 
-/** StackLayer (not ported)
+/** StackLayer
  *
  * Description:
  * Function to stack tensors along an axis.
  *
  * Equivalent Android NNAPI Op:
- * none
+ * n/a
  *
  */
 
@@ -975,7 +950,7 @@
  *
  */
 
-/** Tile  (not ported)
+/** Tile
  *
  * Description:
  * Function to construct a tensor by tiling a given tensor.
@@ -988,40 +963,40 @@
 /** Transpose
  *
  * Description:
- * Function to transpose an 2D tensor.
+ * Function to transpose a 2D tensor.
  *
  * Equivalent Android NNAPI Op:
  * ANEURALNETWORKS_TRANSPOSE
  *
  */
 
-/** Unstack (not ported)
+/** Unstack
  *
  * Description:
  * Function to unpack a rank-R tensor into rank-(R-1) tensors.
  *
  * Equivalent Android NNAPI Op:
- * none
+ * n/a
  *
  */
 
-/** WinogradConvolutionLayer (not ported)
+/** WinogradConvolutionLayer
  *
  * Description:
- * Function to.
+ * Function to do Winograd Convolution.
  *
  * Equivalent Android NNAPI Op:
- * None
+ * ANEURALNETWORKS_CONV_2D
  *
  */
 
-/** WinogradInputTransform (not ported) (only CL)
+/** WinogradInputTransform (only CL)
  *
  * Description:
  * Function to.
  *
  * Equivalent Android NNAPI Op:
- * None
+ * n/a
  *
  */
 
diff --git a/docs/09_operators_list.dox b/docs/09_operators_list.dox
index 82a127bbd3..244f292f82 100644
--- a/docs/09_operators_list.dox
+++ b/docs/09_operators_list.dox
@@ -107,14 +107,1531 @@ where N = batches, C = channels, H = height, W = width
     <tr><td>F16<td>F16
     <tr><td>F32<td>F32
     </table>
+<tr>
+  <td rowspan="2">ArgMinMaxLayer
+  <td rowspan="2" style="width:200px;"> Function to calculate the index of the minimum or maximum values in a tensor based on an axis.
+  <td rowspan="2">
+      <ul>
+       <li>ANEURALNETWORKS_ARGMAX
+       <li>ANEURALNETWORKS_ARGMIN
+      </ul>
+  <td>NEArgMinMaxLayer
+  <td>
+      <ul>
+       <li>All
+      </ul>
+  <td>
+    <table>
+    <tr><th>src<th>dst
+    <tr><td>QASYMM8<td>U32, S32
+    <tr><td>QASYMM8_SIGNED<td>U32, S32
+    <tr><td>S32<td>U32, S32
+    <tr><td>F16<td>U32, S32
+    <tr><td>F32<td>U32, S32
+    </table>
+<tr>
+  <td>CLArgMinMaxLayer
+  <td>
+      <ul>
+       <li>All
+      </ul>
+  <td>
+    <table>
+    <tr><th>src<th>dst
+    <tr><td>QASYMM8<td>U32, S32
+    <tr><td>QASYMM8_SIGNED<td>U32, S32
+    <tr><td>S32<td>U32, S32
+    <tr><td>F16<td>U32, S32
+    <tr><td>F32<td>U32, S32
+    </table>
+<tr>
+  <td rowspan="2">BatchNormalizationLayer
+  <td rowspan="2" style="width:200px;"> Function to perform batch normalization.
+  <td rowspan="2">
+      <ul>
+       <li>n/a
+      </ul>
+  <td>NEBatchNormalizationLayer
+  <td>
+      <ul>
+       <li>NHWC
+       <li>NCHW
+      </ul>
+  <td>
+    <table>
+    <tr><th>src<th>dst
+    <tr><td>F32<td>F32
+    <tr><td>F16<td>F16
+    </table>
+<tr>
+  <td>CLBatchNormalizationLayer
+  <td>
+      <ul>
+       <li>NHWC
+       <li>NCHW
+      </ul>
+  <td>
+    <table>
+    <tr><th>src<th>dst
+    <tr><td>F32<td>F32
+    <tr><td>F16<td>F16
+    </table>
+<tr>
+  <td rowspan="2">BatchToSpaceLayer
+  <td rowspan="2" style="width:200px;"> Batch to space transformation.
+  <td rowspan="2">
+      <ul>
+       <li>ANEURALNETWORKS_BATCH_TO_SPACE_ND
+      </ul>
+  <td>NEBatchToSpaceLayer
+  <td>
+      <ul>
+       <li>NHWC
+       <li>NCHW
+      </ul>
+  <td>
+    <table>
+    <tr><th>src0<th>src1<th>dst
+    <tr><td>All<td>s32<td>All
+    </table>
+<tr>
+  <td>CLBatchToSpaceLayer
+  <td>
+      <ul>
+       <li>NHWC
+       <li>NCHW
+      </ul>
+  <td>
+    <table>
+    <tr><th>src0<th>src1<th>dst
+    <tr><td>All<td>s32<td>All
+    </table>
+<tr>
+  <td rowspan="2">BitwiseAnd
+  <td rowspan="2" style="width:200px;"> Function to performe bitwise AND between 2 tensors.
+  <td rowspan="2">
+      <ul>
+       <li>ANEURALNETWORKS_LOGICAL_AND
+      </ul>
+  <td>NEBitwiseAnd
+  <td>
+      <ul>
+       <li>All
+      </ul>
+  <td>
+    <table>
+    <tr><th>src<th>dst
+    <tr><td>U8<td>U8
+    </table>
+<tr>
+  <td>CLBitwiseAnd
+  <td>
+      <ul>
+       <li>All
+      </ul>
+  <td>
+    <table>
+    <tr><th>src<th>dst
+    <tr><td>U8<td>U8
+    </table>
+<tr>
+  <td rowspan="2">BitwiseNot
+  <td rowspan="2" style="width:200px;"> Function to performe bitwise NOT.
+  <td rowspan="2">
+      <ul>
+       <li>ANEURALNETWORKS_LOGICAL_NOT
+      </ul>
+  <td>NEBitwiseNot
+  <td>
+      <ul>
+       <li>All
+      </ul>
+  <td>
+    <table>
+    <tr><th>src<th>dst
+    <tr><td>U8<td>U8
+    </table>
+<tr>
+  <td>CLBitwiseNot
+  <td>
+      <ul>
+       <li>All
+      </ul>
+  <td>
+    <table>
+    <tr><th>src<th>dst
+    <tr><td>U8<td>U8
+    </table>
+<tr>
+  <td rowspan="2">BitwiseOr
+  <td rowspan="2" style="width:200px;"> Function to performe bitwise OR between 2 tensors.
+  <td rowspan="2">
+      <ul>
+       <li>ANEURALNETWORKS_LOGICAL_OR
+      </ul>
+  <td>NEBitwiseOr
+  <td>
+      <ul>
+       <li>All
+      </ul>
+  <td>
+    <table>
+    <tr><th>src<th>dst
+    <tr><td>U8<td>U8
+    </table>
+<tr>
+  <td>CLBitwiseOr
+  <td>
+      <ul>
+       <li>All
+      </ul>
+  <td>
+    <table>
+    <tr><th>src<th>dst
+    <tr><td>U8<td>U8
+    </table>
+<tr>
+  <td rowspan="2">BitwiseXor
+  <td rowspan="2" style="width:200px;"> Function to performe bitwise XOR between 2 tensors.
+  <td rowspan="2">
+      <ul>
+       <li>n/a
+      </ul>
+  <td>NEBitwiseXor
+  <td>
+      <ul>
+       <li>All
+      </ul>
+  <td>
+    <table>
+    <tr><th>src<th>dst
+    <tr><td>U8<td>U8
+    </table>
+<tr>
+  <td>CLBitwiseXor
+  <td>
+      <ul>
+       <li>All
+      </ul>
+  <td>
+    <table>
+    <tr><th>src<th>dst
+    <tr><td>U8<td>U8
+    </table>
+<tr>
+  <td rowspan="2">BoundingBoxTransform
+  <td rowspan="2" style="width:200px;"> Transform proposal bounding boxes to target bounding box using bounding box deltas.
+  <td rowspan="2">
+      <ul>
+       <li>n/a
+      </ul>
+  <td>NEBoundingBoxTransform
+  <td>
+      <ul>
+       <li>NHWC
+       <li>NCHW
+      </ul>
+  <td>
+    <table>
+    <tr><th>src0<th>src1<th>dst
+    <tr><td>QASYMM16<td>QASYMM8<td>QASYMM16
+    <tr><td>F16<td>F16<td>F16
+    <tr><td>F32<td>F32<td>F32
+    </table>
+<tr>
+  <td>CLBoundingBoxTransform
+  <td>
+      <ul>
+       <li>NHWC
+       <li>NCHW
+      </ul>
+  <td>
+    <table>
+    <tr><th>src0<th>src1<th>dst
+    <tr><td>QASYMM16<td>QASYMM8<td>QASYMM16
+    <tr><td>F16<td>F16<td>F16
+    <tr><td>F32<td>F32<td>F32
+    </table>
+<tr>
+  <td rowspan="2">Cast
+  <td rowspan="2" style="width:200px;"> Function to cast a tensor.
+  <td rowspan="2">
+      <ul>
+       <li>ANEURALNETWORKS_CAST
+      </ul>
+  <td>NECast
+  <td>
+      <ul>
+       <li>All
+      </ul>
+  <td>
+    <table>
+    <tr><th>src<th>dst
+    <tr><td>QASYMM8_SIGNED<td>S16, S32, F32, F16
+    <tr><td>QASYMM8<td>U16, S16, S32, F32, F16
+    <tr><td>U8<td>U16, S16, S32, F32, F16
+    <tr><td>U16<td>U8, U32
+    <tr><td>S16<td>QASYMM8_SIGNED, U8, S32
+    <tr><td>F16<td>QASYMM8_SIGNED, QASYMM8, F32, S32, U8
+    <tr><td>S32<td>QASYMM8_SIGNED, QASYMM8, F16, F32, U8
+    <tr><td>F32<td>QASYMM8_SIGNED, QASYMM8, BFLOAT16, F16, S32, U8
+    </table>
+<tr>
+  <td>CLCast
+  <td>
+      <ul>
+       <li>All
+      </ul>
+  <td>
+    <table>
+    <tr><th>src<th>dst
+    <tr><td>U8<td>S8, U16, S16, U32, S32, F16, F32
+    <tr><td>U16<td>U8, S8, S16, U32, S32, F16, F32
+    <tr><td>S16<td>U8, S8, U16, U32, S32, F16, F32
+    <tr><td>U32<td>U8, S8, U16, S16, S32, F16, F32
+    <tr><td>S32<td>U8, S8, U16, S16, U32, F16, F32
+    <tr><td>F16<td>U8, S8, U16, S16, U32, F32
+    <tr><td>F32<td>U8, S8, U16, S16, U32, F16
+    </table>
+<tr>
+  <td rowspan="2">ChannelShuffleLayer
+  <td rowspan="2" style="width:200px;"> Function to shuffle the channels of the input tensor.
+  <td rowspan="2">
+      <ul>
+       <li>ANEURALNETWORKS_CHANNEL_SHUFFLE
+      </ul>
+  <td>NEChannelShuffleLayer
+  <td>
+      <ul>
+       <li>NCHW
+      </ul>
+  <td>
+    <table>
+    <tr><th>src<th>dst
+    <tr><td>All<td>All
+    </table>
+<tr>
+  <td>CLChannelShuffleLayer
+  <td>
+      <ul>
+       <li>NCHW
+      </ul>
+  <td>
+    <table>
+    <tr><th>src<th>dst
+    <tr><td>All<td>All
+    </table>
 <tr>
   <td rowspan="2">ConcatenateLayer
   <td rowspan="2" style="width:200px;"> Function to concatenate tensors along a given axis.
   <td rowspan="2">
       <ul>
-       <li>ANEURALNETWORKS_CONCATENATION
+       <li>ANEURALNETWORKS_CONCATENATION
+      </ul>
+  <td>NEConcatenateLayer
+  <td>
+      <ul>
+       <li>All
+      </ul>
+  <td>
+    <table>
+    <tr><th>src<th>dst
+    <tr><td>QASYMM8<td>QASYMM8
+    <tr><td>QASYMM8_SIGNED<td>QASYMM8_SIGNED
+    <tr><td>F16<td>F16
+    <tr><td>F32<td>F32
+    </table>
+<tr>
+  <td>CLConcatenateLayer
+  <td>
+      <ul>
+       <li>All
+      </ul>
+  <td>
+    <table>
+    <tr><th>src<th>dst
+    <tr><td>QASYMM8<td>QASYMM8
+    <tr><td>QASYMM8_SIGNED<td>QASYMM8_SIGNED
+    <tr><td>F16<td>F16
+    <tr><td>F32<td>F32
+    </table>
+<tr>
+  <td rowspan="2">ConvertFullyConnectedWeights
+  <td rowspan="2" style="width:200px;"> Function to tranpose the wieghts for the fully connected layer.
+  <td rowspan="2">
+      <ul>
+       <li>n/a
+      </ul>
+  <td>NEConvertFullyConnectedWeights
+  <td>
+      <ul>
+       <li>NHWC
+       <li>NCHW
+      </ul>
+  <td>
+    <table>
+    <tr><th>src<th>dst
+    <tr><td>All<td>All
+    </table>
+<tr>
+  <td>CLConvertFullyConnectedWeights
+  <td>
+      <ul>
+       <li>NHWC
+       <li>NCHW
+      </ul>
+  <td>
+    <table>
+    <tr><th>src<th>dst
+    <tr><td>All<td>All
+    </table>
+<tr>
+  <td rowspan="2">ConvolutionLayer
+  <td rowspan="2" style="width:200px;"> Function to compute a convolution layer.
+  <td rowspan="2">
+      <ul>
+       <li>ANEURALNETWORKS_CONV_2D
+      </ul>
+  <td>NEConvolutionLayer
+  <td>
+      <ul>
+       <li>NHWC
+       <li>NCHW
+      </ul>
+  <td>
+    <table>
+    <tr><th>src0<th>src1<th>src2<th>dst
+    <tr><td>F16<td>F16<td>F16<td>F16
+    <tr><td>F32<td>F32<td>F32<td>F32
+    <tr><td>QASYMM8<td>QASYMM8<td>S32<td>QASYMM8
+    <tr><td>QASYMM8<td>QSYMM8_PER_CHANNEL<td>S32<td>QASYMM8
+    <tr><td>QASYMM8_SIGNED<td>QASYMM8_SIGNED<td>S32<td>QASYMM8_SIGNED
+    <tr><td>QASYMM8_SIGNED<td>QSYMM8_PER_CHANNEL<td>S32<td>QASYMM8_SIGNED
+    </table>
+<tr>
+  <td>CLConvolutionLayer
+  <td>
+      <ul>
+       <li>NHWC
+       <li>NCHW
+      </ul>
+  <td>
+    <table>
+    <tr><th>src0<th>src1<th>src2<th>dst
+    <tr><td>F16<td>F16<td>F16<td>F16
+    <tr><td>F32<td>F32<td>F32<td>F32
+    <tr><td>QASYMM8<td>QASYMM8<td>S32<td>QASYMM8
+    <tr><td>QASYMM8<td>QSYMM8_PER_CHANNEL<td>S32<td>QASYMM8
+    <tr><td>QASYMM8_SIGNED<td>QASYMM8_SIGNED<td>S32<td>QASYMM8_SIGNED
+    <tr><td>QASYMM8_SIGNED<td>QSYMM8_PER_CHANNEL<td>S32<td>QASYMM8_SIGNED
+    </table>
+<tr>
+  <td rowspan="2">Copy
+  <td rowspan="2" style="width:200px;"> Function to copy a tensor.
+  <td rowspan="2">
+      <ul>
+       <li>n/a
+      </ul>
+  <td>NECopy
+  <td>
+      <ul>
+       <li>All
+      </ul>
+  <td>
+    <table>
+    <tr><th>src<th>dst
+    <tr><td>All<td>All
+    </table>
+<tr>
+  <td>CLCopy
+  <td>
+      <ul>
+       <li>All
+      </ul>
+  <td>
+    <table>
+    <tr><th>src<th>dst
+    <tr><td>All<td>All
+    </table>
+<tr>
+  <td rowspan="2">CropResize
+  <td rowspan="2" style="width:200px;"> Function to perform cropping and resizing.
+  <td rowspan="2">
+      <ul>
+       <li>n/a
+      </ul>
+  <td>NECropResize
+  <td>
+      <ul>
+       <li>NHWC
+      </ul>
+  <td>
+    <table>
+    <tr><th>src0<th>src1<th>src2<th>dst
+    <tr><td>All<td>F32<td>F32<td>F32
+    </table>
+<tr>
+  <td>CLCropResize
+  <td>
+      <ul>
+       <li>NHWC
+      </ul>
+  <td>
+    <table>
+    <tr><th>src0<th>src1<th>src2<th>dst
+    <tr><td>All<td>F32<td>F32<td>F32
+    </table>
+<tr>
+  <td rowspan="2">DeconvolutionLayer
+  <td rowspan="2" style="width:200px;"> Function to compute a deconvolution or tranpose convolution.
+  <td rowspan="2">
+      <ul>
+       <li>ANEURALNETWORKS_TRANSPOSE_CONV_2D
+      </ul>
+  <td>NEDeconvolutionLayer
+  <td>
+      <ul>
+       <li>NHWC
+       <li>NCHW
+      </ul>
+  <td>
+    <table>
+    <tr><th>src0<th>src1<th>src2<th>dst
+    <tr><td>F16<td>F16<td>F16<td>F16
+    <tr><td>F32<td>F32<td>F32<td>F32
+    <tr><td>QASYMM8<td>QASYMM8<td>S32<td>QASYMM8
+    <tr><td>QASYMM8<td>QSYMM8_PER_CHANNEL<td>S32<td>QASYMM8
+    <tr><td>QASYMM8_SIGNED<td>QASYMM8_SIGNED<td>S32<td>QASYMM8_SIGNED
+    <tr><td>QASYMM8_SIGNED<td>QSYMM8_PER_CHANNEL<td>S32<td>QASYMM8_SIGNED
+    </table>
+<tr>
+  <td>CLDeconvolutionLayer
+  <td>
+      <ul>
+       <li>NHWC
+       <li>NCHW
+      </ul>
+  <td>
+    <table>
+    <tr><th>src0<th>src1<th>src2<th>dst
+    <tr><td>F16<td>F16<td>F16<td>F16
+    <tr><td>F32<td>F32<td>F32<td>F32
+    <tr><td>QASYMM8<td>QASYMM8<td>S32<td>QASYMM8
+    <tr><td>QASYMM8<td>QSYMM8_PER_CHANNEL<td>S32<td>QASYMM8
+    <tr><td>QASYMM8_SIGNED<td>QASYMM8_SIGNED<td>S32<td>QASYMM8_SIGNED
+    <tr><td>QASYMM8_SIGNED<td>QSYMM8_PER_CHANNEL<td>S32<td>QASYMM8_SIGNED
+    </table>
+<tr>
+  <td rowspan="2">DepthConvertLayer
+  <td rowspan="2" style="width:200px;"> Performs a down-scaling depth conversion.
+  <td rowspan="2">
+      <ul>
+       <li>n/a
+      </ul>
+  <td>NEDepthConvertLayer
+  <td>
+      <ul>
+       <li>All
+      </ul>
+  <td>
+    <table>
+    <tr><th>src<th>dst
+    <tr><td>QASYMM8<td>F16, F32
+    <tr><td>U8<td>U16, S16, S32
+    <tr><td>U16<td>U8, U32
+    <tr><td>S16<td>U8, S32
+    <tr><td>BFLOAT16<td>F32
+    <tr><td>F16<td>QASYMM8, F32
+    <tr><td>F32<td>QASYMM8, F16, BFLOAT16
+    </table>
+<tr>
+  <td>CLDepthConvertLayer
+  <td>
+      <ul>
+       <li>All
+      </ul>
+  <td>
+    <table>
+    <tr><th>src<th>dst
+    <tr><td>U8<td>S8, U16, S16, U32, S32, F16, F32
+    <tr><td>U16<td>U8, S8, S16, U32, S32, F16, F32
+    <tr><td>S16<td>U8, S8, U16, U32, S32, F16, F32
+    <tr><td>U32<td>U8, S8, U16, S16, S32, F16, F32
+    <tr><td>S32<td>U8, S8, U16, S16, U32, F16, F32
+    <tr><td>F16<td>U8, S8, U16, S16, U32, F32
+    <tr><td>F32<td>U8, S8, U16, S16, U32, F16
+    </table>
+<tr>
+  <td rowspan="2">DepthToSpaceLayer
+  <td rowspan="2" style="width:200px;"> Depth to Space transformation.
+  <td rowspan="2">
+      <ul>
+       <li>ANEURALNETWORKS_DEPTH_TO_SPACE
+      </ul>
+  <td>NEDepthToSpaceLayer
+  <td>
+      <ul>
+       <li>NHWC
+       <li>NCHW
+      </ul>
+  <td>
+    <table>
+    <tr><th>src<th>dst
+    <tr><td>All<td>All
+    </table>
+<tr>
+  <td>CLDepthToSpaceLayer
+  <td>
+      <ul>
+       <li>NHWC
+       <li>NCHW
+      </ul>
+  <td>
+    <table>
+    <tr><th>src<th>dst
+    <tr><td>All<td>All
+    </table>
+<tr>
+  <td rowspan="2">DepthwiseConvolutionLayer
+  <td rowspan="2" style="width:200px;"> Function to perform depthwise separable convolution.
+  <td rowspan="2">
+      <ul>
+       <li>ANEURALNETWORKS_DEPTHWISE_CONV_2D
+      </ul>
+  <td>NEDepthwiseConvolutionLayer
+  <td>
+      <ul>
+       <li>NHWC
+       <li>NCHW
+      </ul>
+  <td>
+    <table>
+    <tr><th>src0<th>src1<th>src2<th>dst
+    <tr><td>F16<td>F16<td>F16<td>F16
+    <tr><td>F32<td>F32<td>F32<td>F32
+    <tr><td>QASYMM8<td>QASYMM8<td>S32<td>QASYMM8
+    <tr><td>QASYMM8<td>QSYMM8_PER_CHANNEL<td>S32<td>QASYMM8
+    <tr><td>QASYMM8_SIGNED<td>QASYMM8_SIGNED<td>S32<td>QASYMM8_SIGNED
+    <tr><td>QASYMM8_SIGNED<td>QSYMM8_PER_CHANNEL<td>S32<td>QASYMM8_SIGNED
+    </table>
+<tr>
+  <td>CLDepthwiseConvolutionLayer
+  <td>
+      <ul>
+       <li>NHWC
+       <li>NCHW
+      </ul>
+  <td>
+    <table>
+    <tr><th>src0<th>src1<th>src2<th>dst
+    <tr><td>F16<td>F16<td>F16<td>F16
+    <tr><td>F32<td>F32<td>F32<td>F32
+    <tr><td>QASYMM8<td>QASYMM8<td>S32<td>QASYMM8
+    <tr><td>QASYMM8<td>QSYMM8_PER_CHANNEL<td>S32<td>QASYMM8
+    <tr><td>QASYMM8_SIGNED<td>QASYMM8_SIGNED<td>S32<td>QASYMM8_SIGNED
+    <tr><td>QASYMM8_SIGNED<td>QSYMM8_PER_CHANNEL<td>S32<td>QASYMM8_SIGNED
+    </table>
+<tr>
+  <td rowspan="2">DequantizationLayer
+  <td rowspan="2" style="width:200px;"> Function to dequantize the values in a tensor.
+  <td rowspan="2">
+      <ul>
+       <li>ANEURALNETWORKS_DEQUANTIZE
+      </ul>
+  <td>NEDequantizationLayer
+  <td>
+      <ul>
+       <li>All
+      </ul>
+  <td>
+    <table>
+    <tr><th>src<th>dst
+    <tr><td>QASYMM8<td>F16, F32
+    <tr><td>QASYMM8_SIGNED<td>F16, F32
+    <tr><td>QSYMM8_PER_CHANNEL<td>F16, F32
+    <tr><td>QSYMM8<td>F16, F32
+    <tr><td>QSYMM16<td>F16, F32
+    </table>
+<tr>
+  <td>CLDequantizationLayer
+  <td>
+      <ul>
+       <li>All
+      </ul>
+  <td>
+    <table>
+    <tr><th>src<th>dst
+    <tr><td>QASYMM8<td>F16, F32
+    <tr><td>QASYMM8_SIGNED<td>F16, F32
+    <tr><td>QSYMM8_PER_CHANNEL<td>F16, F32
+    <tr><td>QSYMM8<td>F16, F32
+    <tr><td>QSYMM16<td>F16, F32
+    </table>
+<tr>
+  <td rowspan="2">DirectConvolutionLayer
+  <td rowspan="2" style="width:200px;"> Function to compute direct convolution.
+  <td rowspan="2">
+      <ul>
+       <li>ANEURALNETWORKS_CONV_2D
+      </ul>
+  <td>NEDirectConvolutionLayer
+  <td>
+      <ul>
+       <li>NHWC
+       <li>NCHW
+      </ul>
+  <td>
+    <table>
+    <tr><th>src0<th>src1<th>src2<th>dst
+    <tr><td>F16<td>F16<td>F16<td>F16
+    <tr><td>F32<td>F32<td>F32<td>F32
+    </table>
+<tr>
+  <td>CLDirectConvolutionLayer
+  <td>
+      <ul>
+       <li>NHWC
+       <li>NCHW
+      </ul>
+  <td>
+    <table>
+    <tr><th>src0<th>src1<th>src2<th>dst
+    <tr><td>F16<td>F16<td>F16<td>F16
+    <tr><td>F32<td>F32<td>F32<td>F32
+    <tr><td>QASYMM8<td>QASYMM8<td>S32<td>QASYMM8
+    <tr><td>QASYMM8_SIGNED<td>QASYMM8_SIGNED<td>S32<td>QASYMM8_SIGNED
+    </table>
+<tr>
+  <td rowspan="2">FFT1D
+  <td rowspan="2" style="width:200px;"> Fast Fourier Transform 1D.
+  <td rowspan="2">
+      <ul>
+       <li>n/a
+      </ul>
+  <td>NEFFT1D
+  <td>
+      <ul>
+       <li>All
+      </ul>
+  <td>
+    <table>
+    <tr><th>src<th>dst
+    <tr><td>F32<td>F32
+    </table>
+<tr>
+  <td>CLFFT1D
+  <td>
+      <ul>
+       <li>All
+      </ul>
+  <td>
+    <table>
+    <tr><th>src<th>dst
+    <tr><td>F32<td>F32
+    <tr><td>F16<td>F16
+    </table>
+<tr>
+  <td rowspan="2">FFT2D
+  <td rowspan="2" style="width:200px;"> Fast Fourier Transform 2D.
+  <td rowspan="2">
+      <ul>
+       <li>n/a
+      </ul>
+  <td>NEFFT2D
+  <td>
+      <ul>
+       <li>All
+      </ul>
+  <td>
+    <table>
+    <tr><th>src<th>dst
+    <tr><td>F32<td>F32
+    </table>
+<tr>
+  <td>CLFFT2D
+  <td>
+      <ul>
+       <li>All
+      </ul>
+  <td>
+    <table>
+    <tr><th>src<th>dst
+    <tr><td>F32<td>F32
+    <tr><td>F16<td>F16
+    </table>
+<tr>
+  <td rowspan="2">FFTConvolutionLayer
+  <td rowspan="2" style="width:200px;"> Fast Fourier Transform Convolution.
+  <td rowspan="2">
+      <ul>
+       <li>ANEURALNETWORKS_CONV_2D
+      </ul>
+  <td>NEFFTConvolutionLayer
+  <td>
+      <ul>
+       <li>All
+      </ul>
+  <td>
+    <table>
+    <tr><th>src<th>dst
+    <tr><td>F32<td>F32
+    </table>
+<tr>
+  <td>CLFFTConvolutionLayer
+  <td>
+      <ul>
+       <li>All
+      </ul>
+  <td>
+    <table>
+    <tr><th>src<th>dst
+    <tr><td>F32<td>F32
+    <tr><td>F16<td>F16
+    </table>
+<tr>
+  <td rowspan="2">Fill
+  <td rowspan="2" style="width:200px;"> Set the values of a tensor with a given value.
+  <td rowspan="2">
+      <ul>
+       <li>ANEURALNETWORKS_FILL
+      </ul>
+  <td>NEFill
+  <td>
+      <ul>
+       <li>All
+      </ul>
+  <td>
+    <table>
+    <tr><th>src<th>dst
+    <tr><td>All<td>All
+    </table>
+<tr>
+  <td>CLFill
+  <td>
+      <ul>
+       <li>All
+      </ul>
+  <td>
+    <table>
+    <tr><th>src<th>dst
+    <tr><td>All<td>All
+    </table>
+<tr>
+  <td rowspan="2">FillBorder
+  <td rowspan="2" style="width:200px;"> Function to .
+  <td rowspan="2">
+      <ul>
+       <li>n/a
+      </ul>
+  <td>NEFillBorder
+  <td>
+      <ul>
+       <li>All
+      </ul>
+  <td>
+    <table>
+    <tr><th>src<th>dst
+    <tr><td>All<td>All
+    </table>
+<tr>
+  <td>CLFillBorder
+  <td>
+      <ul>
+       <li>All
+      </ul>
+  <td>
+    <table>
+    <tr><th>src<th>dst
+    <tr><td>All<td>All
+    </table>
+<tr>
+  <td rowspan="2">FlattenLayer
+  <td rowspan="2" style="width:200px;"> Reshape a tensor to be 1D
+  <td rowspan="2">
+      <ul>
+       <li>ANEURALNETWORKS_RESHAPE
+      </ul>
+  <td>NEFlattenLayer
+  <td>
+      <ul>
+       <li>All
+      </ul>
+  <td>
+    <table>
+    <tr><th>src<th>dst
+    <tr><td>All<td>All
+    </table>
+<tr>
+  <td>CLFlattenLayer
+  <td>
+      <ul>
+       <li>All
+      </ul>
+  <td>
+    <table>
+    <tr><th>src<th>dst
+    <tr><td>All<td>All
+    </table>
+<tr>
+  <td rowspan="2">Floor
+  <td rowspan="2" style="width:200px;"> Round the value to the lowest number.
+  <td rowspan="2">
+      <ul>
+       <li>ANEURALNETWORKS_FLOOR
+      </ul>
+  <td>NEFloor
+  <td>
+      <ul>
+       <li>All
+      </ul>
+  <td>
+    <table>
+    <tr><th>src<th>dst
+    <tr><td>F32<td>F32
+    <tr><td>F16<td>F16
+    </table>
+<tr>
+  <td>CLFloor
+  <td>
+      <ul>
+       <li>All
+      </ul>
+  <td>
+    <table>
+    <tr><th>src<th>dst
+    <tr><td>F32<td>F32
+    <tr><td>F16<td>F16
+    </table>
+<tr>
+  <td rowspan="2">FullyConnectedLayer
+  <td rowspan="2" style="width:200px;"> Function to perform a fully connected / dense layer.
+  <td rowspan="2">
+      <ul>
+       <li>ANEURALNETWORKS_FULLY_CONNECTED
+      </ul>
+  <td>NEFullyConnectedLayerReshapeWeightsManaged
+  <td>
+      <ul>
+       <li>NHWC
+       <li>NCHW
+      </ul>
+  <td>
+    <table>
+    <tr><th>src0<th>src1<th>src2<th>dst
+    <tr><td>F16<td>F16<td>F16<td>F16
+    <tr><td>F32<td>F32<td>F32<td>F32
+    <tr><td>QASYMM8<td>QASYMM8<td>S32<td>QASYMM8
+    <tr><td>QASYMM8_SIGNED<td>QASYMM8_SIGNED<td>S32<td>QASYMM8_SIGNED
+    </table>
+<tr>
+  <td>CLFullyConnectedLayerReshapeWeightsManaged
+  <td>
+      <ul>
+       <li>NHWC
+       <li>NCHW
+      </ul>
+  <td>
+    <table>
+    <tr><th>src0<th>src1<th>src2<th>dst
+    <tr><td>F16<td>F16<td>F16<td>F16
+    <tr><td>F32<td>F32<td>F32<td>F32
+    <tr><td>QASYMM8<td>QASYMM8<td>S32<td>QASYMM8
+    <tr><td>QASYMM8_SIGNED<td>QASYMM8_SIGNED<td>S32<td>QASYMM8_SIGNED
+    </table>
+<tr>
+  <td rowspan="2">FuseBatchNormalization
+  <td rowspan="2" style="width:200px;"> Function to fuse the batch normalization node to a preceding convolution node.
+  <td rowspan="2">
+      <ul>
+       <li>n/a
+      </ul>
+  <td>NEFuseBatchNormalization
+  <td>
+      <ul>
+       <li>NHWC
+       <li>NCHW
+      </ul>
+  <td>
+    <table>
+    <tr><th>src<th>dst
+    <tr><td>F32<td>F32
+    <tr><td>F16<td>F16
+    </table>
+<tr>
+  <td>CLFuseBatchNormalization
+  <td>
+      <ul>
+       <li>NHWC
+       <li>NCHW
+      </ul>
+  <td>
+    <table>
+    <tr><th>src<th>dst
+    <tr><td>F32<td>F32
+    <tr><td>F16<td>F16
+    </table>
+<tr>
+  <td rowspan="2">Gather
+  <td rowspan="2" style="width:200px;"> Performs the Gather operation along the chosen axis.
+  <td rowspan="2">
+      <ul>
+       <li>ANEURALNETWORKS_GATHER
+      </ul>
+  <td>NEGather
+  <td>
+      <ul>
+       <li>All
+      </ul>
+  <td>
+    <table>
+    <tr><th>src<th>dst
+    <tr><td>All<td>All
+    </table>
+<tr>
+  <td>CLGather
+  <td>
+      <ul>
+       <li>All
+      </ul>
+  <td>
+    <table>
+    <tr><th>src<th>dst
+    <tr><td>All<td>All
+    </table>
+<tr>
+  <td rowspan="2">GEMM
+  <td rowspan="2" style="width:200px;"> General Matrix Multiplication.
+  <td rowspan="2">
+      <ul>
+       <li>n/a
+      </ul>
+  <td>NEGEMM
+  <td>
+      <ul>
+       <li>All
+      </ul>
+  <td>
+    <table>
+    <tr><th>src0<th>src1<th>src2<th>dst
+    <tr><td>F32<td>F32<td>F32<td>F32
+    <tr><td>F16<td>F16<td>F16<td>F16
+    <tr><td>BFLOAT16<td>BFLOAT16<td>BFLOAT16<td>BFLOAT16
+    </table>
+<tr>
+  <td>CLGEMMReshapeRHSMatrixKernelManaged
+  <td>
+      <ul>
+       <li>All
+      </ul>
+  <td>
+    <table>
+    <tr><th>src0<th>src1<th>src2<th>dst
+    <tr><td>F32<td>F32<td>F32<td>F32
+    <tr><td>F16<td>F16<td>F16<td>F16
+    </table>
+<tr>
+  <td rowspan="2">GEMMConvolutionLayer
+  <td rowspan="2" style="width:200px;"> General Matrix Multiplication.
+  <td rowspan="2">
+      <ul>
+       <li>ANEURALNETWORKS_CONV_2D
+      </ul>
+  <td>NEConvolutionLayerReshapeWeights
+  <td>
+      <ul>
+       <li>NHWC
+       <li>NCHW
+      </ul>
+  <td>
+    <table>
+    <tr><th>src0<th>src1<th>src2<th>dst
+    <tr><td>F16<td>F16<td>F16<td>F16
+    <tr><td>F32<td>F32<td>F32<td>F32
+    <tr><td>BFLOAT16<td>BFLOAT16<td>BFLOAT16<td>BFLOAT16
+    <tr><td>QASYMM8<td>QASYMM8<td>S32<td>QASYMM8
+    <tr><td>QASYMM8<td>QSYMM8_PER_CHANNEL<td>S32<td>QASYMM8
+    <tr><td>QASYMM8_SIGNED<td>QASYMM8_SIGNED<td>S32<td>QASYMM8_SIGNED
+    <tr><td>QASYMM8_SIGNED<td>QSYMM8_PER_CHANNEL<td>S32<td>QASYMM8_SIGNED
+    </table>
+<tr>
+  <td>CLConvolutionLayerReshapeWeights
+  <td>
+      <ul>
+       <li>NHWC
+       <li>NCHW
+      </ul>
+  <td>
+    <table>
+    <tr><th>src0<th>src1<th>src2<th>dst
+    <tr><td>F16<td>F16<td>F16<td>F16
+    <tr><td>F32<td>F32<td>F32<td>F32
+    <tr><td>QASYMM8<td>QASYMM8<td>S32<td>QASYMM8
+    <tr><td>QASYMM8<td>QSYMM8_PER_CHANNEL<td>S32<td>QASYMM8
+    <tr><td>QASYMM8_SIGNED<td>QASYMM8_SIGNED<td>S32<td>QASYMM8_SIGNED
+    <tr><td>QASYMM8_SIGNED<td>QSYMM8_PER_CHANNEL<td>S32<td>QASYMM8_SIGNED
+    </table>
+<tr>
+  <td rowspan="2">GEMMLowpMatrixMultiplyCore
+  <td rowspan="2" style="width:200px;"> General Matrix Multiplication.
+  <td rowspan="2">
+      <ul>
+       <li>n/a
+      </ul>
+  <td>NEGEMMLowpMatrixMultiplyCore
+  <td>
+      <ul>
+       <li>NHWC
+       <li>NCHW
+      </ul>
+  <td>
+    <table>
+    <tr><th>src0<th>src1<th>src2<th>dst
+    <tr><td>QASYMM8<td>QASYMM8<td>S32<td>QASYMM8
+    <tr><td>QASYMM8<td>QSYMM8_PER_CHANNEL<td>S32<td>QASYMM8
+    <tr><td>QASYMM8<td>QSYMM8<td>S32<td>QASYMM8
+    <tr><td>QASYMM8<td>QASYMM8<td>S32<td>S32
+    <tr><td>QASYMM8<td>QSYMM8_PER_CHANNEL<td>S32<td>S32
+    <tr><td>QASYMM8<td>QSYMM8<td>S32<td>S32
+    <tr><td>QASYMM8_SIGNED<td>QASYMM8_SIGNED<td>S32<td>QASYMM8_SIGNED
+    <tr><td>QASYMM8_SIGNED<td>QSYMM8_PER_CHANNEL<td>S32<td>QASYMM8_SIGNED
+    <tr><td>QASYMM8_SIGNED<td>QSYMM8<td>S32<td>QASYMM8_SIGNED
+    <tr><td>QASYMM8_SIGNED<td>QASYMM8_SIGNED<td>S32<td>S32
+    <tr><td>QASYMM8_SIGNED<td>QSYMM8_PER_CHANNEL<td>S32<td>S32
+    <tr><td>QASYMM8_SIGNED<td>QSYMM8<td>S32<td>S32
+    </table>
+<tr>
+  <td>CLGEMMLowpMatrixMultiplyCore
+  <td>
+      <ul>
+       <li>NHWC
+       <li>NCHW
+      </ul>
+  <td>
+    <table>
+    <tr><th>src0<th>src1<th>src2<th>dst
+    <tr><td>QASYMM8<td>QASYMM8<td>S32<td>QASYMM8
+    <tr><td>QASYMM8<td>QSYMM8_PER_CHANNEL<td>S32<td>QASYMM8
+    <tr><td>QASYMM8<td>QSYMM8<td>S32<td>QASYMM8
+    <tr><td>QASYMM8<td>QASYMM8<td>S32<td>S32
+    <tr><td>QASYMM8<td>QSYMM8_PER_CHANNEL<td>S32<td>S32
+    <tr><td>QASYMM8<td>QSYMM8<td>S32<td>S32
+    <tr><td>QASYMM8_SIGNED<td>QASYMM8_SIGNED<td>S32<td>QASYMM8_SIGNED
+    <tr><td>QASYMM8_SIGNED<td>QSYMM8_PER_CHANNEL<td>S32<td>QASYMM8_SIGNED
+    <tr><td>QASYMM8_SIGNED<td>QSYMM8<td>S32<td>QASYMM8_SIGNED
+    <tr><td>QASYMM8_SIGNED<td>QASYMM8_SIGNED<td>S32<td>S32
+    <tr><td>QASYMM8_SIGNED<td>QSYMM8_PER_CHANNEL<td>S32<td>S32
+    <tr><td>QASYMM8_SIGNED<td>QSYMM8<td>S32<td>S32
+    </table>
+<tr>
+  <td rowspan="2">GenerateProposalsLayer
+  <td rowspan="2" style="width:200px;"> Function to generate proposals for a RPN (Region Proposal Network).
+  <td rowspan="2">
+      <ul>
+       <li>ANEURALNETWORKS_GENERATE_PROPOSALS
+      </ul>
+  <td>NEGenerateProposalsLayer
+  <td>
+      <ul>
+       <li>All
+      </ul>
+  <td>
+    <table>
+    <tr><th>src0<th>src1<th>src2<th>dst
+    <tr><td>F16<td>F16<td>F16<td>F16
+    <tr><td>F32<td>F32<td>F32<td>F32
+    <tr><td>QASYMM8<td>QSYMM8<td>QSYMM16<td>QASYMM8
+    </table>
+<tr>
+  <td>CLGenerateProposalsLayer
+  <td>
+      <ul>
+       <li>All
+      </ul>
+  <td>
+    <table>
+    <tr><th>src0<th>src1<th>src2<th>dst
+    <tr><td>F16<td>F16<td>F16<td>F16
+    <tr><td>F32<td>F32<td>F32<td>F32
+    <tr><td>QASYMM8<td>QSYMM8<td>QSYMM16<td>QASYMM8
+    </table>
+<tr>
+  <td rowspan="2">InstanceNormalizationLayer
+  <td rowspan="2" style="width:200px;"> Function to perform a Instance normalization on a given axis.
+  <td rowspan="2">
+      <ul>
+       <li>ANEURALNETWORKS_INSTANCE_NORMALIZATION
+      </ul>
+  <td>NEInstanceNormalizationLayer
+  <td>
+      <ul>
+       <li>NHWC
+       <li>NCHW
+      </ul>
+  <td>
+    <table>
+    <tr><th>src<th>dst
+    <tr><td>F16<td>F16
+    <tr><td>F32<td>F32
+    </table>
+<tr>
+  <td>CLInstanceNormalizationLayer
+  <td>
+      <ul>
+       <li>NHWC
+       <li>NCHW
+      </ul>
+  <td>
+    <table>
+    <tr><th>src<th>dst
+    <tr><td>F16<td>F16
+    <tr><td>F32<td>F32
+    </table>
+<tr>
+  <td rowspan="2">L2NormalizeLayer
+  <td rowspan="2" style="width:200px;"> Function to perform a L2 normalization on a given axis.
+  <td rowspan="2">
+      <ul>
+       <li>ANEURALNETWORKS_L2_NORMALIZATION
+      </ul>
+  <td>NEL2NormalizeLayer
+  <td>
+      <ul>
+       <li>NHWC
+       <li>NCHW
+      </ul>
+  <td>
+    <table>
+    <tr><th>src<th>dst
+    <tr><td>F16<td>F16
+    <tr><td>F32<td>F32
+    </table>
+<tr>
+  <td>CLL2NormalizeLayer
+  <td>
+      <ul>
+       <li>NHWC
+       <li>NCHW
+      </ul>
+  <td>
+    <table>
+    <tr><th>src<th>dst
+    <tr><td>F16<td>F16
+    <tr><td>F32<td>F32
+    </table>
+<tr>
+  <td rowspan="2">LSTMLayer
+  <td rowspan="2" style="width:200px;"> Function to perform a single time step in a Long Short-Term Memory (LSTM) layer.
+  <td rowspan="2">
+      <ul>
+       <li>ANEURALNETWORKS_LSTM
+      </ul>
+  <td>NELSTMLayer
+  <td>
+      <ul>
+       <li>All
+      </ul>
+  <td>
+    <table>
+    <tr><th>src0 - src13<th>dst0 - dst3
+    <tr><td>F16<td>F16
+    <tr><td>F32<td>F32
+    </table>
+<tr>
+  <td>CLLSTMLayer
+  <td>
+      <ul>
+       <li>All
+      </ul>
+  <td>
+    <table>
+    <tr><th>src0 - src13<th>dst0 - dst3
+    <tr><td>F16<td>F16
+    <tr><td>F32<td>F32
+    </table>
+<tr>
+  <td rowspan="2">LSTMLayerQuantized
+  <td rowspan="2" style="width:200px;"> Function to perform quantized LSTM (Long Short-Term Memory)
+  <td rowspan="2">
+      <ul>
+       <li>ANEURALNETWORKS_QUANTIZED_LSTM
+       <li>ANEURALNETWORKS_QUANTIZED_16BIT_LSTM
+      </ul>
+  <td>NELSTMLayerQuantized
+  <td>
+      <ul>
+       <li>All
+      </ul>
+  <td>
+    <table>
+    <tr><th>src0 - src8<th>src9 - src12<th>src13<th>src14<th>dst0<th>dst1
+    <tr><td>QASYMM8<td>S32<td>QSYMM16<td>QASYMM8<td>QSYMM16<td>QASYMM8
+    </table>
+<tr>
+  <td>CLLSTMLayerQuantized
+  <td>
+      <ul>
+       <li>All
+      </ul>
+  <td>
+    <table>
+    <tr><th>src0 - src8<th>src9 - src12<th>src13<th>src14<th>dst0<th>dst1
+    <tr><td>QASYMM8<td>S32<td>QSYMM16<td>QASYMM8<td>QSYMM16<td>QASYMM8
+    </table>
+<tr>
+  <td rowspan="2">MaxUnpoolingLayer
+  <td rowspan="2" style="width:200px;"> Function to perform MaxUnpooling.
+  <td rowspan="2">
+      <ul>
+       <li>n/a
+      </ul>
+  <td>NEMaxUnpoolingLayer
+  <td>
+      <ul>
+       <li>NHWC
+       <li>NCHW
+      </ul>
+  <td>
+    <table>
+    <tr><th>src<th>dst
+    <tr><td>QASYMM8<td>QASYMM8
+    <tr><td>QASYMM8_SIGNED<td>QASYMM8_SIGNED
+    <tr><td>F16<td>F16
+    <tr><td>F32<td>F32
+    </table>
+<tr>
+  <td>CLMaxUnpoolingLayer
+  <td>
+      <ul>
+       <li>NHWC
+       <li>NCHW
+      </ul>
+  <td>
+    <table>
+    <tr><th>src<th>dst
+    <tr><td>QASYMM8<td>QASYMM8
+    <tr><td>QASYMM8_SIGNED<td>QASYMM8_SIGNED
+    <tr><td>F16<td>F16
+    <tr><td>F32<td>F32
+    </table>
+<tr>
+  <td rowspan="2">MeanStdDevNormalizationLayer
+  <td rowspan="2" style="width:200px;"> Function to execute mean and standard deviation normalization.
+  <td rowspan="2">
+      <ul>
+       <li>n/a
+      </ul>
+  <td>NEMeanStdDevNormalizationLayer
+  <td>
+      <ul>
+       <li>NHWC
+       <li>NCHW
+      </ul>
+  <td>
+    <table>
+    <tr><th>src<th>dst
+    <tr><td>F32<td>F32
+    <tr><td>F16<td>F16
+    </table>
+<tr>
+  <td>CLMeanStdDevNormalizationLayer
+  <td>
+      <ul>
+       <li>NHWC
+       <li>NCHW
+      </ul>
+  <td>
+    <table>
+    <tr><th>src<th>dst
+    <tr><td>F32<td>F32
+    <tr><td>F16<td>F16
+    </table>
+<tr>
+  <td rowspan="2">NormalizationLayer
+  <td rowspan="2" style="width:200px;"> Function to compute normalization layer.
+  <td rowspan="2">
+      <ul>
+       <li>ANEURALNETWORKS_LOCAL_RESPONSE_NORMALIZATION
+      </ul>
+  <td>NENormalizationLayer
+  <td>
+      <ul>
+       <li>NHWC
+       <li>NCHW
+      </ul>
+  <td>
+    <table>
+    <tr><th>src<th>dst
+    <tr><td>F32<td>F32
+    <tr><td>F16<td>F16
+    </table>
+<tr>
+  <td>CLNormalizationLayer
+  <td>
+      <ul>
+       <li>NHWC
+       <li>NCHW
+      </ul>
+  <td>
+    <table>
+    <tr><th>src<th>dst
+    <tr><td>F32<td>F32
+    <tr><td>F16<td>F16
+    </table>
+<tr>
+  <td rowspan="2">PadLayer
+  <td rowspan="2" style="width:200px;"> Function to pad a tensor.
+  <td rowspan="2">
+      <ul>
+       <li>ANEURALNETWORKS_PAD
+       <li>ANEURALNETWORKS_PAD_V2
+      </ul>
+  <td>NEPadLayer
+  <td>
+      <ul>
+       <li>NHWC
+       <li>NCHW
+      </ul>
+  <td>
+    <table>
+    <tr><th>src<th>dst
+    <tr><td>All<td>All
+    </table>
+<tr>
+  <td>CLPadLayer
+  <td>
+      <ul>
+       <li>NHWC
+       <li>NCHW
+      </ul>
+  <td>
+    <table>
+    <tr><th>src<th>dst
+    <tr><td>All<td>All
+    </table>
+<tr>
+  <td rowspan="2">Permute
+  <td rowspan="2" style="width:200px;"> Function to transpose an ND tensor.
+  <td rowspan="2">
+      <ul>
+       <li>ANEURALNETWORKS_TRANSPOSE
+      </ul>
+  <td>NEPermute
+  <td>
+      <ul>
+       <li>NHWC
+       <li>NCHW
+      </ul>
+  <td>
+    <table>
+    <tr><th>src<th>dst
+    <tr><td>All<td>All
+    </table>
+<tr>
+  <td>CLPermute
+  <td>
+      <ul>
+       <li>NHWC
+       <li>NCHW
+      </ul>
+  <td>
+    <table>
+    <tr><th>src<th>dst
+    <tr><td>All<td>All
+    </table>
+<tr>
+  <td rowspan="2">PixelWiseMultiplication
+  <td rowspan="2" style="width:200px;"> Function to performe a multiplication.
+  <td rowspan="2">
+      <ul>
+       <li>ANEURALNETWORKS_MUL
+      </ul>
+  <td>NEPixelWiseMultiplication
+  <td>
+      <ul>
+       <li>All
+      </ul>
+  <td>
+    <table>
+    <tr><th>src0<th>src1<th>dst
+    <tr><td>QASYMM8<td>QASYMM8<td>QASYMM8
+    <tr><td>QASYMM8_SIGNED<td>QASYMM8_SIGNED<td>QASYMM8_SIGNED
+    <tr><td>QSYMM16<td>QSYMM16<td>QASYMM16
+    <tr><td>QSYMM16<td>QSYMM16<td>S32
+    <tr><td>U8<td>U8<td>U8
+    <tr><td>U8<td>U8<td>S16
+    <tr><td>U8<td>S16<td>S16
+    <tr><td>S16<td>U8<td>S16
+    <tr><td>S16<td>S16<td>S16
+    <tr><td>F16<td>F16<td>F16
+    <tr><td>F32<td>S32<td>F32
+    </table>
+<tr>
+  <td>CLPixelWiseMultiplication
+  <td>
+      <ul>
+       <li>All
+      </ul>
+  <td>
+    <table>
+    <tr><th>src0<th>src1<th>dst
+    <tr><td>QASYMM8<td>QASYMM8<td>QASYMM8
+    <tr><td>QASYMM8_SIGNED<td>QASYMM8_SIGNED<td>QASYMM8_SIGNED
+    <tr><td>QSYMM16<td>QSYMM16<td>QASYMM16
+    <tr><td>QSYMM16<td>QSYMM16<td>S32
+    <tr><td>U8<td>U8<td>U8
+    <tr><td>U8<td>U8<td>S16
+    <tr><td>U8<td>S16<td>S16
+    <tr><td>S16<td>U8<td>S16
+    <tr><td>S16<td>S16<td>S16
+    <tr><td>F16<td>F16<td>F16
+    <tr><td>F32<td>S32<td>F32
+    </table>
+<tr>
+  <td rowspan="2">PoolingLayer
+  <td rowspan="2" style="width:200px;"> Function to performe pooling with the specified pooling operation.
+  <td rowspan="2">
+      <ul>
+       <li>ANEURALNETWORKS_AVERAGE_POOL_2D
+       <li>ANEURALNETWORKS_L2_POOL_2D
+       <li>ANEURALNETWORKS_MAX_POOL_2D
       </ul>
-  <td>NEConcatenateLayer
+  <td>NEPoolingLayer
+  <td>
+      <ul>
+       <li>NHWC
+       <li>NCHW
+      </ul>
+  <td>
+    <table>
+    <tr><th>src<th>dst
+    <tr><td>QASYMM8<td>QASYMM8
+    <tr><td>QASYMM8_SIGNED<td>QASYMM8_SIGNED
+    <tr><td>F16<td>F16
+    <tr><td>F32<td>F32
+    </table>
+<tr>
+  <td>CLPoolingLayer
+  <td>
+      <ul>
+       <li>NHWC
+       <li>NCHW
+      </ul>
+  <td>
+    <table>
+    <tr><th>src<th>dst
+    <tr><td>QASYMM8<td>QASYMM8
+    <tr><td>QASYMM8_SIGNED<td>QASYMM8_SIGNED
+    <tr><td>F16<td>F16
+    <tr><td>F32<td>F32
+    </table>
+<tr>
+  <td rowspan="2">PReluLayer
+  <td rowspan="2" style="width:200px;"> Function to compute the activation layer with the PRELU activation function.
+  <td rowspan="2">
+      <ul>
+       <li>ANEURALNETWORKS_PRELU
+      </ul>
+  <td>NEPReluLayer
   <td>
       <ul>
        <li>All
@@ -128,7 +1645,7 @@ where N = batches, C = channels, H = height, W = width
     <tr><td>F32<td>F32
     </table>
 <tr>
-  <td>CLConcatenateLayer
+  <td>CLPReluLayer
   <td>
       <ul>
        <li>All
@@ -142,13 +1659,224 @@ where N = batches, C = channels, H = height, W = width
     <tr><td>F32<td>F32
     </table>
 <tr>
-  <td rowspan="2">ConvertFullyConnectedWeights
-  <td rowspan="2" style="width:200px;"> Function to tranpose the wieghts for the fully connected layer.
+  <td rowspan="2">PriorBoxLayer
+  <td rowspan="2" style="width:200px;"> Function to .
   <td rowspan="2">
       <ul>
-       <li>None
+       <li>n/a
       </ul>
-  <td>NEConvertFullyConnectedWeights
+  <td>NEPriorBoxLayer
+  <td>
+      <ul>
+       <li>NHWC
+       <li>NCHW
+      </ul>
+  <td>
+    <table>
+    <tr><th>src0<th>src1<th>dst
+    <tr><td>F32<td>F32<td>F32
+    </table>
+<tr>
+  <td>CLPriorBoxLayer
+  <td>
+      <ul>
+       <li>NHWC
+       <li>NCHW
+      </ul>
+  <td>
+    <table>
+    <tr><th>src0<th>src1<th>dst
+    <tr><td>F32<td>F32<td>F32
+    </table>
+<tr>
+  <td rowspan="2">QLSTMLayer
+  <td rowspan="2" style="width:200px;"> Function to perform quantized LSTM (Long Short-Term Memory).
+  <td rowspan="2">
+      <ul>
+       <li>ANEURALNETWORKS_QUANTIZED_LSTM
+       <li>ANEURALNETWORKS_QUANTIZED_16BIT_LSTM
+      </ul>
+  <td>NEQLSTMLayer
+  <td>
+      <ul>
+       <li>All
+      </ul>
+  <td>
+    <table>
+    <tr><th>src0<th>src1 - src6<th>src7 -src9<th>src10<th>src11<th>dst0<th>dst1 - dst2
+    <tr><td>QASYMM8_SIGNED<td>QASYMM8<td>S32<td>QSYMM16<td>QASYMM8_SIGNED<td>QSYMM16<td>QASYMM8_SIGNED
+    </table>
+<tr>
+  <td>CLQLSTMLayer
+  <td>
+      <ul>
+       <li>All
+      </ul>
+  <td>
+    <table>
+    <tr><th>src0<th>src1 - src6<th>src7 -src9<th>src10<th>src11<th>dst0<th>dst1 - dst2
+    <tr><td>QASYMM8_SIGNED<td>QASYMM8<td>S32<td>QSYMM16<td>QASYMM8_SIGNED<td>QSYMM16<td>QASYMM8_SIGNED
+    </table>
+<tr>
+  <td rowspan="2">QuantizationLayer
+  <td rowspan="2" style="width:200px;"> Function to perform quantization layer
+  <td rowspan="2">
+      <ul>
+       <li>ANEURALNETWORKS_QUANTIZE
+      </ul>
+  <td>NEQuantizationLayer
+  <td>
+      <ul>
+       <li>All
+      </ul>
+  <td>
+    <table>
+    <tr><th>src<th>dst
+    <tr><td>QASYMM8<td>QASYMM8, QASYMM8_SIGNED, QASYMM16
+    <tr><td>QASYMM8_SIGNED<td>QASYMM8, QASYMM8_SIGNED, QASYMM16
+    <tr><td>F16<td>QASYMM8, QASYMM8_SIGNED, QASYMM16
+    <tr><td>F32<td>QASYMM8, QASYMM8_SIGNED, QASYMM16
+    </table>
+<tr>
+  <td>CLQuantizationLayer
+  <td>
+      <ul>
+       <li>All
+      </ul>
+  <td>
+    <table>
+    <tr><th>src<th>dst
+    <tr><td>QASYMM8<td>QASYMM8, QASYMM8_SIGNED, QASYMM16
+    <tr><td>QASYMM8_SIGNED<td>QASYMM8, QASYMM8_SIGNED, QASYMM16
+    <tr><td>F16<td>QASYMM8, QASYMM8_SIGNED, QASYMM16
+    <tr><td>F32<td>QASYMM8, QASYMM8_SIGNED, QASYMM16
+    </table>
+<tr>
+  <td rowspan="2">Range
+  <td rowspan="2" style="width:200px;"> Function to generates a sequence of numbers starting from START and extends by increments of 'STEP' up to but not including 'END'.
+  <td rowspan="2">
+      <ul>
+       <li>n/a
+      </ul>
+  <td>NERange
+  <td>
+      <ul>
+       <li>All
+      </ul>
+  <td>
+    <table>
+    <tr><th>dst
+    <tr><td>U8
+    <tr><td>S8
+    <tr><td>U16
+    <tr><td>S16
+    <tr><td>U32
+    <tr><td>S32
+    <tr><td>F16
+    <tr><td>F32
+    </table>
+<tr>
+  <td>CLRange
+  <td>
+      <ul>
+       <li>All
+      </ul>
+  <td>
+    <table>
+    <tr><th>dst
+    <tr><td>U8
+    <tr><td>S8
+    <tr><td>QASYMM8
+    <tr><td>U16
+    <tr><td>S16
+    <tr><td>U32
+    <tr><td>S32
+    <tr><td>F16
+    <tr><td>F32
+    </table>
+<tr>
+  <td rowspan="2">ReduceMean
+  <td rowspan="2" style="width:200px;"> Function to performe reduce mean operation.
+  <td rowspan="2">
+      <ul>
+       <li>ANEURALNETWORKS_MEAN
+      </ul>
+  <td>NEReduceMean
+  <td>
+      <ul>
+       <li>All
+      </ul>
+  <td>
+    <table>
+    <tr><th>src<th>dst
+    <tr><td>QASYMM8<td>QASYMM8
+    <tr><td>QASYMM8_SIGNED<td>QASYMM8_SIGNED
+    <tr><td>F16<td>F16
+    <tr><td>F32<td>F32
+    </table>
+<tr>
+  <td>CLReduceMean
+  <td>
+      <ul>
+       <li>All
+      </ul>
+  <td>
+    <table>
+    <tr><th>src<th>dst
+    <tr><td>QASYMM8<td>QASYMM8
+    <tr><td>QASYMM8_SIGNED<td>QASYMM8_SIGNED
+    <tr><td>F16<td>F16
+    <tr><td>F32<td>F32
+    </table>
+<tr>
+  <td rowspan="2">ReductionOperation
+  <td rowspan="2" style="width:200px;"> Function to performe reduce with the following operations - ARG_IDX_MAX: Index of the max value - ARG_IDX_MIN: Index of the min value - MEAN_SUM:    Mean of sum - PROD:        Product - SUM_SQUARE:  Sum of squares - SUM:         Sum - MIN:         Min - MAX:         Max
+  <td rowspan="2">
+      <ul>
+       <li>ANEURALNETWORKS_REDUCE_ALL
+       <li>ANEURALNETWORKS_REDUCE_ANY
+       <li>ANEURALNETWORKS_REDUCE_MAX
+       <li>ANEURALNETWORKS_REDUCE_MIN
+       <li>ANEURALNETWORKS_REDUCE_PROD
+       <li>ANEURALNETWORKS_REDUCE_SUM
+      </ul>
+  <td>NEReductionOperation
+  <td>
+      <ul>
+       <li>All
+      </ul>
+  <td>
+    <table>
+    <tr><th>src<th>dst
+    <tr><td>QASYMM8<td>QASYMM8
+    <tr><td>QASYMM8_SIGNED<td>QASYMM8_SIGNED
+    <tr><td>F16<td>F16
+    <tr><td>F32<td>F32
+    <tr><td>S32<td>S32
+    </table>
+<tr>
+  <td>CLReductionOperation
+  <td>
+      <ul>
+       <li>All
+      </ul>
+  <td>
+    <table>
+    <tr><th>src<th>dst
+    <tr><td>QASYMM8<td>QASYMM8
+    <tr><td>QASYMM8_SIGNED<td>QASYMM8_SIGNED
+    <tr><td>F16<td>F16
+    <tr><td>F32<td>F32
+    <tr><td>S32<td>S32
+    </table>
+<tr>
+  <td rowspan="2">ReorgLayer
+  <td rowspan="2" style="width:200px;"> Performs a reorganization layer of input tensor to the output tensor.
+  <td rowspan="2">
+      <ul>
+       <li>n/a
+      </ul>
+  <td>NEReorgLayer
   <td>
       <ul>
        <li>NHWC
@@ -160,7 +1888,7 @@ where N = batches, C = channels, H = height, W = width
     <tr><td>All<td>All
     </table>
 <tr>
-  <td>CLConvertFullyConnectedWeights
+  <td>CLReorgLayer
   <td>
       <ul>
        <li>NHWC
@@ -172,13 +1900,14 @@ where N = batches, C = channels, H = height, W = width
     <tr><td>All<td>All
     </table>
 <tr>
-  <td rowspan="2">Copy
-  <td rowspan="2" style="width:200px;"> Function to copy a tensor.
+  <td rowspan="2">ReshapeLayer
+  <td rowspan="2" style="width:200px;"> Function to reshape a tensor.
   <td rowspan="2">
       <ul>
-       <li>None
+       <li>ANEURALNETWORKS_RESHAPE
+       <li>ANEURALNETWORKS_SQUEEZE
       </ul>
-  <td>NECopy
+  <td>NEReshapeLayer
   <td>
       <ul>
        <li>All
@@ -189,7 +1918,7 @@ where N = batches, C = channels, H = height, W = width
     <tr><td>All<td>All
     </table>
 <tr>
-  <td>CLCopy
+  <td>CLReshapeLayer
   <td>
       <ul>
        <li>All
@@ -200,59 +1929,41 @@ where N = batches, C = channels, H = height, W = width
     <tr><td>All<td>All
     </table>
 <tr>
-  <td rowspan="2">DequantizationLayer
-  <td rowspan="2" style="width:200px;"> Function to dequantize the values in a tensor
+  <td rowspan="2">Reverse
+  <td rowspan="2" style="width:200px;"> Function to reverse tensor according to axis.
   <td rowspan="2">
       <ul>
-       <li>ANEURALNETWORKS_DEQUANTIZE
+       <li>n/a
       </ul>
-  <td>NEDequantizationLayer
+  <td>NEReverse
   <td>
       <ul>
        <li>All
       </ul>
   <td>
     <table>
-    <tr><th>src<th>dst
-    <tr><td>QASYMM8<td>F16
-    <tr><td>QASYMM8<td>F32
-    <tr><td>QASYMM8_SIGNED<td>F16
-    <tr><td>QASYMM8_SIGNED<td>F32
-    <tr><td>QSYMM8_PER_CHANNEL<td>F16
-    <tr><td>QSYMM8_PER_CHANNEL<td>F32
-    <tr><td>QSYMM8<td>F16
-    <tr><td>QSYMM8<td>F32
-    <tr><td>QSYMM16<td>F16
-    <tr><td>QSYMM16<td>F32
+    <tr><th>src0<th>src1<th>dst
+    <tr><td>All<td>U32<td>All
     </table>
 <tr>
-  <td>CLDequantizationLayer
+  <td>CLReverse
   <td>
       <ul>
        <li>All
       </ul>
   <td>
     <table>
-    <tr><th>src<th>dst
-    <tr><td>QASYMM8<td>F16
-    <tr><td>QASYMM8<td>F32
-    <tr><td>QASYMM8_SIGNED<td>F16
-    <tr><td>QASYMM8_SIGNED<td>F32
-    <tr><td>QSYMM8_PER_CHANNEL<td>F16
-    <tr><td>QSYMM8_PER_CHANNEL<td>F32
-    <tr><td>QSYMM8<td>F16
-    <tr><td>QSYMM8<td>F32
-    <tr><td>QSYMM16<td>F16
-    <tr><td>QSYMM16<td>F32
+    <tr><th>src0<th>src1<th>dst
+    <tr><td>All<td>U32<td>All
     </table>
 <tr>
-  <td rowspan="2">DirectConvolutionLayer
-  <td rowspan="2" style="width:200px;"> Function to
+  <td rowspan="2">RNNLayer
+  <td rowspan="2" style="width:200px;"> Function to perform recurrent neural network layer.
   <td rowspan="2">
       <ul>
-       <li>ANEURALNETWORKS_CONV_2D
+       <li>ANEURALNETWORKS_RNN
       </ul>
-  <td>NEDirectConvolutionLayer
+  <td>NERNNLayer
   <td>
       <ul>
        <li>NHWC
@@ -260,12 +1971,12 @@ where N = batches, C = channels, H = height, W = width
       </ul>
   <td>
     <table>
-    <tr><th>src0<th>src1<th>src2<th>dst
-    <tr><td>F16<td>F16<td>F16<td>F16
-    <tr><td>F32<td>F32<td>F32<td>F32
+    <tr><th>src0<th>src1<th>src2<th>src3<th>dst0<th>dst1
+    <tr><td>F16<td>F16<td>F16<td>F16<td>F16<td>F16
+    <tr><td>F32<td>F32<td>F32<td>F32<td>F32<td>F32
     </table>
 <tr>
-  <td>CLDirectConvolutionLayer
+  <td>CLRNNLayer
   <td>
       <ul>
        <li>NHWC
@@ -273,135 +1984,152 @@ where N = batches, C = channels, H = height, W = width
       </ul>
   <td>
     <table>
-    <tr><th>src0<th>src1<th>src2<th>dst
-    <tr><td>F16<td>F16<td>F16<td>F16
-    <tr><td>F32<td>F32<td>F32<td>F32
-    <tr><td>QASYMM8<td>QASYMM8<td>S32<td>QASYMM8
-    <tr><td>QASYMM8_SIGNED<td>QASYMM8_SIGNED<td>S32<td>QASYMM8_SIGNED
+    <tr><th>src0<th>src1<th>src2<th>src3<th>dst0<th>dst1
+    <tr><td>F16<td>F16<td>F16<td>F16<td>F16<td>F16
+    <tr><td>F32<td>F32<td>F32<td>F32<td>F32<td>F32
     </table>
 <tr>
-  <td rowspan="2">FFT1D
-  <td rowspan="2" style="width:200px;"> Fast Fourier Transform 1D
+  <td rowspan="2">ROIAlignLayer
+  <td rowspan="2" style="width:200px;"> Function to perform ROI alignment.
   <td rowspan="2">
       <ul>
-       <li>None
+       <li>ANEURALNETWORKS_ROI_ALIGN
       </ul>
-  <td>NEFFT1D
+  <td>NEROIAlignLayer
   <td>
       <ul>
        <li>All
       </ul>
   <td>
     <table>
-    <tr><th>src<th>dst
-    <tr><td>F32<td>F32
+    <tr><th>src0<th>src1<th>dst
+    <tr><td>F16<td>F16<td>F16
+    <tr><td>F32<td>F32<td>F32
+    <tr><td>QASYMM8<td>QASYMM16<td>QASYMM8
+    <tr><td>QASYMM8_SIGNED<td>QASYMM16<td>QASYMM8_SIGNED
     </table>
 <tr>
-  <td>CLFFT1D
+  <td>CLROIAlignLayer
   <td>
       <ul>
        <li>All
       </ul>
   <td>
     <table>
-    <tr><th>src<th>dst
-    <tr><td>F32<td>F32
-    <tr><td>F16<td>F16
+    <tr><th>src0<th>src1<th>dst
+    <tr><td>F16<td>F16<td>F16
+    <tr><td>F32<td>F32<td>F32
+    <tr><td>QASYMM8<td>QASYMM16<td>QASYMM8
+    <tr><td>QASYMM8_SIGNED<td>QASYMM16<td>QASYMM8_SIGNED
     </table>
 <tr>
-  <td rowspan="2">FFT2D
-  <td rowspan="2" style="width:200px;"> Fast Fourier Transform 2D
+  <td rowspan="2">ROIPoolingLayer
+  <td rowspan="2" style="width:200px;"> Function to perform ROI pooling.
   <td rowspan="2">
       <ul>
-       <li>None
+       <li>ANEURALNETWORKS_ROI_POOLING
       </ul>
-  <td>NEFFT2D
+  <td>NEROIPoolingLayer
   <td>
       <ul>
        <li>All
       </ul>
   <td>
     <table>
-    <tr><th>src<th>dst
-    <tr><td>F32<td>F32
+    <tr><th>src0<th>src1<th>dst
+    <tr><td>F32<td>U16<td>F32
+    <tr><td>QASYMM8<td>U16<td>QASYMM8
     </table>
 <tr>
-  <td>CLFFT2D
+  <td>CLROIPoolingLayer
   <td>
       <ul>
        <li>All
       </ul>
   <td>
     <table>
-    <tr><th>src<th>dst
-    <tr><td>F32<td>F32
-    <tr><td>F16<td>F16
+    <tr><th>src0<th>src1<th>dst
+    <tr><td>F16<td>U16<td>F16
+    <tr><td>F32<td>U16<td>F32
+    <tr><td>QASYMM8<td>U16<td>QASYMM8
     </table>
 <tr>
-  <td rowspan="2">FFTConvolutionLayer
-  <td rowspan="2" style="width:200px;"> Fast Fourier Transform Convolution
+  <td rowspan="2">Scale
+  <td rowspan="2" style="width:200px;"> Function to perform resize a tensor using to interpolate: - Bilinear - Nearest neighbor
   <td rowspan="2">
       <ul>
-       <li>ANEURALNETWORKS_CONV_2D
+       <li>ANEURALNETWORKS_RESIZE_BILINEAR
+       <li>ANEURALNETWORKS_RESIZE_NEAREST_NEIGHBOR
       </ul>
-  <td>NEFFTConvolutionLayer
+  <td>NEScale
   <td>
       <ul>
-       <li>All
+       <li>NHWC
+       <li>NCHW
       </ul>
   <td>
     <table>
     <tr><th>src<th>dst
+    <tr><td>QASYMM8<td>QASYMM8
+    <tr><td>QASYMM8_SIGNED<td>QASYMM8_SIGNED
+    <tr><td>F16<td>F16
     <tr><td>F32<td>F32
+    <tr><td>U8<td>U8
+    <tr><td>S16<td>S16
     </table>
 <tr>
-  <td>CLFFTConvolutionLayer
+  <td>CLScale
   <td>
       <ul>
-       <li>All
+       <li>NHWC
+       <li>NCHW
       </ul>
   <td>
     <table>
     <tr><th>src<th>dst
-    <tr><td>F32<td>F32
+    <tr><td>QASYMM8<td>QASYMM8
+    <tr><td>QASYMM8_SIGNED<td>QASYMM8_SIGNED
     <tr><td>F16<td>F16
+    <tr><td>F32<td>F32
+    <tr><td>U8<td>U8
+    <tr><td>S16<td>S16
     </table>
 <tr>
-  <td rowspan="2">Fill
-  <td rowspan="2" style="width:200px;"> Set the values of a tensor with a given value
+  <td rowspan="2">Select
+  <td rowspan="2" style="width:200px;"> Function to select values from 2 tensors depending on an input tensor of booleans.
   <td rowspan="2">
       <ul>
-       <li>ANEURALNETWORKS_FILL
+       <li>ANEURALNETWORKS_SELECT
       </ul>
-  <td>NEFill
+  <td>NESelect
   <td>
       <ul>
        <li>All
       </ul>
   <td>
     <table>
-    <tr><th>src<th>dst
-    <tr><td>All<td>All
+    <tr><th>src0<th>src1<th>src2<th>dst
+    <tr><td>U8<td>All<td>All<td>All
     </table>
 <tr>
-  <td>CLFill
+  <td>CLSelect
   <td>
       <ul>
        <li>All
       </ul>
   <td>
     <table>
-    <tr><th>src<th>dst
-    <tr><td>All<td>All
+    <tr><th>src0<th>src1<th>src2<th>dst
+    <tr><td>U8<td>All<td>All<td>All
     </table>
 <tr>
-  <td rowspan="2">Floor
-  <td rowspan="2" style="width:200px;"> Round the value to the lowest number
+  <td rowspan="2">Slice
+  <td rowspan="2" style="width:200px;"> Function to perform tensor slicing.
   <td rowspan="2">
       <ul>
-       <li>ANEURALNETWORKS_FLOOR
+       <li>ANEURALNETWORKS_SLICE
       </ul>
-  <td>NEFloor
+  <td>NESlice
   <td>
       <ul>
        <li>All
@@ -409,11 +2137,10 @@ where N = batches, C = channels, H = height, W = width
   <td>
     <table>
     <tr><th>src<th>dst
-    <tr><td>F32<td>F32
-    <tr><td>F16<td>F16
+    <tr><td>All<td>All
     </table>
 <tr>
-  <td>CLFloor
+  <td>CLSlice
   <td>
       <ul>
        <li>All
@@ -421,17 +2148,16 @@ where N = batches, C = channels, H = height, W = width
   <td>
     <table>
     <tr><th>src<th>dst
-    <tr><td>F32<td>F32
-    <tr><td>F16<td>F16
+    <tr><td>All<td>All
     </table>
 <tr>
-  <td rowspan="2">Permute
-  <td rowspan="2" style="width:200px;"> Function to transpose an ND tensor.
+  <td rowspan="2">SpaceToBatchLayer
+  <td rowspan="2" style="width:200px;"> Function to divide a tensor spatially.
   <td rowspan="2">
       <ul>
-       <li>ANEURALNETWORKS_TRANSPOSE
+       <li>ANEURALNETWORKS_SPACE_TO_BATCH_ND
       </ul>
-  <td>NEPermute
+  <td>NESpaceToBatchLayer
   <td>
       <ul>
        <li>NHWC
@@ -439,11 +2165,11 @@ where N = batches, C = channels, H = height, W = width
       </ul>
   <td>
     <table>
-    <tr><th>src<th>dst
-    <tr><td>All<td>All
+    <tr><th>src0<th>src1<th>src2<th>dst
+    <tr><td>All<td>S32<td>S32<td>All
     </table>
 <tr>
-  <td>CLPermute
+  <td>CLSpaceToBatchLayer
   <td>
       <ul>
        <li>NHWC
@@ -451,67 +2177,17 @@ where N = batches, C = channels, H = height, W = width
       </ul>
   <td>
     <table>
-    <tr><th>src<th>dst
-    <tr><td>All<td>All
-    </table>
-<tr>
-  <td rowspan="2">PixelWiseMultiplication
-  <td rowspan="2" style="width:200px;"> Function to performe a multiplication.
-  <td rowspan="2">
-      <ul>
-       <li>ANEURALNETWORKS_MUL
-      </ul>
-  <td>NEPixelWiseMultiplication
-  <td>
-      <ul>
-       <li>All
-      </ul>
-  <td>
-    <table>
-    <tr><th>src0<th>src1<th>dst
-    <tr><td>QASYMM8<td>QASYMM8<td>QASYMM8
-    <tr><td>QASYMM8_SIGNED<td>QASYMM8_SIGNED<td>QASYMM8_SIGNED
-    <tr><td>QSYMM16<td>QSYMM16<td>QASYMM16
-    <tr><td>QSYMM16<td>QSYMM16<td>S32
-    <tr><td>U8<td>U8<td>U8
-    <tr><td>U8<td>U8<td>S16
-    <tr><td>U8<td>S16<td>S16
-    <tr><td>S16<td>U8<td>S16
-    <tr><td>S16<td>S16<td>S16
-    <tr><td>F16<td>F16<td>F16
-    <tr><td>F32<td>S32<td>F32
-    </table>
-<tr>
-  <td>CLPixelWiseMultiplication
-  <td>
-      <ul>
-       <li>All
-      </ul>
-  <td>
-    <table>
-    <tr><th>src0<th>src1<th>dst
-    <tr><td>QASYMM8<td>QASYMM8<td>QASYMM8
-    <tr><td>QASYMM8_SIGNED<td>QASYMM8_SIGNED<td>QASYMM8_SIGNED
-    <tr><td>QSYMM16<td>QSYMM16<td>QASYMM16
-    <tr><td>QSYMM16<td>QSYMM16<td>S32
-    <tr><td>U8<td>U8<td>U8
-    <tr><td>U8<td>U8<td>S16
-    <tr><td>U8<td>S16<td>S16
-    <tr><td>S16<td>U8<td>S16
-    <tr><td>S16<td>S16<td>S16
-    <tr><td>F16<td>F16<td>F16
-    <tr><td>F32<td>S32<td>F32
+    <tr><th>src0<th>src1<th>src2<th>dst
+    <tr><td>All<td>S32<td>S32<td>All
     </table>
 <tr>
-  <td rowspan="2">PoolingLayer
-  <td rowspan="2" style="width:200px;"> Function to performe pooling with the specified pooling operation.
+  <td rowspan="2">SpaceToDepthLayer
+  <td rowspan="2" style="width:200px;"> Function to rearrange blocks of spatial data into depth.
   <td rowspan="2">
       <ul>
-       <li>ANEURALNETWORKS_AVERAGE_POOL_2D
-       <li>ANEURALNETWORKS_L2_POOL_2D
-       <li>ANEURALNETWORKS_MAX_POOL_2D
+       <li>ANEURALNETWORKS_SPACE_TO_DEPTH
       </ul>
-  <td>NEPoolingLayer
+  <td>NESpaceToDepthLayer
   <td>
       <ul>
        <li>NHWC
@@ -520,13 +2196,10 @@ where N = batches, C = channels, H = height, W = width
   <td>
     <table>
     <tr><th>src<th>dst
-    <tr><td>QASYMM8<td>QASYMM8
-    <tr><td>QASYMM8_SIGNED<td>QASYMM8_SIGNED
-    <tr><td>F16<td>F16
-    <tr><td>F32<td>F32
+    <tr><td>All<td>All
     </table>
 <tr>
-  <td>CLPoolingLayer
+  <td>CLSpaceToDepthLayer
   <td>
       <ul>
        <li>NHWC
@@ -535,19 +2208,16 @@ where N = batches, C = channels, H = height, W = width
   <td>
     <table>
     <tr><th>src<th>dst
-    <tr><td>QASYMM8<td>QASYMM8
-    <tr><td>QASYMM8_SIGNED<td>QASYMM8_SIGNED
-    <tr><td>F16<td>F16
-    <tr><td>F32<td>F32
+    <tr><td>All<td>All
     </table>
 <tr>
-  <td rowspan="2">PReluLayer
-  <td rowspan="2" style="width:200px;"> Function to compute the activation layer with the PRELU activation function.
+  <td rowspan="2">Split
+  <td rowspan="2" style="width:200px;"> Function to split a tensor along a given axis.
   <td rowspan="2">
       <ul>
-       <li>ANEURALNETWORKS_PRELU
+       <li>ANEURALNETWORKS_SPLIT
       </ul>
-  <td>NEPReluLayer
+  <td>NESplit
   <td>
       <ul>
        <li>All
@@ -555,13 +2225,10 @@ where N = batches, C = channels, H = height, W = width
   <td>
     <table>
     <tr><th>src<th>dst
-    <tr><td>QASYMM8<td>QASYMM8
-    <tr><td>QASYMM8_SIGNED<td>QASYMM8_SIGNED
-    <tr><td>F16<td>F16
-    <tr><td>F32<td>F32
+    <tr><td>All<td>All
     </table>
 <tr>
-  <td>CLPReluLayer
+  <td>CLSplit
   <td>
       <ul>
        <li>All
@@ -569,19 +2236,16 @@ where N = batches, C = channels, H = height, W = width
   <td>
     <table>
     <tr><th>src<th>dst
-    <tr><td>QASYMM8<td>QASYMM8
-    <tr><td>QASYMM8_SIGNED<td>QASYMM8_SIGNED
-    <tr><td>F16<td>F16
-    <tr><td>F32<td>F32
+    <tr><td>All<td>All
     </table>
 <tr>
-  <td rowspan="2">QuantizationLayer
-  <td rowspan="2" style="width:200px;"> Function to perform quantization layer
+  <td rowspan="2">StackLayer
+  <td rowspan="2" style="width:200px;"> Function to stack tensors along an axis.
   <td rowspan="2">
       <ul>
-       <li>ANEURALNETWORKS_QUANTIZE
+       <li>n/a
       </ul>
-  <td>NEQuantizationLayer
+  <td>NEStackLayer
   <td>
       <ul>
        <li>All
@@ -589,21 +2253,10 @@ where N = batches, C = channels, H = height, W = width
   <td>
     <table>
     <tr><th>src<th>dst
-    <tr><td>QASYMM8<td>QASYMM8
-    <tr><td>QASYMM8<td>QASYMM8_SIGNED
-    <tr><td>QASYMM8<td>QASYMM16
-    <tr><td>QASYMM8_SIGNED<td>QASYMM8
-    <tr><td>QASYMM8_SIGNED<td>QASYMM8_SIGNED
-    <tr><td>QASYMM8_SIGNED<td>QASYMM16
-    <tr><td>F16<td>QASYMM8
-    <tr><td>F16<td>QASYMM8_SIGNED
-    <tr><td>F16<td>QASYMM16
-    <tr><td>F32<td>QASYMM8
-    <tr><td>F32<td>QASYMM8_SIGNED
-    <tr><td>F32<td>QASYMM16
+    <tr><td>All<td>All
     </table>
 <tr>
-  <td>CLQuantizationLayer
+  <td>CLStackLayer
   <td>
       <ul>
        <li>All
@@ -611,28 +2264,16 @@ where N = batches, C = channels, H = height, W = width
   <td>
     <table>
     <tr><th>src<th>dst
-    <tr><td>QASYMM8<td>QASYMM8
-    <tr><td>QASYMM8<td>QASYMM8_SIGNED
-    <tr><td>QASYMM8<td>QASYMM16
-    <tr><td>QASYMM8_SIGNED<td>QASYMM8
-    <tr><td>QASYMM8_SIGNED<td>QASYMM8_SIGNED
-    <tr><td>QASYMM8_SIGNED<td>QASYMM16
-    <tr><td>F16<td>QASYMM8
-    <tr><td>F16<td>QASYMM8_SIGNED
-    <tr><td>F16<td>QASYMM16
-    <tr><td>F32<td>QASYMM8
-    <tr><td>F32<td>QASYMM8_SIGNED
-    <tr><td>F32<td>QASYMM16
+    <tr><td>All<td>All
     </table>
 <tr>
-  <td rowspan="2">ReshapeLayer
-  <td rowspan="2" style="width:200px;"> Fucntion to reshape a tensor
+  <td rowspan="2">StridedSlice
+  <td rowspan="2" style="width:200px;"> Function to extract a strided slice of a tensor.
   <td rowspan="2">
       <ul>
-       <li>ANEURALNETWORKS_RESHAPE
-       <li>ANEURALNETWORKS_SQUEEZE
+       <li>ANEURALNETWORKS_STRIDED_SLICE
       </ul>
-  <td>NEReshapeLayer
+  <td>NEStridedSlice
   <td>
       <ul>
        <li>All
@@ -643,7 +2284,7 @@ where N = batches, C = channels, H = height, W = width
     <tr><td>All<td>All
     </table>
 <tr>
-  <td>CLReshapeLayer
+  <td>CLStridedSlice
   <td>
       <ul>
        <li>All
@@ -654,54 +2295,41 @@ where N = batches, C = channels, H = height, W = width
     <tr><td>All<td>All
     </table>
 <tr>
-  <td rowspan="2">Scale
-  <td rowspan="2" style="width:200px;"> Fucntion to perform resize a tensor using to interpolate: - Bilenear - Nearest neighbor
+  <td rowspan="2">Tile
+  <td rowspan="2" style="width:200px;"> Function to construct a tensor by tiling a given tensor.
   <td rowspan="2">
       <ul>
-       <li>ANEURALNETWORKS_RESIZE_BILINEAR
-       <li>ANEURALNETWORKS_RESIZE_NEAREST_NEIGHBOR
+       <li>ANEURALNETWORKS_TILE
       </ul>
-  <td>NEScale
+  <td>NETile
   <td>
       <ul>
-       <li>NHWC
-       <li>NCHW
+       <li>All
       </ul>
   <td>
     <table>
     <tr><th>src<th>dst
-    <tr><td>QASYMM8<td>QASYMM8
-    <tr><td>QASYMM8_SIGNED<td>QASYMM8_SIGNED
-    <tr><td>F16<td>F16
-    <tr><td>F32<td>F32
-    <tr><td>U8<td>U8
-    <tr><td>S16<td>S16
+    <tr><td>All<td>All
     </table>
 <tr>
-  <td>CLScale
+  <td>CLTile
   <td>
       <ul>
-       <li>NHWC
-       <li>NCHW
+       <li>All
       </ul>
   <td>
     <table>
     <tr><th>src<th>dst
-    <tr><td>QASYMM8<td>QASYMM8
-    <tr><td>QASYMM8_SIGNED<td>QASYMM8_SIGNED
-    <tr><td>F16<td>F16
-    <tr><td>F32<td>F32
-    <tr><td>U8<td>U8
-    <tr><td>S16<td>S16
+    <tr><td>All<td>All
     </table>
 <tr>
-  <td rowspan="2">Slice
-  <td rowspan="2" style="width:200px;"> Function to perform tensor slicing.
+  <td rowspan="2">Transpose
+  <td rowspan="2" style="width:200px;"> Function to transpose a 2D tensor.
   <td rowspan="2">
       <ul>
-       <li>ANEURALNETWORKS_SLICE
+       <li>ANEURALNETWORKS_TRANSPOSE
       </ul>
-  <td>NESlice
+  <td>NETranspose
   <td>
       <ul>
        <li>All
@@ -712,7 +2340,7 @@ where N = batches, C = channels, H = height, W = width
     <tr><td>All<td>All
     </table>
 <tr>
-  <td>CLSlice
+  <td>CLTranspose
   <td>
       <ul>
        <li>All
@@ -723,13 +2351,13 @@ where N = batches, C = channels, H = height, W = width
     <tr><td>All<td>All
     </table>
 <tr>
-  <td rowspan="2">StridedSlice
-  <td rowspan="2" style="width:200px;"> Function to extract a strided slice of a tensor.
+  <td rowspan="2">Unstack
+  <td rowspan="2" style="width:200px;"> Function to unpack a rank-R tensor into rank-(R-1) tensors.
   <td rowspan="2">
       <ul>
-       <li>ANEURALNETWORKS_STRIDED_SLICE
+       <li>n/a
       </ul>
-  <td>NEStridedSlice
+  <td>NEUnstack
   <td>
       <ul>
        <li>All
@@ -740,7 +2368,7 @@ where N = batches, C = channels, H = height, W = width
     <tr><td>All<td>All
     </table>
 <tr>
-  <td>CLStridedSlice
+  <td>CLUnstack
   <td>
       <ul>
        <li>All
@@ -751,32 +2379,36 @@ where N = batches, C = channels, H = height, W = width
     <tr><td>All<td>All
     </table>
 <tr>
-  <td rowspan="2">Transpose
-  <td rowspan="2" style="width:200px;"> Function to transpose an 2D tensor.
+  <td rowspan="2">WinogradConvolutionLayer
+  <td rowspan="2" style="width:200px;"> Function to do Winograd Convolution.
   <td rowspan="2">
       <ul>
-       <li>ANEURALNETWORKS_TRANSPOSE
+       <li>ANEURALNETWORKS_CONV_2D
       </ul>
-  <td>NETranspose
+  <td>NEWinogradConvolutionLayer
   <td>
       <ul>
-       <li>All
+       <li>NHWC
+       <li>NCHW
       </ul>
   <td>
     <table>
-    <tr><th>src<th>dst
-    <tr><td>All<td>All
+    <tr><th>src0<th>src1<th>src2<th>dst
+    <tr><td>F16<td>F16<td>F16<td>F16
+    <tr><td>F32<td>F32<td>F32<td>F32
     </table>
 <tr>
-  <td>CLTranspose
+  <td>CLWinogradConvolutionLayer
   <td>
       <ul>
-       <li>All
+       <li>NHWC
+       <li>NCHW
       </ul>
   <td>
     <table>
-    <tr><th>src<th>dst
-    <tr><td>All<td>All
+    <tr><th>src0<th>src1<th>src2<th>dst
+    <tr><td>F16<td>F16<td>F16<td>F16
+    <tr><td>F32<td>F32<td>F32<td>F32
     </table>
 </table>
 
diff --git a/src/core/CL/cl_kernels/bounding_box_transform.cl b/src/core/CL/cl_kernels/bounding_box_transform.cl
index a9b0496a6e..f2e9cb0ed0 100644
--- a/src/core/CL/cl_kernels/bounding_box_transform.cl
+++ b/src/core/CL/cl_kernels/bounding_box_transform.cl
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 Arm Limited.
+ * Copyright (c) 2018-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -25,7 +25,7 @@
 
 #if defined(DATA_TYPE) && defined(WEIGHT_X) && defined(WEIGHT_Y) && defined(WEIGHT_W) && defined(WEIGHT_H) && defined(IMG_WIDTH) && defined(IMG_HEIGHT) && defined(BOX_FIELDS) && defined(SCALE_BEFORE) // Check for compile time constants
 
-/** Perform a padded copy of input tensor to the output tensor. Padding values are defined at compile time
+/** Transform proposal bounding boxes to target bounding box using bounding box deltas.
  *
  * @attention The following variables must be passed at compile time:
  * -# -DDATA_TYPE= Tensor data type. Supported data types: F16/F32
diff --git a/src/core/CL/cl_kernels/bounding_box_transform_quantized.cl b/src/core/CL/cl_kernels/bounding_box_transform_quantized.cl
index 9e5cee55f4..c1d45a56b9 100644
--- a/src/core/CL/cl_kernels/bounding_box_transform_quantized.cl
+++ b/src/core/CL/cl_kernels/bounding_box_transform_quantized.cl
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 Arm Limited.
+ * Copyright (c) 2019-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -25,7 +25,7 @@
 
 #if defined(DATA_TYPE) && defined(DATA_TYPE_DELTAS) && defined(WEIGHT_X) && defined(WEIGHT_Y) && defined(WEIGHT_W) && defined(WEIGHT_H) && defined(IMG_WIDTH) && defined(IMG_HEIGHT) && defined(BOX_FIELDS) && defined(SCALE_BEFORE) && defined(OFFSET_BOXES) && defined(SCALE_BOXES) && defined(OFFSET_DELTAS) && defined(SCALE_DELTAS) && defined(OFFSET_PRED_BOXES) && defined(SCALE_PRED_BOXES) // Check for compile time constants
 
-/** Perform a padded copy of input tensor to the output tensor for quantized data types. Padding values are defined at compile time
+/** Transform proposal bounding boxes to target bounding box using bounding box deltas for quantized data types.
  *
  * @attention The following variables must be passed at compile time:
  * -# -DDATA_TYPE= Tensor data type. Supported data types: QASYMM16 for boxes and pred_boxes, QASYMM8 for for deltas
diff --git a/src/core/CL/cl_kernels/crop_tensor.cl b/src/core/CL/cl_kernels/crop_tensor.cl
index 62ae36ac5c..d9090dc838 100644
--- a/src/core/CL/cl_kernels/crop_tensor.cl
+++ b/src/core/CL/cl_kernels/crop_tensor.cl
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 Arm Limited.
+ * Copyright (c) 2019-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -25,7 +25,7 @@
 
 #if defined(DATA_TYPE) // Compile time constants
 
-/** Performs a copy of input tensor to the output tensor.
+/** Performs a tensor cropping.
  *
  * @param[in]  in_ptr                            Pointer to the source tensor. Supported data types: All
  * @param[in]  in_stride_x                       Stride of the source tensor in X dimension (in bytes)
diff --git a/src/core/CL/cl_kernels/depth_to_space.cl b/src/core/CL/cl_kernels/depth_to_space.cl
index d3231a59a1..f301e64d66 100644
--- a/src/core/CL/cl_kernels/depth_to_space.cl
+++ b/src/core/CL/cl_kernels/depth_to_space.cl
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 Arm Limited.
+ * Copyright (c) 2019-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,10 +24,10 @@
 #include "helpers.h"
 
 #if defined(DATA_TYPE) && defined(BLOCK_SHAPE) && defined(CHANNEL_SIZE)
-/** Batch to space transformation. (NCHW)
+/** Depth to space transformation. (NCHW)
  *
  * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=float
- * @note The input tensor batch size must be passed at compile time using -DCHANNEL_SIZE. e.g. -DCHANNEL_SIZE=2
+ * @note The input tensor depth size must be passed at compile time using -DCHANNEL_SIZE. e.g. -DCHANNEL_SIZE=2
  * @note The block shape must be passed at compile time using -DBLOCK_SHAPE. e.g. -DBLOCK_SHAPE=2
  *
  * @param[in]  input_ptr                            Pointer to the source tensor. Supported data types: All.
@@ -66,10 +66,10 @@ __kernel void depth_to_space_nchw(
 
     *((__global DATA_TYPE *)tensor4D_offset(&out, out_x, out_y, z, batch_id)) = *((__global DATA_TYPE *)in.ptr);
 }
-/** Batch to space transformation. (NHWC)
+/** Depth to space transformation. (NHWC)
  *
  * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=float
- * @note The input tensor batch size must be passed at compile time using -DCHANNEL_SIZE. e.g. -DCHANNEL_SIZE=2
+ * @note The input tensor depth size must be passed at compile time using -DCHANNEL_SIZE. e.g. -DCHANNEL_SIZE=2
  * @note The block shape must be passed at compile time using -DBLOCK_SHAPE. e.g. -DBLOCK_SHAPE=2
  *
  * @param[in]  input_ptr                            Pointer to the source tensor. Supported data types: All.
diff --git a/src/core/NEON/kernels/NEReductionOperationKernel.h b/src/core/NEON/kernels/NEReductionOperationKernel.h
index 667305b3aa..08e654fd21 100644
--- a/src/core/NEON/kernels/NEReductionOperationKernel.h
+++ b/src/core/NEON/kernels/NEReductionOperationKernel.h
@@ -59,7 +59,7 @@ public:
 
     /** Set the source, destination of the kernel
      *
-     * @param[in]  input  Source tensor. Data type supported: QASYMM8_SIGNED/QASYMM8/F16/F32/S32. Data layouts supported: NCHW.
+     * @param[in]  input  Source tensor. Data type supported: QASYMM8_SIGNED/QASYMM8/F16/F32/S32.
      * @param[out] output Destination tensor.Data types and data layouts supported: same as @p input, S32 for ARG_MIX/ARG_MAX.
      *                    Output will have the same number of dimensions as input.
      * @param[in]  axis   Axis along which to reduce. Supported reduction axis : 0
@@ -69,7 +69,7 @@ public:
 
     /** Static function to check if given info will lead to a valid configuration of @ref NEReductionOperationKernel.
      *
-     * @param[in] input  Source tensor info. Data type supported: QASYMM8_SIGNED/QASYMM8/F16/F32/S32. Data layouts supported: NCHW.
+     * @param[in] input  Source tensor info. Data type supported: QASYMM8_SIGNED/QASYMM8/F16/F32/S32.
      * @param[in] output Destination tensor info.Data types and data layouts supported: same as @p input, S32 for ARG_MIX/ARG_MAX.
      *                   Output will have the same number of dimensions as input.
      * @param[in] axis   Axis along which to reduce. Supported reduction axis : 0
-- 
cgit v1.2.1