From 6124ce60b54eb5639ed19d46c79fce21cca2c83b Mon Sep 17 00:00:00 2001
From: Sheri Zhang <sheri.zhang@arm.com>
Date: Tue, 4 May 2021 14:03:13 +0100
Subject: Update operator list part3

Partially resolve: COMPMID-4199

Signed-off-by: Sheri Zhang <sheri.zhang@arm.com>
Change-Id: Id24702d258fb4e04ad948e7cf6c0efd98d2a5456
Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/5561
Reviewed-by: TeresaARM <teresa.charlinreyes@arm.com>
Comments-Addressed: Arm Jenkins <bsgcomp@arm.com>
Tested-by: Arm Jenkins <bsgcomp@arm.com>
---
 arm_compute/runtime/CL/functions/CLComparison.h    |  10 +-
 arm_compute/runtime/CL/functions/CLCrop.h          |   8 +
 .../CL/functions/CLDeconvolutionLayerUpsample.h    |   9 +
 .../CL/functions/CLDirectDeconvolutionLayer.h      |  14 +
 .../runtime/CL/functions/CLElementWiseUnaryLayer.h |  63 ++
 .../runtime/CL/functions/CLElementwiseOperations.h | 124 +++-
 .../CL/functions/CLGEMMDeconvolutionLayer.h        |  13 +-
 .../runtime/CL/functions/CLGEMMLowpOutputStage.h   |  12 +-
 arm_compute/runtime/CL/functions/CLLogicalAnd.h    |   8 +
 arm_compute/runtime/CL/functions/CLLogicalNot.h    |   8 +
 arm_compute/runtime/CL/functions/CLLogicalOr.h     |   8 +
 arm_compute/runtime/CL/functions/CLSoftmaxLayer.h  |  11 +
 .../CL/functions/CLWinogradInputTransform.h        |  12 +-
 .../runtime/NEON/functions/NEArithmeticAddition.h  |   3 +-
 .../NEON/functions/NEArithmeticSubtraction.h       |  19 +
 .../NEON/functions/NEDetectionPostProcessLayer.h   |  12 +-
 .../NEON/functions/NEElementwiseOperations.h       |  71 ++
 .../NEON/functions/NEElementwiseUnaryLayer.h       |  10 +
 arm_compute/runtime/NEON/functions/NEGEMMConv2d.h  |  12 +
 .../runtime/NEON/functions/NEGEMMLowpOutputStage.h |  10 +
 arm_compute/runtime/NEON/functions/NELogical.h     |  26 +-
 .../runtime/NEON/functions/NESoftmaxLayer.h        |  11 +
 arm_compute/runtime/OperatorList.h                 |  53 +-
 docs/09_operators_list.dox                         | 825 +++++++++++++++++++--
 24 files changed, 1232 insertions(+), 120 deletions(-)

diff --git a/arm_compute/runtime/CL/functions/CLComparison.h b/arm_compute/runtime/CL/functions/CLComparison.h
index 8cc3e96ec5..3f984900ee 100644
--- a/arm_compute/runtime/CL/functions/CLComparison.h
+++ b/arm_compute/runtime/CL/functions/CLComparison.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 Arm Limited.
+ * Copyright (c) 2018-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -39,6 +39,14 @@ class CLComparison : public ICLSimpleFunction
 {
 public:
     /** Initialise the kernel's inputs and outputs.
+     *
+     * Valid data layouts:
+     * - All
+     *
+     * Valid data type configurations:
+     * |src0     |src1     |dst      |
+     * |:--------|:--------|:--------|
+     * |All      |All      |U8       |
      *
      * @param[in]  input1    Source tensor. Data types supported: All.
      *                       The input1 tensor is [in, out] because its TensorInfo might be modified inside the kernel in case of broadcasting of dimension 0.
diff --git a/arm_compute/runtime/CL/functions/CLCrop.h b/arm_compute/runtime/CL/functions/CLCrop.h
index dc509b5b84..d2b72a5eff 100644
--- a/arm_compute/runtime/CL/functions/CLCrop.h
+++ b/arm_compute/runtime/CL/functions/CLCrop.h
@@ -55,6 +55,14 @@ public:
      *
      * @note Supported tensor rank: up to 4
      *
+     * Valid data layouts:
+     * - NHWC
+     *
+     * Valid data type configurations:
+     * |src            |dst            |
+     * |:--------------|:--------------|
+     * |All            |F32            |
+     *
      * @param[in]  input               Source tensor. Data type supported: All. Data layouts supported: NHWC.
      * @param[out] output              Destination tensor. Data type supported: F32
      * @param[in]  start               Coordinates of where to start cropping the image.
diff --git a/arm_compute/runtime/CL/functions/CLDeconvolutionLayerUpsample.h b/arm_compute/runtime/CL/functions/CLDeconvolutionLayerUpsample.h
index 6c1302fbf7..344ebd0afb 100644
--- a/arm_compute/runtime/CL/functions/CLDeconvolutionLayerUpsample.h
+++ b/arm_compute/runtime/CL/functions/CLDeconvolutionLayerUpsample.h
@@ -60,6 +60,15 @@ public:
     ~CLDeconvolutionLayerUpsample();
 
     /** Initialize the function's source, destination, interpolation type and border_mode.
+     *
+     * Valid data layouts:
+     * - NHWC
+     * - NCHW
+     *
+     * Valid data type configurations:
+     * |src            |dst            |
+     * |:--------------|:--------------|
+     * |All            |All            |
      *
      * @param[in, out] input  Source tensor. Data type supported: All.
      * @param[out]     output Destination tensor. Data type supported: same as @p input.
diff --git a/arm_compute/runtime/CL/functions/CLDirectDeconvolutionLayer.h b/arm_compute/runtime/CL/functions/CLDirectDeconvolutionLayer.h
index a23500e16b..567de13508 100644
--- a/arm_compute/runtime/CL/functions/CLDirectDeconvolutionLayer.h
+++ b/arm_compute/runtime/CL/functions/CLDirectDeconvolutionLayer.h
@@ -86,6 +86,20 @@ public:
     /** Default move assignment operator */
     CLDirectDeconvolutionLayer &operator=(CLDirectDeconvolutionLayer &&) = default;
     /** Set the input, weights, biases and output tensors.
+     *
+     * Valid data layouts:
+     * - NHWC
+     * - NCHW
+     *
+     * Valid data type configurations:
+     * |src0           |src1               |src2   |dst            |
+     * |:--------------|:------------------|:------|:--------------|
+     * |F16            |F16                |F16    |F16            |
+     * |F32            |F32                |F32    |F32            |
+     * |QASYMM8        |QASYMM8            |S32    |QASYMM8        |
+     * |QASYMM8_SIGNED |QASYMM8_SIGNED     |S32    |QASYMM8_SIGNED |
+     * |QASYMM8        |QSYMM8_PER_CHANNEL |S32    |QASYMM8        |
+     * |QASYMM8_SIGNED |QSYMM8_PER_CHANNEL |S32    |QASYMM8_SIGNED |
      *
      * @param[in,out] input        Input tensor. 3 lower dimensions represent a single input, and an optional 4th dimension for batch of inputs.
      *                             Data types supported: QASYMM8_SIGNED/QASYMM8/F16/F32.
diff --git a/arm_compute/runtime/CL/functions/CLElementWiseUnaryLayer.h b/arm_compute/runtime/CL/functions/CLElementWiseUnaryLayer.h
index fd6942cad5..79b79e89de 100644
--- a/arm_compute/runtime/CL/functions/CLElementWiseUnaryLayer.h
+++ b/arm_compute/runtime/CL/functions/CLElementWiseUnaryLayer.h
@@ -53,6 +53,15 @@ public:
     /** Default move assignment operator */
     CLRsqrtLayer &operator=(CLRsqrtLayer &&);
     /** Initialize the function
+     *
+     * Valid data layouts:
+     * - All
+     *
+     * Valid data type configurations:
+     * |src            |dst            |
+     * |:--------------|:--------------|
+     * |F16            |F16            |
+     * |F32            |F32            |
      *
      * @param[in]  input  Input tensor. Data types supported: F16/F32.
      * @param[out] output Output tensor. Data types supported: same as @p input.
@@ -99,6 +108,15 @@ public:
     /** Default move assignment operator */
     CLExpLayer &operator=(CLExpLayer &&);
     /** Initialize the function
+     *
+     * Valid data layouts:
+     * - All
+     *
+     * Valid data type configurations:
+     * |src            |dst            |
+     * |:--------------|:--------------|
+     * |F16            |F16            |
+     * |F32            |F32            |
      *
      * @param[in]  input  Input tensor. Data types supported: F16/F32.
      * @param[out] output Output tensor. Data types supported: same as @p input.
@@ -145,6 +163,15 @@ public:
     /** Default move assignment operator */
     CLNegLayer &operator=(CLNegLayer &&);
     /** Initialize the function
+     *
+     * Valid data layouts:
+     * - All
+     *
+     * Valid data type configurations:
+     * |src            |dst            |
+     * |:--------------|:--------------|
+     * |F16            |F16            |
+     * |F32            |F32            |
      *
      * @param[in]  input  Input tensor. Data types supported: F16/F32.
      * @param[out] output Output tensor. Data types supported: same as @p input.
@@ -191,6 +218,15 @@ public:
     /** Default move assignment operator */
     CLSinLayer &operator=(CLSinLayer &&);
     /** Initialize the function
+     *
+     * Valid data layouts:
+     * - All
+     *
+     * Valid data type configurations:
+     * |src            |dst            |
+     * |:--------------|:--------------|
+     * |F16            |F16            |
+     * |F32            |F32            |
      *
      * @param[in]  input  Input tensor. Data types supported: F16/F32.
      * @param[out] output Output tensor. Data types supported: same as @p input.
@@ -237,6 +273,15 @@ public:
     /** Default move assignment operator */
     CLLogLayer &operator=(CLLogLayer &&);
     /** Initialize the function
+     *
+     * Valid data layouts:
+     * - All
+     *
+     * Valid data type configurations:
+     * |src            |dst            |
+     * |:--------------|:--------------|
+     * |F16            |F16            |
+     * |F32            |F32            |
      *
      * @param[in]  input  Input tensor. Data types supported: F16/F32.
      * @param[out] output Output tensor. Data types supported: same as @p input.
@@ -283,6 +328,15 @@ public:
     /** Default move assignment operator */
     CLAbsLayer &operator=(CLAbsLayer &&);
     /** Initialize the function
+     *
+     * Valid data layouts:
+     * - All
+     *
+     * Valid data type configurations:
+     * |src            |dst            |
+     * |:--------------|:--------------|
+     * |F16            |F16            |
+     * |F32            |F32            |
      *
      * @param[in]  input  Input tensor. Data types supported: F16/F32.
      * @param[out] output Output tensor. Data types supported: same as @p input.
@@ -329,6 +383,15 @@ public:
     /** Default move assignment operator */
     CLRoundLayer &operator=(CLRoundLayer &&);
     /** Initialize the function
+     *
+     * Valid data layouts:
+     * - All
+     *
+     * Valid data type configurations:
+     * |src            |dst            |
+     * |:--------------|:--------------|
+     * |F16            |F16            |
+     * |F32            |F32            |
      *
      * @param[in]  input  Input tensor. Data types supported: F16/F32.
      * @param[out] output Output tensor. Data types supported: same as @p input.
diff --git a/arm_compute/runtime/CL/functions/CLElementwiseOperations.h b/arm_compute/runtime/CL/functions/CLElementwiseOperations.h
index 2b291517f3..555e84a251 100644
--- a/arm_compute/runtime/CL/functions/CLElementwiseOperations.h
+++ b/arm_compute/runtime/CL/functions/CLElementwiseOperations.h
@@ -55,19 +55,23 @@ public:
     CLArithmeticAddition &operator=(CLArithmeticAddition &&);
     /** Initialise the kernel's inputs, output and conversion policy.
      *
-     * Valid configurations (Input1,Input2) -> Output :
-     *
-     *   - (U8,U8)           -> U8
-     *   - (U8,U8)           -> S16
-     *   - (S16,U8)          -> S16
-     *   - (U8,S16)          -> S16
-     *   - (S16,S16)         -> S16
-     *   - (S32,S32)         -> S32
-     *   - (F16,F16)         -> F16
-     *   - (F32,F32)         -> F32
-     *   - (QASYMM8,QASYMM8) -> QASYMM8
-     *   - (QASYMM8_SIGNED,QASYMM8_SIGNED) -> QASYMM8_SIGNED
-     *   - (QSYMM16,QSYMM16) -> QSYMM16
+     * Valid data layouts:
+     * - All
+     *
+     * Valid data type configurations:
+     * |src0           |src1           |dst            |
+     * |:--------------|:--------------|:--------------|
+     * |QASYMM8        |QASYMM8        |QASYMM8        |
+     * |QASYMM8_SIGNED |QASYMM8_SIGNED |QASYMM8_SIGNED |
+     * |QSYMM16        |QSYMM16        |QASYMM16       |
+     * |U8             |U8             |U8             |
+     * |U8             |U8             |S16            |
+     * |U8             |S16            |S16            |
+     * |S16            |U8             |S16            |
+     * |S16            |S16            |S16            |
+     * |S32            |S32            |S32            |
+     * |F16            |F16            |F16            |
+     * |F32            |F32            |F32            |
      *
      * @param[in, out] input1   First tensor input. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/S32/F16/F32.
      *                          The input tensor is [in, out] because its TensorInfo might be modified inside the kernel in case of broadcasting of dimension 0.
@@ -161,19 +165,23 @@ public:
     CLArithmeticSubtraction &operator=(CLArithmeticSubtraction &&);
     /** Initialise the kernel's inputs, output and conversion policy.
      *
-     * Valid configurations (Input1,Input2) -> Output :
-     *
-     *   - (U8,U8)           -> U8
-     *   - (U8,U8)           -> S16
-     *   - (S16,U8)          -> S16
-     *   - (U8,S16)          -> S16
-     *   - (S16,S16)         -> S16
-     *   - (S32,S32)         -> S32
-     *   - (F16,F16)         -> F16
-     *   - (F32,F32)         -> F32
-     *   - (QASYMM8,QASYMM8) -> QASYMM8
-     *   - (QASYMM8_SIGNED,QASYMM8_SIGNED) -> QASYMM8_SIGNED
-     *   - (QSYMM16,QSYMM16) -> QSYMM16
+     * Valid data layouts:
+     * - All
+     *
+     * Valid data type configurations:
+     * |src0           |src1           |dst            |
+     * |:--------------|:--------------|:--------------|
+     * |QASYMM8        |QASYMM8        |QASYMM8        |
+     * |QASYMM8_SIGNED |QASYMM8_SIGNED |QASYMM8_SIGNED |
+     * |QSYMM16        |QSYMM16        |QASYMM16       |
+     * |U8             |U8             |U8             |
+     * |U8             |U8             |S16            |
+     * |U8             |S16            |S16            |
+     * |S16            |U8             |S16            |
+     * |S16            |S16            |S16            |
+     * |S32            |S32            |S32            |
+     * |F16            |F16            |F16            |
+     * |F32            |F32            |F32            |
      *
      * @param[in, out] input1   First tensor input. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/S32/F16/F32.
      *                          The input tensor is [in, out] because its TensorInfo might be modified inside the kernel in case of broadcasting of dimension 0.
@@ -266,6 +274,15 @@ public:
     /** Default move assignment operator */
     CLArithmeticDivision &operator=(CLArithmeticDivision &&);
     /** Initialise the kernel's inputs, output.
+     *
+     * Valid data layouts:
+     * - All
+     *
+     * Valid data type configurations:
+     * |src0           |src1           |dst            |
+     * |:--------------|:--------------|:--------------|
+     * |F16            |F16            |F16            |
+     * |F32            |F32            |F32            |
      *
      * @param[in, out] input1   First tensor input. Data types supported: F16/F32.
      *                          The input tensor is [in, out] because its TensorInfo might be modified inside the kernel in case of broadcasting of dimension 0.
@@ -326,6 +343,22 @@ public:
     /** Default move assignment operator */
     CLElementwiseMax &operator=(CLElementwiseMax &&);
     /** Initialise the kernel's inputs, output and conversion policy.
+     *
+     * Valid data layouts:
+     * - All
+     *
+     * Valid data type configurations:
+     * |src0           |src1           |dst            |
+     * |:--------------|:--------------|:--------------|
+     * |QASYMM8        |QASYMM8        |QASYMM8        |
+     * |QASYMM8_SIGNED |QASYMM8_SIGNED |QASYMM8_SIGNED |
+     * |QSYMM16        |QSYMM16        |QASYMM16       |
+     * |U8             |U8             |U8             |
+     * |S16            |S16            |S16            |
+     * |S32            |S32            |S32            |
+     * |U32            |U32            |U32            |
+     * |F16            |F16            |F16            |
+     * |F32            |F32            |F32            |
      *
      * @param[in, out] input1   First tensor input. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/S32/U32/F16/F32.
      *                          The input tensor is [in, out] because its TensorInfo might be modified inside the kernel in case of broadcasting of dimension 0.
@@ -386,6 +419,22 @@ public:
     /** Default move assignment operator */
     CLElementwiseMin &operator=(CLElementwiseMin &&);
     /** Initialise the kernel's inputs, output and conversion policy.
+     *
+     * Valid data layouts:
+     * - All
+     *
+     * Valid data type configurations:
+     * |src0           |src1           |dst            |
+     * |:--------------|:--------------|:--------------|
+     * |QASYMM8        |QASYMM8        |QASYMM8        |
+     * |QASYMM8_SIGNED |QASYMM8_SIGNED |QASYMM8_SIGNED |
+     * |QSYMM16        |QSYMM16        |QASYMM16       |
+     * |U8             |U8             |U8             |
+     * |S16            |S16            |S16            |
+     * |S32            |S32            |S32            |
+     * |U32            |U32            |U32            |
+     * |F16            |F16            |F16            |
+     * |F32            |F32            |F32            |
      *
      * @param[in, out] input1   First tensor input. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/S32/U32/F16/F32.
      *                          The input tensor is [in, out] because its TensorInfo might be modified inside the kernel in case of broadcasting of dimension 0.
@@ -446,6 +495,20 @@ public:
     /** Default move assignment operator */
     CLElementwiseSquaredDiff &operator=(CLElementwiseSquaredDiff &&);
     /** Initialise the kernel's inputs, output and conversion policy.
+     *
+     * Valid data layouts:
+     * - All
+     *
+     * Valid data type configurations:
+     * |src0           |src1           |dst            |
+     * |:--------------|:--------------|:--------------|
+     * |QASYMM8        |QASYMM8        |QASYMM8        |
+     * |QASYMM8_SIGNED |QASYMM8_SIGNED |QASYMM8_SIGNED |
+     * |QSYMM16        |QSYMM16        |QASYMM16       |
+     * |U8             |U8             |U8             |
+     * |S16            |S16            |S16            |
+     * |F16            |F16            |F16            |
+     * |F32            |F32            |F32            |
      *
      * @param[in, out] input1   First tensor input. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/F32.
      *                          The input tensor is [in, out] because its TensorInfo might be modified inside the kernel in case of broadcasting of dimension 0.
@@ -506,6 +569,15 @@ public:
     /** Default move assignment operator */
     CLElementwisePower &operator=(CLElementwisePower &&);
     /** Initialise the kernel's inputs, output and conversion policy.
+     *
+     * Valid data layouts:
+     * - All
+     *
+     * Valid data type configurations:
+     * |src0           |src1           |dst            |
+     * |:--------------|:--------------|:--------------|
+     * |F16            |F16            |F16            |
+     * |F32            |F32            |F32            |
      *
      * @param[in, out] input1   First tensor input. Data types supported: F16/F32.
      *                          The input tensor is [in, out] because its TensorInfo might be modified inside the kernel in case of broadcasting of dimension 0.
diff --git a/arm_compute/runtime/CL/functions/CLGEMMDeconvolutionLayer.h b/arm_compute/runtime/CL/functions/CLGEMMDeconvolutionLayer.h
index 32af0f9427..6e482c98e7 100644
--- a/arm_compute/runtime/CL/functions/CLGEMMDeconvolutionLayer.h
+++ b/arm_compute/runtime/CL/functions/CLGEMMDeconvolutionLayer.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 Arm Limited.
+ * Copyright (c) 2019-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -92,6 +92,17 @@ public:
     /** Default desctructor */
     ~CLGEMMDeconvolutionLayer();
     /** Set the input, weights, biases and output tensors.
+     *
+     * Valid data layouts:
+     * - NHWC
+     *
+     * Valid data type configurations:
+     * |src0           |src1               |src2     |dst            |
+     * |:--------------|:------------------|:--------|:--------------|
+     * |F16            |F16                |F16      |F16            |
+     * |F32            |F32                |F32      |F32            |
+     * |QASYMM8        |QASYMM8            |S32      |QASYMM8        |
+     * |QASYMM8_SIGNED |QASYMM8_SIGNED     |S32      |QASYMM8_SIGNED |
      *
      * @param[in,out] input       Input tensor. 3 lower dimensions represent a single input, and an optional 4th dimension for batch of inputs.
      *                            Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32. Data layout supported: NHWC
diff --git a/arm_compute/runtime/CL/functions/CLGEMMLowpOutputStage.h b/arm_compute/runtime/CL/functions/CLGEMMLowpOutputStage.h
index 0f051ecffd..a60992a0f4 100644
--- a/arm_compute/runtime/CL/functions/CLGEMMLowpOutputStage.h
+++ b/arm_compute/runtime/CL/functions/CLGEMMLowpOutputStage.h
@@ -287,11 +287,21 @@ class CLGEMMLowpOutputStage : public ICLSimpleFunction
 {
 public:
     /** Initialise the kernel's inputs, output
+     *
+     * Valid data layouts:
+     * - All
+     *
+     * Valid data type configurations:
+     * |src0           |src1          |dst           |
+     * |:--------------|:-------------|:-------------|
+     * |S32            |S32           |QASYMM8       |
+     * |S32            |S32           |QASYMM8_SIGNED|
+     * |S32            |S32           |QSYMM16       |
      *
      * @param[in]  input  Input tensor. Data type supported: S32
      * @param[in]  bias   Biases tensor. Only shared biases supported and it can be a nullptr if the biases addition is not required.
      *                    Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input.
-     * @param[out] output Output tensor. Data type supported: QASYMM8/QASYMM8_SIGNED
+     * @param[out] output Output tensor. Data type supported: QASYMM8/QASYMM8_SIGNED/QSYMM16
      * @param[in]  info   GEMMLowp output stage metadata.
      */
     void configure(const ICLTensor *input, const ICLTensor *bias, ICLTensor *output, const GEMMLowpOutputStageInfo &info);
diff --git a/arm_compute/runtime/CL/functions/CLLogicalAnd.h b/arm_compute/runtime/CL/functions/CLLogicalAnd.h
index f7038ee97a..61a15816eb 100644
--- a/arm_compute/runtime/CL/functions/CLLogicalAnd.h
+++ b/arm_compute/runtime/CL/functions/CLLogicalAnd.h
@@ -86,6 +86,14 @@ public:
     /** Default move assignment operator */
     CLLogicalAnd &operator=(CLLogicalAnd &&);
     /** Initialize the function
+     *
+     * Valid data layouts:
+     * - All
+     *
+     * Valid data type configurations:
+     * |src0           |src1          |dst          |
+     * |:--------------|:-------------|:------------|
+     * |U8             |U8            |U8           |
      *
      * @param[in]  input1 Input tensor. Data types supported: U8.
      * @param[in]  input2 Input tensor. Data types supported: same as @p input1.
diff --git a/arm_compute/runtime/CL/functions/CLLogicalNot.h b/arm_compute/runtime/CL/functions/CLLogicalNot.h
index 772f16b942..27fd0f9c9f 100644
--- a/arm_compute/runtime/CL/functions/CLLogicalNot.h
+++ b/arm_compute/runtime/CL/functions/CLLogicalNot.h
@@ -57,6 +57,14 @@ public:
     /** Default move assignment operator */
     CLLogicalNot &operator=(CLLogicalNot &&);
     /** Initialize the function
+     *
+     * Valid data layouts:
+     * - All
+     *
+     * Valid data type configurations:
+     * |src            |dst          |
+     * |:--------------|:------------|
+     * |U8             |U8           |
      *
      * @param[in]  input  Input tensor. Data types supported: U8.
      * @param[out] output Output tensor. Data types supported: same as @p input.
diff --git a/arm_compute/runtime/CL/functions/CLLogicalOr.h b/arm_compute/runtime/CL/functions/CLLogicalOr.h
index 948baee9d9..b9ffb4a449 100644
--- a/arm_compute/runtime/CL/functions/CLLogicalOr.h
+++ b/arm_compute/runtime/CL/functions/CLLogicalOr.h
@@ -86,6 +86,14 @@ public:
     /** Default move assignment operator */
     CLLogicalOr &operator=(CLLogicalOr &&);
     /** Initialize the function
+     *
+     * Valid data layouts:
+     * - All
+     *
+     * Valid data type configurations:
+     * |src0           |src1          |dst          |
+     * |:--------------|:-------------|:------------|
+     * |U8             |U8            |U8           |
      *
      * @param[in]  input1 Input tensor. Data types supported: U8.
      * @param[in]  input2 Input tensor. Data types supported: same as @p input1.
diff --git a/arm_compute/runtime/CL/functions/CLSoftmaxLayer.h b/arm_compute/runtime/CL/functions/CLSoftmaxLayer.h
index ddb35ae56f..721a47144e 100644
--- a/arm_compute/runtime/CL/functions/CLSoftmaxLayer.h
+++ b/arm_compute/runtime/CL/functions/CLSoftmaxLayer.h
@@ -59,6 +59,17 @@ public:
     /** Default destructor */
     ~CLSoftmaxLayerGeneric();
     /** Set the input and output tensors.
+     *
+     * Valid data layouts:
+     * - All
+     *
+     * Valid data type configurations:
+     * |src            |dst            |
+     * |:--------------|:--------------|
+     * |QASYMM8        |QASYMM8        |
+     * |QASYMM8_SIGNED |QASYMM8_SIGNED |
+     * |F16            |F16            |
+     * |F32            |F32            |
      *
      * @param[in]  input  Source tensor. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32 for Softmax and F16/F32 for Log Softmax
      * @param[out] output Destination tensor. Data types supported: same as @p input
diff --git a/arm_compute/runtime/CL/functions/CLWinogradInputTransform.h b/arm_compute/runtime/CL/functions/CLWinogradInputTransform.h
index 8cd809cc1f..d644591b57 100644
--- a/arm_compute/runtime/CL/functions/CLWinogradInputTransform.h
+++ b/arm_compute/runtime/CL/functions/CLWinogradInputTransform.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 Arm Limited.
+ * Copyright (c) 2018-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -40,6 +40,16 @@ class CLWinogradInputTransform : public ICLSimpleFunction
 {
 public:
     /** Set the input and output tensors.
+     *
+     * Valid data layouts:
+     * - NHWC
+     * - NCHW
+     *
+     * Valid data type configurations:
+     * |src            |dst            |
+     * |:--------------|:--------------|
+     * |F16            |F16            |
+     * |F32            |F32            |
      *
      * @note Winograd input transform supports the following configurations for NCWH data layout
      *       F(output tile, kernel size):F(2x2, 3x3), F(2x1, 3x1), F(1x2, 1x3),
diff --git a/arm_compute/runtime/NEON/functions/NEArithmeticAddition.h b/arm_compute/runtime/NEON/functions/NEArithmeticAddition.h
index 734e3502dd..b8e46ff36e 100644
--- a/arm_compute/runtime/NEON/functions/NEArithmeticAddition.h
+++ b/arm_compute/runtime/NEON/functions/NEArithmeticAddition.h
@@ -52,8 +52,7 @@ public:
     /** Initialise the kernel's inputs, output and conversion policy.
      *
      * Valid data layouts:
-     * - NHWC
-     * - NCHW
+     * - All
      *
      * Valid data type configurations:
      * |src0           |src1           |dst            |
diff --git a/arm_compute/runtime/NEON/functions/NEArithmeticSubtraction.h b/arm_compute/runtime/NEON/functions/NEArithmeticSubtraction.h
index c741db3223..0c72e946f6 100644
--- a/arm_compute/runtime/NEON/functions/NEArithmeticSubtraction.h
+++ b/arm_compute/runtime/NEON/functions/NEArithmeticSubtraction.h
@@ -56,6 +56,25 @@ public:
     /** Default move assignment operator */
     NEArithmeticSubtraction &operator=(NEArithmeticSubtraction &&);
     /** Initialise the kernel's inputs, output and conversion policy.
+     *
+     * Valid data layouts:
+     * - All
+     *
+     * Valid data type configurations:
+     * |src0           |src1           |dst            |
+     * |:--------------|:--------------|:--------------|
+     * |QASYMM8        |QASYMM8        |QASYMM8        |
+     * |QASYMM8_SIGNED |QASYMM8_SIGNED |QASYMM8_SIGNED |
+     * |QSYMM16        |QSYMM16        |QASYMM16       |
+     * |QSYMM16        |QSYMM16        |S32            |
+     * |U8             |U8             |U8             |
+     * |U8             |U8             |S16            |
+     * |U8             |S16            |S16            |
+     * |S16            |U8             |S16            |
+     * |S16            |S16            |S16            |
+     * |S32            |S32            |S32            |
+     * |F16            |F16            |F16            |
+     * |F32            |F32            |F32            |
      *
      * @param[in]  input1   First tensor input. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/QSYMM16/S16/S32/F16/F32
      * @param[in]  input2   Second tensor input. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/QSYMM16/S16/S32/F16/F32
diff --git a/arm_compute/runtime/NEON/functions/NEDetectionPostProcessLayer.h b/arm_compute/runtime/NEON/functions/NEDetectionPostProcessLayer.h
index d5c1f0ab6f..2affa8d49e 100644
--- a/arm_compute/runtime/NEON/functions/NEDetectionPostProcessLayer.h
+++ b/arm_compute/runtime/NEON/functions/NEDetectionPostProcessLayer.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 Arm Limited.
+ * Copyright (c) 2019-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -56,6 +56,16 @@ public:
     /** Default destructor */
     ~NEDetectionPostProcessLayer() = default;
     /** Configure the detection output layer NE function
+     *
+     * Valid data layouts:
+     * - All
+     *
+     * Valid data type configurations:
+     * |src0 - src2    |dst0 - dst3    |
+     * |:--------------|:--------------|
+     * |QASYMM8        |F32            |
+     * |QASYMM8_SIGNED |F32            |
+     * |F32            |F32            |
      *
      * @param[in]  input_box_encoding The bounding box input tensor. Data types supported: QASYMM8/QASYMM8_SIGNED/F32.
      * @param[in]  input_score        The class prediction input tensor. Data types supported: same as @p input_box_encoding.
diff --git a/arm_compute/runtime/NEON/functions/NEElementwiseOperations.h b/arm_compute/runtime/NEON/functions/NEElementwiseOperations.h
index 44b70bbe85..95274bdb0c 100644
--- a/arm_compute/runtime/NEON/functions/NEElementwiseOperations.h
+++ b/arm_compute/runtime/NEON/functions/NEElementwiseOperations.h
@@ -53,6 +53,19 @@ public:
     /** Default move assignment operator */
     NEElementwiseMax &operator=(NEElementwiseMax &&);
     /** Initialise the kernel's inputs, output and conversion policy.
+     *
+     * Valid data layouts:
+     * - All
+     *
+     * Valid data type configurations:
+     * |src0           |src1           |dst            |
+     * |:--------------|:--------------|:--------------|
+     * |QASYMM8        |QASYMM8        |QASYMM8        |
+     * |QASYMM8_SIGNED |QASYMM8_SIGNED |QASYMM8_SIGNED |
+     * |S32            |S32            |S32            |
+     * |S16            |S16            |S16            |
+     * |F16            |F16            |F16            |
+     * |F32            |F32            |F32            |
      *
      * @param[in, out] input1   First tensor input. Data types supported: QASYMM8/QASYMM8_SIGNED/S16/F16/S32/F32.
      * @param[in, out] input2   Second tensor input. Data types supported: Same as @p input1.
@@ -100,6 +113,19 @@ public:
     /** Default move assignment operator */
     NEElementwiseMin &operator=(NEElementwiseMin &&);
     /** Initialise the kernel's inputs, output and conversion policy.
+     *
+     * Valid data layouts:
+     * - All
+     *
+     * Valid data type configurations:
+     * |src0           |src1           |dst            |
+     * |:--------------|:--------------|:--------------|
+     * |QASYMM8        |QASYMM8        |QASYMM8        |
+     * |QASYMM8_SIGNED |QASYMM8_SIGNED |QASYMM8_SIGNED |
+     * |S32            |S32            |S32            |
+     * |S16            |S16            |S16            |
+     * |F16            |F16            |F16            |
+     * |F32            |F32            |F32            |
      *
      * @param[in, out] input1   First tensor input. Data types supported: QASYMM8/QASYMM8_SIGNED/S16/F16/S32/F32.
      * @param[in, out] input2   Second tensor input. Data types supported: Same as @p input1.
@@ -147,6 +173,19 @@ public:
     /** Default move assignment operator */
     NEElementwiseSquaredDiff &operator=(NEElementwiseSquaredDiff &&);
     /** Initialise the kernel's inputs, output and conversion policy.
+     *
+     * Valid data layouts:
+     * - All
+     *
+     * Valid data type configurations:
+     * |src0           |src1           |dst            |
+     * |:--------------|:--------------|:--------------|
+     * |QASYMM8        |QASYMM8        |QASYMM8        |
+     * |QASYMM8_SIGNED |QASYMM8_SIGNED |QASYMM8_SIGNED |
+     * |S32            |S32            |S32            |
+     * |S16            |S16            |S16            |
+     * |F16            |F16            |F16            |
+     * |F32            |F32            |F32            |
      *
      * @param[in, out] input1   First tensor input. Data types supported: QASYMM8/QASYMM8_SIGNED/S16/F16/S32/F32.
      * @param[in, out] input2   Second tensor input. Data types supported: Same as @p input1.
@@ -194,6 +233,15 @@ public:
     /** Default move assignment operator */
     NEElementwiseDivision &operator=(NEElementwiseDivision &&);
     /** Initialise the kernel's inputs, output and conversion policy.
+     *
+     * Valid data layouts:
+     * - All
+     *
+     * Valid data type configurations:
+     * |src0           |src1           |dst            |
+     * |:--------------|:--------------|:--------------|
+     * |F16            |F16            |F16            |
+     * |F32            |F32            |F32            |
      *
      * @param[in, out] input1   First tensor input. Data types supported: F16/F32.
      * @param[in, out] input2   Second tensor input. Data types supported: Same as @p input1.
@@ -242,6 +290,15 @@ public:
     /** Default move assignment operator */
     NEElementwisePower &operator=(NEElementwisePower &&);
     /** Initialise the kernel's inputs, output and conversion policy.
+     *
+     * Valid data layouts:
+     * - All
+     *
+     * Valid data type configurations:
+     * |src0           |src1           |dst            |
+     * |:--------------|:--------------|:--------------|
+     * |F16            |F16            |F16            |
+     * |F32            |F32            |F32            |
      *
      * @param[in, out] input1   First tensor input. Data types supported: F16/F32.
      * @param[in, out] input2   Second tensor input. Data types supported: Same as @p input1.
@@ -289,6 +346,20 @@ public:
     /** Default move assignment operator */
     NEElementwiseComparison &operator=(NEElementwiseComparison &&);
     /** Initialise the kernel's inputs, output and conversion policy.
+     *
+     * Valid data layouts:
+     * - All
+     *
+     * Valid data type configurations:
+     * |src0           |src1           |dst   |
+     * |:--------------|:--------------|:-----|
+     * |QASYMM8        |QASYMM8        |U8    |
+     * |QASYMM8_SIGNED |QASYMM8_SIGNED |U8    |
+     * |S32            |S32            |U8    |
+     * |U8             |U8             |U8    |
+     * |S16            |S16            |U8    |
+     * |F16            |F16            |U8    |
+     * |F32            |F32            |U8    |
      *
      * @param[in, out] input1 First tensor input. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/F16/S32/F32.
      * @param[in, out] input2 Second tensor input. Data types supported: Same as @p input1.
diff --git a/arm_compute/runtime/NEON/functions/NEElementwiseUnaryLayer.h b/arm_compute/runtime/NEON/functions/NEElementwiseUnaryLayer.h
index 4786f71cf8..63e47b8377 100644
--- a/arm_compute/runtime/NEON/functions/NEElementwiseUnaryLayer.h
+++ b/arm_compute/runtime/NEON/functions/NEElementwiseUnaryLayer.h
@@ -53,6 +53,16 @@ public:
     NEElementwiseUnaryLayer &operator=(NEElementwiseUnaryLayer &&);
 
     /** Initialize the function
+     *
+     * Valid data layouts:
+     * - All
+     *
+     * Valid data type configurations:
+     * |src            |dst            |
+     * |:--------------|:--------------|
+     * |F16            |F16            |
+     * |F32            |F32            |
+     * |S32            |S32            |
      *
      * @param[in]  input  Input tensor. Data types supported: F16/F32, F16/F32/S32 for NEG/ABS operations.
      * @param[out] output Output tensor. Data types supported: Same as @p input.
diff --git a/arm_compute/runtime/NEON/functions/NEGEMMConv2d.h b/arm_compute/runtime/NEON/functions/NEGEMMConv2d.h
index 8c3ba4f0c8..b2ffd038de 100644
--- a/arm_compute/runtime/NEON/functions/NEGEMMConv2d.h
+++ b/arm_compute/runtime/NEON/functions/NEGEMMConv2d.h
@@ -64,6 +64,18 @@ public:
     /** Destructor */
     ~NEGEMMConv2d();
     /** Set the input and output tensors.
+     *
+     * Valid data layouts:
+     * - All
+     *
+     * Valid data type configurations:
+     * |src0           |src1           |src2           |dst            |
+     * |:--------------|:--------------|:--------------|:--------------|
+     * |QASYMM8        |QASYMM8        |S32            |QASYMM8        |
+     * |QASYMM8_SIGNED |QASYMM8_SIGNED |S32            |QASYMM8_SIGNED |
+     * |F16            |F16            |F16            |F16            |
+     * |F32            |F32            |F32            |F32            |
+     * |BFLOAT16       |BFLOAT16       |BFLOAT16       |BFLOAT16       |
      *
      * @param[in]  input   Source tensor. 3 lower dimensions represent a single input [width, height, IFM],
      *                     while every optional dimension from 4 and above represent a batch of inputs.
diff --git a/arm_compute/runtime/NEON/functions/NEGEMMLowpOutputStage.h b/arm_compute/runtime/NEON/functions/NEGEMMLowpOutputStage.h
index c22ed1b5c4..fa5f5e3826 100644
--- a/arm_compute/runtime/NEON/functions/NEGEMMLowpOutputStage.h
+++ b/arm_compute/runtime/NEON/functions/NEGEMMLowpOutputStage.h
@@ -281,6 +281,16 @@ public:
     /** Default destructor */
     ~NEGEMMLowpOutputStage();
     /** Initialise the kernel's inputs, output
+     *
+     * Valid data layouts:
+     * - All
+     *
+     * Valid data type configurations:
+     * |src0           |src1          |dst           |
+     * |:--------------|:-------------|:-------------|
+     * |S32            |S32           |QASYMM8       |
+     * |S32            |S32           |QASYMM8_SIGNED|
+     * |S32            |S32           |QSYMM16       |
      *
      * @param[in]  input  Input tensor. Data type supported: S32
      * @param[in]  bias   Biases tensor. Only shared biases supported and it can be a nullptr if the biases addition is not required.
diff --git a/arm_compute/runtime/NEON/functions/NELogical.h b/arm_compute/runtime/NEON/functions/NELogical.h
index 04ffce6221..5cf5336f4f 100644
--- a/arm_compute/runtime/NEON/functions/NELogical.h
+++ b/arm_compute/runtime/NEON/functions/NELogical.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020 Arm Limited.
+ * Copyright (c) 2020-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -47,6 +47,14 @@ public:
     ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE_INC(NELogicalAnd)
 
     /** Initialise the kernel's inputs and output
+     *
+     * Valid data layouts:
+     * - All
+     *
+     * Valid data type configurations:
+     * |src0           |src1          |dst          |
+     * |:--------------|:-------------|:------------|
+     * |U8             |U8            |U8           |
      *
      * @param[in]  input1 First tensor input. Data type supported: U8.
      * @param[in]  input2 Second tensor input. Data type supported: U8.
@@ -82,6 +90,14 @@ public:
     ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE_INC(NELogicalOr)
 
     /** Initialise the kernel's inputs and output
+     *
+     * Valid data layouts:
+     * - All
+     *
+     * Valid data type configurations:
+     * |src0           |src1          |dst          |
+     * |:--------------|:-------------|:------------|
+     * |U8             |U8            |U8           |
      *
      * @param[in]  input1 First tensor input. Data type supported: U8.
      * @param[in]  input2 Second tensor input. Data type supported: U8.
@@ -117,6 +133,14 @@ public:
     ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE_INC(NELogicalNot)
 
     /** Initialise the kernel's inputs and output
+     *
+     * Valid data layouts:
+     * - All
+     *
+     * Valid data type configurations:
+     * |src            |dst           |
+     * |:--------------|:-------------|
+     * |U8             |U8            |
      *
      * @param[in]  input  Input tensor. Data type supported: U8.
      * @param[out] output Output tensor. Data type supported: U8.
diff --git a/arm_compute/runtime/NEON/functions/NESoftmaxLayer.h b/arm_compute/runtime/NEON/functions/NESoftmaxLayer.h
index 8a2ae10129..efe959f14e 100644
--- a/arm_compute/runtime/NEON/functions/NESoftmaxLayer.h
+++ b/arm_compute/runtime/NEON/functions/NESoftmaxLayer.h
@@ -51,6 +51,17 @@ public:
     /** Default destructor */
     ~NESoftmaxLayerGeneric();
     /** Set the input and output tensors.
+     *
+     * Valid data layouts:
+     * - All
+     *
+     * Valid data type configurations:
+     * |src            |dst            |
+     * |:--------------|:--------------|
+     * |QASYMM8        |QASYMM8        |
+     * |QASYMM8_SIGNED |QASYMM8_SIGNED |
+     * |F16            |F16            |
+     * |F32            |F32            |
      *
      * @param[in,out] input  Source tensor. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32. If the width is not a
      *                       multiple of the internal processing block size, @ref NEFillBorder replicates the
diff --git a/arm_compute/runtime/OperatorList.h b/arm_compute/runtime/OperatorList.h
index a659a79423..e1c2bed41d 100644
--- a/arm_compute/runtime/OperatorList.h
+++ b/arm_compute/runtime/OperatorList.h
@@ -51,7 +51,7 @@
  *
  */
 
-/** ArithmeticAddition (no CL)
+/** ArithmeticAddition
  *
  * Description:
  * Function to add 2 tensors.
@@ -61,7 +61,7 @@
  *
  */
 
-/** ArithmeticSubtraction (no CL)
+/** ArithmeticSubtraction
  *
  * Description:
  * Function to substract 2 tensors.
@@ -161,7 +161,7 @@
  *
  */
 
-/** Comparison (only CL)
+/** Comparison
  *
  * Description:
  * Function to compare 2 tensors.
@@ -216,7 +216,7 @@
  *
  */
 
-/** Crop (only CL)
+/** Crop
  *
  * Description:
  * Performs a copy of input tensor to the output tensor.
@@ -246,7 +246,7 @@
  *
  */
 
-/** DeconvolutionLayerUpsample (only CL)
+/** DeconvolutionLayerUpsample
  *
  * Description:
  * Function to execute deconvolution upsample on OpenCL.
@@ -296,7 +296,7 @@
  *
  */
 
-/** DetectionPostProcessLayer (no CL)
+/** DetectionPostProcessLayer
  *
  * Description:
  * Function to generate the detection output based on center size encoded boxes, class prediction and anchors by doing non maximum suppression (NMS).
@@ -316,7 +316,7 @@
  *
  */
 
-/** DirectDeconvolutionLayer (only CL)
+/** DirectDeconvolutionLayer
  *
  * Description:
  * Function to run the deconvolution layer.
@@ -326,7 +326,7 @@
  *
  */
 
-/** ElementWiseOperations (skip)
+/** ElementWiseOperations
  *
  * Description:
  * Function to perform in Cpu:
@@ -336,7 +336,6 @@
  * - Pow
  * - SquaredDiff
  * - Comparisons (Equal, greater, greater_equal, less, less_equal, not_equal)
- *
  * Function to perform in CL:
  * - Add
  * - Sub
@@ -351,18 +350,18 @@
  * ANEURALNETWORKS_MINIMUM
  * ANEURALNETWORKS_POW
  * ANEURALNETWORKS_DIV
- * ANEURALNETWORKS_ADD (only CL)
- * ANEURALNETWORKS_SUB (only CL)
- * ANEURALNETWORKS_EQUAL (no CL)
- * ANEURALNETWORKS_GREATER (no CL)
- * ANEURALNETWORKS_GREATER_EQUAL (no CL)
- * ANEURALNETWORKS_LESS (no CL)
- * ANEURALNETWORKS_LESS_EQUAL (no CL)
- * ANEURALNETWORKS_NOT_EQUAL (no CL)
+ * ANEURALNETWORKS_ADD
+ * ANEURALNETWORKS_SUB
+ * ANEURALNETWORKS_EQUAL
+ * ANEURALNETWORKS_GREATER
+ * ANEURALNETWORKS_GREATER_EQUAL
+ * ANEURALNETWORKS_LESS
+ * ANEURALNETWORKS_LESS_EQUAL
+ * ANEURALNETWORKS_NOT_EQUAL
  *
  */
 
-/** ElementWiseOperationUnary (skip)
+/** ElementwiseUnaryLayer
  *
  * Description:
  * Function to perform:
@@ -494,7 +493,7 @@
  *
  */
 
-/** GEMMConv2D (no CL)
+/** GEMMConv2D
  *
  * Description:
  * General Matrix Multiplication.
@@ -514,7 +513,7 @@
  *
  */
 
-/** GEMMDeconvolutionLayer (only CL)
+/** GEMMDeconvolutionLayer
  *
  * Description:
  * General Matrix Multiplication.
@@ -574,7 +573,7 @@
  *
  */
 
-/** Logical (no CL)
+/** Logical
  *
  * Description:
  * Function to perform:
@@ -587,7 +586,7 @@
  *
  */
 
-/** LogicalAnd (only CL)
+/** LogicalAnd
  *
  * Description:
  * Function to perform Logical AND.
@@ -597,7 +596,7 @@
  *
  */
 
-/** LogicalOr (only CL)
+/** LogicalOr
  *
  * Description:
  * Function to perform Logical OR.
@@ -607,7 +606,7 @@
  *
  */
 
-/** LogicalNot (only CL)
+/** LogicalNot
  *
  * Description:
  * Function to perform Logical NOT.
@@ -724,7 +723,7 @@
 /** PriorBoxLayer
  *
  * Description:
- * Function to .
+ * Function to compute prior boxes and clip.
  *
  * Equivalent Android NNAPI Op:
  * n/a
@@ -889,7 +888,7 @@
  *
  */
 
-/** SoftmaxLayer (skip)
+/** SoftmaxLayer
  *
  * Description:
  * Function to compute a SoftmaxLayer and a Log SoftmaxLayer.
@@ -990,7 +989,7 @@
  *
  */
 
-/** WinogradInputTransform (only CL)
+/** WinogradInputTransform
  *
  * Description:
  * Function to.
diff --git a/docs/09_operators_list.dox b/docs/09_operators_list.dox
index 244f292f82..fc41265738 100644
--- a/docs/09_operators_list.dox
+++ b/docs/09_operators_list.dox
@@ -144,6 +144,62 @@ where N = batches, C = channels, H = height, W = width
     <tr><td>F16<td>U32, S32
     <tr><td>F32<td>U32, S32
     </table>
+<tr>
+  <td rowspan="1">ArithmeticAddition
+  <td rowspan="1" style="width:200px;"> Function to add 2 tensors.
+  <td rowspan="1">
+      <ul>
+       <li>ANEURALNETWORKS_ADD
+      </ul>
+  <td>NEArithmeticAddition
+  <td>
+      <ul>
+       <li>All
+      </ul>
+  <td>
+    <table>
+    <tr><th>src0<th>src1<th>dst
+    <tr><td>QASYMM8<td>QASYMM8<td>QASYMM8
+    <tr><td>QASYMM8_SIGNED<td>QASYMM8_SIGNED<td>QASYMM8_SIGNED
+    <tr><td>QSYMM16<td>QSYMM16<td>QASYMM16
+    <tr><td>QSYMM16<td>QSYMM16<td>S32
+    <tr><td>U8<td>U8<td>U8
+    <tr><td>U8<td>U8<td>S16
+    <tr><td>U8<td>S16<td>S16
+    <tr><td>S16<td>U8<td>S16
+    <tr><td>S16<td>S16<td>S16
+    <tr><td>S32<td>S32<td>S32
+    <tr><td>F16<td>F16<td>F16
+    <tr><td>F32<td>F32<td>F32
+    </table>
+<tr>
+  <td rowspan="1">ArithmeticSubtraction
+  <td rowspan="1" style="width:200px;"> Function to substract 2 tensors.
+  <td rowspan="1">
+      <ul>
+       <li>ANEURALNETWORKS_SUB
+      </ul>
+  <td>NEArithmeticSubtraction
+  <td>
+      <ul>
+       <li>All
+      </ul>
+  <td>
+    <table>
+    <tr><th>src0<th>src1<th>dst
+    <tr><td>QASYMM8<td>QASYMM8<td>QASYMM8
+    <tr><td>QASYMM8_SIGNED<td>QASYMM8_SIGNED<td>QASYMM8_SIGNED
+    <tr><td>QSYMM16<td>QSYMM16<td>QASYMM16
+    <tr><td>QSYMM16<td>QSYMM16<td>S32
+    <tr><td>U8<td>U8<td>U8
+    <tr><td>U8<td>U8<td>S16
+    <tr><td>U8<td>S16<td>S16
+    <tr><td>S16<td>U8<td>S16
+    <tr><td>S16<td>S16<td>S16
+    <tr><td>S32<td>S32<td>S32
+    <tr><td>F16<td>F16<td>F16
+    <tr><td>F32<td>F32<td>F32
+    </table>
 <tr>
   <td rowspan="2">BatchNormalizationLayer
   <td rowspan="2" style="width:200px;"> Function to perform batch normalization.
@@ -421,6 +477,28 @@ where N = batches, C = channels, H = height, W = width
     <tr><th>src<th>dst
     <tr><td>All<td>All
     </table>
+<tr>
+  <td rowspan="1">Comparison
+  <td rowspan="1" style="width:200px;"> Function to compare 2 tensors.
+  <td rowspan="1">
+      <ul>
+       <li>ANEURALNETWORKS_EQUAL
+       <li>ANEURALNETWORKS_GREATER
+       <li>ANEURALNETWORKS_GREATER_EQUAL
+       <li>ANEURALNETWORKS_LESS
+       <li>ANEURALNETWORKS_LESS_EQUAL
+       <li>ANEURALNETWORKS_NOT_EQUAL
+      </ul>
+  <td>CLComparison
+  <td>
+      <ul>
+       <li>All
+      </ul>
+  <td>
+    <table>
+    <tr><th>src0<th>src1<th>dst
+    <tr><td>All<td>All<td>U8
+    </table>
 <tr>
   <td rowspan="2">ConcatenateLayer
   <td rowspan="2" style="width:200px;"> Function to concatenate tensors along a given axis.
@@ -553,6 +631,23 @@ where N = batches, C = channels, H = height, W = width
     <tr><th>src<th>dst
     <tr><td>All<td>All
     </table>
+<tr>
+  <td rowspan="1">Crop
+  <td rowspan="1" style="width:200px;"> Performs a copy of input tensor to the output tensor.
+  <td rowspan="1">
+      <ul>
+       <li>n/a
+      </ul>
+  <td>CLCrop
+  <td>
+      <ul>
+       <li>NHWC
+      </ul>
+  <td>
+    <table>
+    <tr><th>src<th>dst
+    <tr><td>All<td>F32
+    </table>
 <tr>
   <td rowspan="2">CropResize
   <td rowspan="2" style="width:200px;"> Function to perform cropping and resizing.
@@ -621,6 +716,24 @@ where N = batches, C = channels, H = height, W = width
     <tr><td>QASYMM8_SIGNED<td>QASYMM8_SIGNED<td>S32<td>QASYMM8_SIGNED
     <tr><td>QASYMM8_SIGNED<td>QSYMM8_PER_CHANNEL<td>S32<td>QASYMM8_SIGNED
     </table>
+<tr>
+  <td rowspan="1">DeconvolutionLayerUpsample
+  <td rowspan="1" style="width:200px;"> Function to execute deconvolution upsample on OpenCL.
+  <td rowspan="1">
+      <ul>
+       <li>ANEURALNETWORKS_TRANSPOSE_CONV_2D
+      </ul>
+  <td>CLDeconvolutionLayerUpsample
+  <td>
+      <ul>
+       <li>NHWC
+       <li>NCHW
+      </ul>
+  <td>
+    <table>
+    <tr><th>src<th>dst
+    <tr><td>All<td>All
+    </table>
 <tr>
   <td rowspan="2">DepthConvertLayer
   <td rowspan="2" style="width:200px;"> Performs a down-scaling depth conversion.
@@ -706,39 +819,420 @@ where N = batches, C = channels, H = height, W = width
       </ul>
   <td>
     <table>
-    <tr><th>src0<th>src1<th>src2<th>dst
-    <tr><td>F16<td>F16<td>F16<td>F16
-    <tr><td>F32<td>F32<td>F32<td>F32
-    <tr><td>QASYMM8<td>QASYMM8<td>S32<td>QASYMM8
-    <tr><td>QASYMM8<td>QSYMM8_PER_CHANNEL<td>S32<td>QASYMM8
-    <tr><td>QASYMM8_SIGNED<td>QASYMM8_SIGNED<td>S32<td>QASYMM8_SIGNED
-    <tr><td>QASYMM8_SIGNED<td>QSYMM8_PER_CHANNEL<td>S32<td>QASYMM8_SIGNED
+    <tr><th>src0<th>src1<th>src2<th>dst
+    <tr><td>F16<td>F16<td>F16<td>F16
+    <tr><td>F32<td>F32<td>F32<td>F32
+    <tr><td>QASYMM8<td>QASYMM8<td>S32<td>QASYMM8
+    <tr><td>QASYMM8<td>QSYMM8_PER_CHANNEL<td>S32<td>QASYMM8
+    <tr><td>QASYMM8_SIGNED<td>QASYMM8_SIGNED<td>S32<td>QASYMM8_SIGNED
+    <tr><td>QASYMM8_SIGNED<td>QSYMM8_PER_CHANNEL<td>S32<td>QASYMM8_SIGNED
+    </table>
+<tr>
+  <td>CLDepthwiseConvolutionLayer
+  <td>
+      <ul>
+       <li>NHWC
+       <li>NCHW
+      </ul>
+  <td>
+    <table>
+    <tr><th>src0<th>src1<th>src2<th>dst
+    <tr><td>F16<td>F16<td>F16<td>F16
+    <tr><td>F32<td>F32<td>F32<td>F32
+    <tr><td>QASYMM8<td>QASYMM8<td>S32<td>QASYMM8
+    <tr><td>QASYMM8<td>QSYMM8_PER_CHANNEL<td>S32<td>QASYMM8
+    <tr><td>QASYMM8_SIGNED<td>QASYMM8_SIGNED<td>S32<td>QASYMM8_SIGNED
+    <tr><td>QASYMM8_SIGNED<td>QSYMM8_PER_CHANNEL<td>S32<td>QASYMM8_SIGNED
+    </table>
+<tr>
+  <td rowspan="2">DequantizationLayer
+  <td rowspan="2" style="width:200px;"> Function to dequantize the values in a tensor.
+  <td rowspan="2">
+      <ul>
+       <li>ANEURALNETWORKS_DEQUANTIZE
+      </ul>
+  <td>NEDequantizationLayer
+  <td>
+      <ul>
+       <li>All
+      </ul>
+  <td>
+    <table>
+    <tr><th>src<th>dst
+    <tr><td>QASYMM8<td>F16, F32
+    <tr><td>QASYMM8_SIGNED<td>F16, F32
+    <tr><td>QSYMM8_PER_CHANNEL<td>F16, F32
+    <tr><td>QSYMM8<td>F16, F32
+    <tr><td>QSYMM16<td>F16, F32
+    </table>
+<tr>
+  <td>CLDequantizationLayer
+  <td>
+      <ul>
+       <li>All
+      </ul>
+  <td>
+    <table>
+    <tr><th>src<th>dst
+    <tr><td>QASYMM8<td>F16, F32
+    <tr><td>QASYMM8_SIGNED<td>F16, F32
+    <tr><td>QSYMM8_PER_CHANNEL<td>F16, F32
+    <tr><td>QSYMM8<td>F16, F32
+    <tr><td>QSYMM16<td>F16, F32
+    </table>
+<tr>
+  <td rowspan="1">DetectionPostProcessLayer
+  <td rowspan="1" style="width:200px;"> Function to generate the detection output based on center size encoded boxes, class prediction and anchors by doing non maximum suppression (NMS).
+  <td rowspan="1">
+      <ul>
+       <li>ANEURALNETWORKS_DETECTION_POSTPROCESSING
+      </ul>
+  <td>NEDetectionPostProcessLayer
+  <td>
+      <ul>
+       <li>All
+      </ul>
+  <td>
+    <table>
+    <tr><th>src0 - src2<th>dst0 - dst3
+    <tr><td>QASYMM8<td>F32
+    <tr><td>QASYMM8_SIGNED<td>F32
+    <tr><td>F32<td>F32
+    </table>
+<tr>
+  <td rowspan="2">DirectConvolutionLayer
+  <td rowspan="2" style="width:200px;"> Function to compute direct convolution.
+  <td rowspan="2">
+      <ul>
+       <li>ANEURALNETWORKS_CONV_2D
+      </ul>
+  <td>NEDirectConvolutionLayer
+  <td>
+      <ul>
+       <li>NHWC
+       <li>NCHW
+      </ul>
+  <td>
+    <table>
+    <tr><th>src0<th>src1<th>src2<th>dst
+    <tr><td>F16<td>F16<td>F16<td>F16
+    <tr><td>F32<td>F32<td>F32<td>F32
+    </table>
+<tr>
+  <td>CLDirectConvolutionLayer
+  <td>
+      <ul>
+       <li>NHWC
+       <li>NCHW
+      </ul>
+  <td>
+    <table>
+    <tr><th>src0<th>src1<th>src2<th>dst
+    <tr><td>F16<td>F16<td>F16<td>F16
+    <tr><td>F32<td>F32<td>F32<td>F32
+    <tr><td>QASYMM8<td>QASYMM8<td>S32<td>QASYMM8
+    <tr><td>QASYMM8_SIGNED<td>QASYMM8_SIGNED<td>S32<td>QASYMM8_SIGNED
+    </table>
+<tr>
+  <td rowspan="1">DirectDeconvolutionLayer
+  <td rowspan="1" style="width:200px;"> Function to run the deconvolution layer.
+  <td rowspan="1">
+      <ul>
+       <li>ANEURALNETWORKS_TRANSPOSE_CONV_2D
+      </ul>
+  <td>CLDirectDeconvolutionLayer
+  <td>
+      <ul>
+       <li>NHWC
+       <li>NCHW
+      </ul>
+  <td>
+    <table>
+    <tr><th>src0<th>src1<th>src2<th>dst
+    <tr><td>F16<td>F16<td>F16<td>F16
+    <tr><td>F32<td>F32<td>F32<td>F32
+    <tr><td>QASYMM8<td>QASYMM8<td>S32<td>QASYMM8
+    <tr><td>QASYMM8_SIGNED<td>QASYMM8_SIGNED<td>S32<td>QASYMM8_SIGNED
+    <tr><td>QASYMM8<td>QSYMM8_PER_CHANNEL<td>S32<td>QASYMM8
+    <tr><td>QASYMM8_SIGNED<td>QSYMM8_PER_CHANNEL<td>S32<td>QASYMM8_SIGNED
+    </table>
+<tr>
+  <td rowspan="13">ElementWiseOperations
+  <td rowspan="13" style="width:200px;"> Function to perform in Cpu: - Div - Max - Min - Pow - SquaredDiff - Comparisons (Equal, greater, greater_equal, less, less_equal, not_equal) Function to perform in CL: - Add - Sub - Div - Max - Min - Pow - SquaredDiff
+  <td rowspan="13">
+      <ul>
+       <li>ANEURALNETWORKS_MAXIMUM
+       <li>ANEURALNETWORKS_MINIMUM
+       <li>ANEURALNETWORKS_POW
+       <li>ANEURALNETWORKS_DIV
+       <li>ANEURALNETWORKS_ADD
+       <li>ANEURALNETWORKS_SUB
+       <li>ANEURALNETWORKS_EQUAL
+       <li>ANEURALNETWORKS_GREATER
+       <li>ANEURALNETWORKS_GREATER_EQUAL
+       <li>ANEURALNETWORKS_LESS
+       <li>ANEURALNETWORKS_LESS_EQUAL
+       <li>ANEURALNETWORKS_NOT_EQUAL
+      </ul>
+  <td>NEElementwiseMax
+  <td>
+      <ul>
+       <li>All
+      </ul>
+  <td>
+    <table>
+    <tr><th>src0<th>src1<th>dst
+    <tr><td>QASYMM8<td>QASYMM8<td>QASYMM8
+    <tr><td>QASYMM8_SIGNED<td>QASYMM8_SIGNED<td>QASYMM8_SIGNED
+    <tr><td>S32<td>S32<td>S32
+    <tr><td>S16<td>S16<td>S16
+    <tr><td>F16<td>F16<td>F16
+    <tr><td>F32<td>F32<td>F32
+    </table>
+<tr>
+  <td>NEElementwiseMin
+  <td>
+      <ul>
+       <li>All
+      </ul>
+  <td>
+    <table>
+    <tr><th>src0<th>src1<th>dst
+    <tr><td>QASYMM8<td>QASYMM8<td>QASYMM8
+    <tr><td>QASYMM8_SIGNED<td>QASYMM8_SIGNED<td>QASYMM8_SIGNED
+    <tr><td>S32<td>S32<td>S32
+    <tr><td>S16<td>S16<td>S16
+    <tr><td>F16<td>F16<td>F16
+    <tr><td>F32<td>F32<td>F32
+    </table>
+<tr>
+  <td>NEElementwiseSquaredDiff
+  <td>
+      <ul>
+       <li>All
+      </ul>
+  <td>
+    <table>
+    <tr><th>src0<th>src1<th>dst
+    <tr><td>QASYMM8<td>QASYMM8<td>QASYMM8
+    <tr><td>QASYMM8_SIGNED<td>QASYMM8_SIGNED<td>QASYMM8_SIGNED
+    <tr><td>S32<td>S32<td>S32
+    <tr><td>S16<td>S16<td>S16
+    <tr><td>F16<td>F16<td>F16
+    <tr><td>F32<td>F32<td>F32
+    </table>
+<tr>
+  <td>NEElementwiseDivision
+  <td>
+      <ul>
+       <li>All
+      </ul>
+  <td>
+    <table>
+    <tr><th>src0<th>src1<th>dst
+    <tr><td>F16<td>F16<td>F16
+    <tr><td>F32<td>F32<td>F32
+    </table>
+<tr>
+  <td>NEElementwisePower
+  <td>
+      <ul>
+       <li>All
+      </ul>
+  <td>
+    <table>
+    <tr><th>src0<th>src1<th>dst
+    <tr><td>F16<td>F16<td>F16
+    <tr><td>F32<td>F32<td>F32
+    </table>
+<tr>
+  <td>NEElementwiseComparison
+  <td>
+      <ul>
+       <li>All
+      </ul>
+  <td>
+    <table>
+    <tr><th>src0<th>src1<th>dst
+    <tr><td>QASYMM8<td>QASYMM8<td>U8
+    <tr><td>QASYMM8_SIGNED<td>QASYMM8_SIGNED<td>U8
+    <tr><td>S32<td>S32<td>U8
+    <tr><td>U8<td>U8<td>U8
+    <tr><td>S16<td>S16<td>U8
+    <tr><td>F16<td>F16<td>U8
+    <tr><td>F32<td>F32<td>U8
+    </table>
+<tr>
+  <td>CLArithmeticAddition
+  <td>
+      <ul>
+       <li>All
+      </ul>
+  <td>
+    <table>
+    <tr><th>src0<th>src1<th>dst
+    <tr><td>QASYMM8<td>QASYMM8<td>QASYMM8
+    <tr><td>QASYMM8_SIGNED<td>QASYMM8_SIGNED<td>QASYMM8_SIGNED
+    <tr><td>QSYMM16<td>QSYMM16<td>QASYMM16
+    <tr><td>U8<td>U8<td>U8
+    <tr><td>U8<td>U8<td>S16
+    <tr><td>U8<td>S16<td>S16
+    <tr><td>S16<td>U8<td>S16
+    <tr><td>S16<td>S16<td>S16
+    <tr><td>S32<td>S32<td>S32
+    <tr><td>F16<td>F16<td>F16
+    <tr><td>F32<td>F32<td>F32
+    </table>
+<tr>
+  <td>CLArithmeticSubtraction
+  <td>
+      <ul>
+       <li>All
+      </ul>
+  <td>
+    <table>
+    <tr><th>src0<th>src1<th>dst
+    <tr><td>QASYMM8<td>QASYMM8<td>QASYMM8
+    <tr><td>QASYMM8_SIGNED<td>QASYMM8_SIGNED<td>QASYMM8_SIGNED
+    <tr><td>QSYMM16<td>QSYMM16<td>QASYMM16
+    <tr><td>U8<td>U8<td>U8
+    <tr><td>U8<td>U8<td>S16
+    <tr><td>U8<td>S16<td>S16
+    <tr><td>S16<td>U8<td>S16
+    <tr><td>S16<td>S16<td>S16
+    <tr><td>S32<td>S32<td>S32
+    <tr><td>F16<td>F16<td>F16
+    <tr><td>F32<td>F32<td>F32
+    </table>
+<tr>
+  <td>CLArithmeticDivision
+  <td>
+      <ul>
+       <li>All
+      </ul>
+  <td>
+    <table>
+    <tr><th>src0<th>src1<th>dst
+    <tr><td>F16<td>F16<td>F16
+    <tr><td>F32<td>F32<td>F32
+    </table>
+<tr>
+  <td>CLElementwiseMax
+  <td>
+      <ul>
+       <li>All
+      </ul>
+  <td>
+    <table>
+    <tr><th>src0<th>src1<th>dst
+    <tr><td>QASYMM8<td>QASYMM8<td>QASYMM8
+    <tr><td>QASYMM8_SIGNED<td>QASYMM8_SIGNED<td>QASYMM8_SIGNED
+    <tr><td>QSYMM16<td>QSYMM16<td>QASYMM16
+    <tr><td>U8<td>U8<td>U8
+    <tr><td>S16<td>S16<td>S16
+    <tr><td>S32<td>S32<td>S32
+    <tr><td>U32<td>U32<td>U32
+    <tr><td>F16<td>F16<td>F16
+    <tr><td>F32<td>F32<td>F32
+    </table>
+<tr>
+  <td>CLElementwiseMin
+  <td>
+      <ul>
+       <li>All
+      </ul>
+  <td>
+    <table>
+    <tr><th>src0<th>src1<th>dst
+    <tr><td>QASYMM8<td>QASYMM8<td>QASYMM8
+    <tr><td>QASYMM8_SIGNED<td>QASYMM8_SIGNED<td>QASYMM8_SIGNED
+    <tr><td>QSYMM16<td>QSYMM16<td>QASYMM16
+    <tr><td>U8<td>U8<td>U8
+    <tr><td>S16<td>S16<td>S16
+    <tr><td>S32<td>S32<td>S32
+    <tr><td>U32<td>U32<td>U32
+    <tr><td>F16<td>F16<td>F16
+    <tr><td>F32<td>F32<td>F32
+    </table>
+<tr>
+  <td>CLElementwiseSquaredDiff
+  <td>
+      <ul>
+       <li>All
+      </ul>
+  <td>
+    <table>
+    <tr><th>src0<th>src1<th>dst
+    <tr><td>QASYMM8<td>QASYMM8<td>QASYMM8
+    <tr><td>QASYMM8_SIGNED<td>QASYMM8_SIGNED<td>QASYMM8_SIGNED
+    <tr><td>QSYMM16<td>QSYMM16<td>QASYMM16
+    <tr><td>U8<td>U8<td>U8
+    <tr><td>S16<td>S16<td>S16
+    <tr><td>F16<td>F16<td>F16
+    <tr><td>F32<td>F32<td>F32
+    </table>
+<tr>
+  <td>CLElementwisePower
+  <td>
+      <ul>
+       <li>All
+      </ul>
+  <td>
+    <table>
+    <tr><th>src0<th>src1<th>dst
+    <tr><td>F16<td>F16<td>F16
+    <tr><td>F32<td>F32<td>F32
+    </table>
+<tr>
+  <td rowspan="8">ElementwiseUnaryLayer
+  <td rowspan="8" style="width:200px;"> Function to perform: - Rsqrt - Exp - Neg - Log - Abs - Round - Sin
+  <td rowspan="8">
+      <ul>
+       <li>ANEURALNETWORKS_ABS
+       <li>ANEURALNETWORKS_EXP
+       <li>ANEURALNETWORKS_LOG
+       <li>ANEURALNETWORKS_NEG
+       <li>ANEURALNETWORKS_RSQRT
+       <li>ANEURALNETWORKS_SIN
+      </ul>
+  <td>NEElementwiseUnaryLayer
+  <td>
+      <ul>
+       <li>All
+      </ul>
+  <td>
+    <table>
+    <tr><th>src<th>dst
+    <tr><td>F16<td>F16
+    <tr><td>F32<td>F32
+    <tr><td>S32<td>S32
+    </table>
+<tr>
+  <td>CLRsqrtLayer
+  <td>
+      <ul>
+       <li>All
+      </ul>
+  <td>
+    <table>
+    <tr><th>src<th>dst
+    <tr><td>F16<td>F16
+    <tr><td>F32<td>F32
     </table>
 <tr>
-  <td>CLDepthwiseConvolutionLayer
+  <td>CLExpLayer
   <td>
       <ul>
-       <li>NHWC
-       <li>NCHW
+       <li>All
       </ul>
   <td>
     <table>
-    <tr><th>src0<th>src1<th>src2<th>dst
-    <tr><td>F16<td>F16<td>F16<td>F16
-    <tr><td>F32<td>F32<td>F32<td>F32
-    <tr><td>QASYMM8<td>QASYMM8<td>S32<td>QASYMM8
-    <tr><td>QASYMM8<td>QSYMM8_PER_CHANNEL<td>S32<td>QASYMM8
-    <tr><td>QASYMM8_SIGNED<td>QASYMM8_SIGNED<td>S32<td>QASYMM8_SIGNED
-    <tr><td>QASYMM8_SIGNED<td>QSYMM8_PER_CHANNEL<td>S32<td>QASYMM8_SIGNED
+    <tr><th>src<th>dst
+    <tr><td>F16<td>F16
+    <tr><td>F32<td>F32
     </table>
 <tr>
-  <td rowspan="2">DequantizationLayer
-  <td rowspan="2" style="width:200px;"> Function to dequantize the values in a tensor.
-  <td rowspan="2">
-      <ul>
-       <li>ANEURALNETWORKS_DEQUANTIZE
-      </ul>
-  <td>NEDequantizationLayer
+  <td>CLNegLayer
   <td>
       <ul>
        <li>All
@@ -746,14 +1240,11 @@ where N = batches, C = channels, H = height, W = width
   <td>
     <table>
     <tr><th>src<th>dst
-    <tr><td>QASYMM8<td>F16, F32
-    <tr><td>QASYMM8_SIGNED<td>F16, F32
-    <tr><td>QSYMM8_PER_CHANNEL<td>F16, F32
-    <tr><td>QSYMM8<td>F16, F32
-    <tr><td>QSYMM16<td>F16, F32
+    <tr><td>F16<td>F16
+    <tr><td>F32<td>F32
     </table>
 <tr>
-  <td>CLDequantizationLayer
+  <td>CLSinLayer
   <td>
       <ul>
        <li>All
@@ -761,45 +1252,44 @@ where N = batches, C = channels, H = height, W = width
   <td>
     <table>
     <tr><th>src<th>dst
-    <tr><td>QASYMM8<td>F16, F32
-    <tr><td>QASYMM8_SIGNED<td>F16, F32
-    <tr><td>QSYMM8_PER_CHANNEL<td>F16, F32
-    <tr><td>QSYMM8<td>F16, F32
-    <tr><td>QSYMM16<td>F16, F32
+    <tr><td>F16<td>F16
+    <tr><td>F32<td>F32
     </table>
 <tr>
-  <td rowspan="2">DirectConvolutionLayer
-  <td rowspan="2" style="width:200px;"> Function to compute direct convolution.
-  <td rowspan="2">
+  <td>CLLogLayer
+  <td>
       <ul>
-       <li>ANEURALNETWORKS_CONV_2D
+       <li>All
       </ul>
-  <td>NEDirectConvolutionLayer
+  <td>
+    <table>
+    <tr><th>src<th>dst
+    <tr><td>F16<td>F16
+    <tr><td>F32<td>F32
+    </table>
+<tr>
+  <td>CLAbsLayer
   <td>
       <ul>
-       <li>NHWC
-       <li>NCHW
+       <li>All
       </ul>
   <td>
     <table>
-    <tr><th>src0<th>src1<th>src2<th>dst
-    <tr><td>F16<td>F16<td>F16<td>F16
-    <tr><td>F32<td>F32<td>F32<td>F32
+    <tr><th>src<th>dst
+    <tr><td>F16<td>F16
+    <tr><td>F32<td>F32
     </table>
 <tr>
-  <td>CLDirectConvolutionLayer
+  <td>CLRoundLayer
   <td>
       <ul>
-       <li>NHWC
-       <li>NCHW
+       <li>All
       </ul>
   <td>
     <table>
-    <tr><th>src0<th>src1<th>src2<th>dst
-    <tr><td>F16<td>F16<td>F16<td>F16
-    <tr><td>F32<td>F32<td>F32<td>F32
-    <tr><td>QASYMM8<td>QASYMM8<td>S32<td>QASYMM8
-    <tr><td>QASYMM8_SIGNED<td>QASYMM8_SIGNED<td>S32<td>QASYMM8_SIGNED
+    <tr><th>src<th>dst
+    <tr><td>F16<td>F16
+    <tr><td>F32<td>F32
     </table>
 <tr>
   <td rowspan="2">FFT1D
@@ -1009,7 +1499,7 @@ where N = batches, C = channels, H = height, W = width
       <ul>
        <li>ANEURALNETWORKS_FULLY_CONNECTED
       </ul>
-  <td>NEFullyConnectedLayerReshapeWeightsManaged
+  <td>NEFullyConnectedLayer
   <td>
       <ul>
        <li>NHWC
@@ -1024,7 +1514,7 @@ where N = batches, C = channels, H = height, W = width
     <tr><td>QASYMM8_SIGNED<td>QASYMM8_SIGNED<td>S32<td>QASYMM8_SIGNED
     </table>
 <tr>
-  <td>CLFullyConnectedLayerReshapeWeightsManaged
+  <td>CLFullyConnectedLayer
   <td>
       <ul>
        <li>NHWC
@@ -1118,7 +1608,7 @@ where N = batches, C = channels, H = height, W = width
     <tr><td>BFLOAT16<td>BFLOAT16<td>BFLOAT16<td>BFLOAT16
     </table>
 <tr>
-  <td>CLGEMMReshapeRHSMatrixKernelManaged
+  <td>CLGEMM
   <td>
       <ul>
        <li>All
@@ -1129,6 +1619,27 @@ where N = batches, C = channels, H = height, W = width
     <tr><td>F32<td>F32<td>F32<td>F32
     <tr><td>F16<td>F16<td>F16<td>F16
     </table>
+<tr>
+  <td rowspan="1">GEMMConv2D
+  <td rowspan="1" style="width:200px;"> General Matrix Multiplication.
+  <td rowspan="1">
+      <ul>
+       <li>ANEURALNETWORKS_CONV_2D
+      </ul>
+  <td>NEGEMMConv2d
+  <td>
+      <ul>
+       <li>All
+      </ul>
+  <td>
+    <table>
+    <tr><th>src0<th>src1<th>src2<th>dst
+    <tr><td>QASYMM8<td>QASYMM8<td>S32<td>QASYMM8
+    <tr><td>QASYMM8_SIGNED<td>QASYMM8_SIGNED<td>S32<td>QASYMM8_SIGNED
+    <tr><td>F16<td>F16<td>F16<td>F16
+    <tr><td>F32<td>F32<td>F32<td>F32
+    <tr><td>BFLOAT16<td>BFLOAT16<td>BFLOAT16<td>BFLOAT16
+    </table>
 <tr>
   <td rowspan="2">GEMMConvolutionLayer
   <td rowspan="2" style="width:200px;"> General Matrix Multiplication.
@@ -1136,7 +1647,7 @@ where N = batches, C = channels, H = height, W = width
       <ul>
        <li>ANEURALNETWORKS_CONV_2D
       </ul>
-  <td>NEConvolutionLayerReshapeWeights
+  <td>NEGEMMConvolutionLayer
   <td>
       <ul>
        <li>NHWC
@@ -1154,7 +1665,7 @@ where N = batches, C = channels, H = height, W = width
     <tr><td>QASYMM8_SIGNED<td>QSYMM8_PER_CHANNEL<td>S32<td>QASYMM8_SIGNED
     </table>
 <tr>
-  <td>CLConvolutionLayerReshapeWeights
+  <td>CLGEMMConvolutionLayer
   <td>
       <ul>
        <li>NHWC
@@ -1170,6 +1681,26 @@ where N = batches, C = channels, H = height, W = width
     <tr><td>QASYMM8_SIGNED<td>QASYMM8_SIGNED<td>S32<td>QASYMM8_SIGNED
     <tr><td>QASYMM8_SIGNED<td>QSYMM8_PER_CHANNEL<td>S32<td>QASYMM8_SIGNED
     </table>
+<tr>
+  <td rowspan="1">GEMMDeconvolutionLayer
+  <td rowspan="1" style="width:200px;"> General Matrix Multiplication.
+  <td rowspan="1">
+      <ul>
+       <li>ANEURALNETWORKS_TRANSPOSE_CONV_2D
+      </ul>
+  <td>CLGEMMDeconvolutionLayer
+  <td>
+      <ul>
+       <li>NHWC
+      </ul>
+  <td>
+    <table>
+    <tr><th>src0<th>src1<th>src2<th>dst
+    <tr><td>F16<td>F16<td>F16<td>F16
+    <tr><td>F32<td>F32<td>F32<td>F32
+    <tr><td>QASYMM8<td>QASYMM8<td>S32<td>QASYMM8
+    <tr><td>QASYMM8_SIGNED<td>QASYMM8_SIGNED<td>S32<td>QASYMM8_SIGNED
+    </table>
 <tr>
   <td rowspan="2">GEMMLowpMatrixMultiplyCore
   <td rowspan="2" style="width:200px;"> General Matrix Multiplication.
@@ -1222,6 +1753,38 @@ where N = batches, C = channels, H = height, W = width
     <tr><td>QASYMM8_SIGNED<td>QSYMM8_PER_CHANNEL<td>S32<td>S32
     <tr><td>QASYMM8_SIGNED<td>QSYMM8<td>S32<td>S32
     </table>
+<tr>
+  <td rowspan="2">GEMMLowpOutputStage
+  <td rowspan="2" style="width:200px;"> General Matrix Multiplication.
+  <td rowspan="2">
+      <ul>
+       <li>n/a
+      </ul>
+  <td>NEGEMMLowpOutputStage
+  <td>
+      <ul>
+       <li>All
+      </ul>
+  <td>
+    <table>
+    <tr><th>src0<th>src1<th>dst
+    <tr><td>S32<td>S32<td>QASYMM8
+    <tr><td>S32<td>S32<td>QASYMM8_SIGNED
+    <tr><td>S32<td>S32<td>QSYMM16
+    </table>
+<tr>
+  <td>CLGEMMLowpOutputStage
+  <td>
+      <ul>
+       <li>All
+      </ul>
+  <td>
+    <table>
+    <tr><th>src0<th>src1<th>dst
+    <tr><td>S32<td>S32<td>QASYMM8
+    <tr><td>S32<td>S32<td>QASYMM8_SIGNED
+    <tr><td>S32<td>S32<td>QSYMM16
+    </table>
 <tr>
   <td rowspan="2">GenerateProposalsLayer
   <td rowspan="2" style="width:200px;"> Function to generate proposals for a RPN (Region Proposal Network).
@@ -1318,6 +1881,96 @@ where N = batches, C = channels, H = height, W = width
     <tr><td>F16<td>F16
     <tr><td>F32<td>F32
     </table>
+<tr>
+  <td rowspan="3">Logical
+  <td rowspan="3" style="width:200px;"> Function to perform: - Logical AND - Logical OR - Logical NOT
+  <td rowspan="3">
+      <ul>
+       <li>n/a
+      </ul>
+  <td>NELogicalAnd
+  <td>
+      <ul>
+       <li>All
+      </ul>
+  <td>
+    <table>
+    <tr><th>src0<th>src1<th>dst
+    <tr><td>U8<td>U8<td>U8
+    </table>
+<tr>
+  <td>NELogicalOr
+  <td>
+      <ul>
+       <li>All
+      </ul>
+  <td>
+    <table>
+    <tr><th>src0<th>src1<th>dst
+    <tr><td>U8<td>U8<td>U8
+    </table>
+<tr>
+  <td>NELogicalNot
+  <td>
+      <ul>
+       <li>All
+      </ul>
+  <td>
+    <table>
+    <tr><th>src<th>dst
+    <tr><td>U8<td>U8
+    </table>
+<tr>
+  <td rowspan="1">LogicalAnd
+  <td rowspan="1" style="width:200px;"> Function to perform Logical AND.
+  <td rowspan="1">
+      <ul>
+       <li>n/a
+      </ul>
+  <td>CLLogicalAnd
+  <td>
+      <ul>
+       <li>All
+      </ul>
+  <td>
+    <table>
+    <tr><th>src0<th>src1<th>dst
+    <tr><td>U8<td>U8<td>U8
+    </table>
+<tr>
+  <td rowspan="1">LogicalOr
+  <td rowspan="1" style="width:200px;"> Function to perform Logical OR.
+  <td rowspan="1">
+      <ul>
+       <li>n/a
+      </ul>
+  <td>CLLogicalOr
+  <td>
+      <ul>
+       <li>All
+      </ul>
+  <td>
+    <table>
+    <tr><th>src0<th>src1<th>dst
+    <tr><td>U8<td>U8<td>U8
+    </table>
+<tr>
+  <td rowspan="1">LogicalNot
+  <td rowspan="1" style="width:200px;"> Function to perform Logical NOT.
+  <td rowspan="1">
+      <ul>
+       <li>n/a
+      </ul>
+  <td>CLLogicalNot
+  <td>
+      <ul>
+       <li>All
+      </ul>
+  <td>
+    <table>
+    <tr><th>src<th>dst
+    <tr><td>U8<td>U8
+    </table>
 <tr>
   <td rowspan="2">LSTMLayer
   <td rowspan="2" style="width:200px;"> Function to perform a single time step in a Long Short-Term Memory (LSTM) layer.
@@ -1660,7 +2313,7 @@ where N = batches, C = channels, H = height, W = width
     </table>
 <tr>
   <td rowspan="2">PriorBoxLayer
-  <td rowspan="2" style="width:200px;"> Function to .
+  <td rowspan="2" style="width:200px;"> Function to compute prior boxes and clip.
   <td rowspan="2">
       <ul>
        <li>n/a
@@ -2150,6 +2803,41 @@ where N = batches, C = channels, H = height, W = width
     <tr><th>src<th>dst
     <tr><td>All<td>All
     </table>
+<tr>
+  <td rowspan="2">SoftmaxLayer
+  <td rowspan="2" style="width:200px;"> Function to compute a SoftmaxLayer and a Log SoftmaxLayer.
+  <td rowspan="2">
+      <ul>
+       <li>ANEURALNETWORKS_LOG_SOFTMAX
+       <li>ANEURALNETWORKS_SOFTMAX
+      </ul>
+  <td>NESoftmaxLayerGeneric
+  <td>
+      <ul>
+       <li>All
+      </ul>
+  <td>
+    <table>
+    <tr><th>src<th>dst
+    <tr><td>QASYMM8<td>QASYMM8
+    <tr><td>QASYMM8_SIGNED<td>QASYMM8_SIGNED
+    <tr><td>F16<td>F16
+    <tr><td>F32<td>F32
+    </table>
+<tr>
+  <td>CLSoftmaxLayerGeneric
+  <td>
+      <ul>
+       <li>All
+      </ul>
+  <td>
+    <table>
+    <tr><th>src<th>dst
+    <tr><td>QASYMM8<td>QASYMM8
+    <tr><td>QASYMM8_SIGNED<td>QASYMM8_SIGNED
+    <tr><td>F16<td>F16
+    <tr><td>F32<td>F32
+    </table>
 <tr>
   <td rowspan="2">SpaceToBatchLayer
   <td rowspan="2" style="width:200px;"> Function to divide a tensor spatially.
@@ -2410,6 +3098,25 @@ where N = batches, C = channels, H = height, W = width
     <tr><td>F16<td>F16<td>F16<td>F16
     <tr><td>F32<td>F32<td>F32<td>F32
     </table>
+<tr>
+  <td rowspan="1">WinogradInputTransform
+  <td rowspan="1" style="width:200px;"> Function to.
+  <td rowspan="1">
+      <ul>
+       <li>n/a
+      </ul>
+  <td>CLWinogradInputTransform
+  <td>
+      <ul>
+       <li>NHWC
+       <li>NCHW
+      </ul>
+  <td>
+    <table>
+    <tr><th>src<th>dst
+    <tr><td>F16<td>F16
+    <tr><td>F32<td>F32
+    </table>
 </table>
 
 */
-- 
cgit v1.2.1