Fix errata in documentation

This patch addresses the following errata found in the project documentation: * Common typos. * Missing use of trademarks. * Incomplete operator descriptions. * Examples of code that have since been removed from the library. * Plus clarification over the usage of `All` category for data types and layouts. In addition, the Operator list was not generated properly due to: * Non-matching cases in the filenames (i.e. `Elementwise` and `ElementWise`). For consistency, all usages of the latter have been renamed to the former. * Extra data layout tables in the headers for the `NESlice` and `NEStridedSlice` functions (note: not present in CL counterpart) meant documentation for those functions was generated twice. Resolves: COMPMID-4561, COMPMID-4562, COMPMID-4563 Change-Id: I1eb24559545397749e636ffbf927727fb1bc6201 Signed-off-by: Jakub Sujak <jakub.sujak@arm.com> Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/5769 Comments-Addressed: Arm Jenkins <bsgcomp@arm.com> Tested-by: Arm Jenkins <bsgcomp@arm.com> Reviewed-by: Sheri Zhang <sheri.zhang@arm.com> Reviewed-by: SiCong Li <sicong.li@arm.com>
author: Jakub Sujak <jakub.sujak@arm.com> 2021-06-04 09:46:08 +0100
committer: Michele Di Giorgio <michele.digiorgio@arm.com> 2021-06-11 09:19:27 +0000
commit: ee301b384f4aeb697a5c249b8bb848d784146582 (patch)
tree: e42ecfcfdbf95d21d5d01a422663161d32fe1733
parent: a5c428a5428d1c7a9d1d03fd198d6a8578b6c12c (diff)
download: ComputeLibrary-ee301b384f4aeb697a5c249b8bb848d784146582.tar.gz
31 files changed, 137 insertions, 140 deletions
diff --git a/Android.bp b/Android.bp
index db4a3aa843..bafbe8fe89 100644
--- a/Android.bp
+++ b/Android.bp
@@ -447,8 +447,8 @@ cc_library_static {
         "src/runtime/CL/functions/CLDequantizationLayer.cpp",
         "src/runtime/CL/functions/CLDirectConvolutionLayer.cpp",
         "src/runtime/CL/functions/CLDirectDeconvolutionLayer.cpp",
-        "src/runtime/CL/functions/CLElementWiseUnaryLayer.cpp",
         "src/runtime/CL/functions/CLElementwiseOperations.cpp",
+        "src/runtime/CL/functions/CLElementwiseUnaryLayer.cpp",
         "src/runtime/CL/functions/CLFFT1D.cpp",
         "src/runtime/CL/functions/CLFFT2D.cpp",
         "src/runtime/CL/functions/CLFFTConvolutionLayer.cpp",
diff --git a/arm_compute/runtime/CL/CLFunctions.h b/arm_compute/runtime/CL/CLFunctions.h
index 01b61c82d8..9d4b2fa050 100644
--- a/arm_compute/runtime/CL/CLFunctions.h
+++ b/arm_compute/runtime/CL/CLFunctions.h
@@ -51,8 +51,8 @@
 #include "arm_compute/runtime/CL/functions/CLDequantizationLayer.h"
 #include "arm_compute/runtime/CL/functions/CLDirectConvolutionLayer.h"
 #include "arm_compute/runtime/CL/functions/CLDirectDeconvolutionLayer.h"
-#include "arm_compute/runtime/CL/functions/CLElementWiseUnaryLayer.h"
 #include "arm_compute/runtime/CL/functions/CLElementwiseOperations.h"
+#include "arm_compute/runtime/CL/functions/CLElementwiseUnaryLayer.h"
 #include "arm_compute/runtime/CL/functions/CLFFT1D.h"
 #include "arm_compute/runtime/CL/functions/CLFFT2D.h"
 #include "arm_compute/runtime/CL/functions/CLFFTConvolutionLayer.h"
diff --git a/arm_compute/runtime/CL/functions/CLElementWiseUnaryLayer.h b/arm_compute/runtime/CL/functions/CLElementwiseUnaryLayer.h
index 594ee4cfdc..594ee4cfdc 100644
--- a/arm_compute/runtime/CL/functions/CLElementWiseUnaryLayer.h
+++ b/arm_compute/runtime/CL/functions/CLElementwiseUnaryLayer.h
diff --git a/arm_compute/runtime/NEON/functions/NESlice.h b/arm_compute/runtime/NEON/functions/NESlice.h
index 550bfd2188..ac79a5c633 100644
--- a/arm_compute/runtime/NEON/functions/NESlice.h
+++ b/arm_compute/runtime/NEON/functions/NESlice.h
@@ -103,14 +103,6 @@ class NESlice : public INEOperator
 public:
     /** Configure kernel
      *
-     * Valid data layouts:
-     * - All
-     *
-     * Valid data type configurations:
-     * |src    |dst    |
-     * |:------|:------|
-     * |All    |All    |
-     *
      * @note Supported tensor rank: up to 4
      * @note Start indices must be non-negative. 0 <= starts[i]
      * @note End coordinates can be negative, which represents the number of elements before the end of that dimension.
diff --git a/arm_compute/runtime/NEON/functions/NEStridedSlice.h b/arm_compute/runtime/NEON/functions/NEStridedSlice.h
index 0b4c2a63a1..4b14d946f6 100644
--- a/arm_compute/runtime/NEON/functions/NEStridedSlice.h
+++ b/arm_compute/runtime/NEON/functions/NEStridedSlice.h
@@ -109,14 +109,6 @@ class NEStridedSlice : public INEOperator
 public:
     /** Configure kernel
      *
-     * Valid data layouts:
-     * - All
-     *
-     * Valid data type configurations:
-     * |src    |dst    |
-     * |:------|:------|
-     * |All    |All    |
-     *
      * @note Supported tensor rank: up to 4
      *
      * @param[in]  input            Source tensor info. Data type supported: All
diff --git a/arm_compute/runtime/OperatorList.h b/arm_compute/runtime/OperatorList.h
index e1c2bed41d..8f1f4ba0a9 100644
--- a/arm_compute/runtime/OperatorList.h
+++ b/arm_compute/runtime/OperatorList.h
@@ -94,7 +94,7 @@
 /** BitwiseAnd
  *
  * Description:
- * Function to performe bitwise AND between 2 tensors.
+ * Function to perform bitwise AND between 2 tensors.
  *
  * Equivalent Android NNAPI Op:
  * ANEURALNETWORKS_LOGICAL_AND
@@ -104,7 +104,7 @@
 /** BitwiseNot
  *
  * Description:
- * Function to performe bitwise NOT.
+ * Function to perform bitwise NOT.
  *
  * Equivalent Android NNAPI Op:
  * ANEURALNETWORKS_LOGICAL_NOT
@@ -114,7 +114,7 @@
 /** BitwiseOr
  *
  * Description:
- * Function to performe bitwise OR between 2 tensors.
+ * Function to perform bitwise OR between 2 tensors.
  *
  * Equivalent Android NNAPI Op:
  * ANEURALNETWORKS_LOGICAL_OR
@@ -124,7 +124,7 @@
 /** BitwiseXor
  *
  * Description:
- * Function to performe bitwise XOR between 2 tensors.
+ * Function to perform bitwise XOR between 2 tensors.
  *
  * Equivalent Android NNAPI Op:
  * n/a
@@ -189,7 +189,7 @@
 /** ConvertFullyConnectedWeights
  *
  * Description:
- * Function to tranpose the wieghts for the fully connected layer.
+ * Function to transpose the weights for the fully connected layer.
  *
  * Equivalent Android NNAPI Op:
  * n/a
@@ -239,7 +239,7 @@
 /** DeconvolutionLayer
  *
  * Description:
- * Function to compute a deconvolution or tranpose convolution.
+ * Function to compute a deconvolution or transpose convolution.
  *
  * Equivalent Android NNAPI Op:
  * ANEURALNETWORKS_TRANSPOSE_CONV_2D
@@ -326,7 +326,7 @@
  *
  */
 
-/** ElementWiseOperations
+/** ElementwiseOperations
  *
  * Description:
  * Function to perform in Cpu:
@@ -426,7 +426,7 @@
 /** FillBorder
  *
  * Description:
- * Function to .
+ * Function to fill the borders within the XY-planes.
  *
  * Equivalent Android NNAPI Op:
  * n/a
@@ -493,7 +493,7 @@
  *
  */
 
-/** GEMMConv2D
+/** GEMMConv2d
  *
  * Description:
  * General Matrix Multiplication.
@@ -691,7 +691,7 @@
 /** PixelWiseMultiplication
  *
  * Description:
- * Function to performe a multiplication.
+ * Function to perform a multiplication.
  *
  * Equivalent Android NNAPI Op:
  * ANEURALNETWORKS_MUL
@@ -701,7 +701,7 @@
 /** PoolingLayer
  *
  * Description:
- * Function to performe pooling with the specified pooling operation.
+ * Function to perform pooling with the specified pooling operation.
  *
  * Equivalent Android NNAPI Op:
  * ANEURALNETWORKS_AVERAGE_POOL_2D
@@ -764,7 +764,7 @@
 /** ReduceMean
  *
  * Description:
- * Function to performe reduce mean operation.
+ * Function to perform reduce mean operation.
  *
  * Equivalent Android NNAPI Op:
  * ANEURALNETWORKS_MEAN
@@ -774,7 +774,7 @@
 /** ReductionOperation
  *
  * Description:
- * Function to performe reduce with the following operations
+ * Function to perform reduce with the following operations
  * - ARG_IDX_MAX: Index of the max value
  * - ARG_IDX_MIN: Index of the min value
  * - MEAN_SUM:    Mean of sum
@@ -992,7 +992,7 @@
 /** WinogradInputTransform
  *
  * Description:
- * Function to.
+ * Function to perform a Winograd transform on the input tensor.
  *
  * Equivalent Android NNAPI Op:
  * n/a
diff --git a/docs/contributor_guide/adding_operator.dox b/docs/contributor_guide/adding_operator.dox
index 67e6fbd25b..772d4362c8 100644
--- a/docs/contributor_guide/adding_operator.dox
+++ b/docs/contributor_guide/adding_operator.dox
@@ -75,10 +75,10 @@ Similarly, all common functions that process shapes, like calculating output sha
 @subsection S4_1_2_add_kernel Add a kernel
 As we mentioned at the beginning, the kernel is the implementation of the operator or algorithm partially using a specific programming language related to the backend we want to use. Adding a kernel in the library means implementing the algorithm in a SIMD technology like Arm® Neon™ or OpenCL. All kernels in Compute Library must implement a common interface IKernel or one of the specific subinterfaces.
 IKernel is the common interface for all the kernels in the core library, it contains the main methods for configure and run the kernel itself, such as window()  that return the maximum window the kernel can be executed on or is_parallelisable() for indicate whether or not the kernel is parallelizable. If the kernel is parallelizable then the window returned by the window() method can be split into sub-windows which can then be run in parallel, in the other case, only the window returned by window() can be passed to the run method.
-There are specific interfaces for OpenCL and Neon: @ref ICLKernel, INEKernel (using INEKernel = @ref ICPPKernel).
+There are specific interfaces for OpenCL and Neon™: @ref ICLKernel, INEKernel (using INEKernel = @ref ICPPKernel).
 
 - @ref ICLKernel is the common interface for all the OpenCL kernels. It implements the inherited methods and adds all the methods necessary to configure the CL kernel, such as set/return the Local-Workgroup-Size hint, add single, array or tensor argument, set the targeted GPU architecture according to the CL device. All these methods are used during the configuration and the run of the operator.
-- INEKernel inherits from @ref IKernel as well and it's the common interface for all kernels implemented in Neon, it adds just the run and the name methods.
+- INEKernel inherits from @ref IKernel as well and it's the common interface for all kernels implemented in Neon™, it adds just the run and the name methods.
 
 There are two others implementation of @ref IKernel called @ref ICLSimpleKernel and INESimpleKernel, they are the interface for simple kernels that have just one input tensor and one output tensor.
 Creating a new kernel implies adding new files:
@@ -87,7 +87,7 @@ Creating a new kernel implies adding new files:
 - src/core/CL/kernels/CLReshapeLayerKernel.cpp
 - src/core/CL/CLKernelLibrary.cpp
 
-Neon kernel
+Neon™ kernel
 - arm_compute/core/NEON/kernels/NEReshapeLayerKernel.h
 - src/core/NEON/kernels/NEReshapeLayerKernel.cpp
 
@@ -153,7 +153,7 @@ OpenCL function
 - arm_compute/runtime/CL/functions/CLReshapeLayer.h
 - src/runtime/CL/functions/CLReshapeLayer.cpp
 
-Neon function
+Neon™ function
 - arm_compute/runtime/NEON/functions/NEReshapeLayer.h
 - src/runtime/NEON/functions/NEReshapeLayer.cpp
 
@@ -216,7 +216,7 @@ void CLAddReshapeLayer::run()
 
 @endcode
 
-For Neon:
+For Neon™:
 
 @code{.cpp}
 using namespace arm_compute;
@@ -264,7 +264,7 @@ At this point, everything is in place at the library level. If you are following
 
 @subsubsection S4_1_4_1_add_reference Add the reference implementation and the tests
 As mentioned in the introduction, the reference implementation is a pure C++ implementation without any optimization or backend specific instruction.
-The refence implementation consist of two files into the folder tests/validation/reference:
+The reference implementation consist of two files into the folder tests/validation/reference:
 - tests/validation/reference/ReshapeLayer.h
 - tests/validation/reference/ReshapeLayer.cpp
 
@@ -300,7 +300,7 @@ For example, dataset for ReshapeLayer:
 
 Benchmark and validation tests are based on the same framework to setup and run the tests. In addition to running simple, self-contained test functions the framework supports fixtures and data test cases.
 Fixtures can be used to share common setup, teardown or even run tasks among multiple test cases, for that purpose a fixture can define a "setup", "teardown" and "run" method.
-Adding tests for the new operator in the runtime library we need to implement at least the setup method, that is used to call two methods for configure, run and return the output respectively of the target (CL or Neon) and the reference (C++ implementation).
+Adding tests for the new operator in the runtime library we need to implement at least the setup method, that is used to call two methods for configure, run and return the output respectively of the target (CL or Neon™) and the reference (C++ implementation).
 
 For example let's have a look at Reshape Layer Fixture :
 
@@ -310,7 +310,7 @@ In the fixture class above we can see that the setup method computes the target
 The compute_target method reflects the exact behavior expected when we call a function. The input and output tensor must be declared, function configured, tensors allocated, the input tensor filled with required data, and finally, the function must be run and the results returned.
 This fixture is used in the test case, that is a parameterized test case that inherits from a fixture. The test case will have access to all public and protected members of the fixture. Only the setup and teardown methods of the fixture will be used. The setup method of the fixture needs to be a template and must accept inputs from the dataset as arguments.
 The body of this function will be used as a test function.
-For the fixture test case the first argument is the name of the test case (has to be unique within the enclosing test suite), the second argument is the class name of the fixture, the third argument is the dataset mode in which the test will be active (PRECOMMIT or NIGTHLY) and the fourth argument is the dataset.
+For the fixture test case the first argument is the name of the test case (has to be unique within the enclosing test suite), the second argument is the class name of the fixture, the third argument is the dataset mode in which the test will be active (PRECOMMIT or NIGHTLY) and the fourth argument is the dataset.
 For example:
 
 @snippet tests/validation/CL/ActivationLayer.cpp CLActivationLayerFixture snippet
diff --git a/docs/contributor_guide/contribution_guidelines.dox b/docs/contributor_guide/contribution_guidelines.dox
index ec3e3a70d3..f3a6def582 100644
--- a/docs/contributor_guide/contribution_guidelines.dox
+++ b/docs/contributor_guide/contribution_guidelines.dox
@@ -139,11 +139,11 @@ void foobar(const MyLargeCustomTypeClass &m); // Definitely better as const-refe
 
 - Don't use unions
 
-Unions cannot be used to convert values between different types because (in C++) it is undefined behaviour to read from a member other than the last one that has been assigned to. This limits the use of unions to a few corner cases and therefor the general advice is not to use unions. See http://releases.llvm.org/3.8.0/tools/clang/tools/extra/docs/clang-tidy/checks/cppcoreguidelines-pro-type-union-access.html
+Unions cannot be used to convert values between different types because (in C++) it is undefined behaviour to read from a member other than the last one that has been assigned to. This limits the use of unions to a few corner cases and therefore the general advice is not to use unions. See http://releases.llvm.org/3.8.0/tools/clang/tools/extra/docs/clang-tidy/checks/cppcoreguidelines-pro-type-union-access.html
 
 - Use pre-increment/pre-decrement whenever possible
 
-In contrast to the pre-incerement the post-increment has to make a copy of the incremented object. This might not be a problem for primitive types like int but for class like objects that overload the operators, like iterators, it can have a huge impact on the performance. See http://stackoverflow.com/a/9205011
+In contrast to the pre-increment the post-increment has to make a copy of the incremented object. This might not be a problem for primitive types like int but for class like objects that overload the operators, like iterators, it can have a huge impact on the performance. See http://stackoverflow.com/a/9205011
 
 To be consistent across the different cases the general advice is to use the pre-increment operator unless post-increment is explicitly required. The same rules apply for the decrement operator.
 
@@ -438,7 +438,7 @@ You are now ready to submit your patch for review:
 
 @section S5_3_code_review Patch acceptance and code review
 
-Once a patch is uploaded for review, there is a pre-commit test that runs on a Jenkins server for continuos integration tests. In order to be merged a patch needs to:
+Once a patch is uploaded for review, there is a pre-commit test that runs on a Jenkins server for continuous integration tests. In order to be merged a patch needs to:
 
 - get a "+1 Verified" from the pre-commit job
 - get a "+1 Comments-Addressed", in case of comments from reviewers the committer has to address them all. A comment is considered addressed when the first line of the reply contains the word "Done"
diff --git a/docs/user_guide/data_layout.dox b/docs/user_guide/data_layout.dox
index 48f15acd63..97d3ea6262 100644
--- a/docs/user_guide/data_layout.dox
+++ b/docs/user_guide/data_layout.dox
@@ -29,7 +29,7 @@ namespace arm_compute
 
 @section data_layout_support_supported_data_layout Supported Data Layouts
 
-Compute Library supports the follwing data layouts and
+Compute Library supports the following data layouts and
 the right-most letter represents the fastest changing dimension:
 
 - NHWC: The native layout of Compute Library that delivers the best performance where channels are in the fastest changing dimension
diff --git a/docs/user_guide/how_to_build_and_run_examples.dox b/docs/user_guide/how_to_build_and_run_examples.dox
index e57183e891..1766199eb4 100644
--- a/docs/user_guide/how_to_build_and_run_examples.dox
+++ b/docs/user_guide/how_to_build_and_run_examples.dox
@@ -161,7 +161,7 @@ To see the build options available simply run ```scons -h```:
 @b arch: The x86_32 and x86_64 targets can only be used with neon=0 and opencl=1.
 
 @b os: Choose the operating system you are targeting: Linux, Android or bare metal.
-@note bare metal can only be used for Arm® Neon™ (not OpenCL), only static libraries get built and Neon's multi-threading support is disabled.
+@note bare metal can only be used for Arm® Neon™ (not OpenCL), only static libraries get built and Neon™'s multi-threading support is disabled.
 
 @b build: you can either build directly on your device (native) or cross compile from your desktop machine (cross-compile). In both cases make sure the compiler is available in your path.
 
@@ -169,11 +169,11 @@ To see the build options available simply run ```scons -h```:
 
 There is also an 'embed_only' option which will generate all the .embed files for the OpenCL kernels. This might be useful if using a different build system to compile the library.
 
-In addittion the option 'compress_kernels' will compress the embedded OpenCL kernel files using zlib and inject them in the library. This is useful for reducing the binary size. Note, this option is only available for Android when 'embed_kernels' is enabled.
+In addition the option 'compress_kernels' will compress the embedded OpenCL kernel files using zlib and inject them in the library. This is useful for reducing the binary size. Note, this option is only available for Android when 'embed_kernels' is enabled.
 
 @b Werror: If you are compiling using the same toolchains as the ones used in this guide then there shouldn't be any warning and therefore you should be able to keep Werror=1. If with a different compiler version the library fails to build because of warnings interpreted as errors then, if you are sure the warnings are not important, you might want to try to build with Werror=0 (But please do report the issue on Github).
 
-@b opencl / @b neon: Choose which SIMD technology you want to target. (Neon for Arm Cortex-A CPUs or OpenCL for Arm® Mali™ GPUs)
+@b opencl / @b neon: Choose which SIMD technology you want to target. (Neon™ for Arm® Cortex®-A CPUs or OpenCL for Arm® Mali™ GPUs)
 
 @b embed_kernels: For OpenCL only: set embed_kernels=1 if you want the OpenCL kernels to be built in the library's binaries instead of being read from separate ".cl" / ".cs" files. If embed_kernels is set to 0 then the application can set the path to the folder containing the OpenCL kernel files by calling CLKernelLibrary::init(). By default the path is set to "./cl_kernels".
 
@@ -201,11 +201,11 @@ Example:
 
 @b mali: Enable the collection of Arm® Mali™ hardware counters to measure execution time in benchmark tests. (Your device needs to have a Arm® Mali™ driver that supports it)
 
-@b openmp Build in the OpenMP scheduler for Neon.
+@b openmp Build in the OpenMP scheduler for Neon™.
 
 @note Only works when building with g++ not clang++
 
-@b cppthreads Build in the C++11 scheduler for Neon.
+@b cppthreads Build in the C++11 scheduler for Neon™.
 
 @sa Scheduler::set
 
@@ -272,21 +272,21 @@ The examples get automatically built by scons as part of the build process of th
 
 To cross compile a Arm® Neon™ example for Linux 32bit:
 
-	arm-linux-gnueabihf-g++ examples/neon_convolution.cpp utils/Utils.cpp -I. -Iinclude -std=c++14 -mfpu=neon -L. -larm_compute -larm_compute_core -o neon_convolution
+	arm-linux-gnueabihf-g++ examples/neon_cnn.cpp utils/Utils.cpp -I. -Iinclude -std=c++14 -mfpu=neon -L. -larm_compute -larm_compute_core -o neon_cnn
 
 To cross compile a Arm® Neon™ example for Linux 64bit:
 
-	aarch64-linux-gnu-g++ examples/neon_convolution.cpp utils/Utils.cpp -I. -Iinclude -std=c++14 -L. -larm_compute -larm_compute_core -o neon_convolution
+	aarch64-linux-gnu-g++ examples/neon_cnn.cpp utils/Utils.cpp -I. -Iinclude -std=c++14 -L. -larm_compute -larm_compute_core -o neon_cnn
 
 (notice the only difference with the 32 bit command is that we don't need the -mfpu option and the compiler's name is different)
 
 To cross compile an OpenCL example for Linux 32bit:
 
-	arm-linux-gnueabihf-g++ examples/cl_convolution.cpp utils/Utils.cpp -I. -Iinclude -std=c++14 -mfpu=neon -L. -larm_compute -larm_compute_core -o cl_convolution -DARM_COMPUTE_CL
+	arm-linux-gnueabihf-g++ examples/cl_sgemm.cpp utils/Utils.cpp -I. -Iinclude -std=c++14 -mfpu=neon -L. -larm_compute -larm_compute_core -o cl_sgemm -DARM_COMPUTE_CL
 
 To cross compile an OpenCL example for Linux 64bit:
 
-	aarch64-linux-gnu-g++ examples/cl_convolution.cpp utils/Utils.cpp -I. -Iinclude -std=c++14 -L. -larm_compute -larm_compute_core -o cl_convolution -DARM_COMPUTE_CL
+	aarch64-linux-gnu-g++ examples/cl_sgemm.cpp utils/Utils.cpp -I. -Iinclude -std=c++14 -L. -larm_compute -larm_compute_core -o cl_sgemm -DARM_COMPUTE_CL
 
 (notice the only difference with the 32 bit command is that we don't need the -mfpu option and the compiler's name is different)
 
@@ -306,17 +306,17 @@ i.e. to cross compile the "graph_lenet" example for Linux 64bit:
 
 To compile natively (i.e directly on an Arm device) for Arm® Neon™ for Linux 32bit:
 
-	g++ examples/neon_convolution.cpp utils/Utils.cpp -I. -Iinclude -std=c++14 -mfpu=neon -larm_compute -larm_compute_core -o neon_convolution
+	g++ examples/neon_cnn.cpp utils/Utils.cpp -I. -Iinclude -std=c++14 -mfpu=neon -larm_compute -larm_compute_core -o neon_cnn
 
 To compile natively (i.e directly on an Arm device) for Arm® Neon™ for Linux 64bit:
 
-	g++ examples/neon_convolution.cpp utils/Utils.cpp -I. -Iinclude -std=c++14 -larm_compute -larm_compute_core -o neon_convolution
+	g++ examples/neon_cnn.cpp utils/Utils.cpp -I. -Iinclude -std=c++14 -larm_compute -larm_compute_core -o neon_cnn
 
 (notice the only difference with the 32 bit command is that we don't need the -mfpu option)
 
 To compile natively (i.e directly on an Arm device) for OpenCL for Linux 32bit or Linux 64bit:
 
-	g++ examples/cl_convolution.cpp utils/Utils.cpp -I. -Iinclude -std=c++14 -larm_compute -larm_compute_core -o cl_convolution -DARM_COMPUTE_CL
+	g++ examples/cl_sgemm.cpp utils/Utils.cpp -I. -Iinclude -std=c++14 -larm_compute -larm_compute_core -o cl_sgemm -DARM_COMPUTE_CL
 
 To compile natively the examples with the Graph API, such as graph_lenet.cpp, you need to link the examples against arm_compute_graph.so too.
 
@@ -337,11 +337,11 @@ i.e. to natively compile the "graph_lenet" example for Linux 64bit:
 
 To run the built executable simply run:
 
-	LD_LIBRARY_PATH=build ./neon_convolution
+	LD_LIBRARY_PATH=build ./neon_cnn
 
 or
 
-	LD_LIBRARY_PATH=build ./cl_convolution
+	LD_LIBRARY_PATH=build ./cl_sgemm
 
 @note Examples accept different types of arguments, to find out what they are run the example with \a --help as an argument. If no arguments are specified then random values will be used to execute the graph.
 
@@ -374,7 +374,7 @@ For Android, the library was successfully built and tested using Google's standa
 For NDK r18 or older, here is a guide to <a href="https://developer.android.com/ndk/guides/standalone_toolchain.html">create your Android standalone toolchains from the NDK</a>:
 - Download the NDK r18b from here: https://developer.android.com/ndk/downloads/index.html to directory $NDK
 - Make sure you have Python 2.7 installed on your machine.
-- Generate the 32 and/or 64 toolchains by running the following commands to your toolchain dirctory $MY_TOOLCHAINS:
+- Generate the 32 and/or 64 toolchains by running the following commands to your toolchain directory $MY_TOOLCHAINS:
 
 	$NDK/build/tools/make_standalone_toolchain.py --arch arm64 --install-dir $MY_TOOLCHAINS/aarch64-linux-android-ndk-r18b --stl libc++ --api 21
 	$NDK/build/tools/make_standalone_toolchain.py --arch arm --install-dir $MY_TOOLCHAINS/arm-linux-android-ndk-r18b --stl libc++ --api 21
@@ -409,16 +409,16 @@ Once you've got your Android standalone toolchain built and added to your path y
 To cross compile a Arm® Neon™ example:
 
 	#32 bit:
-	arm-linux-androideabi-clang++ examples/neon_convolution.cpp utils/Utils.cpp -I. -Iinclude -std=c++14 -larm_compute-static -larm_compute_core-static -L. -o neon_convolution_arm -static-libstdc++ -pie
+	arm-linux-androideabi-clang++ examples/neon_cnn.cpp utils/Utils.cpp -I. -Iinclude -std=c++14 -larm_compute-static -larm_compute_core-static -L. -o neon_cnn_arm -static-libstdc++ -pie
 	#64 bit:
-	aarch64-linux-android-clang++ examples/neon_convolution.cpp utils/Utils.cpp -I. -Iinclude -std=c++14 -larm_compute-static -larm_compute_core-static -L. -o neon_convolution_aarch64 -static-libstdc++ -pie
+	aarch64-linux-android-clang++ examples/neon_cnn.cpp utils/Utils.cpp -I. -Iinclude -std=c++14 -larm_compute-static -larm_compute_core-static -L. -o neon_cnn_aarch64 -static-libstdc++ -pie
 
 To cross compile an OpenCL example:
 
 	#32 bit:
-	arm-linux-androideabi-clang++ examples/cl_convolution.cpp utils/Utils.cpp -I. -Iinclude -std=c++14 -larm_compute-static -larm_compute_core-static -L. -o cl_convolution_arm -static-libstdc++ -pie -DARM_COMPUTE_CL
+	arm-linux-androideabi-clang++ examples/cl_sgemm.cpp utils/Utils.cpp -I. -Iinclude -std=c++14 -larm_compute-static -larm_compute_core-static -L. -o cl_sgemm_arm -static-libstdc++ -pie -DARM_COMPUTE_CL
 	#64 bit:
-	aarch64-linux-android-clang++ examples/cl_convolution.cpp utils/Utils.cpp -I. -Iinclude -std=c++14 -larm_compute-static -larm_compute_core-static -L. -o cl_convolution_aarch64 -static-libstdc++ -pie -DARM_COMPUTE_CL
+	aarch64-linux-android-clang++ examples/cl_sgemm.cpp utils/Utils.cpp -I. -Iinclude -std=c++14 -larm_compute-static -larm_compute_core-static -L. -o cl_sgemm_aarch64 -static-libstdc++ -pie -DARM_COMPUTE_CL
 
 To cross compile the examples with the Graph API, such as graph_lenet.cpp, you need to link the library arm_compute_graph also.
 
@@ -432,28 +432,28 @@ To cross compile the examples with the Graph API, such as graph_lenet.cpp, you n
 
 Then you need to do is upload the executable and the shared library to the device using ADB:
 
-	adb push neon_convolution_arm /data/local/tmp/
-	adb push cl_convolution_arm /data/local/tmp/
+	adb push neon_cnn_arm /data/local/tmp/
+	adb push cl_sgemm_arm /data/local/tmp/
 	adb push gc_absdiff_arm /data/local/tmp/
 	adb shell chmod 777 -R /data/local/tmp/
 
 And finally to run the example:
 
-	adb shell /data/local/tmp/neon_convolution_arm
-	adb shell /data/local/tmp/cl_convolution_arm
+	adb shell /data/local/tmp/neon_cnn_arm
+	adb shell /data/local/tmp/cl_sgemm_arm
 	adb shell /data/local/tmp/gc_absdiff_arm
 
 For 64bit:
 
-	adb push neon_convolution_aarch64 /data/local/tmp/
-	adb push cl_convolution_aarch64 /data/local/tmp/
+	adb push neon_cnn_aarch64 /data/local/tmp/
+	adb push cl_sgemm_aarch64 /data/local/tmp/
 	adb push gc_absdiff_aarch64 /data/local/tmp/
 	adb shell chmod 777 -R /data/local/tmp/
 
 And finally to run the example:
 
-	adb shell /data/local/tmp/neon_convolution_aarch64
-	adb shell /data/local/tmp/cl_convolution_aarch64
+	adb shell /data/local/tmp/neon_cnn_aarch64
+	adb shell /data/local/tmp/cl_sgemm_aarch64
 	adb shell /data/local/tmp/gc_absdiff_aarch64
 
 @note Examples accept different types of arguments, to find out what they are run the example with \a --help as an argument. If no arguments are specified then random values will be used to execute the graph.
@@ -461,7 +461,7 @@ And finally to run the example:
 For example:
 	adb shell /data/local/tmp/graph_lenet --help
 
-In this case the first argument of LeNet (like all the graph examples) is the target (i.e 0 to run on Neon, 1 to run on OpenCL if available, 2 to run on OpenCL using the CLTuner), the second argument is the path to the folder containing the npy files for the weights and finally the third argument is the number of batches to run.
+In this case the first argument of LeNet (like all the graph examples) is the target (i.e 0 to run on Neon™, 1 to run on OpenCL if available, 2 to run on OpenCL using the CLTuner), the second argument is the path to the folder containing the npy files for the weights and finally the third argument is the number of batches to run.
 
 @section S1_4_macos Building for macOS
 
diff --git a/docs/user_guide/library.dox b/docs/user_guide/library.dox
index 688a695466..e987eac752 100644
--- a/docs/user_guide/library.dox
+++ b/docs/user_guide/library.dox
@@ -94,13 +94,27 @@ There are different ways padding can be calculated:
 If you don't want to manually set the padding but still want to allocate your objects upfront then you can use auto_padding. It guarantees that the allocation will have enough padding to run any of the provided functions.
 
 @code{.cpp}
-Image     src, dst;
+Image       src{}, dst{};
+NEScale     scale{};
 
-// Use auto padding for the input:
-src.info()->init_auto_padding(TensorShape(640u,480u), Format::U8);
+// Create an empty grayscale 640x480 image
+src.allocator()->init(TensorInfo(640, 480, Format::U8));
 
-// Use manual padding for the destination image
-dst.info()->init(src.info()->tensor_shape(), Format::U8, strides_in_bytes, offset_first_element_in_bytes, total_size_in_bytes);
+constexpr int scale_factor = 2;
+TensorInfo dst_tensor_info(src.info()->dimension(0) / scale_factor, src.info()->dimension(1) / scale_factor,
+                           Format::U8);
+
+// Configure the destination image
+dst.allocator()->init(dst_tensor_info);
+
+// Configure Scale function object:
+scale.configure(&src, &dst, ScaleKernelInfo{
+            InterpolationPolicy::NEAREST_NEIGHBOR,
+            BorderMode::UNDEFINED,
+            PixelValue(),
+            SamplingPolicy::CENTER,
+            false
+});
 
 // Allocate all the images
 src.allocator()->allocate();
@@ -108,15 +122,12 @@ dst.allocator()->allocate();
 // Fill the input image with the content of the PPM image if a filename was provided:
 fill_image(src);
 
-NEGaussian3x3 gauss;
-
-// Apply a Gaussian 3x3 filter to the source image (Note: if the padding provided is not enough then the execution window and valid region of the output will be shrunk)
-gauss.configure(&src, &dst, BorderMode::UNDEFINED);
-
-//Execute the functions:
-gauss.run();
+// Run the scale operation:
+scale.run();
 @endcode
 
+The full example is provided in examples/neon_scale.cpp
+
 @warning Some kernels need up to 3 neighbor values to calculate the value of a given pixel. Therefore, to be safe, we use a 4-pixel padding all around the image. In addition, some kernels read and write up to 32 pixels at the same time. To cover that case as well we add an extra 32 pixels of padding at the end of each row. As a result auto padded buffers waste a lot of memory and are less cache friendly. It is therefore recommended to use accurate padding or manual padding wherever possible.
 
 @subsubsection architecture_images_tensors_valid_region Valid regions
@@ -255,7 +266,7 @@ tmp2.allocator()->allocate();       // Flag that the lifetime of object tmp2 has
 tmp3.allocator()->allocate();       // Flag that the lifetime of object tmp3 has ended
 @endcode
 
-@warning The configuration step should be done sequentially by a single thread so that all the lifetimes are captured correclty.
+@warning The configuration step should be done sequentially by a single thread so that all the lifetimes are captured correctly.
 
 When configuration of all the operations is finished then the memory manager have to be populated:
 @code{.cpp}
@@ -339,7 +350,7 @@ However this process takes quite a lot of time, which is why it cannot be enable
 
 But, when the @ref CLTuner is disabled ( Target = 1 for the graph examples), the @ref graph::Graph will try to reload the file containing the tuning parameters, then for each executed kernel the Compute Library will use the fine tuned LWS if it was present in the file or use a default LWS value if it's not.
 
-@section architecture_cl_queue_prioritites OpenCL Queue Priorities
+@section architecture_cl_queue_priorities OpenCL Queue Priorities
 
 OpenCL 2.1 exposes the `cl_khr_priority_hints` extensions that if supported by an underlying implementation allows the user to specify priority hints to the created command queues.
 Is important to note that this does not specify guarantees or the explicit scheduling behavior, this is something that each implementation needs to expose.
@@ -432,7 +443,7 @@ Consequently, this will allow finer control of these services among pipelines wh
 This feature introduces some changes to our API.
 All the kernels/functions will now accept a Runtime Context object which will allow the function to use the mentioned services.
 
-Finally, we will try to adapt our code-base progressively to use the new mechanism but will continue supporting the legacy mechanism to allow a smooth transition. Changes will apply to all our three backends: Neon, OpenCL and OpenGL ES.
+Finally, we will try to adapt our code-base progressively to use the new mechanism but will continue supporting the legacy mechanism to allow a smooth transition. Changes will apply to all our backends: Neon™ and OpenCL.
 
 @subsection architecture_experimental_clvk CLVK
 
@@ -479,7 +490,7 @@ times under the same execution context
 - #AclPreferFastStart: Provides faster single execution. It can be used when the operators will be executed only once,
 thus reducing their latency is important (Currently, it is not implemented)
 
-@paragraph architecture_experimental_api_object_context_capabilitys AclTargetCapabilities
+@paragraph architecture_experimental_api_object_context_capabilities AclTargetCapabilities
 Context creation can also have a list of capabilities of hardware as one of its parameters. This is currently
 available only for the CPU backend. A list of architecture capabilities can be passed to influence the selection
 of the underlying kernels. Such capabilities can be for example the enablement of SVE or the dot product
diff --git a/docs/user_guide/operator_list.dox b/docs/user_guide/operator_list.dox
index fc41265738..05cc892d40 100644
--- a/docs/user_guide/operator_list.dox
+++ b/docs/user_guide/operator_list.dox
@@ -45,14 +45,14 @@ The main data-types that the Machine Learning functions support are the followin
     <li>F16: 16-bit half precision floating point
     <li>S32: 32-bit signed integer
     <li>U8: 8-bit unsigned char
-    <li>All: include all above data types
+    <li>All: Agnostic to any specific data type
   </ul>
 
 Compute Library supports the following data layouts (fast changing dimension from right to left):
   <ul>
     <li>NHWC: The native layout of Compute Library that delivers the best performance where channels are in the fastest changing dimension
     <li>NCHW: Legacy layout where width is in the fastest changing dimension
-    <li>All: include all above data layouts
+    <li>All: Agnostic to any specific data layout
   </ul>
 where N = batches, C = channels, H = height, W = width
 
@@ -264,7 +264,7 @@ where N = batches, C = channels, H = height, W = width
     </table>
 <tr>
   <td rowspan="2">BitwiseAnd
-  <td rowspan="2" style="width:200px;"> Function to performe bitwise AND between 2 tensors.
+  <td rowspan="2" style="width:200px;"> Function to perform bitwise AND between 2 tensors.
   <td rowspan="2">
       <ul>
        <li>ANEURALNETWORKS_LOGICAL_AND
@@ -292,7 +292,7 @@ where N = batches, C = channels, H = height, W = width
     </table>
 <tr>
   <td rowspan="2">BitwiseNot
-  <td rowspan="2" style="width:200px;"> Function to performe bitwise NOT.
+  <td rowspan="2" style="width:200px;"> Function to perform bitwise NOT.
   <td rowspan="2">
       <ul>
        <li>ANEURALNETWORKS_LOGICAL_NOT
@@ -320,7 +320,7 @@ where N = batches, C = channels, H = height, W = width
     </table>
 <tr>
   <td rowspan="2">BitwiseOr
-  <td rowspan="2" style="width:200px;"> Function to performe bitwise OR between 2 tensors.
+  <td rowspan="2" style="width:200px;"> Function to perform bitwise OR between 2 tensors.
   <td rowspan="2">
       <ul>
        <li>ANEURALNETWORKS_LOGICAL_OR
@@ -348,7 +348,7 @@ where N = batches, C = channels, H = height, W = width
     </table>
 <tr>
   <td rowspan="2">BitwiseXor
-  <td rowspan="2" style="width:200px;"> Function to performe bitwise XOR between 2 tensors.
+  <td rowspan="2" style="width:200px;"> Function to perform bitwise XOR between 2 tensors.
   <td rowspan="2">
       <ul>
        <li>n/a
@@ -535,7 +535,7 @@ where N = batches, C = channels, H = height, W = width
     </table>
 <tr>
   <td rowspan="2">ConvertFullyConnectedWeights
-  <td rowspan="2" style="width:200px;"> Function to tranpose the wieghts for the fully connected layer.
+  <td rowspan="2" style="width:200px;"> Function to transpose the weights for the fully connected layer.
   <td rowspan="2">
       <ul>
        <li>n/a
@@ -678,7 +678,7 @@ where N = batches, C = channels, H = height, W = width
     </table>
 <tr>
   <td rowspan="2">DeconvolutionLayer
-  <td rowspan="2" style="width:200px;"> Function to compute a deconvolution or tranpose convolution.
+  <td rowspan="2" style="width:200px;"> Function to compute a deconvolution or transpose convolution.
   <td rowspan="2">
       <ul>
        <li>ANEURALNETWORKS_TRANSPOSE_CONV_2D
@@ -957,7 +957,7 @@ where N = batches, C = channels, H = height, W = width
     <tr><td>QASYMM8_SIGNED<td>QSYMM8_PER_CHANNEL<td>S32<td>QASYMM8_SIGNED
     </table>
 <tr>
-  <td rowspan="13">ElementWiseOperations
+  <td rowspan="13">ElementwiseOperations
   <td rowspan="13" style="width:200px;"> Function to perform in Cpu: - Div - Max - Min - Pow - SquaredDiff - Comparisons (Equal, greater, greater_equal, less, less_equal, not_equal) Function to perform in CL: - Add - Sub - Div - Max - Min - Pow - SquaredDiff
   <td rowspan="13">
       <ul>
@@ -1242,6 +1242,7 @@ where N = batches, C = channels, H = height, W = width
     <tr><th>src<th>dst
     <tr><td>F16<td>F16
     <tr><td>F32<td>F32
+    <tr><td>S32<td>S32
     </table>
 <tr>
   <td>CLSinLayer
@@ -1408,7 +1409,7 @@ where N = batches, C = channels, H = height, W = width
     </table>
 <tr>
   <td rowspan="2">FillBorder
-  <td rowspan="2" style="width:200px;"> Function to .
+  <td rowspan="2" style="width:200px;"> Function to fill the borders within the XY-planes.
   <td rowspan="2">
       <ul>
        <li>n/a
@@ -1620,7 +1621,7 @@ where N = batches, C = channels, H = height, W = width
     <tr><td>F16<td>F16<td>F16<td>F16
     </table>
 <tr>
-  <td rowspan="1">GEMMConv2D
+  <td rowspan="1">GEMMConv2d
   <td rowspan="1" style="width:200px;"> General Matrix Multiplication.
   <td rowspan="1">
       <ul>
@@ -2193,7 +2194,7 @@ where N = batches, C = channels, H = height, W = width
     </table>
 <tr>
   <td rowspan="2">PixelWiseMultiplication
-  <td rowspan="2" style="width:200px;"> Function to performe a multiplication.
+  <td rowspan="2" style="width:200px;"> Function to perform a multiplication.
   <td rowspan="2">
       <ul>
        <li>ANEURALNETWORKS_MUL
@@ -2237,11 +2238,12 @@ where N = batches, C = channels, H = height, W = width
     <tr><td>S16<td>U8<td>S16
     <tr><td>S16<td>S16<td>S16
     <tr><td>F16<td>F16<td>F16
-    <tr><td>F32<td>S32<td>F32
+    <tr><td>F32<td>F32<td>F32
+    <tr><td>S32<td>S32<td>S32
     </table>
 <tr>
   <td rowspan="2">PoolingLayer
-  <td rowspan="2" style="width:200px;"> Function to performe pooling with the specified pooling operation.
+  <td rowspan="2" style="width:200px;"> Function to perform pooling with the specified pooling operation.
   <td rowspan="2">
       <ul>
        <li>ANEURALNETWORKS_AVERAGE_POOL_2D
@@ -2449,7 +2451,7 @@ where N = batches, C = channels, H = height, W = width
     </table>
 <tr>
   <td rowspan="2">ReduceMean
-  <td rowspan="2" style="width:200px;"> Function to performe reduce mean operation.
+  <td rowspan="2" style="width:200px;"> Function to perform reduce mean operation.
   <td rowspan="2">
       <ul>
        <li>ANEURALNETWORKS_MEAN
@@ -2483,7 +2485,7 @@ where N = batches, C = channels, H = height, W = width
     </table>
 <tr>
   <td rowspan="2">ReductionOperation
-  <td rowspan="2" style="width:200px;"> Function to performe reduce with the following operations - ARG_IDX_MAX: Index of the max value - ARG_IDX_MIN: Index of the min value - MEAN_SUM:    Mean of sum - PROD:        Product - SUM_SQUARE:  Sum of squares - SUM:         Sum - MIN:         Min - MAX:         Max
+  <td rowspan="2" style="width:200px;"> Function to perform reduce with the following operations - ARG_IDX_MAX: Index of the max value - ARG_IDX_MIN: Index of the min value - MEAN_SUM:    Mean of sum - PROD:        Product - SUM_SQUARE:  Sum of squares - SUM:         Sum - MIN:         Min - MAX:         Max
   <td rowspan="2">
       <ul>
        <li>ANEURALNETWORKS_REDUCE_ALL
@@ -3100,7 +3102,7 @@ where N = batches, C = channels, H = height, W = width
     </table>
 <tr>
   <td rowspan="1">WinogradInputTransform
-  <td rowspan="1" style="width:200px;"> Function to.
+  <td rowspan="1" style="width:200px;"> Function to perform a Winograd transform on the input tensor.
   <td rowspan="1">
       <ul>
        <li>n/a
diff --git a/docs/user_guide/release_version_and_change_log.dox b/docs/user_guide/release_version_and_change_log.dox
index 557eff0779..20995af693 100644
--- a/docs/user_guide/release_version_and_change_log.dox
+++ b/docs/user_guide/release_version_and_change_log.dox
@@ -44,8 +44,8 @@ v21.05 Public major release
  - Various bug fixes.
  - Various optimisations.
  - Various documentation updates:
-   - Add supported operators and coressponding Android NNAPI operators.
-   - Documentaiton reorg into user guide and contributor guide.
+   - Add supported operators and corresponding Android NNAPI operators.
+   - Documentation reorg into user guide and contributor guide.
  - Add support for a global allocator for OpenCL tensors
  - Add experimental support for [CLVK](https://github.com/kpet/clvk).
  - Add data type S32 support for:
@@ -201,7 +201,7 @@ v21.02 Public major release
  - Add functionality to load the OpenCL GEMM heuristics at runtime
    - The GEMM heuristic file (MLGO) can be used to update the default GEMM heuristics available for OpenCL
  - Note: there might be performance regressions against v20.08 in Inception v3 using int8 data types on Arm Mali-G77 GPUs. Currently under investigation
- - Note: data-type decoupling is in progress and expiremental. Warning of unused symbols might be raised
+ - Note: data-type decoupling is in progress and experimental. Warning of unused symbols might be raised
 
 v20.11 Public major release
  - Various bug fixes.
@@ -514,7 +514,7 @@ v20.08 Public major release
  - Added new data type U8 support for:
    - @ref NECropKernel
    - CLCropKernel
- - Added aligh_corner support for nearest neighbor interpolation in:
+ - Added align_corner support for nearest neighbor interpolation in:
    - NEScaleKernel
    - CLScaleKernel
  - New OpenCL kernels / functions:
@@ -828,7 +828,7 @@ v19.08 Public major release
  - Enable the fusion of batch normalization with convolution and depthwise convolution layer for FP32 in the graph API (OpenCL only)
  - Added support for fusing activation function and broadcast addition with the matrix multiplication for FP32 (OpenCL only)
  - Re-factored the depthwise convolution layer kernel on Arm® Neon™ for generic cases
- - Added an optimized depthwise convolution layer kernel for 5x5 filters (Neon only)
+ - Added an optimized depthwise convolution layer kernel for 5x5 filters (Neon™ only)
  - Added support to enable OpenCL kernel cache. Added example showing how to load the prebuilt OpenCL kernels from a binary cache file
  - Altered @ref QuantizationInfo interface to support per-channel quantization.
  - The CLDepthwiseConvolutionLayer3x3 will be included by @ref CLDepthwiseConvolutionLayer to accommodate for future optimizations.
@@ -967,8 +967,8 @@ v19.02 Public major release
  - Add 4D tensors support to
     - @ref NESoftmaxLayer
  - Fused activation in @ref CLWinogradConvolutionLayer
- - Extented @ref NEPermute to support more cases
- - Added Neon/SVE GEMM Hybrid kernels
+ - Extended @ref NEPermute to support more cases
+ - Added Neon™/SVE GEMM Hybrid kernels
  - Added u8 and s8 hybrid assembly kernels
  - Introduced GEMM strategy name in NEGEMMAssemblyWrapper
  - Improved @ref CLTuner
@@ -1078,11 +1078,11 @@ v18.08 Public major release
 v18.05 Public major release
  - Various bug fixes.
  - Various optimisations.
- - Major redesign in the interface for the neon kernels implemented in assembly.
+ - Major redesign in the interface for the Neon™ kernels implemented in assembly.
  - Removed arm_compute::NEGEMMLowpAArch64A53Kernel / arm_compute::NEGEMMLowpAArch64Kernel / arm_compute::NEGEMMLowpAArch64V8P4Kernel / arm_compute::NEGEMMInterleavedBlockedKernel / arm_compute::NEGEMMLowpAssemblyMatrixMultiplyCore / arm_compute::NEHGEMMAArch64FP16Kernel
- - Added NEGEMMAssemblyWrapper and AssemblyKernelGlue which are used to execute assembly kernels in neon functions.
+ - Added NEGEMMAssemblyWrapper and AssemblyKernelGlue which are used to execute assembly kernels in Neon™ functions.
  - Minor changes to the CPUInfo type to make it compatible with the new assembly gemm interface.
- - Moved neon assembly kernels to the folder src/core/Neon/kernels/arm_gemm.
+ - Moved Neon™ assembly kernels to the folder src/core/Neon/kernels/arm_gemm.
  - Improved doxygen documentation.
  - Improved memory management for layer's transitions.
  - Added support for NHWC data layout in tensors.
@@ -1123,7 +1123,7 @@ v18.05 Public major release
  - Replaced NEDeconvolutionLayerUpsampleKernel with NEScaleKernel in @ref NEDeconvolutionLayer.
  - Added fast maths flag in @ref CLConvolutionLayer.
  - Added new tests and benchmarks in validation and benchmark frameworks
- - Merge Activation layer with Convolution Layer (Neon. CL, GLES)
+ - Merge Activation layer with Convolution Layer (Neon™, CL, GLES)
  - Added support to OpenCL 2.0 SVM
  - Added support to import memory in OpenCL tensors.
  - Added the prepare() method to perform any one off pre-processing before running the function.
diff --git a/src/runtime/CL/functions/CLElementWiseUnaryLayer.cpp b/src/runtime/CL/functions/CLElementwiseUnaryLayer.cpp
index dbf2121bfd..a45dd6f9a6 100644
--- a/src/runtime/CL/functions/CLElementWiseUnaryLayer.cpp
+++ b/src/runtime/CL/functions/CLElementwiseUnaryLayer.cpp
@@ -21,7 +21,7 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/runtime/CL/functions/CLElementWiseUnaryLayer.h"
+#include "arm_compute/runtime/CL/functions/CLElementwiseUnaryLayer.h"
 
 #include "arm_compute/core/CL/CLKernelLibrary.h"
 #include "arm_compute/core/CL/ICLTensor.h"
diff --git a/tests/validation/CL/AbsLayer.cpp b/tests/validation/CL/AbsLayer.cpp
index e6ba14b50e..0bad8f9b68 100644
--- a/tests/validation/CL/AbsLayer.cpp
+++ b/tests/validation/CL/AbsLayer.cpp
@@ -22,7 +22,7 @@
  * SOFTWARE.
  */
 #include "arm_compute/core/Types.h"
-#include "arm_compute/runtime/CL/functions/CLElementWiseUnaryLayer.h"
+#include "arm_compute/runtime/CL/functions/CLElementwiseUnaryLayer.h"
 #include "arm_compute/runtime/Tensor.h"
 #include "arm_compute/runtime/TensorAllocator.h"
 #include "tests/CL/CLAccessor.h"
@@ -32,7 +32,7 @@
 #include "tests/framework/Macros.h"
 #include "tests/framework/datasets/Datasets.h"
 #include "tests/validation/Validation.h"
-#include "tests/validation/fixtures/ElementWiseUnaryFixture.h"
+#include "tests/validation/fixtures/ElementwiseUnaryFixture.h"
 
 namespace arm_compute
 {
diff --git a/tests/validation/CL/ExpLayer.cpp b/tests/validation/CL/ExpLayer.cpp
index 16e75a64b4..1797046e5d 100644
--- a/tests/validation/CL/ExpLayer.cpp
+++ b/tests/validation/CL/ExpLayer.cpp
@@ -22,7 +22,7 @@
  * SOFTWARE.
  */
 #include "arm_compute/core/Types.h"
-#include "arm_compute/runtime/CL/functions/CLElementWiseUnaryLayer.h"
+#include "arm_compute/runtime/CL/functions/CLElementwiseUnaryLayer.h"
 #include "arm_compute/runtime/Tensor.h"
 #include "arm_compute/runtime/TensorAllocator.h"
 #include "tests/CL/CLAccessor.h"
@@ -32,7 +32,7 @@
 #include "tests/framework/Macros.h"
 #include "tests/framework/datasets/Datasets.h"
 #include "tests/validation/Validation.h"
-#include "tests/validation/fixtures/ElementWiseUnaryFixture.h"
+#include "tests/validation/fixtures/ElementwiseUnaryFixture.h"
 
 namespace arm_compute
 {
diff --git a/tests/validation/CL/LogLayer.cpp b/tests/validation/CL/LogLayer.cpp
index 95c4f1226e..895c306841 100644
--- a/tests/validation/CL/LogLayer.cpp
+++ b/tests/validation/CL/LogLayer.cpp
@@ -22,7 +22,7 @@
  * SOFTWARE.
  */
 #include "arm_compute/core/Types.h"
-#include "arm_compute/runtime/CL/functions/CLElementWiseUnaryLayer.h"
+#include "arm_compute/runtime/CL/functions/CLElementwiseUnaryLayer.h"
 #include "arm_compute/runtime/Tensor.h"
 #include "arm_compute/runtime/TensorAllocator.h"
 #include "tests/CL/CLAccessor.h"
@@ -32,7 +32,7 @@
 #include "tests/framework/Macros.h"
 #include "tests/framework/datasets/Datasets.h"
 #include "tests/validation/Validation.h"
-#include "tests/validation/fixtures/ElementWiseUnaryFixture.h"
+#include "tests/validation/fixtures/ElementwiseUnaryFixture.h"
 
 namespace arm_compute
 {
diff --git a/tests/validation/CL/NegLayer.cpp b/tests/validation/CL/NegLayer.cpp
index 01fa792543..c93e31dca9 100644
--- a/tests/validation/CL/NegLayer.cpp
+++ b/tests/validation/CL/NegLayer.cpp
@@ -22,7 +22,7 @@
  * SOFTWARE.
  */
 #include "arm_compute/core/Types.h"
-#include "arm_compute/runtime/CL/functions/CLElementWiseUnaryLayer.h"
+#include "arm_compute/runtime/CL/functions/CLElementwiseUnaryLayer.h"
 #include "arm_compute/runtime/Tensor.h"
 #include "arm_compute/runtime/TensorAllocator.h"
 #include "tests/CL/CLAccessor.h"
@@ -32,7 +32,7 @@
 #include "tests/framework/Macros.h"
 #include "tests/framework/datasets/Datasets.h"
 #include "tests/validation/Validation.h"
-#include "tests/validation/fixtures/ElementWiseUnaryFixture.h"
+#include "tests/validation/fixtures/ElementwiseUnaryFixture.h"
 
 namespace arm_compute
 {
diff --git a/tests/validation/CL/RoundLayer.cpp b/tests/validation/CL/RoundLayer.cpp
index 5aa9ca6b4e..f0c88d1ad3 100644
--- a/tests/validation/CL/RoundLayer.cpp
+++ b/tests/validation/CL/RoundLayer.cpp
@@ -22,7 +22,7 @@
  * SOFTWARE.
  */
 #include "arm_compute/core/Types.h"
-#include "arm_compute/runtime/CL/functions/CLElementWiseUnaryLayer.h"
+#include "arm_compute/runtime/CL/functions/CLElementwiseUnaryLayer.h"
 #include "arm_compute/runtime/Tensor.h"
 #include "arm_compute/runtime/TensorAllocator.h"
 #include "tests/CL/CLAccessor.h"
@@ -32,7 +32,7 @@
 #include "tests/framework/Macros.h"
 #include "tests/framework/datasets/Datasets.h"
 #include "tests/validation/Validation.h"
-#include "tests/validation/fixtures/ElementWiseUnaryFixture.h"
+#include "tests/validation/fixtures/ElementwiseUnaryFixture.h"
 
 namespace arm_compute
 {
diff --git a/tests/validation/CL/RsqrtLayer.cpp b/tests/validation/CL/RsqrtLayer.cpp
index 29c113b105..936d853d34 100644
--- a/tests/validation/CL/RsqrtLayer.cpp
+++ b/tests/validation/CL/RsqrtLayer.cpp
@@ -22,7 +22,7 @@
  * SOFTWARE.
  */
 #include "arm_compute/core/Types.h"
-#include "arm_compute/runtime/CL/functions/CLElementWiseUnaryLayer.h"
+#include "arm_compute/runtime/CL/functions/CLElementwiseUnaryLayer.h"
 #include "arm_compute/runtime/Tensor.h"
 #include "arm_compute/runtime/TensorAllocator.h"
 #include "tests/CL/CLAccessor.h"
@@ -32,7 +32,7 @@
 #include "tests/framework/Macros.h"
 #include "tests/framework/datasets/Datasets.h"
 #include "tests/validation/Validation.h"
-#include "tests/validation/fixtures/ElementWiseUnaryFixture.h"
+#include "tests/validation/fixtures/ElementwiseUnaryFixture.h"
 
 namespace arm_compute
 {
diff --git a/tests/validation/CL/SinLayer.cpp b/tests/validation/CL/SinLayer.cpp
index e40c990db6..f0cb4c314e 100644
--- a/tests/validation/CL/SinLayer.cpp
+++ b/tests/validation/CL/SinLayer.cpp
@@ -22,7 +22,7 @@
  * SOFTWARE.
  */
 #include "arm_compute/core/Types.h"
-#include "arm_compute/runtime/CL/functions/CLElementWiseUnaryLayer.h"
+#include "arm_compute/runtime/CL/functions/CLElementwiseUnaryLayer.h"
 #include "arm_compute/runtime/Tensor.h"
 #include "arm_compute/runtime/TensorAllocator.h"
 #include "tests/CL/CLAccessor.h"
@@ -32,7 +32,7 @@
 #include "tests/framework/Macros.h"
 #include "tests/framework/datasets/Datasets.h"
 #include "tests/validation/Validation.h"
-#include "tests/validation/fixtures/ElementWiseUnaryFixture.h"
+#include "tests/validation/fixtures/ElementwiseUnaryFixture.h"
 
 namespace arm_compute
 {
diff --git a/tests/validation/NEON/ElementwiseAbsoluteValue.cpp b/tests/validation/NEON/ElementwiseAbsoluteValue.cpp
index 87f4c7f187..ccde670034 100644
--- a/tests/validation/NEON/ElementwiseAbsoluteValue.cpp
+++ b/tests/validation/NEON/ElementwiseAbsoluteValue.cpp
@@ -32,7 +32,7 @@
 #include "tests/framework/Macros.h"
 #include "tests/framework/datasets/Datasets.h"
 #include "tests/validation/Validation.h"
-#include "tests/validation/fixtures/ElementWiseUnaryFixture.h"
+#include "tests/validation/fixtures/ElementwiseUnaryFixture.h"
 
 namespace arm_compute
 {
diff --git a/tests/validation/NEON/ElementwiseExpLayer.cpp b/tests/validation/NEON/ElementwiseExpLayer.cpp
index 211e10fa45..f9e5f39989 100644
--- a/tests/validation/NEON/ElementwiseExpLayer.cpp
+++ b/tests/validation/NEON/ElementwiseExpLayer.cpp
@@ -32,7 +32,7 @@
 #include "tests/framework/Macros.h"
 #include "tests/framework/datasets/Datasets.h"
 #include "tests/validation/Validation.h"
-#include "tests/validation/fixtures/ElementWiseUnaryFixture.h"
+#include "tests/validation/fixtures/ElementwiseUnaryFixture.h"
 
 namespace arm_compute
 {
diff --git a/tests/validation/NEON/ElementwiseLog.cpp b/tests/validation/NEON/ElementwiseLog.cpp
index 3115ed6065..3aa7fb3665 100644
--- a/tests/validation/NEON/ElementwiseLog.cpp
+++ b/tests/validation/NEON/ElementwiseLog.cpp
@@ -32,7 +32,7 @@
 #include "tests/framework/Macros.h"
 #include "tests/framework/datasets/Datasets.h"
 #include "tests/validation/Validation.h"
-#include "tests/validation/fixtures/ElementWiseUnaryFixture.h"
+#include "tests/validation/fixtures/ElementwiseUnaryFixture.h"
 
 namespace arm_compute
 {
diff --git a/tests/validation/NEON/ElementwiseNegation.cpp b/tests/validation/NEON/ElementwiseNegation.cpp
index 629baa80e6..0b63588d8a 100644
--- a/tests/validation/NEON/ElementwiseNegation.cpp
+++ b/tests/validation/NEON/ElementwiseNegation.cpp
@@ -32,7 +32,7 @@
 #include "tests/framework/Macros.h"
 #include "tests/framework/datasets/Datasets.h"
 #include "tests/validation/Validation.h"
-#include "tests/validation/fixtures/ElementWiseUnaryFixture.h"
+#include "tests/validation/fixtures/ElementwiseUnaryFixture.h"
 
 namespace arm_compute
 {
diff --git a/tests/validation/NEON/ElementwiseRound.cpp b/tests/validation/NEON/ElementwiseRound.cpp
index 5ff81a5d8a..d2f0b456a0 100644
--- a/tests/validation/NEON/ElementwiseRound.cpp
+++ b/tests/validation/NEON/ElementwiseRound.cpp
@@ -32,7 +32,7 @@
 #include "tests/framework/Macros.h"
 #include "tests/framework/datasets/Datasets.h"
 #include "tests/validation/Validation.h"
-#include "tests/validation/fixtures/ElementWiseUnaryFixture.h"
+#include "tests/validation/fixtures/ElementwiseUnaryFixture.h"
 
 namespace arm_compute
 {
diff --git a/tests/validation/NEON/ElementwiseRsqrtLayer.cpp b/tests/validation/NEON/ElementwiseRsqrtLayer.cpp
index 1591b76cd7..2d52183b15 100644
--- a/tests/validation/NEON/ElementwiseRsqrtLayer.cpp
+++ b/tests/validation/NEON/ElementwiseRsqrtLayer.cpp
@@ -32,7 +32,7 @@
 #include "tests/framework/Macros.h"
 #include "tests/framework/datasets/Datasets.h"
 #include "tests/validation/Validation.h"
-#include "tests/validation/fixtures/ElementWiseUnaryFixture.h"
+#include "tests/validation/fixtures/ElementwiseUnaryFixture.h"
 
 namespace arm_compute
 {
diff --git a/tests/validation/NEON/ElementwiseSin.cpp b/tests/validation/NEON/ElementwiseSin.cpp
index 9b212e264f..06775c0690 100644
--- a/tests/validation/NEON/ElementwiseSin.cpp
+++ b/tests/validation/NEON/ElementwiseSin.cpp
@@ -32,7 +32,7 @@
 #include "tests/framework/Macros.h"
 #include "tests/framework/datasets/Datasets.h"
 #include "tests/validation/Validation.h"
-#include "tests/validation/fixtures/ElementWiseUnaryFixture.h"
+#include "tests/validation/fixtures/ElementwiseUnaryFixture.h"
 
 namespace arm_compute
 {
diff --git a/tests/validation/fixtures/ElementWiseUnaryFixture.h b/tests/validation/fixtures/ElementwiseUnaryFixture.h
index 661af91763..7221226fd1 100644
--- a/tests/validation/fixtures/ElementWiseUnaryFixture.h
+++ b/tests/validation/fixtures/ElementwiseUnaryFixture.h
@@ -31,7 +31,7 @@
 #include "tests/IAccessor.h"
 #include "tests/framework/Asserts.h"
 #include "tests/framework/Fixture.h"
-#include "tests/validation/reference/ElementWiseUnary.h"
+#include "tests/validation/reference/ElementwiseUnary.h"
 
 namespace arm_compute
 {
diff --git a/tests/validation/reference/ElementWiseUnary.cpp b/tests/validation/reference/ElementwiseUnary.cpp
index 1d46ed648f..5333b53c15 100644
--- a/tests/validation/reference/ElementWiseUnary.cpp
+++ b/tests/validation/reference/ElementwiseUnary.cpp
@@ -21,7 +21,7 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "ElementWiseUnary.h"
+#include "ElementwiseUnary.h"
 
 namespace arm_compute
 {
diff --git a/tests/validation/reference/ElementWiseUnary.h b/tests/validation/reference/ElementwiseUnary.h
index be4a229a5b..be4a229a5b 100644
--- a/tests/validation/reference/ElementWiseUnary.h
+++ b/tests/validation/reference/ElementwiseUnary.h
author	Jakub Sujak <jakub.sujak@arm.com>	2021-06-04 09:46:08 +0100
committer	Michele Di Giorgio <michele.digiorgio@arm.com>	2021-06-11 09:19:27 +0000
commit	ee301b384f4aeb697a5c249b8bb848d784146582 (patch)
tree	e42ecfcfdbf95d21d5d01a422663161d32fe1733
parent	a5c428a5428d1c7a9d1d03fd198d6a8578b6c12c (diff)
download	ComputeLibrary-ee301b384f4aeb697a5c249b8bb848d784146582.tar.gz