Fix trademarks throughout the codebase

Resolves: COMPMID-4299 Change-Id: Ie6a52c1371b9a2a7b5bb4f019ecd5e70a2008567 Signed-off-by: Michele Di Giorgio <michele.digiorgio@arm.com> Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/5338 Reviewed-by: Georgios Pinitas <georgios.pinitas@arm.com> Comments-Addressed: Arm Jenkins <bsgcomp@arm.com> Tested-by: Arm Jenkins <bsgcomp@arm.com>
author: Michele Di Giorgio <michele.digiorgio@arm.com> 2021-03-09 14:09:08 +0000
committer: Michele Di Giorgio <michele.digiorgio@arm.com> 2021-03-31 17:08:51 +0000
commit: 33f41fabd30fb444aaa0cf3e65b61794d498d151 (patch)
tree: a381cff3096a3b05198b0cd311fee28e40fd5a4f
parent: 5f91b5d7063462854b62d342f9d4e04ae647e9a6 (diff)
download: ComputeLibrary-33f41fabd30fb444aaa0cf3e65b61794d498d151.tar.gz
132 files changed, 326 insertions, 325 deletions
diff --git a/README.md b/README.md
index e44f3a5045..5770f69cd5 100644
--- a/README.md
+++ b/README.md
@@ -42,7 +42,7 @@ Binaries available at https://github.com/ARM-software/ComputeLibrary/releases.
 
 - x86
 
-### Supported OS
+### Supported Systems
 
 - Android™
 - Bare Metal
@@ -72,7 +72,7 @@ You must use your real name, no pseudonyms or anonymous contributions are accept
 
 Android is a trademark of Google LLC.
 
-Arm, Cortex and Mali are registered trademarks or trademarks of Arm Limited (or its subsidiaries) in the US and/or elsewhere.
+Arm, Cortex, Mali and Neon are registered trademarks or trademarks of Arm Limited (or its subsidiaries) in the US and/or elsewhere.
 
 Linux® is the registered trademark of Linus Torvalds in the U.S. and other countries.
 
diff --git a/SConstruct b/SConstruct
index 48b01c9a06..9190d490d7 100644
--- a/SConstruct
+++ b/SConstruct
@@ -1,4 +1,6 @@
-# Copyright (c) 2016, 2017 Arm Limited.
+# -*- coding: utf-8 -*-
+
+# Copyright (c) 2016-2021 Arm Limited.
 #
 # SPDX-License-Identifier: MIT
 #
@@ -53,7 +55,7 @@ vars.AddVariables(
     BoolVariable("Werror", "Enable/disable the -Werror compilation flag", True),
     BoolVariable("standalone", "Builds the tests as standalone executables, links statically with libgcc, libstdc++ and libarm_compute", False),
     BoolVariable("opencl", "Enable OpenCL support", True),
-    BoolVariable("neon", "Enable Neon support", False),
+    BoolVariable("neon", "Enable Arm® Neon™ support", False),
     BoolVariable("embed_kernels", "Embed OpenCL kernels and OpenGL ES compute shaders in library binary", True),
     BoolVariable("compress_kernels", "Compress embedded OpenCL kernels in library binary. Note embed_kernels should be enabled", False),
     BoolVariable("set_soname", "Set the library's soname and shlibversion (requires SCons 2.4 or above)", False),
@@ -123,7 +125,7 @@ if env['build'] == "embed_only":
     Return()
 
 if env['neon'] and 'x86' in env['arch']:
-    print("Cannot compile Neon for x86")
+    print("Cannot compile Arm® Neon™ for x86")
     Exit(1)
 
 if env['set_soname'] and not version_at_least(SCons.__version__, "2.4"):
@@ -285,7 +287,7 @@ if not GetOption("help"):
             print("GCC 6.2.1 or newer is required to compile armv8.2-a code")
             Exit(1)
         elif env['arch'] == 'arm64-v8a' and not version_at_least(compiler_ver, '4.9'):
-            print("GCC 4.9 or newer is required to compile Neon code for AArch64")
+            print("GCC 4.9 or newer is required to compile Arm® Neon™ code for AArch64")
             Exit(1)
 
         if version_at_least(compiler_ver, '6.1'):
diff --git a/arm_compute/core/ITensor.h b/arm_compute/core/ITensor.h
index abcec60b79..131ee205ea 100644
--- a/arm_compute/core/ITensor.h
+++ b/arm_compute/core/ITensor.h
@@ -32,7 +32,7 @@ namespace arm_compute
 {
 class Coordinates;
 
-/** Interface for Neon tensor */
+/** Interface for CPU tensor */
 class ITensor
 {
 public:
diff --git a/arm_compute/graph/Types.h b/arm_compute/graph/Types.h
index b1154daef3..4f3ce76370 100644
--- a/arm_compute/graph/Types.h
+++ b/arm_compute/graph/Types.h
@@ -96,7 +96,7 @@ struct GraphConfig
 enum class Target
 {
     UNSPECIFIED, /**< Unspecified Target */
-    NEON,        /**< Neon capable target device */
+    NEON,        /**< Arm® Neon™ capable target device */
     CL,          /**< OpenCL capable target device */
 };
 
diff --git a/arm_compute/graph/Utils.h b/arm_compute/graph/Utils.h
index e32ac29f35..6c3b76839e 100644
--- a/arm_compute/graph/Utils.h
+++ b/arm_compute/graph/Utils.h
@@ -76,7 +76,7 @@ bool is_target_supported(Target target);
 /** Returns default target for execution
  *
  * @note If an OpenCL backend exists then OpenCL is returned,
- *       else if the Neon backend exists returns Neon as target.
+ *       else if the CPU backend exists returns @ref Target::NEON as target.
  *       If no backends are registered an error is raised.
  *
  * @return Default target
diff --git a/arm_compute/graph/backends/NEON/NEDeviceBackend.h b/arm_compute/graph/backends/NEON/NEDeviceBackend.h
index 01d2be010a..0b343c0d67 100644
--- a/arm_compute/graph/backends/NEON/NEDeviceBackend.h
+++ b/arm_compute/graph/backends/NEON/NEDeviceBackend.h
@@ -34,7 +34,7 @@ namespace graph
 {
 namespace backends
 {
-/** Neon device backend */
+/** CPU device backend */
 class NEDeviceBackend final : public IDeviceBackend
 {
 public:
@@ -54,7 +54,7 @@ public:
     std::shared_ptr<arm_compute::IWeightsManager> create_weights_manager() override;
 
 private:
-    Allocator _allocator; /**< Neon backend allocator */
+    Allocator _allocator; /**< Backend allocator */
 };
 } // namespace backends
 } // namespace graph
diff --git a/arm_compute/graph/backends/NEON/NEFunctionFactory.h b/arm_compute/graph/backends/NEON/NEFunctionFactory.h
index 4dd1f1f5ca..6365b71f32 100644
--- a/arm_compute/graph/backends/NEON/NEFunctionFactory.h
+++ b/arm_compute/graph/backends/NEON/NEFunctionFactory.h
@@ -38,7 +38,7 @@ class GraphContext;
 
 namespace backends
 {
-/** Factory for generating Neon backend functions **/
+/** Factory for generating CPU backend functions **/
 class NEFunctionFactory final
 {
 public:
diff --git a/arm_compute/graph/backends/NEON/NESubTensorHandle.h b/arm_compute/graph/backends/NEON/NESubTensorHandle.h
index 534f1c4e06..a438b65735 100644
--- a/arm_compute/graph/backends/NEON/NESubTensorHandle.h
+++ b/arm_compute/graph/backends/NEON/NESubTensorHandle.h
@@ -34,7 +34,7 @@ namespace graph
 {
 namespace backends
 {
-/** Neon Sub-Tensor handle interface object **/
+/** CPU Sub-Tensor handle interface object **/
 class NESubTensorHandle final : public ITensorHandle
 {
 public:
diff --git a/arm_compute/graph/backends/NEON/NETensorHandle.h b/arm_compute/graph/backends/NEON/NETensorHandle.h
index 211990fe6c..99101a8fe9 100644
--- a/arm_compute/graph/backends/NEON/NETensorHandle.h
+++ b/arm_compute/graph/backends/NEON/NETensorHandle.h
@@ -34,7 +34,7 @@ namespace graph
 {
 namespace backends
 {
-/** Neon Tensor handle interface object **/
+/** CPU Tensor handle interface object **/
 class NETensorHandle final : public ITensorHandle
 {
 public:
diff --git a/arm_compute/runtime/IFunction.h b/arm_compute/runtime/IFunction.h
index 2aec0c007f..b7b28f999d 100644
--- a/arm_compute/runtime/IFunction.h
+++ b/arm_compute/runtime/IFunction.h
@@ -34,7 +34,7 @@ public:
     virtual ~IFunction() = default;
     /** Run the kernels contained in the function
      *
-     * For Neon kernels:
+     * For CPU kernels:
      * - Multi-threading is used for the kernels which are parallelisable.
      * - By default std::thread::hardware_concurrency() threads are used.
      *
diff --git a/arm_compute/runtime/NEON/INEOperator.h b/arm_compute/runtime/NEON/INEOperator.h
index 184a5959b4..5637d831a3 100644
--- a/arm_compute/runtime/NEON/INEOperator.h
+++ b/arm_compute/runtime/NEON/INEOperator.h
@@ -39,7 +39,7 @@ class Window;
 using INEKernel = ICPPKernel;
 namespace experimental
 {
-/** Basic interface for functions which have a single async Neon kernel */
+/** Basic interface for functions which have a single async CPU kernel */
 class INEOperator : public IOperator
 {
 public:
diff --git a/arm_compute/runtime/NEON/INESimpleFunction.h b/arm_compute/runtime/NEON/INESimpleFunction.h
index 8c7cf6512c..7512759bd0 100644
--- a/arm_compute/runtime/NEON/INESimpleFunction.h
+++ b/arm_compute/runtime/NEON/INESimpleFunction.h
@@ -33,7 +33,7 @@ namespace arm_compute
 class ICPPKernel;
 class NEFillBorderKernel;
 using INEKernel = ICPPKernel;
-/** Basic interface for functions which have a single Neon kernel */
+/** Basic interface for functions which have a single CPU kernel */
 class INESimpleFunction : public IFunction
 {
 public:
diff --git a/arm_compute/runtime/NEON/INESimpleFunctionNoBorder.h b/arm_compute/runtime/NEON/INESimpleFunctionNoBorder.h
index 52bd5f333b..dc4bac17e4 100644
--- a/arm_compute/runtime/NEON/INESimpleFunctionNoBorder.h
+++ b/arm_compute/runtime/NEON/INESimpleFunctionNoBorder.h
@@ -33,7 +33,7 @@ namespace arm_compute
 {
 class ICPPKernel;
 using INEKernel = ICPPKernel;
-/** Basic interface for functions which have a single Neon kernel and no border */
+/** Basic interface for functions which have a single CPU kernel and no border */
 class INESimpleFunctionNoBorder : public IFunction
 {
 public:
diff --git a/arm_compute/runtime/NEON/NEScheduler.h b/arm_compute/runtime/NEON/NEScheduler.h
index 542142a30a..a3082d00f6 100644
--- a/arm_compute/runtime/NEON/NEScheduler.h
+++ b/arm_compute/runtime/NEON/NEScheduler.h
@@ -28,7 +28,7 @@
 
 namespace arm_compute
 {
-/** Neon Scheduler */
+/** CPU Scheduler */
 using NEScheduler = Scheduler;
 }
 #endif /*ARM_COMPUTE_NESCHEDULER_H */
diff --git a/arm_compute/runtime/NEON/functions/NEArgMinMaxLayer.h b/arm_compute/runtime/NEON/functions/NEArgMinMaxLayer.h
index 8235185a8e..cbf1d5b444 100644
--- a/arm_compute/runtime/NEON/functions/NEArgMinMaxLayer.h
+++ b/arm_compute/runtime/NEON/functions/NEArgMinMaxLayer.h
@@ -37,7 +37,7 @@ class ITensor;
 /** Function to calculate the index of the minimum or maximum values in a
  *  tensor based on an axis.
  *
- *  This function calls the following Neon kernels:
+ *  This function calls the following kernels:
  *
  * -# @ref NEReductionOperationKernel
  * -# @ref NEFillBorderKernel
diff --git a/arm_compute/runtime/NEON/functions/NEBoundingBoxTransform.h b/arm_compute/runtime/NEON/functions/NEBoundingBoxTransform.h
index de8dfef4ed..c377520a12 100644
--- a/arm_compute/runtime/NEON/functions/NEBoundingBoxTransform.h
+++ b/arm_compute/runtime/NEON/functions/NEBoundingBoxTransform.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 Arm Limited.
+ * Copyright (c) 2019-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -32,11 +32,7 @@ namespace arm_compute
 class ITensor;
 class ITensorInfo;
 
-/** Basic function to run @ref NEBoundingBoxTransformKernel.
- *
- * This function calls the following Neon kernels:
- * -# @ref NEBoundingBoxTransformKernel
- */
+/** Basic function to run @ref NEBoundingBoxTransformKernel. */
 class NEBoundingBoxTransform : public INESimpleFunctionNoBorder
 {
 public:
diff --git a/arm_compute/runtime/NEON/functions/NEConvolutionLayer.h b/arm_compute/runtime/NEON/functions/NEConvolutionLayer.h
index a387255b6c..d2d41c1e8a 100644
--- a/arm_compute/runtime/NEON/functions/NEConvolutionLayer.h
+++ b/arm_compute/runtime/NEON/functions/NEConvolutionLayer.h
@@ -37,7 +37,7 @@ namespace arm_compute
 // Forward declarations
 class ITensor;
 
-/** Basic function to simulate a convolution layer. This function calls one of the following Neon functions:
+/** Basic function to simulate a convolution layer. This function calls one of the following functions:
  * -# @ref NEGEMMConvolutionLayer     (executed only in case GEMM is required for the operation)
  * -# @ref NEWinogradConvolutionLayer (executed only in case Winograd is required for the operation)
  * -# @ref NEDirectConvolutionLayer   (executed only in case Direct Convolution is required for the operation)
diff --git a/arm_compute/runtime/NEON/functions/NEDeconvolutionLayer.h b/arm_compute/runtime/NEON/functions/NEDeconvolutionLayer.h
index 02a0f784ec..3864a663c2 100644
--- a/arm_compute/runtime/NEON/functions/NEDeconvolutionLayer.h
+++ b/arm_compute/runtime/NEON/functions/NEDeconvolutionLayer.h
@@ -64,7 +64,7 @@ namespace arm_compute
  * The weights used by Deconvolution are supposed to be the same as the ones used for Convolution. Therefore, it will be necessary to use the weights in the
  * reverse order to perform an actual convolution. This is achieved by using @ref NEReverse.
  *
- * This function calls the following Neon kernels/functions:
+ * This function calls the following kernels/functions:
  *
  * -# @ref CPPUpsample
  * -# @ref NEConvolutionLayer
diff --git a/arm_compute/runtime/NEON/functions/NEDepthwiseConvolutionLayer.h b/arm_compute/runtime/NEON/functions/NEDepthwiseConvolutionLayer.h
index 98fffe0b33..9aa8f04eb8 100644
--- a/arm_compute/runtime/NEON/functions/NEDepthwiseConvolutionLayer.h
+++ b/arm_compute/runtime/NEON/functions/NEDepthwiseConvolutionLayer.h
@@ -110,7 +110,7 @@ private:
                                                                           const PadStrideInfo &conv_info, unsigned int depth_multiplier = 1,
                                                                           ActivationLayerInfo act_info = ActivationLayerInfo(), const Size2D &dilation = Size2D(1U, 1U));
 
-    /** Basic function to execute optimized depthwise convolution routines. This function calls the following Neon kernels:
+    /** Basic function to execute optimized depthwise convolution routines. This function calls the following kernels:
     *
     * @note At the moment 3x3 and 5x5 convolution of stride 1, 2 are supported
     *
@@ -192,7 +192,7 @@ private:
         bool                                   _is_prepared;
     };
 
-    /** Basic function to execute a generic depthwise convolution. This function calls the following Neon kernel:
+    /** Basic function to execute a generic depthwise convolution. This function calls the following kernel:
      *
      * -# @ref NEDepthwiseConvolutionLayerNativeKernel
      *
diff --git a/arm_compute/runtime/NEON/functions/NEDirectConvolutionLayer.h b/arm_compute/runtime/NEON/functions/NEDirectConvolutionLayer.h
index ff0c3054d8..86914fa0bc 100644
--- a/arm_compute/runtime/NEON/functions/NEDirectConvolutionLayer.h
+++ b/arm_compute/runtime/NEON/functions/NEDirectConvolutionLayer.h
@@ -41,7 +41,7 @@ class NEFillBorderKernel;
 
 /** Function to run the direct convolution.
  *
- *  This function calls the following Neon kernels:
+ *  This function calls the following kernels:
  *
  * -# @ref NEFillBorderKernel for the input
  * -# @ref NEDirectConvolutionLayerOutputStageKernel
diff --git a/arm_compute/runtime/NEON/functions/NEFFT1D.h b/arm_compute/runtime/NEON/functions/NEFFT1D.h
index 04e8f81b69..a533aa7f48 100644
--- a/arm_compute/runtime/NEON/functions/NEFFT1D.h
+++ b/arm_compute/runtime/NEON/functions/NEFFT1D.h
@@ -40,7 +40,7 @@ class NEFFTDigitReverseKernel;
 class NEFFTRadixStageKernel;
 class NEFFTScaleKernel;
 
-/** Basic function to execute one dimensional FFT. This function calls the following Neon kernels:
+/** Basic function to execute one dimensional FFT. This function calls the following kernels:
  *
  * -# @ref NEFFTDigitReverseKernel Performs digit reverse
  * -# @ref NEFFTRadixStageKernel   A list of FFT kernels depending on the radix decomposition
diff --git a/arm_compute/runtime/NEON/functions/NEFFT2D.h b/arm_compute/runtime/NEON/functions/NEFFT2D.h
index 218401b429..ce84a85105 100644
--- a/arm_compute/runtime/NEON/functions/NEFFT2D.h
+++ b/arm_compute/runtime/NEON/functions/NEFFT2D.h
@@ -36,7 +36,7 @@ namespace arm_compute
 // Forward declaration
 class ITensor;
 
-/** Basic function to execute two dimensional FFT. This function calls the following Neon kernels:
+/** Basic function to execute two dimensional FFT. This function calls the following kernels:
  *
  * -# @ref NEFFT1D 1D FFT is performed on the first given axis
  * -# @ref NEFFT1D 1D FFT is performed on the second given axis
diff --git a/arm_compute/runtime/NEON/functions/NEFFTConvolutionLayer.h b/arm_compute/runtime/NEON/functions/NEFFTConvolutionLayer.h
index 8967363e7f..213fa6093b 100644
--- a/arm_compute/runtime/NEON/functions/NEFFTConvolutionLayer.h
+++ b/arm_compute/runtime/NEON/functions/NEFFTConvolutionLayer.h
@@ -43,7 +43,7 @@ namespace arm_compute
 // Forward declarations
 class ITensor;
 
-/** Basic function to execute FFT-based convolution on Neon. This function calls the following Neon functions/kernels:
+/** Basic function to execute FFT-based convolution on CPU. This function calls the following functions/kernels:
  *
  *  -# @ref NEPermute                        Permute input if NHWC(only NCHW is supported).
  *  -# @ref NEPadLayer                       Pad input.
@@ -84,7 +84,7 @@ public:
      *                              Data types supported: Same as @p input.
      * @param[in]  conv_info        Contains padding and stride information described in @ref PadStrideInfo.
      * @param[in]  act_info         (Optional) Activation layer information in case of a fused activation.
-     * @param[in]  enable_fast_math (Optional) Enable fast math computation. Unused for Neon backend.
+     * @param[in]  enable_fast_math (Optional) Enable fast math computation. Unused for CPU backend.
      */
     void configure(ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const PadStrideInfo &conv_info,
                    const ActivationLayerInfo &act_info = ActivationLayerInfo(), bool enable_fast_math = false);
@@ -101,7 +101,7 @@ public:
      *                             Data types supported: Same as @p input.
      * @param[in] conv_info        Contains padding and stride information described in @ref PadStrideInfo.
      * @param[in] act_info         (Optional) Activation layer information in case of a fused activation.
-     * @param[in] enable_fast_math (Optional) Enable fast math computation. Unused for Neon backend.
+     * @param[in] enable_fast_math (Optional) Enable fast math computation. Unused for CPU backend.
      *
      * @return a status
      */
diff --git a/arm_compute/runtime/NEON/functions/NEFullyConnectedLayer.h b/arm_compute/runtime/NEON/functions/NEFullyConnectedLayer.h
index 1b3f36d866..58b11744a8 100644
--- a/arm_compute/runtime/NEON/functions/NEFullyConnectedLayer.h
+++ b/arm_compute/runtime/NEON/functions/NEFullyConnectedLayer.h
@@ -35,7 +35,7 @@
 
 namespace arm_compute
 {
-/** Basic function to reshape the weights of Fully Connected layer with Neon. This function calls the following kernels:
+/** Basic function to reshape the weights of Fully Connected layer. This function calls the following kernels:
  *
  * @note  The fully connected layer accepts "weights" tensors only with 2 dimensions.
  */
@@ -117,7 +117,7 @@ private:
 };
 } // namespace weights_transformations
 
-/** Basic function to compute a Fully Connected layer on Neon. This function calls the following Neon kernels:
+/** Basic function to compute a Fully Connected layer. This function calls the following kernels:
  *  -# @ref NEIm2ColKernel (called when the input comes from a convolutional layer)
  *  -# @ref NEFullyConnectedLayerReshapeWeights (if @p are_weights_reshaped is set to false and transpose_weights is set to true ) (called once)
  *  -# @ref NEGEMMMatrixMultiplyKernel or @ref NEGEMMLowpMatrixMultiplyCore (if quantized asymmetric)
diff --git a/arm_compute/runtime/NEON/functions/NEGEMM.h b/arm_compute/runtime/NEON/functions/NEGEMM.h
index 6f7951eece..a6c3436656 100644
--- a/arm_compute/runtime/NEON/functions/NEGEMM.h
+++ b/arm_compute/runtime/NEON/functions/NEGEMM.h
@@ -43,7 +43,7 @@ class NEGEMMMatrixMultiplyKernel;
 class NEGEMMTranspose1xWKernel;
 class NEGEMMAssemblyDispatch;
 
-/** Basic function to execute GEMM on Neon. This function calls the following Neon kernels:
+/** Basic function to execute GEMM. This function calls the following kernels:
  *
  * If optimized assembly is available:
  *  -# @ref NEGEMMAssemblyDispatch
diff --git a/arm_compute/runtime/NEON/functions/NEGEMMConv2d.h b/arm_compute/runtime/NEON/functions/NEGEMMConv2d.h
index 2bd233f520..8c3ba4f0c8 100644
--- a/arm_compute/runtime/NEON/functions/NEGEMMConv2d.h
+++ b/arm_compute/runtime/NEON/functions/NEGEMMConv2d.h
@@ -38,7 +38,7 @@ namespace arm_compute
 class ITensor;
 class NEGEMMAssemblyDispatch;
 
-/** Basic function to compute the convolution layer. This function calls the following Neon kernels/functions:
+/** Basic function to compute the convolution layer. This function calls the following kernels/functions:
  *
  * Supports only NHWC data layout
  *
diff --git a/arm_compute/runtime/NEON/functions/NEGEMMConvolutionLayer.h b/arm_compute/runtime/NEON/functions/NEGEMMConvolutionLayer.h
index 33f00c087c..9897bf1d4d 100644
--- a/arm_compute/runtime/NEON/functions/NEGEMMConvolutionLayer.h
+++ b/arm_compute/runtime/NEON/functions/NEGEMMConvolutionLayer.h
@@ -150,7 +150,7 @@ private:
 };
 } // namespace weights_transformations
 
-/** Basic function to compute the convolution layer. This function calls the following Neon kernels/functions:
+/** Basic function to compute the convolution layer. This function calls the following kernels/functions:
  *
  * -# @ref NEIm2ColKernel
  * -# @ref NEGEMM (if the data type is BFLOAT16/FP16/FP32)
diff --git a/arm_compute/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.h b/arm_compute/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.h
index 821b498dad..b2b77bd406 100644
--- a/arm_compute/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.h
+++ b/arm_compute/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.h
@@ -37,7 +37,6 @@ namespace arm_compute
 {
 class ITensor;
 class NEConvertQuantizedSignednessKernel;
-class NEConvertQuantizedSignednessKernel;
 class NEGEMMInterleave4x4Kernel;
 class NEGEMMLowpMatrixMultiplyKernel;
 class NEGEMMLowpOffsetContributionKernel;
@@ -47,7 +46,7 @@ class NEGEMMLowpMatrixBReductionKernel;
 class NEGEMMTranspose1xWKernel;
 class NEGEMMAssemblyDispatch;
 
-/** Basic function to execute GEMMLowpMatrixMultiplyCore on Neon. This function calls the following Neon kernels if the DOT product instruction is not available:
+/** Basic function to execute GEMMLowpMatrixMultiplyCore. This function calls the following kernels if the DOT product instruction is not available:
  *
  *  -# @ref NEGEMMInterleave4x4Kernel
  *  -# @ref NEGEMMTranspose1xWKernel
diff --git a/arm_compute/runtime/NEON/functions/NEGEMMLowpOutputStage.h b/arm_compute/runtime/NEON/functions/NEGEMMLowpOutputStage.h
index 79b427ea6f..c22ed1b5c4 100644
--- a/arm_compute/runtime/NEON/functions/NEGEMMLowpOutputStage.h
+++ b/arm_compute/runtime/NEON/functions/NEGEMMLowpOutputStage.h
@@ -27,7 +27,7 @@
 #include "arm_compute/core/Types.h"
 #include "arm_compute/runtime/NEON/INESimpleFunctionNoBorder.h"
 
-/** This file contains all available output stages for GEMMLowp on Neon.
+/** This file contains all available output stages for GEMMLowp.
  *
  *  In gemmlowp, the "output stage" is the process that takes a final int32 accumulator value (the output of @ref NEGEMMLowpMatrixMultiplyCore),
  *  and processes it to obtain the final ASYMM8 value.
@@ -40,7 +40,7 @@ namespace arm_compute
 class ITensor;
 class ITensorInfo;
 
-/** Basic function to execute NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint on Neon.
+/** Basic function to execute NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint.
  *
  *  NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint depends on 3 parameters:
  *
@@ -61,7 +61,7 @@ class ITensorInfo;
  *
  * ((FixedPointMul(input[i][k] + bias[k], result_fixedpoint_multiplier)) >> result_shift) + result_offset_after_shift
  *
- *  This function calls the following Neon kernels:
+ *  This function calls the following kernels:
  *
  * -# @ref NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel
  *
@@ -112,7 +112,7 @@ public:
      */
     static Status validate(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, int min = std::numeric_limits<int32_t>::lowest(), int max = std::numeric_limits<int32_t>::max());
 };
-/** Basic function to execute NEGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPoint on Neon.
+/** Basic function to execute NEGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPoint.
  *
  *  NEGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPoint depends on 3 parameters:
  *
@@ -133,7 +133,7 @@ public:
  *
  * ((FixedPointMul(input[i][k] + bias[k], result_fixedpoint_multiplier)) >> result_shift) + result_offset_after_shift
  *
- *  This function calls the following Neon kernels:
+ *  This function calls the following kernels:
  *
  * -# @ref NEGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel
  *
@@ -184,7 +184,7 @@ public:
      */
     static Status validate(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, int min = std::numeric_limits<int32_t>::lowest(), int max = std::numeric_limits<int32_t>::max());
 };
-/** Basic function to execute NEGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPoint on Neon.
+/** Basic function to execute NEGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPoint.
  *
  *  NEGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPoint depends on 2 parameters:
  *
@@ -205,7 +205,7 @@ public:
  *
  * ((FixedPointMul(input[i][k] + bias[k], result_fixedpoint_multiplier)) >> result_shift) + result_offset_after_shift
  *
- *  This function calls the following Neon kernels:
+ *  This function calls the following kernels:
  *
  * -# @ref NEGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel
  *
@@ -256,9 +256,9 @@ public:
     static Status validate(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, int min = std::numeric_limits<int32_t>::lowest(), int max = std::numeric_limits<int32_t>::max());
 };
 
-/** Basic function to execute GEMMLowpQuantizeDown kernels on Neon.
+/** Basic function to execute GEMMLowpQuantizeDown kernels.
  *
- *  This function calls the following Neon kernels:
+ *  This function calls the following kernels:
  *
  * -# @ref NEGEMMLowpQuantizeDownInt32ScaleKernel
  * -# @ref NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel
diff --git a/arm_compute/runtime/NEON/functions/NELSTMLayerQuantized.h b/arm_compute/runtime/NEON/functions/NELSTMLayerQuantized.h
index a59dcf88cc..53a024ae04 100644
--- a/arm_compute/runtime/NEON/functions/NELSTMLayerQuantized.h
+++ b/arm_compute/runtime/NEON/functions/NELSTMLayerQuantized.h
@@ -47,7 +47,7 @@ class ITensor;
 
 /** Basic function to run @ref NELSTMLayerQuantized
  *
- * This function calls the following Neon functions/kernels:
+ * This function calls the following functions/kernels:
  *
  * -# @ref NEGEMMLowpMatrixMultiplyCore                          Quantized matrix multiplication core. Accumulators are 32-bit integers
  * -# @ref NEGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPoint   Convert 32-bit integers into QSYMM16
diff --git a/arm_compute/runtime/NEON/functions/NEMaxUnpoolingLayer.h b/arm_compute/runtime/NEON/functions/NEMaxUnpoolingLayer.h
index 7b1f7e9ca1..fae26b3c93 100644
--- a/arm_compute/runtime/NEON/functions/NEMaxUnpoolingLayer.h
+++ b/arm_compute/runtime/NEON/functions/NEMaxUnpoolingLayer.h
@@ -35,7 +35,7 @@ class ITensorInfo;
 class NEFill;
 class NEMaxUnpoolingLayerKernel;
 
-/** Function to perform MaxUnpooling. This function calls the following Neon kernels:
+/** Function to perform MaxUnpooling. This function calls the following kernels:
  *
  * -# @ref NEFill
  * -# @ref NEMaxUnpoolingLayerKernel
diff --git a/arm_compute/runtime/NEON/functions/NENormalizationLayer.h b/arm_compute/runtime/NEON/functions/NENormalizationLayer.h
index fbc2f6f95b..8c4ad1516e 100644
--- a/arm_compute/runtime/NEON/functions/NENormalizationLayer.h
+++ b/arm_compute/runtime/NEON/functions/NENormalizationLayer.h
@@ -39,7 +39,7 @@ namespace arm_compute
 class ITensor;
 class NENormalizationLayerKernel;
 
-/** Basic function to compute a normalization layer. This function calls the following Neon kernels:
+/** Basic function to compute a normalization layer. This function calls the following kernels:
  *
  * -# @ref NEPixelWiseMultiplication
  * -# @ref NEFillBorderKernel
diff --git a/arm_compute/runtime/NEON/functions/NEPadLayer.h b/arm_compute/runtime/NEON/functions/NEPadLayer.h
index 242625604f..76ff0643a0 100644
--- a/arm_compute/runtime/NEON/functions/NEPadLayer.h
+++ b/arm_compute/runtime/NEON/functions/NEPadLayer.h
@@ -38,7 +38,7 @@ namespace arm_compute
 {
 class NEPadLayerKernel;
 
-/** Basic function to pad a tensor. This function calls the following Neon functions/kernels:
+/** Basic function to pad a tensor. This function calls the following functions/kernels:
  *
  *  - For padding mode = PaddingMode::CONSTANT:
  *      -# @ref NEPadLayerKernel
diff --git a/arm_compute/runtime/NEON/functions/NEPermute.h b/arm_compute/runtime/NEON/functions/NEPermute.h
index fb95e45bdb..2508458a3d 100644
--- a/arm_compute/runtime/NEON/functions/NEPermute.h
+++ b/arm_compute/runtime/NEON/functions/NEPermute.h
@@ -52,7 +52,7 @@ public:
     NEPermute &operator=(const NEPermute &) = delete;
     /** Default move assignment operator */
     NEPermute &operator=(NEPermute &&) = default;
-    /** Configure the permute Neon kernel
+    /** Configure the permute function
      *
      * @note Arbitrary permutation vectors are supported with rank not greater than 4
      *
diff --git a/arm_compute/runtime/NEON/functions/NEPoolingLayer.h b/arm_compute/runtime/NEON/functions/NEPoolingLayer.h
index e374348f98..cb136ebca9 100644
--- a/arm_compute/runtime/NEON/functions/NEPoolingLayer.h
+++ b/arm_compute/runtime/NEON/functions/NEPoolingLayer.h
@@ -36,7 +36,7 @@ namespace arm_compute
 class ITensor;
 class ITensorInfo;
 
-/** Basic function to simulate a pooling layer with the specified pooling operation. This function calls the following Neon kernels:
+/** Basic function to simulate a pooling layer with the specified pooling operation. This function calls the following kernels:
  *
  * -# @ref NEFillBorderKernel (executed if padding size is different from zero)
  * -# @ref cpu::kernels::CpuPoolingKernel
diff --git a/arm_compute/runtime/NEON/functions/NEQLSTMLayer.h b/arm_compute/runtime/NEON/functions/NEQLSTMLayer.h
index 954aceba1a..e706179415 100644
--- a/arm_compute/runtime/NEON/functions/NEQLSTMLayer.h
+++ b/arm_compute/runtime/NEON/functions/NEQLSTMLayer.h
@@ -47,7 +47,7 @@ class NEGEMMLowpMatrixAReductionKernel;
 
 /** Basic function to run @ref NEQLSTMLayer
  *
- * This function calls the following Neon functions/kernels:
+ * This function calls the following kernels:
  *
  * -# @ref NEActivationLayer                                     Activation functions (tanh and logistic)
  * -# @ref NEArithmeticAddition                                  Elementwise addition
diff --git a/arm_compute/runtime/NEON/functions/NEQuantizationLayer.h b/arm_compute/runtime/NEON/functions/NEQuantizationLayer.h
index 54ec76b177..9e2d9ecf24 100644
--- a/arm_compute/runtime/NEON/functions/NEQuantizationLayer.h
+++ b/arm_compute/runtime/NEON/functions/NEQuantizationLayer.h
@@ -35,12 +35,7 @@ namespace arm_compute
 class ITensor;
 class ITensorInfo;
 
-/** Basic function to simulate a quantization layer. This function calls the following Arm(R) Neon(TM) implementation layers:
- *
- *
- * -# @ref cpu::CpuQuantization
- *
- */
+/** Basic function to run a quantization layer using @ref cpu::CpuQuantization */
 class NEQuantizationLayer : public IFunction
 {
 public:
diff --git a/arm_compute/runtime/NEON/functions/NEROIAlignLayer.h b/arm_compute/runtime/NEON/functions/NEROIAlignLayer.h
index 9d934588fb..c72cd494d2 100644
--- a/arm_compute/runtime/NEON/functions/NEROIAlignLayer.h
+++ b/arm_compute/runtime/NEON/functions/NEROIAlignLayer.h
@@ -32,12 +32,7 @@ namespace arm_compute
 class ITensor;
 class ITensorInfo;
 
-/** Basic function to run @ref NEROIAlignLayerKernel.
- *
- * This function calls the following Neon kernels:
- * -# @ref NEROIAlignLayerKernel
- *
- */
+/** Basic function to run @ref NEROIAlignLayerKernel. */
 class NEROIAlignLayer : public INESimpleFunctionNoBorder
 {
 public:
diff --git a/arm_compute/runtime/NEON/functions/NEROIPoolingLayer.h b/arm_compute/runtime/NEON/functions/NEROIPoolingLayer.h
index 510c89caf2..214dd43402 100644
--- a/arm_compute/runtime/NEON/functions/NEROIPoolingLayer.h
+++ b/arm_compute/runtime/NEON/functions/NEROIPoolingLayer.h
@@ -35,12 +35,7 @@ class ITensorInfo;
 class NEROIPoolingLayerKernel;
 class ROIPoolingLayerInfo;
 
-/** Basic function to run @ref NEROIPoolingLayerKernel.
- *
- * This function calls the following Neon kernels:
- * -# @ref NEROIPoolingLayerKernel
- *
- */
+/** Basic function to run @ref NEROIPoolingLayerKernel. */
 class NEROIPoolingLayer : public IFunction
 {
 public:
diff --git a/arm_compute/runtime/NEON/functions/NEReductionOperation.h b/arm_compute/runtime/NEON/functions/NEReductionOperation.h
index f30cc810f1..b96b70926c 100644
--- a/arm_compute/runtime/NEON/functions/NEReductionOperation.h
+++ b/arm_compute/runtime/NEON/functions/NEReductionOperation.h
@@ -35,7 +35,7 @@ namespace arm_compute
 class ITensor;
 class NEReductionOperationKernel;
 
-/** Basic function to simulate a reduction operation. This function calls the following Neon kernels:
+/** Basic function to simulate a reduction operation. This function calls the following kernels:
  *
  * -# @ref NEReshapeLayer
  * -# @ref NEReductionOperationKernel
diff --git a/arm_compute/runtime/NEON/functions/NERemap.h b/arm_compute/runtime/NEON/functions/NERemap.h
index 84d0f2ee92..835ebfab7e 100644
--- a/arm_compute/runtime/NEON/functions/NERemap.h
+++ b/arm_compute/runtime/NEON/functions/NERemap.h
@@ -34,13 +34,10 @@ namespace arm_compute
 {
 class ITensor;
 
-/** Basic function to execute remap. This function calls the following Neon kernels:
+/** Basic function to execute remap. This function calls the following kernels:
  *
  * -# @ref NEFillBorderKernel (executed if border_mode == CONSTANT or border_mode == REPLICATE)
  * -# @ref NERemapKernel
- *
- * @deprecated This function is deprecated and is intended to be removed in 21.05 release
- *
  */
 class NERemap : public INESimpleFunction
 {
diff --git a/arm_compute/runtime/NEON/functions/NESpaceToBatchLayer.h b/arm_compute/runtime/NEON/functions/NESpaceToBatchLayer.h
index aeeaefcc38..27c1ddf8e3 100644
--- a/arm_compute/runtime/NEON/functions/NESpaceToBatchLayer.h
+++ b/arm_compute/runtime/NEON/functions/NESpaceToBatchLayer.h
@@ -36,7 +36,7 @@ class ITensorInfo;
 class NESpaceToBatchLayerKernel;
 class NEFill;
 
-/** Basic function to spatial divide a tensor. This function calls the following Neon kernels/functions:
+/** Basic function to spatial divide a tensor. This function calls the following kernels/functions:
  *
  *  -# @ref NEFill
  *  -# @ref NESpaceToBatchLayerKernel
diff --git a/arm_compute/runtime/NEON/functions/NESpaceToDepthLayer.h b/arm_compute/runtime/NEON/functions/NESpaceToDepthLayer.h
index d76fc48204..73c228d8ee 100644
--- a/arm_compute/runtime/NEON/functions/NESpaceToDepthLayer.h
+++ b/arm_compute/runtime/NEON/functions/NESpaceToDepthLayer.h
@@ -35,10 +35,7 @@ class ITensor;
 class ITensorInfo;
 class NESpaceToDepthLayerKernel;
 
-/** This function calls the following Neon kernels/functions:
- *
- *  -# @ref NESpaceToDepthLayerKernel
- */
+/** Basic function to run @ref NESpaceToDepthLayerKernel. */
 class NESpaceToDepthLayer : public IFunction
 {
 public:
diff --git a/arm_compute/runtime/NEON/functions/NEWinogradConvolutionLayer.h b/arm_compute/runtime/NEON/functions/NEWinogradConvolutionLayer.h
index e41cdbd0ac..befc373646 100644
--- a/arm_compute/runtime/NEON/functions/NEWinogradConvolutionLayer.h
+++ b/arm_compute/runtime/NEON/functions/NEWinogradConvolutionLayer.h
@@ -42,7 +42,8 @@ namespace arm_compute
 class ITensor;
 class ICPPKernel;
 
-/** Basic function to simulate a convolution layer. This function calls the following Neon kernels:
+/** Basic function to simulate a convolution layer. This function calls the following kernels:
+ *
  * -# @ref NEWinogradLayerTransformWeightsKernel (executed only once in the first call to the run() method )
  * -# @ref NEWinogradLayerTransformInputKernel
  * -# @ref NEWinogradLayerTransformOutputKernel
diff --git a/docs/00_introduction.dox b/docs/00_introduction.dox
index e199ee9d6f..112254e82a 100644
--- a/docs/00_introduction.dox
+++ b/docs/00_introduction.dox
@@ -32,7 +32,7 @@ The Compute Library is a collection of low-level machine learning functions opti
 Several builds of the library are available using various configurations:
  - OS: Linux, Android, macOS or bare metal.
  - Architecture: armv7a (32bit) or arm64-v8a (64bit).
- - Technology: Neon / OpenCL / Neon and OpenCL.
+ - Technology: Arm® Neon™ / OpenCL / Arm® Neon™ and OpenCL.
  - Debug / Asserts / Release: Use a build with asserts enabled to debug your application and enable extra validation. Once you are sure your application works as expected you can switch to a release build of the library for maximum performance.
 
 @section S0_1_contact Contact / Support
@@ -86,7 +86,7 @@ If there is more than one release in a month then an extra sequential number is
 @subsection S2_2_changelog Changelog
 
 v21.05 Public major release
- - Removed computer vision support from Neon backend
+ - Removed computer vision support from Arm® Neon™ backend
  - Removed the following functions:
    - NEAbsoluteDifference
    - NEAccumulate
@@ -225,7 +225,7 @@ v21.02 Public major release
 v20.11 Public major release
  - Various bug fixes.
  - Various optimisations.
- - Performance regressions can be noted when executing Depthwise Convolution on Neon with a depth multiplier > 1 for quantized data type.
+ - Performance regressions can be noted when executing Depthwise Convolution on Arm® Neon™ with a depth multiplier > 1 for quantized data type.
    This is planned to be resolved in 21.02 release.
  - Added new data type QASYMM8_SIGNED support for @ref NEROIAlignLayer.
  - Added new data type S32 support for:
@@ -250,11 +250,11 @@ v20.11 Public major release
    - @ref CLLogicalNot
    - @ref CLLogicalAnd
    - @ref CLLogicalOr
- - New Neon kernels / functions:
+ - New Arm® Neon™ kernels / functions:
    - @ref NELogicalNot
    - @ref NELogicalAnd
    - @ref NELogicalOr
- - Removed padding from Neon kernels:
+ - Removed padding from Arm® Neon™ kernels:
    - NEComplexPixelWiseMultiplicationKernel
    - NENonMaximaSuppression3x3Kernel
    - @ref NERemapKernel
@@ -404,7 +404,7 @@ v20.11 Public major release
      - CLWarpAffineKernel
      - CLWarpPerspective
      - CLWarpPerspectiveKernel
- - Deprecated Neon kernels / functions (If a kernel is used only by the function that is being deprecated, the kernel is deprecated together):
+ - Deprecated Arm® Neon™ kernels / functions (If a kernel is used only by the function that is being deprecated, the kernel is deprecated together):
      - NELocallyConnectedLayer
      - NELocallyConnectedMatrixMultiplyKernel
      - NEAbsoluteDifference
@@ -538,7 +538,7 @@ v20.08 Public major release
    - CLScaleKernel
  - New OpenCL kernels / functions:
    - @ref CLMaxUnpoolingLayerKernel
- - New Neon kernels / functions:
+ - New Arm® Neon™ kernels / functions:
    - @ref NEMaxUnpoolingLayerKernel
  - New graph example:
    - graph_yolov3_output_detector
@@ -574,7 +574,7 @@ v20.08 Public major release
  - Removed OpenCL kernels / functions:
    - CLGEMMLowpQuantizeDownInt32ToUint8Scale
    - CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFloat
- - Removed Neon kernels / functions:
+ - Removed Arm® Neon™ kernels / functions:
    - NEGEMMLowpQuantizeDownInt32ToUint8Scale
    - NEGEMMMatrixAccumulateBiasesKernel
  - Deprecated functions / interfaces:
@@ -589,7 +589,7 @@ v20.08 Public major release
  - Removed padding requirement for the input (e.g. LHS of GEMM) and output in @ref CLGEMMMatrixMultiplyNativeKernel, @ref CLGEMMMatrixMultiplyReshapedKernel, @ref CLGEMMMatrixMultiplyReshapedOnlyRHSKernel and @ref CLIm2ColKernel (NHWC only)
    - This change allows to use @ref CLGEMMConvolutionLayer without extra padding for the input and output.
    - Only the weights/bias of @ref CLGEMMConvolutionLayer could require padding for the computation.
-   - Only on Arm Mali Midgard GPUs, @ref CLGEMMConvolutionLayer could require padding since @ref CLGEMMMatrixMultiplyKernel is called and currently requires padding.
+   - Only on Arm® Mali™ Midgard GPUs, @ref CLGEMMConvolutionLayer could require padding since @ref CLGEMMMatrixMultiplyKernel is called and currently requires padding.
  - Added support for exporting the OpenCL buffer object to the OpenCL image object in @ref CLGEMMMatrixMultiplyReshapedKernel and @ref CLGEMMMatrixMultiplyReshapedOnlyRHSKernel.
    - This support allows to export the OpenCL buffer used for the reshaped RHS matrix to the OpenCL image object.
    - The padding requirement for the OpenCL image object is considered into the @ref CLGEMMReshapeRHSMatrixKernel.
@@ -640,7 +640,7 @@ v20.05 Public major release
  - New OpenCL kernels / functions:
      - @ref CLQLSTMLayer
      - @ref CLQLSTMLayerNormalizationKernel
- - New Neon kernels / functions:
+ - New Arm® Neon™ kernels / functions:
      - @ref NEQLSTMLayer
      - @ref NEQLSTMLayerNormalizationKernel
  - Added HARD_SWISH support in:
@@ -649,20 +649,20 @@ v20.05 Public major release
  - Deprecated OpenCL kernels / functions:
      - CLGEMMLowpQuantizeDownInt32ToUint8Scale
      - CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFloat
- - Deprecated Neon kernels / functions:
+ - Deprecated Arm® Neon™ kernels / functions:
      - NEGEMMLowpQuantizeDownInt32ToUint8Scale
  - Removed CPP kernels / functions:
      - CPPFlipWeightsKernel
  - Removed PoolingLayerInfo constructors without Data Layout.
  - Removed CLDepthwiseConvolutionLayer3x3
  - Removed NEDepthwiseConvolutionLayerOptimized
- - Added support for Winograd 3x3,4x4 on Neon FP16:
+ - Added support for Winograd 3x3,4x4 on Arm® Neon™ FP16:
      - @ref NEWinogradConvolutionLayer
      - @ref NEWinogradLayerTransformInputKernel
      - @ref NEWinogradLayerTransformOutputKernel
      - @ref NEWinogradLayerTransformWeightsKernel
  - Added CLCompileContext
- - Added Neon GEMM kernel with 2D window support
+ - Added Arm® Neon™ GEMM kernel with 2D window support
 
 v20.02.1 Maintenance release
  - Added Android-NN build script.
@@ -700,14 +700,14 @@ v20.02 Public major release
  - New OpenCL kernels / functions:
      - @ref CLFill
      - CLGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel / @ref CLGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPoint
- - New Neon kernels / functions:
+ - New Arm® Neon™ kernels / functions:
      - @ref NEFill
      - @ref NEGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel / @ref NEGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPoint
- - Deprecated Neon functions / interfaces:
+ - Deprecated Arm® Neon™ functions / interfaces:
      - CLDepthwiseConvolutionLayer3x3
      - NEDepthwiseConvolutionLayerOptimized
      - PoolingLayerInfo constructors without Data Layout.
- - Added support for quantization with multiplier greater than 1 on Neon and CL.
+ - Added support for quantization with multiplier greater than 1 on Arm® Neon™ and CL.
  - Added support for quantized inputs of type QASYMM8_SIGNED and QASYMM8 to @ref CLQuantizationLayer.
  - Added the ability to build bootcode for bare metal.
  - Added support for generating synthetic QASYMM8 graphs.
@@ -732,7 +732,7 @@ v19.11 Public major release
     - CLDepthwiseSeparableConvolutionLayer
     - CLDepthwiseVectorToTensorKernel
     - CLDirectConvolutionLayerOutputStageKernel
- - Deprecated Neon kernels / functions:
+ - Deprecated Arm® Neon™ kernels / functions:
     - NEDepthwiseWeightsReshapeKernel
     - NEDepthwiseIm2ColKernel
     - NEDepthwiseSeparableConvolutionLayer
@@ -743,7 +743,7 @@ v19.11 Public major release
     - @ref CLDepthwiseConvolutionLayerNativeKernel to replace the old generic depthwise convolution (see Deprecated
       OpenCL kernels / functions)
     - @ref CLLogSoftmaxLayer
- - New Neon kernels / functions:
+ - New Arm® Neon™ kernels / functions:
     - @ref NEBoundingBoxTransformKernel / @ref NEBoundingBoxTransform
     - @ref NEComputeAllAnchorsKernel / NEComputeAllAnchors
     - @ref NEDetectionPostProcessLayer
@@ -782,8 +782,8 @@ v19.11 Public major release
  - Replaced the calls to CLCopyKernel and CLMemsetKernel with @ref CLPadLayer in @ref CLGenerateProposalsLayer.
  - Improved performance for CL Inception V3 - FP16.
  - Improved accuracy for CL Inception V3 - FP16 by enabling FP32 accumulator (mixed-precision).
- - Improved Neon performance by enabling fusing batch normalization with convolution and depth-wise convolution layer.
- - Improved Neon performance for MobileNet-SSD by improving the output detection performance.
+ - Improved Arm® Neon™ performance by enabling fusing batch normalization with convolution and depth-wise convolution layer.
+ - Improved Arm® Neon™ performance for MobileNet-SSD by improving the output detection performance.
  - Optimized @ref CLPadLayer.
  - Optimized CL generic depthwise convolution layer by introducing @ref CLDepthwiseConvolutionLayerNativeKernel.
  - Reduced memory consumption by implementing weights sharing.
@@ -799,7 +799,7 @@ v19.08.1 Public maintenance release
 v19.08 Public major release
  - Various bug fixes.
  - Various optimisations.
- - Deprecated Neon functions
+ - Deprecated Arm® Neon™ functions
     - NEDepthConcatenateLayer
     - NEWidthConcatenateLayer
  - Deprecated OpenCL kernels / functions
@@ -807,7 +807,7 @@ v19.08 Public major release
     - CLGEMMInterleave4x4Kernel / CLGEMMInterleave4x4
     - CLGEMMTranspose1xWKernel / CLGEMMTranspose1xW
     - CLWidthConcatenateLayer
- - New Neon kernels / functions:
+ - New Arm® Neon™ kernels / functions:
     - @ref NEAbsLayer
     - @ref NECast
     - @ref NEElementwisePower
@@ -846,7 +846,7 @@ v19.08 Public major release
  - Added support for REDUCE_MIN and REDUCE_MAX in @ref ReductionOperation
  - Enable the fusion of batch normalization with convolution and depthwise convolution layer for FP32 in the graph API (OpenCL only)
  - Added support for fusing activation function and broadcast addition with the matrix multiplication for FP32 (OpenCL only)
- - Re-factored the depthwise convolution layer kernel on Neon for generic cases
+ - Re-factored the depthwise convolution layer kernel on Arm® Neon™ for generic cases
  - Added an optimized depthwise convolution layer kernel for 5x5 filters (Neon only)
  - Added support to enable OpenCL kernel cache. Added example showing how to load the prebuilt OpenCL kernels from a binary cache file
  - Altered @ref QuantizationInfo interface to support per-channel quantization.
@@ -854,12 +854,12 @@ v19.08 Public major release
  - The NEDepthwiseConvolutionLayerOptimized will be included by @ref NEDepthwiseConvolutionLayer to accommodate for future optimizations.
  - Removed inner_border_right and inner_border_top parameters from @ref CLDeconvolutionLayer interface
  - Removed inner_border_right and inner_border_top parameters from @ref NEDeconvolutionLayer interface
- - Optimized the Neon assembly kernel for GEMMLowp. The new implementation fuses the output stage and quantization with the matrix multiplication kernel
+ - Optimized the Arm® Neon™ assembly kernel for GEMMLowp. The new implementation fuses the output stage and quantization with the matrix multiplication kernel
 
 v19.05 Public major release
  - Various bug fixes.
  - Various optimisations.
- - New Neon kernels / functions:
+ - New Arm® Neon™ kernels / functions:
     - @ref NEBatchToSpaceLayerKernel / @ref NEBatchToSpaceLayer
     - NEComplexPixelWiseMultiplicationKernel / @ref NEComplexPixelWiseMultiplication
     - @ref NECropKernel / @ref NECropResize
@@ -927,7 +927,7 @@ v19.05 Public major release
 v19.02 Public major release
  - Various bug fixes.
  - Various optimisations.
- - New Neon kernels / functions:
+ - New Arm® Neon™ kernels / functions:
     - @ref NETileKernel / @ref NETile
     - @ref NEFuseBatchNormalizationKernel / @ref NEFuseBatchNormalization
     - NEElementwiseOperationKernel
@@ -1010,7 +1010,7 @@ v19.02 Public major release
 v18.11 Public major release
  - Various bug fixes.
  - Various optimisations.
- - New Neon kernels / functions:
+ - New Arm® Neon™ kernels / functions:
     - @ref NEChannelShuffleLayer / @ref NEChannelShuffleLayerKernel
     - @ref NEReduceMean
     - @ref NEReorgLayer / @ref NEReorgLayerKernel
@@ -1084,7 +1084,7 @@ v18.08 Public major release
     - @ref CLConvolutionLayer
     - @ref CLScale
     - @ref CLIm2ColKernel
- - New Neon kernels / functions:
+ - New Arm® Neon™ kernels / functions:
     - @ref NERNNLayer
  - New OpenCL kernels / functions:
     - @ref CLArithmeticDivision
@@ -1123,7 +1123,7 @@ v18.05 Public major release
     - CLWidthConcatenateLayer / CLWidthConcatenateLayerKernel
     - @ref CLWinogradFilterTransformKernel / @ref CLWinogradInputTransformKernel / @ref CLWinogradConvolutionLayer
     - @ref CLWinogradInputTransformKernel / @ref CLWinogradInputTransform
- - New Neon kernels / functions:
+ - New Arm® Neon™ kernels / functions:
     - @ref NEConvertFullyConnectedWeightsKernel / @ref NEConvertFullyConnectedWeights.
  - Created the validate method in @ref CLDepthwiseConvolutionLayer.
  - Beta and gamma are no longer mandatory arguments in @ref NEBatchNormalizationLayer and @ref CLBatchNormalizationLayer.
@@ -1161,7 +1161,7 @@ v18.03 Public maintenance release
  - Renamed NEWinogradLayer.cpp to @ref NEWinogradConvolutionLayer
 
 v18.02 Public major release
- - Various Neon / OpenCL / GLES optimisations.
+ - Various Arm® Neon™ / OpenCL / GLES optimisations.
  - Various bug fixes.
  - Changed default number of threads on big LITTLE systems.
  - Refactored examples and added:
@@ -1186,7 +1186,7 @@ v18.02 Public major release
  - Added support for non-square pooling to @ref NEPoolingLayer and @ref CLPoolingLayer
  - New OpenCL kernels / functions:
     - CLDirectConvolutionLayerOutputStageKernel
- - New Neon kernels / functions
+ - New Arm® Neon™ kernels / functions
     - Added name() method to all kernels.
     - Added support for Winograd 5x5.
     - NEPermuteKernel / @ref NEPermute
@@ -1213,9 +1213,9 @@ v18.01 Public maintenance release
     - GCGEMMInterleave4x4Kernel
     - GCGEMMTranspose1xWKernel
     - GCIm2ColKernel
- - Refactored Neon Winograd (NEWinogradLayerKernel)
+ - Refactored Arm® Neon™ Winograd (NEWinogradLayerKernel)
  - Added @ref NEDirectConvolutionLayerOutputStageKernel
- - Added QASYMM8 support to the following Neon kernels:
+ - Added QASYMM8 support to the following Arm® Neon™ kernels:
     - NEDepthwiseConvolutionLayer3x3Kernel
     - @ref NEFillBorderKernel
     - NEPoolingLayerKernel
@@ -1230,7 +1230,7 @@ v17.12 Public major release
  - Introduced logging interface
  - Introduced opencl timer
  - Reworked GEMMLowp interface
- - Added new Neon assembly kernels for GEMMLowp, SGEMM and HGEMM
+ - Added new Arm® Neon™ assembly kernels for GEMMLowp, SGEMM and HGEMM
  - Added validation method for most Machine Learning kernels / functions
  - Added new graph examples such as googlenet, mobilenet, squeezenet, vgg16 and vgg19
  - Added sgemm example for OpenCL
@@ -1257,7 +1257,7 @@ v17.12 Public major release
     - GCLogits1DMaxKernel / GCLogits1DShiftExpSumKernel / GCLogits1DNormKernel / GCSoftmaxLayer
     - GCTransposeKernel / GCTranspose
 
- - New Neon kernels / functions
+ - New Arm® Neon™ kernels / functions
     - arm_compute::NEGEMMLowpAArch64A53Kernel / arm_compute::NEGEMMLowpAArch64Kernel / arm_compute::NEGEMMLowpAArch64V8P4Kernel / arm_compute::NEGEMMInterleavedBlockedKernel / arm_compute::NEGEMMLowpAssemblyMatrixMultiplyCore
     - arm_compute::NEHGEMMAArch64FP16Kernel
     - NEDepthwiseConvolutionLayer3x3Kernel / NEDepthwiseIm2ColKernel / NEGEMMMatrixVectorMultiplyKernel / NEDepthwiseVectorToTensorKernel / @ref NEDepthwiseConvolutionLayer
@@ -1269,7 +1269,7 @@ v17.12 Public major release
     - @ref CLGEMMLowpOffsetContributionKernel / @ref CLGEMMLowpMatrixAReductionKernel / @ref CLGEMMLowpMatrixBReductionKernel / @ref CLGEMMLowpMatrixMultiplyCore
     - CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel / @ref CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint
 
- - New graph nodes for Neon and OpenCL
+ - New graph nodes for Arm® Neon™ and OpenCL
     - graph::BranchLayer
     - graph::DepthConvertLayer
     - graph::DepthwiseConvolutionLayer
@@ -1293,8 +1293,8 @@ v17.09 Public major release
  - Experimental Graph support: initial implementation of a simple stream API to easily chain machine learning layers.
  - Memory Manager (@ref BlobLifetimeManager, @ref BlobMemoryPool, @ref ILifetimeManager, @ref IMemoryGroup, @ref IMemoryManager, @ref IMemoryPool, @ref IPoolManager, @ref MemoryManagerOnDemand, @ref PoolManager)
  - New validation and benchmark frameworks (Boost and Google frameworks replaced by homemade framework).
- - Most machine learning functions support both fixed point 8 and 16 bit (QS8, QS16) for both Neon and OpenCL.
- - New Neon kernels / functions:
+ - Most machine learning functions support both fixed point 8 and 16 bit (QS8, QS16) for both Arm® Neon™ and OpenCL.
+ - New Arm® Neon™ kernels / functions:
     - arm_compute::NEGEMMAssemblyBaseKernel arm_compute::NEGEMMAArch64Kernel
     - NEDequantizationLayerKernel / @ref NEDequantizationLayer
     - NEFloorKernel / @ref NEFloor
@@ -1320,12 +1320,12 @@ v17.09 Public major release
 
 v17.06 Public major release
  - Various bug fixes
- - Added support for fixed point 8 bit (QS8) to the various Neon machine learning kernels.
+ - Added support for fixed point 8 bit (QS8) to the various Arm® Neon™ machine learning kernels.
  - Added unit tests and benchmarks (AlexNet, LeNet)
  - Added support for sub tensors.
  - Added infrastructure to provide GPU specific optimisation for some OpenCL kernels.
  - Added @ref OMPScheduler (OpenMP) scheduler for Neon
- - Added @ref SingleThreadScheduler scheduler for Neon (For bare metal)
+ - Added @ref SingleThreadScheduler scheduler for Arm® Neon™ (For bare metal)
  - User can specify his own scheduler by implementing the @ref IScheduler interface.
  - New OpenCL kernels / functions:
     - @ref CLBatchNormalizationLayerKernel / @ref CLBatchNormalizationLayer
@@ -1335,7 +1335,7 @@ v17.06 Public major release
     - @ref CLWeightsReshapeKernel / @ref CLConvolutionLayerReshapeWeights
  - New C++ kernels:
     - CPPDetectionWindowNonMaximaSuppressionKernel
- - New Neon kernels / functions:
+ - New Arm® Neon™ kernels / functions:
     - @ref NEBatchNormalizationLayerKernel / @ref NEBatchNormalizationLayer
     - NEDepthConcatenateLayerKernel / NEDepthConcatenateLayer
     - @ref NEDirectConvolutionLayerKernel / @ref NEDirectConvolutionLayer
@@ -1373,11 +1373,11 @@ v17.04 Public bug fixes release
 
 v17.03.1 First Major public release of the sources
  - Renamed the library to arm_compute
- - New CPP target introduced for C++ kernels shared between Neon and CL functions.
+ - New CPP target introduced for C++ kernels shared between Arm® Neon™ and CL functions.
  - New padding calculation interface introduced and ported most kernels / functions to use it.
  - New OpenCL kernels / functions:
    - CLGEMMLowpMatrixMultiplyKernel / CLGEMMLowp
- - New Neon kernels / functions:
+ - New Arm® Neon™ kernels / functions:
    - @ref NENormalizationLayerKernel / @ref NENormalizationLayer
    - NETransposeKernel / @ref NETranspose
    - NELogits1DMaxKernel, NELogits1DShiftExpSumKernel, NELogits1DNormKernel / @ref NESoftmaxLayer
@@ -1394,7 +1394,7 @@ v17.03 Sources preview
    - CLLKTrackerInitKernel, CLLKTrackerStage0Kernel, CLLKTrackerStage1Kernel, CLLKTrackerFinalizeKernel / CLOpticalFlow
    - @ref CLNormalizationLayerKernel / @ref CLNormalizationLayer
    - CLLaplacianPyramid, CLLaplacianReconstruct
- - New Neon kernels / functions:
+ - New Arm® Neon™ kernels / functions:
    - NEActivationLayerKernel / @ref NEActivationLayer
    - GEMM refactoring + FP16 support (Requires armv8.2 CPU): @ref NEGEMMInterleave4x4Kernel, @ref NEGEMMTranspose1xWKernel, @ref NEGEMMMatrixMultiplyKernel, @ref NEGEMMMatrixAdditionKernel / @ref NEGEMM
    - NEPoolingLayerKernel / @ref NEPoolingLayer
@@ -1408,7 +1408,7 @@ v17.02.1 Sources preview
    - CLGaussianPyramidHorKernel, CLGaussianPyramidVertKernel / CLGaussianPyramid, CLGaussianPyramidHalf, CLGaussianPyramidOrb
    - CLMinMaxKernel, CLMinMaxLocationKernel / CLMinMaxLocation
    - CLNonLinearFilterKernel / CLNonLinearFilter
- - New Neon FP16 kernels (Requires armv8.2 CPU)
+ - New Arm® Neon™ FP16 kernels (Requires armv8.2 CPU)
    - NEAccumulateWeightedFP16Kernel
    - NEBox3x3FP16Kernel
    - NENonMaximaSuppression3x3FP16Kernel
@@ -1420,7 +1420,7 @@ v17.02 Sources preview
    - CLDerivativeKernel / CLChannelExtract
    - CLFastCornersKernel / CLFastCorners
    - CLMeanStdDevKernel / CLMeanStdDev
- - New Neon kernels / functions:
+ - New Arm® Neon™ kernels / functions:
    - HOG / SVM: NEHOGOrientationBinningKernel, NEHOGBlockNormalizationKernel, NEHOGDetectorKernel, NEHOGNonMaximaSuppressionKernel / NEHOGDescriptor, NEHOGDetector, NEHOGGradient, NEHOGMultiDetection
    - NENonLinearFilterKernel / NENonLinearFilter
  - Introduced a CLScheduler to manage the default context and command queue used by the runtime library and create synchronisation events.
@@ -1473,7 +1473,7 @@ To see the build options available simply run ```scons -h```:
         opencl: Enable OpenCL support (yes|no)
             default: True
 
-        neon: Enable Neon support (yes|no)
+        neon: Enable Arm® Neon™ support (yes|no)
             default: False
 
         embed_kernels: Embed OpenCL kernels in library binary (yes|no)
@@ -1555,7 +1555,7 @@ To see the build options available simply run ```scons -h```:
         pmu: Enable PMU counters (yes|no)
             default: False
 
-        mali: Enable Mali hardware counters (yes|no)
+        mali: Enable Arm® Mali™ hardware counters (yes|no)
             default: False
 
         external_tests_dir: Add examples, benchmarks and tests to the tests suite from an external path ( /path/to/external_tests_dir )
@@ -1569,7 +1569,7 @@ To see the build options available simply run ```scons -h```:
 @b arch: The x86_32 and x86_64 targets can only be used with neon=0 and opencl=1.
 
 @b os: Choose the operating system you are targeting: Linux, Android or bare metal.
-@note bare metal can only be used for Neon (not OpenCL), only static libraries get built and Neon's multi-threading support is disabled.
+@note bare metal can only be used for Arm® Neon™ (not OpenCL), only static libraries get built and Neon's multi-threading support is disabled.
 
 @b build: you can either build directly on your device (native) or cross compile from your desktop machine (cross-compile). In both cases make sure the compiler is available in your path.
 
@@ -1581,7 +1581,7 @@ In addittion the option 'compress_kernels' will compress the embedded OpenCL ker
 
 @b Werror: If you are compiling using the same toolchains as the ones used in this guide then there shouldn't be any warning and therefore you should be able to keep Werror=1. If with a different compiler version the library fails to build because of warnings interpreted as errors then, if you are sure the warnings are not important, you might want to try to build with Werror=0 (But please do report the issue on Github).
 
-@b opencl / @b neon: Choose which SIMD technology you want to target. (Neon for Arm Cortex-A CPUs or OpenCL for Arm Mali GPUs)
+@b opencl / @b neon: Choose which SIMD technology you want to target. (Neon for Arm Cortex-A CPUs or OpenCL for Arm® Mali™ GPUs)
 
 @b embed_kernels: For OpenCL only: set embed_kernels=1 if you want the OpenCL kernels to be built in the library's binaries instead of being read from separate ".cl" / ".cs" files. If embed_kernels is set to 0 then the application can set the path to the folder containing the OpenCL kernel files by calling CLKernelLibrary::init(). By default the path is set to "./cl_kernels".
 
@@ -1607,7 +1607,7 @@ Example:
 
 @b pmu: Enable the PMU cycle counter to measure execution time in benchmark tests. (Your device needs to support it)
 
-@b mali: Enable the collection of Mali hardware counters to measure execution time in benchmark tests. (Your device needs to have a Mali driver that supports it)
+@b mali: Enable the collection of Arm® Mali™ hardware counters to measure execution time in benchmark tests. (Your device needs to have a Arm® Mali™ driver that supports it)
 
 @b openmp Build in the OpenMP scheduler for Neon.
 
@@ -1645,7 +1645,7 @@ For Linux, the library was successfully built and tested using the following Lin
  - gcc-linaro-6.3.1-2017.05-x86_64_arm-linux-gnueabihf
  - gcc-linaro-6.3.1-2017.05-x86_64_aarch64-linux-gnu
 
-To cross-compile the library in debug mode, with Neon only support, for Linux 32bit:
+To cross-compile the library in debug mode, with Arm® Neon™ only support, for Linux 32bit:
 
 	scons Werror=1 -j8 debug=1 neon=1 opencl=0 os=linux arch=armv7a
 
@@ -1678,11 +1678,11 @@ The examples get automatically built by scons as part of the build process of th
 
 @note The following command lines assume the arm_compute libraries are present in the current directory or in the system library path. If this is not the case you can specify the location of the pre-built libraries with the compiler option -L. When building the OpenCL example the commands below assume that the CL headers are located in the include folder where the command is executed.
 
-To cross compile a Neon example for Linux 32bit:
+To cross compile a Arm® Neon™ example for Linux 32bit:
 
 	arm-linux-gnueabihf-g++ examples/neon_convolution.cpp utils/Utils.cpp -I. -Iinclude -std=c++14 -mfpu=neon -L. -larm_compute -larm_compute_core -o neon_convolution
 
-To cross compile a Neon example for Linux 64bit:
+To cross compile a Arm® Neon™ example for Linux 64bit:
 
 	aarch64-linux-gnu-g++ examples/neon_convolution.cpp utils/Utils.cpp -I. -Iinclude -std=c++14 -L. -larm_compute -larm_compute_core -o neon_convolution
 
@@ -1712,11 +1712,11 @@ i.e. to cross compile the "graph_lenet" example for Linux 64bit:
 
 @note If compiling using static libraries, this order must be followed when linking: arm_compute_graph_static, arm_compute, arm_compute_core
 
-To compile natively (i.e directly on an Arm device) for Neon for Linux 32bit:
+To compile natively (i.e directly on an Arm device) for Arm® Neon™ for Linux 32bit:
 
 	g++ examples/neon_convolution.cpp utils/Utils.cpp -I. -Iinclude -std=c++14 -mfpu=neon -larm_compute -larm_compute_core -o neon_convolution
 
-To compile natively (i.e directly on an Arm device) for Neon for Linux 64bit:
+To compile natively (i.e directly on an Arm device) for Arm® Neon™ for Linux 64bit:
 
 	g++ examples/neon_convolution.cpp utils/Utils.cpp -I. -Iinclude -std=c++14 -larm_compute -larm_compute_core -o neon_convolution
 
@@ -1798,7 +1798,7 @@ For NDK r19 or newer, you can directly <a href="https://developer.android.com/nd
 
 @subsubsection S3_3_1_library How to build the library ?
 
-To cross-compile the library in debug mode, with Neon only support, for Android 32bit:
+To cross-compile the library in debug mode, with Arm® Neon™ only support, for Android 32bit:
 
 	CXX=clang++ CC=clang scons Werror=1 -j8 debug=1 neon=1 opencl=0 os=android arch=armv7a
 
@@ -1814,7 +1814,7 @@ The examples get automatically built by scons as part of the build process of th
 
 Once you've got your Android standalone toolchain built and added to your path you can do the following:
 
-To cross compile a Neon example:
+To cross compile a Arm® Neon™ example:
 
 	#32 bit:
 	arm-linux-androideabi-clang++ examples/neon_convolution.cpp utils/Utils.cpp -I. -Iinclude -std=c++14 -larm_compute-static -larm_compute_core-static -L. -o neon_convolution_arm -static-libstdc++ -pie
@@ -1835,7 +1835,7 @@ To cross compile the examples with the Graph API, such as graph_lenet.cpp, you n
 	#64 bit:
 	aarch64-linux-android-clang++ examples/graph_lenet.cpp utils/Utils.cpp utils/GraphUtils.cpp utils/CommonGraphOptions.cpp -I. -Iinclude -std=c++14 -Wl,--whole-archive -larm_compute_graph-static -Wl,--no-whole-archive -larm_compute-static -larm_compute_core-static -L. -o graph_lenet_aarch64 -static-libstdc++ -pie -DARM_COMPUTE_CL
 
-@note Due to some issues in older versions of the Mali OpenCL DDK (<= r13p0), we recommend to link arm_compute statically on Android.
+@note Due to some issues in older versions of the Arm® Mali™ OpenCL DDK (<= r13p0), we recommend to link arm_compute statically on Android.
 @note When linked statically the arm_compute_graph library currently needs the --whole-archive linker flag in order to work properly
 
 Then you need to do is upload the executable and the shared library to the device using ADB:
@@ -1893,7 +1893,7 @@ Download linaro for <a href="https://releases.linaro.org/components/toolchain/bi
 
 @subsubsection S3_5_1_library How to build the library ?
 
-To cross-compile the library with Neon support for baremetal arm64-v8a:
+To cross-compile the library with Arm® Neon™ support for baremetal arm64-v8a:
 
 	scons Werror=1 -j8 debug=0 neon=1 opencl=0 os=bare_metal arch=arm64-v8a build=cross_compile cppthreads=0 openmp=0 standalone=1
 
@@ -1933,13 +1933,13 @@ can be followed.
 
 @subsubsection S3_7_1_cl_hard_requirements Hard Requirements
 
-Compute Library requires OpenCL 1.1 and above with support of non uniform workgroup sizes, which is officially supported in the Mali OpenCL DDK r8p0 and above as an extension (respective extension flag is \a -cl-arm-non-uniform-work-group-size).
+Compute Library requires OpenCL 1.1 and above with support of non uniform workgroup sizes, which is officially supported in the Arm® Mali™ OpenCL DDK r8p0 and above as an extension (respective extension flag is \a -cl-arm-non-uniform-work-group-size).
 
-Enabling 16-bit floating point calculations require \a cl_khr_fp16 extension to be supported. All Mali GPUs with compute capabilities have native support for half precision floating points.
+Enabling 16-bit floating point calculations require \a cl_khr_fp16 extension to be supported. All Arm® Mali™ GPUs with compute capabilities have native support for half precision floating points.
 
 @subsubsection S3_7_2_cl_performance_requirements Performance improvements
 
-Integer dot product built-in function extensions (and therefore optimized kernels) are available with Mali OpenCL DDK r22p0 and above for the following GPUs : G71, G76. The relevant extensions are \a cl_arm_integer_dot_product_int8, \a cl_arm_integer_dot_product_accumulate_int8 and \a cl_arm_integer_dot_product_accumulate_int16.
+Integer dot product built-in function extensions (and therefore optimized kernels) are available with Arm® Mali™ OpenCL DDK r22p0 and above for the following GPUs : G71, G76. The relevant extensions are \a cl_arm_integer_dot_product_int8, \a cl_arm_integer_dot_product_accumulate_int8 and \a cl_arm_integer_dot_product_accumulate_int16.
 
 OpenCL kernel level debugging can be simplified with the use of printf, this requires the \a cl_arm_printf extension to be supported.
 
diff --git a/docs/01_library.dox b/docs/01_library.dox
index 5cd33b67a6..6f4b717bfa 100644
--- a/docs/01_library.dox
+++ b/docs/01_library.dox
@@ -38,10 +38,10 @@ The Core library is a low level collection of algorithms implementations, it is
 The Runtime library is a very basic wrapper around the Core library which can be used for quick prototyping, it is basic in the sense that:
 
 - It allocates images and tensors by using standard malloc().
-- It multi-threads Neon code in a very basic way using a very simple pool of threads.
+- It multi-threads Arm® Neon™ code in a very basic way using a very simple pool of threads.
 - For OpenCL it uses the default CLScheduler command queue for all mapping operations and kernels.
 
-For maximum performance, it is expected that the users would re-implement an equivalent to the runtime library which suits better their needs (With a more clever multi-threading strategy, load-balancing between Neon and OpenCL, etc.)
+For maximum performance, it is expected that the users would re-implement an equivalent to the runtime library which suits better their needs (With a more clever multi-threading strategy, load-balancing between Arm® Neon™ and OpenCL, etc.)
 
 @section S4_1_2 Data-type and Data-layout support
 
@@ -62,7 +62,7 @@ where N = batches, C = channels, H = height, W = width
 @section S4_1_3 Fast-math support
 
 Compute Library supports different types of convolution methods, fast-math flag is only used for the Winograd algorithm.
-When the fast-math flag is enabled, both Neon and CL convolution layers will try to dispatch the fastest implementation available, which may introduce a drop in accuracy as well. The different scenarios involving the fast-math flag are presented below:
+When the fast-math flag is enabled, both Arm® Neon™ and CL convolution layers will try to dispatch the fastest implementation available, which may introduce a drop in accuracy as well. The different scenarios involving the fast-math flag are presented below:
 - For FP32:
     - no-fast-math: Only supports Winograd 3x3,3x1,1x3,5x1,1x5,7x1,1x7
     - fast-math: Supports Winograd 3x3,3x1,1x3,5x1,1x5,7x1,1x7,5x5,7x7
@@ -131,7 +131,7 @@ kernel.run( max_window ); // Run the kernel on the full window
 
 @subsection S4_2_3 Multi-threading
 
-The previous section shows how to run a Neon / CPP kernel in the current thread, however if your system has several CPU cores, you will probably want the kernel to use several cores. Here is how this can be done:
+The previous section shows how to run a Arm® Neon™ / CPP kernel in the current thread, however if your system has several CPU cores, you will probably want the kernel to use several cores. Here is how this can be done:
 
 @code{.cpp}
     ThreadInfo info;
@@ -181,7 +181,7 @@ The previous section shows how to run a Neon / CPP kernel in the current thread,
     }
 @endcode
 
-This is a very basic implementation which was originally used in the Neon runtime library by all the Neon functions.
+This is a very basic implementation which was originally used in the Arm® Neon™ runtime library by all the Arm® Neon™ functions.
 
 @sa CPPScheduler
 
@@ -202,7 +202,7 @@ function.configure( input, output, option0, option1);
 function.run();
 @endcode
 
-@warning The Compute Library requires Mali OpenCL DDK r8p0 or higher (OpenCL kernels are compiled using the -cl-arm-non-uniform-work-group-size flag)
+@warning The Compute Library requires Arm® Mali™ OpenCL DDK r8p0 or higher (OpenCL kernels are compiled using the -cl-arm-non-uniform-work-group-size flag)
 
 @note All OpenCL functions and objects in the runtime library use the command queue associated with CLScheduler for all operations, a real implementation would be expected to use different queues for mapping operations and kernels in order to reach a better GPU utilization.
 
@@ -225,9 +225,9 @@ If the library is compiled with embed_kernels=0 the application can set the path
 
 In order to block until all the jobs in the CLScheduler's command queue are done executing the user can call @ref CLScheduler::sync() or create a sync event using @ref CLScheduler::enqueue_sync_event()
 
-@subsection S4_4_2_cl_neon OpenCL / Neon interoperability
+@subsection S4_4_2_cl_neon OpenCL / Arm® Neon™ interoperability
 
-You can mix OpenCL and Neon kernels and functions. However it is the user's responsibility to handle the mapping/unmapping of OpenCL objects.
+You can mix OpenCL and Arm® Neon™ kernels and functions. However it is the user's responsibility to handle the mapping/unmapping of OpenCL objects.
 
 @section S4_5_algorithms Algorithms
 
@@ -249,7 +249,7 @@ You have 3 types of @ref BorderMode :
 - @ref BorderMode::REPLICATE : Neighbor pixels outside of the image are treated as having the same value as the closest valid pixel.
 - @ref BorderMode::CONSTANT : Neighbor pixels outside of the image are treated as having the same constant value. (The user can choose what this value should be).
 
-Moreover both OpenCL and Neon use vector loads and stores instructions to access the data in buffers, so in order to avoid having special cases to handle for the borders all the images and tensors used in this library must be padded.
+Moreover both OpenCL and Arm® Neon™ use vector loads and stores instructions to access the data in buffers, so in order to avoid having special cases to handle for the borders all the images and tensors used in this library must be padded.
 
 @subsubsection padding Padding
 
@@ -474,7 +474,7 @@ conv2.run();
 
 The implemented @ref TensorAllocator and @ref CLTensorAllocator objects provide an interface capable of importing existing memory to a tensor as backing memory.
 
-A simple Neon example can be the following:
+A simple Arm® Neon™ example can be the following:
 @code{.cpp}
 // External backing memory
 void* external_ptr = ...;
diff --git a/docs/02_tests.dox b/docs/02_tests.dox
index 0aee8e59d8..70d2f3d67b 100644
--- a/docs/02_tests.dox
+++ b/docs/02_tests.dox
@@ -353,7 +353,7 @@ You can use the `--instruments` option to select one or more instruments to meas
 
 `PMU` will try to read the CPU PMU events from the kernel (They need to be enabled on your platform)
 
-`MALI` will try to collect Mali hardware performance counters. (You need to have a recent enough Mali driver)
+`MALI` will try to collect Arm® Mali™ hardware performance counters. (You need to have a recent enough Arm® Mali™ driver)
 
 `WALL_CLOCK_TIMER` will measure time using `gettimeofday`: this should work on all platforms.
 
@@ -371,7 +371,7 @@ To run the OpenCL precommit validation tests:
 
 	LD_LIBRARY_PATH=. ./arm_compute_validation --mode=precommit --filter="^CL.*"
 
-To run the Neon precommit benchmark tests with PMU and Wall Clock timer in miliseconds instruments enabled:
+To run the Arm® Neon™ precommit benchmark tests with PMU and Wall Clock timer in miliseconds instruments enabled:
 
 	LD_LIBRARY_PATH=. ./arm_compute_benchmark --mode=precommit --filter="^NEON.*" --instruments="pmu,wall_clock_timer_ms" --iterations=10
 
diff --git a/docs/04_adding_operator.dox b/docs/04_adding_operator.dox
index 1b4b575964..aef1bb4af0 100644
--- a/docs/04_adding_operator.dox
+++ b/docs/04_adding_operator.dox
@@ -71,7 +71,7 @@ Similarly, all common functions that process shapes, like calculating output sha
 
 
 @subsection S4_1_2_add_kernel Add a kernel
-As we mentioned at the beginning, the kernel is the implementation of the operator or algorithm partially using a specific programming language related to the backend we want to use. Adding a kernel in the library means implementing the algorithm in a SIMD technology like Neon or OpenCL. All kernels in Compute Library must implement a common interface IKernel or one of the specific subinterfaces.
+As we mentioned at the beginning, the kernel is the implementation of the operator or algorithm partially using a specific programming language related to the backend we want to use. Adding a kernel in the library means implementing the algorithm in a SIMD technology like Arm® Neon™ or OpenCL. All kernels in Compute Library must implement a common interface IKernel or one of the specific subinterfaces.
 IKernel is the common interface for all the kernels in the core library, it contains the main methods for configure and run the kernel itself, such as window()  that return the maximum window the kernel can be executed on or is_parallelisable() for indicate whether or not the kernel is parallelizable. If the kernel is parallelizable then the window returned by the window() method can be split into sub-windows which can then be run in parallel, in the other case, only the window returned by window() can be passed to the run method.
 There are specific interfaces for OpenCL and Neon: @ref ICLKernel, INEKernel (using INEKernel = @ref ICPPKernel).
 
@@ -120,10 +120,10 @@ For OpenCL:
 @snippet src/core/gpu/cl/kernels/ClReshapeKernel.cpp ClReshapeKernel Kernel
 The run will call the function defined in the .cl file.
 
-For the Neon backend case:
+For the Arm® Neon™ backend case:
 @snippet src/core/cpu/kernels/CpuReshapeKernel.cpp NEReshapeLayerKernel Kernel
 
-In the Neon case, there is no need to add an extra file and we implement the kernel in the same NEReshapeLayerKernel.cpp file.
+In the Arm® Neon™ case, there is no need to add an extra file and we implement the kernel in the same NEReshapeLayerKernel.cpp file.
 If the tests are already in place, the new kernel can be tested using the existing tests by adding the configure and run of the kernel to the compute_target() in the fixture.
 
 
@@ -137,13 +137,13 @@ If the tests are already in place, the new kernel can be tested using the existi
 - (sub[n].start() - max[n].start()) % max[n].step() == 0
 - (sub[n].end() - sub[n].start()) % max[n].step() == 0
 
-@ref CPPScheduler::schedule provides a sample implementation that is used for Neon kernels.
-%Memory management is the other aspect that the runtime layer is supposed to handle. %Memory management of the tensors is abstracted using TensorAllocator. Each tensor holds a pointer to a TensorAllocator object, which is used to allocate and free the memory at runtime. The implementation that is currently supported in Compute Library allows memory blocks, required to be fulfilled for a given operator, to be grouped together under a @ref MemoryGroup. Each group can be acquired and released. The underlying implementation of memory groups vary depending on whether Neon or CL is used. The memory group class uses memory pool to provide the required memory. It also uses the memory manager to manage the lifetime and a IPoolManager to manage the memory pools registered with the memory manager.
+@ref CPPScheduler::schedule provides a sample implementation that is used for Arm® Neon™ kernels.
+%Memory management is the other aspect that the runtime layer is supposed to handle. %Memory management of the tensors is abstracted using TensorAllocator. Each tensor holds a pointer to a TensorAllocator object, which is used to allocate and free the memory at runtime. The implementation that is currently supported in Compute Library allows memory blocks, required to be fulfilled for a given operator, to be grouped together under a @ref MemoryGroup. Each group can be acquired and released. The underlying implementation of memory groups vary depending on whether Arm® Neon™ or CL is used. The memory group class uses memory pool to provide the required memory. It also uses the memory manager to manage the lifetime and a IPoolManager to manage the memory pools registered with the memory manager.
 
 
 We have seen the various interfaces for a kernel in the core library, the same structure the same file structure design exists in the runtime module. IFunction is the base class for all the functions, it has two child interfaces: ICLSimpleFunction and INESimpleFunction that are used as base class for functions which call a single kernel.
 
-The new operator has to implement %validate(), configure() and run(), these methods will call the respective function in the kernel considering that the multi-threading is used for the kernels which are parallelizable, by default std::thread::hardware_concurrency() threads are used. For Neon function can be used CPPScheduler::set_num_threads() to manually set the number of threads, whereas for OpenCL kernels all the kernels are enqueued on the queue associated with CLScheduler and the queue is then flushed.
+The new operator has to implement %validate(), configure() and run(), these methods will call the respective function in the kernel considering that the multi-threading is used for the kernels which are parallelizable, by default std::thread::hardware_concurrency() threads are used. For Arm® Neon™ function can be used CPPScheduler::set_num_threads() to manually set the number of threads, whereas for OpenCL kernels all the kernels are enqueued on the queue associated with CLScheduler and the queue is then flushed.
 For the runtime functions, there is an extra method implemented: prepare(), this method prepares the function for the run, it does all the heavy operations that are done only once (reshape the weight, release the memory not necessary after the reshape, etc). The prepare method can be called standalone or in the first run, if not called before, after then the function will be marked as prepared.
 The files we add are:
 
diff --git a/docs/06_functions_list.dox b/docs/06_functions_list.dox
index 0c5145cdc8..2cd16d0603 100644
--- a/docs/06_functions_list.dox
+++ b/docs/06_functions_list.dox
@@ -29,7 +29,7 @@ namespace arm_compute
 
 @tableofcontents
 
-@section S6_1 Neon functions
+@section S6_1 Arm® Neon™ functions
 
 - @ref IFunction
     - @ref INESimpleFunction
diff --git a/docs/07_errata.dox b/docs/07_errata.dox
index 6a82ca91c4..0c8d684017 100644
--- a/docs/07_errata.dox
+++ b/docs/07_errata.dox
@@ -53,10 +53,10 @@ namespace arm_compute
     - Versions Affected: >= v19.11
     - OSs Affected: Linux
     - Conditions:
-        - Mali DDK r1p0 - r8p0, and
+        - Arm® Mali™ DDK r1p0 - r8p0, and
         - Linux kernel >= 4.4
 
-- On Android with arm64-v8a/arm64-v8.2-a architecture, Neon validation tests can fail when compiled using Android Ndk
+- On Android with arm64-v8a/arm64-v8.2-a architecture, Arm® Neon™ validation tests can fail when compiled using Android Ndk
   >= r18b in debug mode (https://github.com/android/ndk/issues/1135).
     - Versions Affected: >= v19.11
     - OSs Affected: Android
diff --git a/docs/ComputeLibrary.dir b/docs/ComputeLibrary.dir
index de4968c0ab..74ac9d9d23 100644
--- a/docs/ComputeLibrary.dir
+++ b/docs/ComputeLibrary.dir
@@ -44,15 +44,15 @@
  */
 
 /** @dir src/core/NEON
- *  @brief Neon backend core: kernels and utilities.
+ *  @brief Arm® Neon™ backend core: kernels and utilities.
  */
 
 /** @file src/core/NEON/NEKernels.h
- *  @brief Includes all the Neon kernels at once
+ *  @brief Includes all the Arm® Neon™ kernels at once
  */
 
 /** @dir src/core/NEON/kernels
- *  @brief Folder containing all the Neon kernels
+ *  @brief Folder containing all the Arm® Neon™ kernels
  */
 
 /** @dir arm_compute/core/utils
@@ -76,7 +76,7 @@
  */
 
 /** @dir arm_compute/graph/backends/NEON
- *  @brief Neon specific operations
+ *  @brief Arm® Neon™ specific operations
  */
 
 /** @dir arm_compute/graph/detail
@@ -148,15 +148,15 @@
  */
 
 /** @dir arm_compute/runtime/NEON
- *  @brief Neon backend runtime interface.
+ *  @brief Arm® Neon™ backend runtime interface.
  */
 
 /** @file arm_compute/runtime/NEON/NEFunctions.h
- *  @brief Includes all the Neon functions at once.
+ *  @brief Includes all the Arm® Neon™ functions at once.
  */
 
 /** @dir arm_compute/runtime/NEON/functions
- *  @brief Folder containing all the Neon functions.
+ *  @brief Folder containing all the Arm® Neon™ functions.
  */
 
 /** @dir arm_compute/runtime/OMP
@@ -182,8 +182,8 @@
  *
  *  -# cl_*.cpp --> OpenCL examples
  *  -# graph_*.cpp --> Graph examples
- *  -# neoncl_*.cpp --> Neon / OpenCL interoperability examples
- *  -# neon_*.cpp --> Neon examples
+ *  -# neoncl_*.cpp --> Arm® Neon™ / OpenCL interoperability examples
+ *  -# neon_*.cpp --> Arm® Neon™ examples
  */
 
 /** @dir examples/gemm_tuner
@@ -211,11 +211,11 @@
  */
 
 /** @dir src/core/NEON/wrapper
- *  @brief Neon wrapper used to simplify code
+ *  @brief Arm® Neon™ wrapper used to simplify code
  */
 
 /** @file src/core/NEON/wrapper/traits.h
- *  @brief Traits defined on Neon vectors
+ *  @brief Traits defined on Arm® Neon™ vectors
  */
 
 /** @file src/core/NEON/wrapper/wrapper.h
@@ -223,7 +223,7 @@
  */
 
 /** @dir src/core/NEON/wrapper/intrinsics
- *  @brief Neon intrinsics wrappers
+ *  @brief Arm® Neon™ intrinsics wrappers
  */
 
 /** @dir src/core/NEON/wrapper/scalar
@@ -255,7 +255,7 @@
  */
 
 /** @dir tests/NEON
- *  @brief Neon accessors.
+ *  @brief Arm® Neon™ accessors.
  */
 
 /** @dir tests/benchmark
@@ -267,7 +267,7 @@
  */
 
 /** @dir tests/benchmark/NEON
- *  @brief Neon benchmarking tests.
+ *  @brief Arm® Neon™ benchmarking tests.
  */
 
 /** @dir tests/benchmark_examples
@@ -299,7 +299,7 @@
  */
 
 /** @dir tests/validation/NEON
- *  @brief Neon validation tests.
+ *  @brief Arm® Neon™ validation tests.
  */
 
 /** @dir tests/validation/reference
diff --git a/examples/neon_cnn.cpp b/examples/neon_cnn.cpp
index 11bb0884bd..5ecf055e60 100644
--- a/examples/neon_cnn.cpp
+++ b/examples/neon_cnn.cpp
@@ -257,7 +257,7 @@ private:
     Tensor out_fc0{};
     Tensor out_softmax{};
 
-    // Neon allocator
+    // Allocator
     Allocator allocator{};
 
     // Memory groups
diff --git a/examples/neon_copy_objects.cpp b/examples/neon_copy_objects.cpp
index 824e419cf7..3043709fe7 100644
--- a/examples/neon_copy_objects.cpp
+++ b/examples/neon_copy_objects.cpp
@@ -140,7 +140,7 @@ public:
     }
     void do_run() override
     {
-        // Run Neon softmax:
+        // Run softmax:
         softmax.run();
     }
     void do_teardown() override
diff --git a/scripts/include_functions_kernels.py b/scripts/include_functions_kernels.py
index 721855ee27..82b40f0e36 100644
--- a/scripts/include_functions_kernels.py
+++ b/scripts/include_functions_kernels.py
@@ -1,4 +1,27 @@
 #!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+# Copyright (c) 2017-2018, 2020-2021 Arm Limited.
+#
+# SPDX-License-Identifier: MIT
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to
+# deal in the Software without restriction, including without limitation the
+# rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+# sell copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
 import glob
 import collections
 import os
@@ -9,7 +32,7 @@ src_path ="src"
 Target = collections.namedtuple('Target', 'name prefix basepath')
 
 core_targets = [
-    Target("NEON", "NE", src_path),             # Neon kernels are under src
+    Target("NEON", "NE", src_path),             # Arm® Neon™ kernels are under src
     Target("CL", "CL", src_path),               # CL kernels are under src
     Target("CPP", "CPP", armcv_path)            # CPP kernels are under arm_compute
     ]
diff --git a/src/core/CL/cl_kernels/helpers_asymm.h b/src/core/CL/cl_kernels/helpers_asymm.h
index eea4458170..27878cde36 100644
--- a/src/core/CL/cl_kernels/helpers_asymm.h
+++ b/src/core/CL/cl_kernels/helpers_asymm.h
@@ -192,7 +192,7 @@ inline float dequantize_qasymm8_signed(char input, float offset, float scale)
 
 /** Each bit of the result is set to the corresponding bit of either then_val or
  * else_val depending on whether the corresponding bit of if_mask is set.
- * Equivalent to the VBSL instruction in Arm Neon.
+ * Equivalent to the VBSL instruction in Arm® Neon™.
  *
  * @param[in] size Size of vector.
  *
@@ -320,7 +320,7 @@ inline float dequantize_qasymm8_signed(char input, float offset, float scale)
     }
 
 /** Calculates (a+b)/2, rounded to the nearest integer.
- * Equivalent to VRHADD in the Arm Neon instruction set.
+ * Equivalent to VRHADD in the Arm Arm® Neon™ instruction set.
  *
  * @param[in] size Size of vector.
  *
diff --git a/src/core/GPUTarget.cpp b/src/core/GPUTarget.cpp
index c1346b1fcc..14264cb883 100644
--- a/src/core/GPUTarget.cpp
+++ b/src/core/GPUTarget.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 Arm Limited.
+ * Copyright (c) 2018-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -146,7 +146,7 @@ GPUTarget get_target_from_name(const std::string &device_name)
 
     if(!found_mali)
     {
-        ARM_COMPUTE_LOG_INFO_MSG_CORE("Can't find valid Mali GPU. Target is set to default.");
+        ARM_COMPUTE_LOG_INFO_MSG_CORE("Can't find valid Arm® Mali™ GPU. Target is set to default.");
         return GPUTarget::MIDGARD;
     }
 
@@ -179,7 +179,7 @@ GPUTarget get_target_from_name(const std::string &device_name)
     // Report in case of unknown target
     if(gpu_target == GPUTarget::UNKNOWN)
     {
-        ARM_COMPUTE_LOG_INFO_MSG_CORE("Mali GPU unknown. Target is set to the default one. (BIFROST)");
+        ARM_COMPUTE_LOG_INFO_MSG_CORE("Arm® Mali™ Mali GPU unknown. Target is set to the default one. (BIFROST)");
         return GPUTarget::BIFROST;
     }
 
diff --git a/src/core/NEON/INESimpleKernel.h b/src/core/NEON/INESimpleKernel.h
index d2b6de427b..2986e7b5c9 100644
--- a/src/core/NEON/INESimpleKernel.h
+++ b/src/core/NEON/INESimpleKernel.h
@@ -28,7 +28,7 @@
 
 namespace arm_compute
 {
-/** Interface for simple Neon kernels having 1 tensor input and 1 tensor output */
+/** Interface for simple CPU kernels having 1 tensor input and 1 tensor output */
 using INESimpleKernel = ICPPSimpleKernel;
 } // namespace arm_compute
 #endif /*ARM_COMPUTE_INESIMPLEKERNEL_H */
diff --git a/src/core/NEON/kernels/NEBatchNormalizationLayerKernel.cpp b/src/core/NEON/kernels/NEBatchNormalizationLayerKernel.cpp
index 9d47d7d76f..1691943b07 100644
--- a/src/core/NEON/kernels/NEBatchNormalizationLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEBatchNormalizationLayerKernel.cpp
@@ -148,7 +148,7 @@ validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const IT
 template <typename T, bool fused_activation, typename F>
 void NEBatchNormalizationLayerKernel::batch_normalization_nchw(const Window &window)
 {
-    /** Neon vector tag type. */
+    /** SIMD vector tag type. */
     using ExactTagType = typename wrapper::traits::neon_bitvector_tag_t<T, wrapper::traits::BitWidth::W128>;
 
     const int  window_step_x  = 16 / sizeof(T);
@@ -164,7 +164,7 @@ void NEBatchNormalizationLayerKernel::batch_normalization_nchw(const Window &win
     F activation_functor(_act_info);
 
     // Hold information about the current feature map we are iterating.
-    // Only compute denominator and Neon vectors once per feature map.
+    // Only compute denominator and constants once per feature map.
     int slice = -1;
 
     const auto input_mean  = reinterpret_cast<const T *>(_mean->ptr_to_element(Coordinates(0, 0)));
diff --git a/src/core/NEON/kernels/NEChannelShuffleLayerKernel.cpp b/src/core/NEON/kernels/NEChannelShuffleLayerKernel.cpp
index c9066578b2..008ad7c9f4 100644
--- a/src/core/NEON/kernels/NEChannelShuffleLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEChannelShuffleLayerKernel.cpp
@@ -40,7 +40,7 @@ namespace
 {
 Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, unsigned int num_groups)
 {
-    // Note: ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input) is not needed here as this kernel doesn't use Neon FP16 instructions.
+    // Note: ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input) is not needed here as this kernel doesn't use CPU FP16 instructions.
     ARM_COMPUTE_RETURN_ERROR_ON(input->data_type() == DataType::UNKNOWN);
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_LAYOUT_NOT_IN(input, DataLayout::NCHW, DataLayout::NHWC);
 
diff --git a/src/core/NEON/kernels/NECol2ImKernel.cpp b/src/core/NEON/kernels/NECol2ImKernel.cpp
index 26e4455c4a..4ba02f1542 100644
--- a/src/core/NEON/kernels/NECol2ImKernel.cpp
+++ b/src/core/NEON/kernels/NECol2ImKernel.cpp
@@ -44,7 +44,7 @@ namespace
 {
 Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const Size2D &convolved_dims)
 {
-    //Note: ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input) is not needed here as this kernel doesn't use Neon FP16 instructions.
+    //Note: ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input) is not needed here as this kernel doesn't use CPU FP16 instructions.
     ARM_COMPUTE_RETURN_ERROR_ON(input->data_type() == DataType::UNKNOWN);
 
     // Validate configured output
diff --git a/src/core/NEON/kernels/NECol2ImKernel.h b/src/core/NEON/kernels/NECol2ImKernel.h
index 00a519d229..397bf5ab17 100644
--- a/src/core/NEON/kernels/NECol2ImKernel.h
+++ b/src/core/NEON/kernels/NECol2ImKernel.h
@@ -32,7 +32,7 @@ namespace arm_compute
 {
 class ITensor;
 
-/** Neon kernel to perform col2im reshaping.
+/** Kernel to perform col2im reshaping.
  *
  * Rearranges each matrix column into image blocks. It's the inverse operation of @ref NEIm2ColKernel.
  *
diff --git a/src/core/NEON/kernels/NEConvertFullyConnectedWeightsKernel.cpp b/src/core/NEON/kernels/NEConvertFullyConnectedWeightsKernel.cpp
index 101d1384d0..4cd1bc79fe 100644
--- a/src/core/NEON/kernels/NEConvertFullyConnectedWeightsKernel.cpp
+++ b/src/core/NEON/kernels/NEConvertFullyConnectedWeightsKernel.cpp
@@ -69,7 +69,7 @@ Status NEConvertFullyConnectedWeightsKernel::validate(const ITensorInfo *input,
                                                       DataLayout data_layout)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input);
-    //Note: ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input) is not needed here as this kernel doesn't use Neon FP16 instructions.
+    //Note: ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input) is not needed here as this kernel doesn't use CPU FP16 instructions.
     ARM_COMPUTE_RETURN_ERROR_ON(input->data_type() == DataType::UNKNOWN);
     ARM_COMPUTE_RETURN_ERROR_ON(input->num_dimensions() != 2);
     ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(1) != original_input_shape.total_size_lower(3));
diff --git a/src/core/NEON/kernels/NEConvertQuantizedSignednessKernel.h b/src/core/NEON/kernels/NEConvertQuantizedSignednessKernel.h
index 0be1fbe5aa..67d5ca246e 100644
--- a/src/core/NEON/kernels/NEConvertQuantizedSignednessKernel.h
+++ b/src/core/NEON/kernels/NEConvertQuantizedSignednessKernel.h
@@ -32,7 +32,7 @@ namespace arm_compute
 // Forward declarations
 class ITensor;
 
-/** Neon kernel to convert asymmetric signed to asymmetric signed and vice-versa */
+/** Kernel to convert asymmetric signed to asymmetric signed and vice-versa */
 class NEConvertQuantizedSignednessKernel : public INEKernel
 {
 public:
diff --git a/src/core/NEON/kernels/NEDirectConvolutionLayerKernel.cpp b/src/core/NEON/kernels/NEDirectConvolutionLayerKernel.cpp
index e81d50fe5f..09f99748bf 100644
--- a/src/core/NEON/kernels/NEDirectConvolutionLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEDirectConvolutionLayerKernel.cpp
@@ -149,7 +149,7 @@ inline bool run_optim_small_tensor(const ITensor *t)
 
 // Optimized convolver for 1x1 kernels used only where input width and height are both <= 8
 // For big Z as in Input=7x7x832, this implementation is faster than the general code becuase it doesn't need to
-// store intermidiate results in memory. Temporary results are stored in Neon registers directly and then written to the output buffer.
+// store intermidiate results in memory. Temporary results are stored in SIMD registers directly and then written to the output buffer.
 template <unsigned int stridex>
 class convolver_w1x1_i8x8_f32
 {
diff --git a/src/core/NEON/kernels/NEDirectConvolutionLayerKernel.h b/src/core/NEON/kernels/NEDirectConvolutionLayerKernel.h
index 58d385a138..258def77a3 100644
--- a/src/core/NEON/kernels/NEDirectConvolutionLayerKernel.h
+++ b/src/core/NEON/kernels/NEDirectConvolutionLayerKernel.h
@@ -30,7 +30,7 @@ namespace arm_compute
 {
 class ITensor;
 
-/** Neon interface for Direct Convolution Layer kernel */
+/** Interface for the kernel to perform Direct Convolution Layer. */
 class NEDirectConvolutionLayerKernel : public INEKernel
 {
 public:
diff --git a/src/core/NEON/kernels/NEDirectConvolutionLayerOutputStageKernel.cpp b/src/core/NEON/kernels/NEDirectConvolutionLayerOutputStageKernel.cpp
index 8dcbd00ddc..3597045bd5 100644
--- a/src/core/NEON/kernels/NEDirectConvolutionLayerOutputStageKernel.cpp
+++ b/src/core/NEON/kernels/NEDirectConvolutionLayerOutputStageKernel.cpp
@@ -93,7 +93,7 @@ typename std::enable_if<arm_compute::utils::traits::is_floating_point<T>::value,
 output_stage_nchw(ITensor *input, const ITensor *bias, const Window &window, ITensor *output,
                   int result_fixedpoint_multiplier, int result_shift, int result_offset_after_shift, bool has_bias)
 {
-    /** Neon vector tag type. */
+    /** SIMD vector tag type. */
     using ExactTagType = typename wrapper::traits::neon_bitvector_tag_t<T, wrapper::traits::BitWidth::W128>;
 
     ARM_COMPUTE_ERROR_ON(input->info()->data_layout() == DataLayout::UNKNOWN);
diff --git a/src/core/NEON/kernels/NEDirectConvolutionLayerOutputStageKernel.h b/src/core/NEON/kernels/NEDirectConvolutionLayerOutputStageKernel.h
index cd0710d0c1..8f7eeb05b2 100644
--- a/src/core/NEON/kernels/NEDirectConvolutionLayerOutputStageKernel.h
+++ b/src/core/NEON/kernels/NEDirectConvolutionLayerOutputStageKernel.h
@@ -30,7 +30,7 @@
 namespace arm_compute
 {
 class ITensor;
-/** Neon kernel to accumulate the biases, if provided, or downscale in case of quantized input.
+/** Kernel to accumulate the biases, if provided, or downscale in case of quantized input.
  *
  * @note We assume bias to be shared
  * @note For quantized computations (i.e. @p input of S32 type) the output data type for auto-initialization must be passed as part
diff --git a/src/core/NEON/kernels/NEFillBorderKernel.cpp b/src/core/NEON/kernels/NEFillBorderKernel.cpp
index 70178dffc0..1c7c1f9763 100644
--- a/src/core/NEON/kernels/NEFillBorderKernel.cpp
+++ b/src/core/NEON/kernels/NEFillBorderKernel.cpp
@@ -103,7 +103,7 @@ void NEFillBorderKernel::configure(ITensor *tensor, BorderSize border_size, Bord
 void NEFillBorderKernel::configure(ITensorInfo *tensor, BorderSize border_size, BorderMode border_mode, const PixelValue &constant_border_value)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(tensor);
-    //Note: ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input) is not needed here as this kernel doesn't use Neon FP16 instructions.
+    //Note: ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input) is not needed here as this kernel doesn't use CPU FP16 instructions.
     ARM_COMPUTE_ERROR_ON(tensor->data_type() == DataType::UNKNOWN);
 
     _border_size           = border_size;
diff --git a/src/core/NEON/kernels/NEGEMMInterleave4x4Kernel.cpp b/src/core/NEON/kernels/NEGEMMInterleave4x4Kernel.cpp
index 96ed810e0e..9011680c9b 100644
--- a/src/core/NEON/kernels/NEGEMMInterleave4x4Kernel.cpp
+++ b/src/core/NEON/kernels/NEGEMMInterleave4x4Kernel.cpp
@@ -47,7 +47,7 @@ namespace
 Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input);
-    //Note: ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input) is not needed here as this kernel doesn't use Neon FP16 instructions.
+    //Note: ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input) is not needed here as this kernel doesn't use CPU FP16 instructions.
     ARM_COMPUTE_RETURN_ERROR_ON(input->data_type() == DataType::UNKNOWN);
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
 
diff --git a/src/core/NEON/kernels/NEGEMMInterleave4x4Kernel.h b/src/core/NEON/kernels/NEGEMMInterleave4x4Kernel.h
index 92fbd12a54..e592d5ef6e 100644
--- a/src/core/NEON/kernels/NEGEMMInterleave4x4Kernel.h
+++ b/src/core/NEON/kernels/NEGEMMInterleave4x4Kernel.h
@@ -30,7 +30,7 @@ namespace arm_compute
 {
 class ITensor;
 
-/** Neon kernel to interleave the elements of a matrix
+/** Kernel to interleave the elements of a matrix
  *
  * This function puts the values in a 4x4 block of Matrix A on the same row (Interleaved values)
  *
diff --git a/src/core/NEON/kernels/NEGEMMLowpMatrixMultiplyKernel.h b/src/core/NEON/kernels/NEGEMMLowpMatrixMultiplyKernel.h
index dfdb7b3236..acfb79edeb 100644
--- a/src/core/NEON/kernels/NEGEMMLowpMatrixMultiplyKernel.h
+++ b/src/core/NEON/kernels/NEGEMMLowpMatrixMultiplyKernel.h
@@ -30,7 +30,7 @@ namespace arm_compute
 {
 class ITensor;
 
-/** Neon kernel to multiply matrices
+/** Kernel to multiply matrices
  *
  * @note @ref NEGEMMLowpMatrixMultiplyKernel low precision matrix product kernel
  *  This kernel performs the following computation:
diff --git a/src/core/NEON/kernels/NEGEMMLowpOffsetContributionKernel.h b/src/core/NEON/kernels/NEGEMMLowpOffsetContributionKernel.h
index 9911ffc0f4..f71929fe9e 100644
--- a/src/core/NEON/kernels/NEGEMMLowpOffsetContributionKernel.h
+++ b/src/core/NEON/kernels/NEGEMMLowpOffsetContributionKernel.h
@@ -30,7 +30,7 @@ namespace arm_compute
 {
 class ITensor;
 
-/** Neon kernel used to add the offset contribution after @ref NEGEMMLowpMatrixMultiplyKernel. The computation is performed in-place
+/** Kernel used to add the offset contribution after @ref NEGEMMLowpMatrixMultiplyKernel. The computation is performed in-place
  *
  * This kernel takes a final int32 accumulator value (the output of @ref NEGEMMLowpMatrixMultiplyKernel),
  * and adds to it the offset contribution of matrix A and matrix B in-place.
diff --git a/src/core/NEON/kernels/NEGEMMLowpOffsetContributionOutputStageKernel.h b/src/core/NEON/kernels/NEGEMMLowpOffsetContributionOutputStageKernel.h
index 39fbd8eb0e..6908f37aad 100644
--- a/src/core/NEON/kernels/NEGEMMLowpOffsetContributionOutputStageKernel.h
+++ b/src/core/NEON/kernels/NEGEMMLowpOffsetContributionOutputStageKernel.h
@@ -30,7 +30,7 @@ namespace arm_compute
 {
 class ITensor;
 
-/** Neon kernel used to add the offset contribution and perform the output stage after @ref NEGEMMLowpMatrixMultiplyKernel.
+/** Kernel used to add the offset contribution and perform the output stage after @ref NEGEMMLowpMatrixMultiplyKernel.
  *
  * The computation is performed in-place
  *
diff --git a/src/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ScaleKernel.h b/src/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ScaleKernel.h
index 63d80aaf1b..021ff8e2e0 100644
--- a/src/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ScaleKernel.h
+++ b/src/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ScaleKernel.h
@@ -30,7 +30,7 @@ namespace arm_compute
 {
 class ITensor;
 
-/** Neon kernel used to quantize down the int32 accumulator values of GEMMLowp to QASYMM8/QASYMM8_SIGNED
+/** Kernel used to quantize down the int32 accumulator values of GEMMLowp to QASYMM8/QASYMM8_SIGNED
  *
  * This kernel takes a final int32 accumulator value (the output of @ref NEGEMMLowpMatrixMultiplyKernel), and processes it to obtain the final QASYMM8/QASYMM8_SIGNED value.
  * The following computations will be performed by the kernel:
diff --git a/src/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel.h b/src/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel.h
index 8e92ba6eca..b01b204a6f 100644
--- a/src/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel.h
+++ b/src/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel.h
@@ -30,7 +30,7 @@ namespace arm_compute
 {
 class ITensor;
 
-/** Neon kernel used to quantize down the int32 accumulator values of GEMMLowp to QSYMM16
+/** Kernel used to quantize down the int32 accumulator values of GEMMLowp to QSYMM16
  *
  * This kernel takes a final int32 accumulator value (the output of @ref NEGEMMLowpMatrixMultiplyKernel), and processes it to obtain the final QSYMM16 value.
  * The following computations will be performed by the kernel:
diff --git a/src/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel.h b/src/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel.h
index 9b51a3ba84..9e7dc2f599 100644
--- a/src/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel.h
+++ b/src/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel.h
@@ -30,7 +30,7 @@ namespace arm_compute
 {
 class ITensor;
 
-/** Neon kernel used to quantize down the int32 accumulator values of GEMMLowp to QASYMM8_SIGNED
+/** Kernel used to quantize down the int32 accumulator values of GEMMLowp to QASYMM8_SIGNED
  *
  * This kernel takes a final int32 accumulator value (the output of @ref NEGEMMLowpMatrixMultiplyKernel), and processes it to obtain the final QASYMM8_SIGNED value.
  * The following computations will be performed by the kernel:
diff --git a/src/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel.h b/src/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel.h
index 4d43afaab2..def0573967 100644
--- a/src/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel.h
+++ b/src/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel.h
@@ -30,7 +30,7 @@ namespace arm_compute
 {
 class ITensor;
 
-/** Neon kernel used to quantize down the int32 accumulator values of GEMMLowp to QASYMM8
+/** Kernel used to quantize down the int32 accumulator values of GEMMLowp to QASYMM8
  *
  * This kernel takes a final int32 accumulator value (the output of @ref NEGEMMLowpMatrixMultiplyKernel), and processes it to obtain the final QASYMM8 value.
  * The following computations will be performed by the kernel:
diff --git a/src/core/NEON/kernels/NEGEMMLowpReductionKernel.h b/src/core/NEON/kernels/NEGEMMLowpReductionKernel.h
index 521adbfca4..9be618d656 100644
--- a/src/core/NEON/kernels/NEGEMMLowpReductionKernel.h
+++ b/src/core/NEON/kernels/NEGEMMLowpReductionKernel.h
@@ -32,7 +32,7 @@ namespace arm_compute
 class ITensor;
 struct GEMMLowpReductionKernelInfo;
 
-/** Common interface for all Neon reduction kernels */
+/** Common interface for all reduction kernels */
 class INEGEMMLowpReductionKernel : public INEKernel
 {
 public:
@@ -69,7 +69,7 @@ protected:
     bool           _mul_by_scalar;
 };
 
-/** Neon kernel used to compute the row-vectors of sums of all the entries in each row of Matrix A.
+/** Kernel used to compute the row-vectors of sums of all the entries in each row of Matrix A.
  *
  * @note This stage is needed to handle the offset of matrix product
  *       https://github.com/google/gemmlowp/blob/master/doc/low-precision.md
@@ -130,7 +130,7 @@ private:
     void run_internal(const Window &window);
 };
 
-/** Neon kernel used to compute the row-vectors of sums of all the entries in each column of Matrix B.
+/** Kernel used to compute the row-vectors of sums of all the entries in each column of Matrix B.
  *
  * @note This stage is needed to handle the offset of matrix product
  *       https://github.com/google/gemmlowp/blob/master/doc/low-precision.md
diff --git a/src/core/NEON/kernels/NEGEMMMatrixAdditionKernel.h b/src/core/NEON/kernels/NEGEMMMatrixAdditionKernel.h
index f9ff143e07..c896cabc6a 100644
--- a/src/core/NEON/kernels/NEGEMMMatrixAdditionKernel.h
+++ b/src/core/NEON/kernels/NEGEMMMatrixAdditionKernel.h
@@ -30,7 +30,7 @@ namespace arm_compute
 {
 class ITensor;
 
-/** Neon kernel to perform the in-place matrix addition between 2 matrices taking into account that the second matrix might be weighted by a scalar value beta:
+/** Kernel to perform the in-place matrix addition between 2 matrices taking into account that the second matrix might be weighted by a scalar value beta:
  *
  * @note [ MTX_OUT = MTX_0 + beta * MTX_1 ] with MTX_0 and MTX_1 of the same size
  *
diff --git a/src/core/NEON/kernels/NEGEMMMatrixMultiplyKernel.h b/src/core/NEON/kernels/NEGEMMMatrixMultiplyKernel.h
index e2945ee117..3bc162a1b4 100644
--- a/src/core/NEON/kernels/NEGEMMMatrixMultiplyKernel.h
+++ b/src/core/NEON/kernels/NEGEMMMatrixMultiplyKernel.h
@@ -30,7 +30,7 @@ namespace arm_compute
 {
 class ITensor;
 
-/** Neon kernel to multiply two input matrices "A" and "B". All elements of the output matrix/vector will be multiplied by alpha after the matrix multiplication
+/** Kernel to multiply two input matrices "A" and "B". All elements of the output matrix/vector will be multiplied by alpha after the matrix multiplication
  *
  * @note If the output tensor is a matrix, the implementation assumes that the input tensors @p input0 and @p input1 are both matrices and reshaped respectively with @ref NEGEMMInterleave4x4Kernel" and @ref NEGEMMTranspose1xWKernel
  * @note If the output tensor is a vector and the data type is F32, the implementation assumes that the first input tensor @p input0 is a vector and the second input tensor @p input1 a matrix. The implementation also assumes that both tensors have not been reshaped
diff --git a/src/core/NEON/kernels/NEGEMMTranspose1xWKernel.cpp b/src/core/NEON/kernels/NEGEMMTranspose1xWKernel.cpp
index ad7ae505f4..f6a453cbbc 100644
--- a/src/core/NEON/kernels/NEGEMMTranspose1xWKernel.cpp
+++ b/src/core/NEON/kernels/NEGEMMTranspose1xWKernel.cpp
@@ -51,7 +51,7 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input);
     ARM_COMPUTE_RETURN_ERROR_ON(input->data_type() == DataType::UNKNOWN);
-    //Note: ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input) is not needed here as this kernel doesn't use Neon FP16 instructions.
+    //Note: ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input) is not needed here as this kernel doesn't use CPU FP16 instructions.
 
     if(output->total_size() != 0)
     {
diff --git a/src/core/NEON/kernels/NEGEMMTranspose1xWKernel.h b/src/core/NEON/kernels/NEGEMMTranspose1xWKernel.h
index 583588a1c1..7ca71cf414 100644
--- a/src/core/NEON/kernels/NEGEMMTranspose1xWKernel.h
+++ b/src/core/NEON/kernels/NEGEMMTranspose1xWKernel.h
@@ -31,7 +31,7 @@ namespace arm_compute
 // Forward declarations
 class ITensor;
 
-/** Neon kernel which transposes the elements of a matrix in chunks of 1xW, where W is equal to (16 / element size of the tensor)
+/** Kernel which transposes the elements of a matrix in chunks of 1xW, where W is equal to (16 / element size of the tensor)
  *
  * Following an example of how the transposition1xW works when the input data is F32
  *
diff --git a/src/core/NEON/kernels/NEGatherKernel.h b/src/core/NEON/kernels/NEGatherKernel.h
index 46b41b28e3..0711f8190b 100644
--- a/src/core/NEON/kernels/NEGatherKernel.h
+++ b/src/core/NEON/kernels/NEGatherKernel.h
@@ -33,7 +33,7 @@ namespace arm_compute
 // Forward declarations
 class ITensor;
 
-/** Kernel to perform other operation on Neon */
+/** Kernel to perform gather operation. */
 class NEGatherKernel : public INEKernel
 {
 public:
diff --git a/src/core/NEON/kernels/NEInstanceNormalizationLayerKernel.cpp b/src/core/NEON/kernels/NEInstanceNormalizationLayerKernel.cpp
index fa1f7a6c49..d33431a8d2 100644
--- a/src/core/NEON/kernels/NEInstanceNormalizationLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEInstanceNormalizationLayerKernel.cpp
@@ -82,7 +82,7 @@ inline float16x8_t vector_float_norm(const float16x8_t &inputs, const float32x4_
 template <typename T, typename AccType = T>
 void instance_normalization_nchw(ITensor *input, ITensor *output, float gamma, float beta, float epsilon, const Window &window)
 {
-    /** Neon vector tag type. */
+    /** SIMD vector tag type. */
     using ExactTagType = typename wrapper::traits::neon_bitvector_tag_t<T, wrapper::traits::BitWidth::W128>;
 
     // Clear X/Y dimensions on execution window as we handle the planes manually
diff --git a/src/core/NEON/kernels/NENormalizationLayerKernel.cpp b/src/core/NEON/kernels/NENormalizationLayerKernel.cpp
index 21a70ae513..f1c5d3f6e6 100644
--- a/src/core/NEON/kernels/NENormalizationLayerKernel.cpp
+++ b/src/core/NEON/kernels/NENormalizationLayerKernel.cpp
@@ -170,7 +170,7 @@ void NENormalizationLayerKernel::configure(const ITensor *input, const ITensor *
 template <typename T, unsigned int S, unsigned int dim, bool do_2D_norm>
 void NENormalizationLayerKernel::normalize_float(const Window &window)
 {
-    /** Neon vector tag type. */
+    /** SIMD vector tag type. */
     using ExactTagType = typename wrapper::traits::neon_vector<T, S>::tag_type;
 
     Window win(window);
diff --git a/src/core/NEON/kernels/NEPadLayerKernel.h b/src/core/NEON/kernels/NEPadLayerKernel.h
index af0dbfdc64..00cda7dc22 100644
--- a/src/core/NEON/kernels/NEPadLayerKernel.h
+++ b/src/core/NEON/kernels/NEPadLayerKernel.h
@@ -30,10 +30,7 @@ namespace arm_compute
 {
 class ITensor;
 
-/** Neon kernel to add padding to a tensor
- *
- * Add padding given padding information
- */
+/** Basic kernel to pad the input tensor given padding information. */
 class NEPadLayerKernel : public INEKernel
 {
 public:
diff --git a/src/core/NEON/kernels/NEQLSTMLayerNormalizationKernel.h b/src/core/NEON/kernels/NEQLSTMLayerNormalizationKernel.h
index 5522ae889a..a3ff6e988f 100644
--- a/src/core/NEON/kernels/NEQLSTMLayerNormalizationKernel.h
+++ b/src/core/NEON/kernels/NEQLSTMLayerNormalizationKernel.h
@@ -31,7 +31,7 @@ namespace arm_compute
 {
 class ITensor;
 
-/** Neon kernel to perform layer normalization */
+/** Kernel to perform layer normalization for QLSTM. */
 class NEQLSTMLayerNormalizationKernel : public INEKernel
 {
 public:
diff --git a/src/core/NEON/kernels/NERangeKernel.cpp b/src/core/NEON/kernels/NERangeKernel.cpp
index 170f58fd7a..d0c51f8497 100644
--- a/src/core/NEON/kernels/NERangeKernel.cpp
+++ b/src/core/NEON/kernels/NERangeKernel.cpp
@@ -43,7 +43,7 @@ namespace
 template <typename T>
 void range_function(ITensor *output, float start, float step, const Window &window)
 {
-    /** Neon vector tag type. */
+    /** SIMD vector tag type. */
     using ExactTagType = typename wrapper::traits::neon_bitvector<T, wrapper::traits::BitWidth::W128>::tag_type;
 
     const auto step_vec  = wrapper::vdup_n(static_cast<T>(step), ExactTagType{});
diff --git a/src/core/NEON/kernels/NEReductionOperationKernel.cpp b/src/core/NEON/kernels/NEReductionOperationKernel.cpp
index 875d23333e..553048c7dd 100644
--- a/src/core/NEON/kernels/NEReductionOperationKernel.cpp
+++ b/src/core/NEON/kernels/NEReductionOperationKernel.cpp
@@ -386,7 +386,7 @@ public:
 template <typename T, int S>
 struct RedOpX
 {
-    /** Neon vector tag type. */
+    /** SIMD vector tag type. */
     using ExactTagType = typename wrapper::traits::neon_vector<T, S>::tag_type;
 
     inline void operator()(const Window &in_window, Window &out_window, const ITensor *in, ITensor *out, const ReductionOperation op)
@@ -908,7 +908,7 @@ struct RedOpX_quantized
 template <typename T, int S>
 struct RedOpYZW
 {
-    /** Neon vector tag type. */
+    /** SIMD vector tag type. */
     using ExactTagType = typename wrapper::traits::neon_vector<T, S>::tag_type;
     using neon_vector  = typename wrapper::traits::neon_vector<T, S>::type;
 
@@ -1127,7 +1127,7 @@ struct RedOpYZW
 template <typename T, int S, int axis, ReductionOperation op>
 struct RedOpYZW_complex
 {
-    /** Neon vector tag type. */
+    /** SIMD vector tag type. */
     using ExactTagType = typename wrapper::traits::neon_vector<T, S>::tag_type;
     using neon_vector  = typename wrapper::traits::neon_vector<T, S>::type;
 
diff --git a/src/core/NEON/kernels/NEReductionOperationKernel.h b/src/core/NEON/kernels/NEReductionOperationKernel.h
index c7ed0070be..667305b3aa 100644
--- a/src/core/NEON/kernels/NEReductionOperationKernel.h
+++ b/src/core/NEON/kernels/NEReductionOperationKernel.h
@@ -30,7 +30,7 @@ namespace arm_compute
 {
 class ITensor;
 
-/** Neon kernel to perform a reduction operation
+/** Kernel to perform a reduction operation
  *
  * @note For ARG_MIN/ARG_MAX reduction, the default data type for an uninitialized
  *       output tensor is signed 32-bit integer (S32). It is the user's responsibility
diff --git a/src/core/NEON/kernels/NERemapKernel.h b/src/core/NEON/kernels/NERemapKernel.h
index adc7f4bdd5..21cb67ef58 100644
--- a/src/core/NEON/kernels/NERemapKernel.h
+++ b/src/core/NEON/kernels/NERemapKernel.h
@@ -31,7 +31,7 @@ namespace arm_compute
 {
 class ITensor;
 
-/** Neon kernel to perform a remap on a tensor */
+/** Kernel to perform a remap on a tensor */
 class NERemapKernel : public INEKernel
 {
 public:
diff --git a/src/core/NEON/kernels/NEReorgLayerKernel.cpp b/src/core/NEON/kernels/NEReorgLayerKernel.cpp
index 75a58fccd6..a7b830c066 100644
--- a/src/core/NEON/kernels/NEReorgLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEReorgLayerKernel.cpp
@@ -42,7 +42,7 @@ namespace
 {
 Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, int32_t stride)
 {
-    //Note: ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input) is not needed here as this kernel doesn't use Neon FP16 instructions.
+    //Note: ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input) is not needed here as this kernel doesn't use CPU FP16 instructions.
     ARM_COMPUTE_RETURN_ERROR_ON(input->data_type() == DataType::UNKNOWN);
     ARM_COMPUTE_RETURN_ERROR_ON(input->data_layout() == DataLayout::UNKNOWN);
 
diff --git a/src/core/NEON/kernels/NEReverseKernel.cpp b/src/core/NEON/kernels/NEReverseKernel.cpp
index b2fce0f56d..758433f89f 100644
--- a/src/core/NEON/kernels/NEReverseKernel.cpp
+++ b/src/core/NEON/kernels/NEReverseKernel.cpp
@@ -37,7 +37,7 @@ namespace
 Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *axis)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output, axis);
-    //Note: ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input) is not needed here as this kernel doesn't use Neon FP16 instructions.
+    //Note: ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input) is not needed here as this kernel doesn't use CPU FP16 instructions.
     ARM_COMPUTE_RETURN_ERROR_ON(input->data_type() == DataType::UNKNOWN);
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(axis, 1, DataType::U32);
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis->num_dimensions() > 1, "Axis must be a 1D tensor");
diff --git a/src/core/NEON/kernels/NEStackLayerKernel.cpp b/src/core/NEON/kernels/NEStackLayerKernel.cpp
index 64310e7f7f..07ce829c43 100644
--- a/src/core/NEON/kernels/NEStackLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEStackLayerKernel.cpp
@@ -44,7 +44,7 @@ namespace
 Status validate_arguments(const ITensorInfo *input, unsigned int axis, unsigned int idx_input, unsigned int num_tensors, const ITensorInfo *output)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
-    // Note: ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input) is not needed here as this kernel doesn't use Neon FP16 instructions.
+    // Note: ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input) is not needed here as this kernel doesn't use CPU FP16 instructions.
     ARM_COMPUTE_RETURN_ERROR_ON(input->data_type() == DataType::UNKNOWN);
     ARM_COMPUTE_RETURN_ERROR_ON(idx_input >= num_tensors);
     ARM_COMPUTE_RETURN_ERROR_ON(axis > input->num_dimensions());
diff --git a/src/core/NEON/kernels/NEStackLayerKernel.h b/src/core/NEON/kernels/NEStackLayerKernel.h
index dc5c7d77ad..9b36518e4d 100644
--- a/src/core/NEON/kernels/NEStackLayerKernel.h
+++ b/src/core/NEON/kernels/NEStackLayerKernel.h
@@ -32,7 +32,7 @@ namespace arm_compute
 {
 class ITensor;
 
-/** Neon kernel to stacks a rank-R tensor into one with rank-(R+1) along the axis dimension.*/
+/** Basic kernel to stack a rank-R tensor into one with rank-(R+1) along the axis dimension. */
 class NEStackLayerKernel : public INEKernel
 {
 public:
diff --git a/src/core/NEON/kernels/NETileKernel.h b/src/core/NEON/kernels/NETileKernel.h
index e6ce9534e7..47f306afd0 100644
--- a/src/core/NEON/kernels/NETileKernel.h
+++ b/src/core/NEON/kernels/NETileKernel.h
@@ -30,7 +30,7 @@ namespace arm_compute
 {
 class ITensor;
 
-/** Neon kernel to perform a tile operation */
+/** Basic kernel to perform a tile operation */
 class NETileKernel : public INEKernel
 {
 public:
diff --git a/src/core/NEON/kernels/NEWeightsReshapeKernel.cpp b/src/core/NEON/kernels/NEWeightsReshapeKernel.cpp
index 44d60093f0..9bef9c30d9 100644
--- a/src/core/NEON/kernels/NEWeightsReshapeKernel.cpp
+++ b/src/core/NEON/kernels/NEWeightsReshapeKernel.cpp
@@ -47,7 +47,7 @@ TensorShape get_output_shape(const ITensorInfo *input, bool has_bias)
 Status validate_arguments(const ITensorInfo *input, const ITensorInfo *biases, const ITensorInfo *output)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
-    //Note: ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input) is not needed here as this kernel doesn't use Neon FP16 instructions.
+    //Note: ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input) is not needed here as this kernel doesn't use CPU FP16 instructions.
     ARM_COMPUTE_RETURN_ERROR_ON(input->data_type() == DataType::UNKNOWN);
 
     if(biases != nullptr)
diff --git a/src/core/NEON/kernels/NEWeightsReshapeKernel.h b/src/core/NEON/kernels/NEWeightsReshapeKernel.h
index a4a9e28763..76eca9fe86 100644
--- a/src/core/NEON/kernels/NEWeightsReshapeKernel.h
+++ b/src/core/NEON/kernels/NEWeightsReshapeKernel.h
@@ -30,7 +30,7 @@ namespace arm_compute
 {
 class ITensor;
 
-/** Neon kernel to perform reshaping on the weights used by convolution and locally connected layer
+/** Kernel to perform reshaping on the weights used by convolution and locally connected layer
  *
  * Rearranges each 3-dimensional kernel to a single row leading to a matrix with linearized kernels.
  * In combination with the @ref NEIm2ColKernel can transform a convolution to a matrix multiplication.
diff --git a/src/core/NEON/kernels/NEWinogradConvolutionLayerKernel.h b/src/core/NEON/kernels/NEWinogradConvolutionLayerKernel.h
index 3583735482..75d257de4b 100644
--- a/src/core/NEON/kernels/NEWinogradConvolutionLayerKernel.h
+++ b/src/core/NEON/kernels/NEWinogradConvolutionLayerKernel.h
@@ -35,7 +35,7 @@ namespace arm_compute
 // Forward declarations
 class ITensor;
 
-/** Interface for the Neon kernel to perform Winograd input transform. */
+/** Interface for the kernel to perform Winograd input transform. */
 class INEWinogradLayerTransformInputKernel : public INEKernel
 {
 public:
@@ -96,7 +96,7 @@ public:
     }
 };
 
-/** Neon kernel to perform Winograd input transform. */
+/** Kernel to perform Winograd input transform. */
 template <typename T, int OutputTileRows, int OutputTileCols, int KernelRows, int KernelCols>
 class NEWinogradLayerTransformInputKernel : public INEWinogradLayerTransformInputKernel
 {
@@ -227,7 +227,7 @@ private:
     ITensor                        *_workspace;
 };
 
-/** Interface for the Neon kernel to perform Winograd output transform. */
+/** Interface for the kernel to perform Winograd output transform. */
 class INEWinogradLayerTransformOutputKernel : public INEKernel
 {
 public:
@@ -310,7 +310,7 @@ public:
     }
 };
 
-/** Neon kernel to perform Winograd output transform. */
+/** Kernel to perform Winograd output transform. */
 template <typename T, int OutputTileRows, int OutputTileCols, int KernelRows, int KernelCols>
 class NEWinogradLayerTransformOutputKernel : public INEWinogradLayerTransformOutputKernel
 {
@@ -437,7 +437,7 @@ private:
     int                              _num_channels;
 };
 
-/** Interface for the Neon kernel to perform Winograd weights transform. */
+/** Interface for the kernel to perform Winograd weights transform. */
 class INEWinogradLayerTransformWeightsKernel : public INEKernel
 {
 public:
@@ -495,7 +495,7 @@ public:
     static Status validate(const ITensorInfo *input, const ITensorInfo *weights);
 };
 
-/** Neon kernel to perform Winograd weights transform. */
+/** Kernel to perform Winograd weights transform. */
 template <typename T, int OutputTileRows, int OutputTileCols, int KernelRows, int KernelCols>
 class NEWinogradLayerTransformWeightsKernel final : public INEWinogradLayerTransformWeightsKernel
 {
@@ -577,7 +577,7 @@ private:
     int                               _num_input_channels;
 };
 
-/** Neon kernel to perform Winograd. */
+/** Kernel to perform Winograd. */
 template <typename TIn, typename TOut, int OutputTileRows, int OutputTileCols, int KernelRows, int KernelCols>
 class NEWinogradLayerConfiguration
 {
diff --git a/src/core/NEON/kernels/arm_gemm/gemm_fp32.cpp b/src/core/NEON/kernels/arm_gemm/gemm_fp32.cpp
index 5c894c01c8..6c3743dce7 100644
--- a/src/core/NEON/kernels/arm_gemm/gemm_fp32.cpp
+++ b/src/core/NEON/kernels/arm_gemm/gemm_fp32.cpp
@@ -111,7 +111,7 @@ static const GemmImplementation<float, float> gemm_fp32_methods[] =
 },
 #endif // __ARM_FEATURE_SVE
 
-// Neon hybrid methods
+// Arm® Neon™ hybrid methods
 {
     GemmMethod::GEMM_HYBRID,
     "a64_smallK_hybrid_fp32_mla_8x4",
diff --git a/src/core/NEON/kernels/arm_gemm/interleave_indirect.cpp b/src/core/NEON/kernels/arm_gemm/interleave_indirect.cpp
index bb86d9e41d..0d56b46e19 100644
--- a/src/core/NEON/kernels/arm_gemm/interleave_indirect.cpp
+++ b/src/core/NEON/kernels/arm_gemm/interleave_indirect.cpp
@@ -292,21 +292,21 @@ void Interleave(TOut *out, const TIn *in, size_t in_stride, const unsigned int y
 /* AArch32 */
 #ifdef __arm__
 /* FP32 */
-/* Neon implementation (height 6) */
+/* Arm® Neon™ implementation (height 6) */
 template void IndirectInterleave<6, 1, VLType::None>(float *, const float * const * const *, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
 template void ConvolutionInterleave<6, 1, VLType::None>(float *, const float *, size_t, const convolver<float> &, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
 template void Interleave<6, 1, VLType::None>(float *, const float *, size_t, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
 
 /* FP16 */
 #if __ARM_FP16_ARGS
-/* Neon implementation using FP32 kernel (height 6) */
+/* Arm® Neon™ implementation using FP32 kernel (height 6) */
 template void IndirectInterleave<6, 1, VLType::None>(float *, const __fp16 * const * const *, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
 template void ConvolutionInterleave<6, 1, VLType::None>(float *, const __fp16 *, size_t, const convolver<__fp16> &, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
 template void Interleave<6, 1, VLType::None>(float *, const __fp16 *, size_t, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
 #endif /* __ARM_FP16_ARGS */
 
 /* BF16 */
-/* Neon implementation using FP32 kernel */
+/* Arm® Neon™ implementation using FP32 kernel */
 template void IndirectInterleave<6, 1, VLType::None>(float *, const bfloat16 * const * const *, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
 template void ConvolutionInterleave<6, 1, VLType::None>(float *, const bfloat16 *, size_t, const convolver<bfloat16> &, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
 template void Interleave<6, 1, VLType::None>(float *, const bfloat16 *, size_t, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
@@ -315,7 +315,7 @@ template void Interleave<6, 1, VLType::None>(float *, const bfloat16 *, size_t,
 /* AArch64 */
 #ifdef __aarch64__
 /* FP32 */
-/* Neon/SVE implementation (height 8) */
+/* Arm® Neon™/SVE implementation (height 8) */
 template void IndirectInterleave<8, 1, VLType::None>(float *, const float * const * const *, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
 template void ConvolutionInterleave<8, 1, VLType::None>(float *, const float *, size_t, const convolver<float> &, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
 template void Interleave<8, 1, VLType::None>(float *, const float *, size_t, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
@@ -339,7 +339,7 @@ template void ConvolutionInterleave<8, 1, VLType::None>(float *, const __fp16 *,
 template void Interleave<8, 1, VLType::None>(float *, const __fp16 *, size_t, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
 
 /* BF16 */
-/* Neon/SVE BFDOT */
+/* Arm® Neon™/SVE BFDOT */
 #ifdef V8P6_BF
 template void IndirectInterleave<8, 2, VLType::None>(bfloat16 *, const bfloat16 * const * const *, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
 template void ConvolutionInterleave<8, 2, VLType::None>(bfloat16 *, const bfloat16 *, size_t, const convolver<bfloat16> &, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
@@ -350,7 +350,7 @@ template void ConvolutionInterleave<8, 4, VLType::None>(bfloat16 *, const bfloat
 template void Interleave<8, 4, VLType::None>(bfloat16 *, const bfloat16 *, size_t, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
 #endif // V8P6_BF
 
-/* Neon/SVE using FP32 kernel */
+/* Arm® Neon™/SVE using FP32 kernel */
 template void IndirectInterleave<8, 1, VLType::None>(float *, const bfloat16 * const * const *, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
 template void ConvolutionInterleave<8, 1, VLType::None>(float *, const bfloat16 *, size_t, const convolver<bfloat16> &, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
 template void Interleave<8, 1, VLType::None>(float *, const bfloat16 *, size_t, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
@@ -365,12 +365,12 @@ template void ConvolutionInterleave<8, 1, VLType::None>(uint16_t *, const uint16
 template void Interleave<8, 1, VLType::None>(uint16_t *, const uint16_t *, size_t, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
 
 /* INT8 */
-/* Neon SMLA/SMLAL (height 4, block 16) */
+/* Arm® Neon™ SMLA/SMLAL (height 4, block 16) */
 template void IndirectInterleave<4, 16, VLType::None>(int8_t *, const int8_t * const * const *, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
 template void ConvolutionInterleave<4, 16, VLType::None>(int8_t *, const int8_t *, size_t, const convolver<int8_t> &, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
 template void Interleave<4, 16, VLType::None>(int8_t *, const int8_t *, size_t, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
 
-/* Neon SDOT (height 8, block 4) */
+/* Arm® Neon™ SDOT (height 8, block 4) */
 template void IndirectInterleave<8, 4, VLType::None>(int8_t *, const int8_t * const * const *, unsigned int, unsigned int, unsigned int y0, unsigned int ymax, unsigned int k0, unsigned int kmax, bool, int32_t);
 template void ConvolutionInterleave<8, 4, VLType::None>(int8_t *, const int8_t *, size_t, const convolver<int8_t> &, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
 template void Interleave<8, 4, VLType::None>(int8_t *, const int8_t *, size_t, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
@@ -382,17 +382,17 @@ template void ConvolutionInterleave<8, 8, VLType::None>(int8_t *, const int8_t *
 template void Interleave<8, 8, VLType::None>(int8_t *, const int8_t *, size_t, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
 #endif // MMLA_INT8
 
-/* Neon SDOT (height 8, block 1) */
+/* Arm® Neon™ SDOT (height 8, block 1) */
 template void IndirectInterleave<8, 1, VLType::None>(int16_t *, const int8_t * const * const *, unsigned int, unsigned int, unsigned int y0, unsigned int ymax, unsigned int k0, unsigned int kmax, bool, int32_t);
 template void ConvolutionInterleave<8, 1, VLType::None>(int16_t *, const int8_t *, size_t, const convolver<int8_t> &, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
 template void Interleave<8, 1, VLType::None>(int16_t *, const int8_t *, size_t, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
 
-/* Neon SMLA/SMLAL (height 4, block 16) */
+/* Arm® Neon™ SMLA/SMLAL (height 4, block 16) */
 template void IndirectInterleave<4, 16, VLType::None>(uint8_t *, const uint8_t * const * const *, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
 template void ConvolutionInterleave<4, 16, VLType::None>(uint8_t *, const uint8_t *, size_t, const convolver<uint8_t> &, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
 template void Interleave<4, 16, VLType::None>(uint8_t *, const uint8_t *, size_t, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
 
-/* Neon SDOT (height 8, block 4) */
+/* Arm® Neon™ SDOT (height 8, block 4) */
 template void IndirectInterleave<8, 4, VLType::None>(uint8_t *, const uint8_t * const * const *, unsigned int, unsigned int, unsigned int y0, unsigned int ymax, unsigned int k0, unsigned int kmax, bool, int32_t);
 template void ConvolutionInterleave<8, 4, VLType::None>(uint8_t *, const uint8_t *, size_t, const convolver<uint8_t> &, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
 template void Interleave<8, 4, VLType::None>(uint8_t *, const uint8_t *, size_t, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
@@ -404,7 +404,7 @@ template void ConvolutionInterleave<8, 8, VLType::None>(uint8_t *, const uint8_t
 template void Interleave<8, 8, VLType::None>(uint8_t *, const uint8_t *, size_t, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
 #endif // MMLA_INT8
 
-/* Neon 16-bit (height 8, block 1) */
+/* Arm® Neon™ 16-bit (height 8, block 1) */
 template void IndirectInterleave<8, 1, VLType::None>(uint16_t *, const uint8_t * const * const *, unsigned int, unsigned int, unsigned int y0, unsigned int ymax, unsigned int k0, unsigned int kmax, bool, int32_t);
 template void ConvolutionInterleave<8, 1, VLType::None>(uint16_t *, const uint8_t *, size_t, const convolver<uint8_t> &, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
 template void Interleave<8, 1, VLType::None>(uint16_t *, const uint8_t *, size_t, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
diff --git a/src/core/NEON/kernels/batchnormalization/impl/NEON/fp16.cpp b/src/core/NEON/kernels/batchnormalization/impl/NEON/fp16.cpp
index c105adac70..ed5254a0a4 100644
--- a/src/core/NEON/kernels/batchnormalization/impl/NEON/fp16.cpp
+++ b/src/core/NEON/kernels/batchnormalization/impl/NEON/fp16.cpp
@@ -44,7 +44,7 @@ template <typename T>
 void batch_normalization(ITensor *src, ITensor *dst, const ITensor *mean, const ITensor *var, const ITensor *beta, const ITensor *gamma,
                          float epsilon, ActivationLayerInfo &act_info, const Window &window)
 {
-    /** Neon vector tag type. */
+    /** SIMD vector tag type. */
     using ExactTagType = typename wrapper::traits::neon_bitvector_tag_t<float16_t, wrapper::traits::BitWidth::W128>;
 
     const int  window_step_x  = 8;
diff --git a/src/core/NEON/kernels/batchnormalization/impl/NEON/fp32.cpp b/src/core/NEON/kernels/batchnormalization/impl/NEON/fp32.cpp
index 4a90a211c7..d6e22e1843 100644
--- a/src/core/NEON/kernels/batchnormalization/impl/NEON/fp32.cpp
+++ b/src/core/NEON/kernels/batchnormalization/impl/NEON/fp32.cpp
@@ -43,7 +43,7 @@ template <typename T>
 void batch_normalization(ITensor *src, ITensor *dst, const ITensor *mean, const ITensor *var, const ITensor *beta, const ITensor *gamma,
                          float epsilon, ActivationLayerInfo &act_info, const Window &window)
 {
-    /** Neon vector tag type. */
+    /** SIMD vector tag type. */
     using ExactTagType = typename wrapper::traits::neon_bitvector_tag_t<float, wrapper::traits::BitWidth::W128>;
 
     const int  window_step_x  = 4;
diff --git a/src/core/NEON/kernels/detail/NEActivationFunctionDetail.h b/src/core/NEON/kernels/detail/NEActivationFunctionDetail.h
index e68f1117e8..25d682d8ae 100644
--- a/src/core/NEON/kernels/detail/NEActivationFunctionDetail.h
+++ b/src/core/NEON/kernels/detail/NEActivationFunctionDetail.h
@@ -34,7 +34,7 @@ namespace detail
 template <typename T, int S>
 struct dummy
 {
-    /** Neon vector type. */
+    /** SIMD vector type. */
     using ExactType = typename wrapper::traits::neon_vector<T, S>::type;
 
     /** Construct a dummy activation object.
@@ -68,9 +68,9 @@ struct dummy
 template <typename T, int S>
 struct linear
 {
-    /** Neon vector type. */
+    /** SIMD vector type. */
     using ExactType = typename wrapper::traits::neon_vector<T, S>::type;
-    /** Neon vector tag type. */
+    /** SIMD vector tag type. */
     using ExactTagType = typename wrapper::traits::neon_vector<T, S>::tag_type;
 
     /** Construct a Linear activation object.
@@ -112,9 +112,9 @@ struct linear
 template <typename T, int S>
 struct square
 {
-    /** Neon vector type. */
+    /** SIMD vector type. */
     using ExactType = typename wrapper::traits::neon_vector<T, S>::type;
-    /** Neon vector tag type. */
+    /** SIMD vector tag type. */
     using ExactTagType = typename wrapper::traits::neon_vector<T, S>::tag_type;
 
     /** Construct a Square activation object.
@@ -148,9 +148,9 @@ struct square
 template <typename T, int S>
 struct logistic
 {
-    /** Neon vector type. */
+    /** SIMD vector type. */
     using ExactType = typename wrapper::traits::neon_vector<T, S>::type;
-    /** Neon vector tag type. */
+    /** SIMD vector tag type. */
     using ExactTagType = typename wrapper::traits::neon_vector<T, S>::tag_type;
 
     /** Construct a Logistic activation object.
@@ -188,9 +188,9 @@ struct logistic
 template <typename T, int S>
 struct relu
 {
-    /** Neon vector type. */
+    /** SIMD vector type. */
     using ExactType = typename wrapper::traits::neon_vector<T, S>::type;
-    /** Neon vector tag type. */
+    /** SIMD vector tag type. */
     using ExactTagType = typename wrapper::traits::neon_vector<T, S>::tag_type;
 
     /** Construct a RELU activation object.
@@ -228,9 +228,9 @@ struct relu
 template <typename T, int S>
 struct brelu
 {
-    /** Neon vector type. */
+    /** SIMD vector type. */
     using ExactType = typename wrapper::traits::neon_vector<T, S>::type;
-    /** Neon vector tag type. */
+    /** SIMD vector tag type. */
     using ExactTagType = typename wrapper::traits::neon_vector<T, S>::tag_type;
 
     /** Construct a bounded RELU activation object.
@@ -270,9 +270,9 @@ struct brelu
 template <typename T, int S>
 struct lubrelu
 {
-    /** Neon vector type. */
+    /** SIMD vector type. */
     using ExactType = typename wrapper::traits::neon_vector<T, S>::type;
-    /** Neon vector tag type. */
+    /** SIMD vector tag type. */
     using ExactTagType = typename wrapper::traits::neon_vector<T, S>::tag_type;
 
     /** Construct a lower-upper bounded RELU activation object.
diff --git a/src/core/NEON/wrapper/traits.h b/src/core/NEON/wrapper/traits.h
index b786e44bc7..3452b76761 100644
--- a/src/core/NEON/wrapper/traits.h
+++ b/src/core/NEON/wrapper/traits.h
@@ -44,7 +44,7 @@ struct vector_64_tag {};
 /** 128-bit vector tag */
 struct vector_128_tag {};
 
-/** Create the appropriate Neon vector given its type and size in terms of elements */
+/** Create the appropriate SIMD vector given its type and size in terms of elements */
 template <typename T, int S> struct neon_vector;
 
 // Specializations
@@ -88,7 +88,7 @@ enum class BitWidth
     W128, /**< 128-bit width */
 };
 
-/** Create the appropriate Neon vector given its type and size in terms of bits */
+/** Create the appropriate SIMD vector given its type and size in terms of bits */
 template <typename T, BitWidth BW> struct neon_bitvector;
 // Specializations
 #ifndef DOXYGEN_SKIP_THIS
diff --git a/src/core/cpu/kernels/CpuConcatenateBatchKernel.cpp b/src/core/cpu/kernels/CpuConcatenateBatchKernel.cpp
index bd6d777572..aaa1898ce9 100644
--- a/src/core/cpu/kernels/CpuConcatenateBatchKernel.cpp
+++ b/src/core/cpu/kernels/CpuConcatenateBatchKernel.cpp
@@ -133,7 +133,7 @@ void batch_concat(const ITensor *src, ITensor *dst, unsigned int batch_offset, c
 Status validate_arguments(const ITensorInfo *src, unsigned int batch_offset, const ITensorInfo *dst)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst);
-    //Note: ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(src) is not needed here as this kernel doesn't use Neon FP16 instructions.
+    //Note: ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(src) is not needed here as this kernel doesn't use CPU FP16 instructions.
     ARM_COMPUTE_RETURN_ERROR_ON(src->data_type() == DataType::UNKNOWN);
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst);
 
diff --git a/src/core/cpu/kernels/CpuConcatenateDepthKernel.cpp b/src/core/cpu/kernels/CpuConcatenateDepthKernel.cpp
index d8eed44cd8..35c189caeb 100644
--- a/src/core/cpu/kernels/CpuConcatenateDepthKernel.cpp
+++ b/src/core/cpu/kernels/CpuConcatenateDepthKernel.cpp
@@ -134,7 +134,7 @@ void depth_concat(const ITensor *src, ITensor *dst, unsigned int depth_offset, c
 Status validate_arguments(const ITensorInfo *input, unsigned int depth_offset, const ITensorInfo *output)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
-    //Note: ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input) is not needed here as this kernel doesn't use Neon FP16 instructions.
+    //Note: ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input) is not needed here as this kernel doesn't use CPU FP16 instructions.
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F16, DataType::F32);
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
 
diff --git a/src/core/cpu/kernels/CpuConcatenateHeightKernel.cpp b/src/core/cpu/kernels/CpuConcatenateHeightKernel.cpp
index 4dc458a4a8..363c271a68 100644
--- a/src/core/cpu/kernels/CpuConcatenateHeightKernel.cpp
+++ b/src/core/cpu/kernels/CpuConcatenateHeightKernel.cpp
@@ -49,7 +49,7 @@ namespace
 Status validate_arguments(const ITensorInfo *src, unsigned int height_offset, const ITensorInfo *dst)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst);
-    // Note: ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(src) is not needed here as this kernel doesn't use Neon FP16 instructions.
+    // Note: ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(src) is not needed here as this kernel doesn't use CPU FP16 instructions.
     ARM_COMPUTE_RETURN_ERROR_ON(src->data_type() == DataType::UNKNOWN);
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst);
     ARM_COMPUTE_RETURN_ERROR_ON(src->dimension(Window::DimX) != dst->dimension(Window::DimX));
diff --git a/src/core/cpu/kernels/CpuConcatenateWidthKernel.cpp b/src/core/cpu/kernels/CpuConcatenateWidthKernel.cpp
index efefd5d011..11b1db5bc2 100644
--- a/src/core/cpu/kernels/CpuConcatenateWidthKernel.cpp
+++ b/src/core/cpu/kernels/CpuConcatenateWidthKernel.cpp
@@ -49,7 +49,7 @@ namespace
 Status validate_arguments(const ITensorInfo *src, unsigned int width_offset, const ITensorInfo *dst)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst);
-    // Note: ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(src) is not needed here as this kernel doesn't use Neon FP16 instructions.
+    // Note: ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(src) is not needed here as this kernel doesn't use CPU FP16 instructions.
     ARM_COMPUTE_RETURN_ERROR_ON(src->data_type() == DataType::UNKNOWN);
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst);
     ARM_COMPUTE_RETURN_ERROR_ON(src->dimension(0) + width_offset > dst->dimension(0));
diff --git a/src/core/cpu/kernels/CpuReshapeKernel.cpp b/src/core/cpu/kernels/CpuReshapeKernel.cpp
index 4ab1612518..70c652695a 100644
--- a/src/core/cpu/kernels/CpuReshapeKernel.cpp
+++ b/src/core/cpu/kernels/CpuReshapeKernel.cpp
@@ -50,7 +50,7 @@ namespace
 Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst);
-    // Note: ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(src) is not needed here as this kernel doesn't use Neon FP16 instructions.
+    // Note: ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(src) is not needed here as this kernel doesn't use CPU FP16 instructions.
     ARM_COMPUTE_RETURN_ERROR_ON(src->data_type() == DataType::UNKNOWN);
 
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst);
diff --git a/src/core/cpu/kernels/CpuTransposeKernel.cpp b/src/core/cpu/kernels/CpuTransposeKernel.cpp
index ed08aa1aa0..c7cafe94a8 100644
--- a/src/core/cpu/kernels/CpuTransposeKernel.cpp
+++ b/src/core/cpu/kernels/CpuTransposeKernel.cpp
@@ -95,7 +95,7 @@ void transpose_8bit_elements(const ITensor *in, ITensor *out, const Window &wind
 
     Iterator output(out, window_out);
 
-    // Run the Neon path if and only if the input is not a row-vector
+    // Run the SIMD path if and only if the input is not a row-vector
     if(in->info()->dimension(1) != 1)
     {
         Iterator input(in, window_in);
@@ -234,7 +234,7 @@ void transpose_16bit_elements(const ITensor *in, ITensor *out, const Window &win
 
     Iterator output(out, window_out);
 
-    // Run the Neon path if and only if the input is not a row-vector
+    // Run the SIMD path if and only if the input is not a row-vector
     if(in->info()->dimension(1) != 1)
     {
         Iterator input(in, window_in);
@@ -347,7 +347,7 @@ void transpose_32bit_elements(const ITensor *in, ITensor *out, const Window &win
 
     Iterator output(out, window_out);
 
-    // Run the Neon path if and only if the input is not a row-vector
+    // Run the SIMD path if and only if the input is not a row-vector
     if(in->info()->dimension(1) != 1)
     {
         Iterator input(in, window_in);
@@ -455,7 +455,7 @@ void CpuTransposeKernel::configure(const ITensorInfo *src, ITensorInfo *dst)
 Status CpuTransposeKernel::validate(const ITensorInfo *src, const ITensorInfo *dst)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src);
-    //Note: ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input) is not needed here as this kernel doesn't use Neon FP16 instructions.
+    //Note: ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input) is not needed here as this kernel doesn't use CPU FP16 instructions.
     ARM_COMPUTE_RETURN_ERROR_ON(src->data_type() == DataType::UNKNOWN);
 
     // Error if input is not 8 bit, 16bit or 32bit
diff --git a/src/core/cpu/kernels/activation/NEON/fp16.cpp b/src/core/cpu/kernels/activation/NEON/fp16.cpp
index 0ddd43ea0e..6f2d5d8533 100644
--- a/src/core/cpu/kernels/activation/NEON/fp16.cpp
+++ b/src/core/cpu/kernels/activation/NEON/fp16.cpp
@@ -50,7 +50,7 @@ inline float16x8_t mask_float_vector(const float16x8_t &in, const uint16x8_t &ma
 
 void fp16_neon_activation(const ITensor *src, ITensor *dst, const ActivationLayerInfo &act_info, const Window &window)
 {
-    /** Neon vector tag type. */
+    /** SIMD vector tag type. */
     using ExactTagType                                = typename wrapper::traits::neon_bitvector_tag_t<float16_t, wrapper::traits::BitWidth::W128>;
     const ActivationLayerInfo::ActivationFunction act = act_info.activation();
 
diff --git a/src/core/cpu/kernels/activation/NEON/fp32.cpp b/src/core/cpu/kernels/activation/NEON/fp32.cpp
index 244ca5739f..54301d45ad 100644
--- a/src/core/cpu/kernels/activation/NEON/fp32.cpp
+++ b/src/core/cpu/kernels/activation/NEON/fp32.cpp
@@ -48,7 +48,7 @@ inline float32x4_t mask_float_vector(const float32x4_t &in, const uint32x4_t &ma
 
 void fp32_neon_activation(const ITensor *src, ITensor *dst, const ActivationLayerInfo &act_info, const Window &window)
 {
-    /** Neon vector tag type. */
+    /** SIMD vector tag type. */
     using ExactTagType = typename arm_compute::wrapper::traits::neon_bitvector_tag_t<float, wrapper::traits::BitWidth::W128>;
 
     constexpr int                                 window_step_x  = 4;
diff --git a/src/core/cpu/kernels/add/neon/list.h b/src/core/cpu/kernels/add/neon/list.h
index 964bdccca3..3ab03dd40e 100644
--- a/src/core/cpu/kernels/add/neon/list.h
+++ b/src/core/cpu/kernels/add/neon/list.h
@@ -47,7 +47,7 @@ DECLARE_ADD_KERNEL(add_u8_u8_s16_neon);
 template <typename ScalarType>
 void add_same_neon(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window)
 {
-    /** Neon vector tag type. */
+    /** SIMD vector tag type. */
     using ExactTagType = typename wrapper::traits::neon_bitvector_tag_t<ScalarType, wrapper::traits::BitWidth::W128>;
 
     // Create input windows
diff --git a/src/core/cpu/kernels/pooling/neon/quantized.h b/src/core/cpu/kernels/pooling/neon/quantized.h
index 535fb53d87..a16960a205 100644
--- a/src/core/cpu/kernels/pooling/neon/quantized.h
+++ b/src/core/cpu/kernels/pooling/neon/quantized.h
@@ -473,7 +473,7 @@ void pooling2_quantized_neon_nchw(const ITensor *src, ITensor *dst0, ITensor *ds
     Iterator in(src, window_src);
     Iterator out(dst0, window);
 
-    /** Neon vector types */
+    /** SIMD vector types */
     using q8x8_t    = typename wrapper::traits::neon_vector<T, 8>::type;
     using q8x16_t   = typename wrapper::traits::neon_vector<T, 16>::type;
     using q8x8x2_t  = typename std::conditional<std::is_same<T, uint8_t>::value, uint8x8x2_t, int8x8x2_t>::type;
@@ -602,7 +602,7 @@ void pooling3_quantized_neon_nchw(const ITensor *src, ITensor *dst0, ITensor *ds
     Iterator in(src, window_src);
     Iterator out(dst0, window);
 
-    /** Neon vector types */
+    /** SIMD vector types */
     using q8x8_t    = typename wrapper::traits::neon_vector<T, 8>::type;
     using q8x16_t   = typename wrapper::traits::neon_vector<T, 16>::type;
     using q8x8x2_t  = typename std::conditional<std::is_same<T, uint8_t>::value, uint8x8x2_t, int8x8x2_t>::type;
@@ -756,7 +756,7 @@ void poolingMxN_quantized_neon_nchw(const ITensor *src, ITensor *dst0, ITensor *
     Iterator in(src, window_src);
     Iterator out(dst0, window);
 
-    /** Neon vector types */
+    /** SIMD vector types */
     using q8x8_t  = typename wrapper::traits::neon_vector<T, 8>::type;
     using q16_t   = typename wrapper::traits::promote_t<T>;
     using q16x8_t = typename wrapper::traits::neon_vector<q16_t, 8>::type;
diff --git a/src/core/cpu/kernels/softmax/impl/NEON/list.h b/src/core/cpu/kernels/softmax/impl/NEON/list.h
index 740e6ea9bc..5ebee31272 100644
--- a/src/core/cpu/kernels/softmax/impl/NEON/list.h
+++ b/src/core/cpu/kernels/softmax/impl/NEON/list.h
@@ -36,7 +36,7 @@ namespace cpu
 template <typename T>
 void neon_logits_1d_max(const ITensor *in, ITensor *out, const Window &window)
 {
-    /** Neon vector tag type. */
+    /** SIMD vector tag type. */
     using ExactTagType = typename wrapper::traits::neon_bitvector_tag_t<T, wrapper::traits::BitWidth::W128>;
 
     constexpr int window_step_x  = 16 / sizeof(T);
@@ -267,7 +267,7 @@ void neon_softmax_logits_1d_float(const ITensor *in, const ITensor *max, void *c
     Iterator max_it(max, window);
     Iterator out_it(out, window);
 
-    /** Neon vector tag type. */
+    /** SIMD vector tag type. */
     using ExactTagType = typename wrapper::traits::neon_bitvector_tag_t<T, wrapper::traits::BitWidth::W128>;
 
     constexpr int vec_size   = 16 / sizeof(T);
diff --git a/src/core/cpu/kernels/sub/neon/list.h b/src/core/cpu/kernels/sub/neon/list.h
index 8c82402513..1ab4e6367b 100644
--- a/src/core/cpu/kernels/sub/neon/list.h
+++ b/src/core/cpu/kernels/sub/neon/list.h
@@ -47,7 +47,7 @@ DECLARE_SUB_KERNEL(sub_u8_u8_s16_neon);
 template <typename T>
 void sub_same_neon(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window)
 {
-    /** Neon vector tag type. */
+    /** SIMD vector tag type. */
     using ExactTagType = typename wrapper::traits::neon_bitvector_tag_t<T, wrapper::traits::BitWidth::W128>;
 
     bool is_sat = policy == ConvertPolicy::SATURATE;
diff --git a/src/graph/backends/NEON/NEDeviceBackend.cpp b/src/graph/backends/NEON/NEDeviceBackend.cpp
index a6c4fe9aa3..9efa3ac0c8 100644
--- a/src/graph/backends/NEON/NEDeviceBackend.cpp
+++ b/src/graph/backends/NEON/NEDeviceBackend.cpp
@@ -52,7 +52,7 @@ namespace graph
 {
 namespace backends
 {
-/** Register Neon backend */
+/** Register CPU backend */
 static detail::BackendRegistrar<NEDeviceBackend> NEDeviceBackend_registrar(Target::NEON);
 
 NEDeviceBackend::NEDeviceBackend()
@@ -138,7 +138,7 @@ std::unique_ptr<ITensorHandle> NEDeviceBackend::create_subtensor(ITensorHandle *
 
 std::unique_ptr<arm_compute::IFunction> NEDeviceBackend::configure_node(INode &node, GraphContext &ctx)
 {
-    ARM_COMPUTE_LOG_GRAPH_VERBOSE("Configuring Neon node with ID : " << node.id() << std::endl);
+    ARM_COMPUTE_LOG_GRAPH_VERBOSE("Configuring CPU node with ID : " << node.id() << std::endl);
     ARM_COMPUTE_ERROR_ON(node.assigned_target() != Target::NEON);
 
     // Configure node
@@ -147,7 +147,7 @@ std::unique_ptr<arm_compute::IFunction> NEDeviceBackend::configure_node(INode &n
 
 arm_compute::Status NEDeviceBackend::validate_node(INode &node)
 {
-    ARM_COMPUTE_LOG_GRAPH_VERBOSE("Validating Neon node with ID : " << node.id() << std::endl);
+    ARM_COMPUTE_LOG_GRAPH_VERBOSE("Validating CPU node with ID : " << node.id() << std::endl);
     ARM_COMPUTE_ERROR_ON(node.assigned_target() != Target::NEON);
 
     return NENodeValidator::validate(&node);
diff --git a/src/graph/backends/NEON/NEFunctionFactory.cpp b/src/graph/backends/NEON/NEFunctionFactory.cpp
index 6cf5874633..0fc5291648 100644
--- a/src/graph/backends/NEON/NEFunctionFactory.cpp
+++ b/src/graph/backends/NEON/NEFunctionFactory.cpp
@@ -54,7 +54,7 @@ struct NETargetInfo
 
 Target NETargetInfo::TargetType = Target::NEON;
 
-/** Collection of Neon convolution functions */
+/** Collection of CPU convolution functions */
 struct NEConvolutionLayerFunctions
 {
     using GenericConvolutionLayer  = NEConvolutionLayer;
@@ -63,7 +63,7 @@ struct NEConvolutionLayerFunctions
     using WinogradConvolutionLayer = NEWinogradConvolutionLayer;
 };
 
-/** Collection of Neon element-wise functions */
+/** Collection of CPU element-wise functions */
 struct NEEltwiseFunctions
 {
     using Addition       = NEArithmeticAddition;
@@ -73,13 +73,13 @@ struct NEEltwiseFunctions
     using Division       = NEElementwiseDivision;
 };
 
-/** Collection of Neon unary element-wise functions */
+/** Collection of CPU unary element-wise functions */
 struct NEUnaryEltwiseFunctions
 {
     using Exp = NEExpLayer;
 };
 
-/** Function and tensor types to be used inside a Neon fused convolution/batch normalization layer */
+/** Function and tensor types to be used inside a fused convolution/batch normalization layer */
 struct NEFusedLayerTypes
 {
     using ConvolutionLayer          = NEConvolutionLayer;
diff --git a/src/graph/backends/NEON/NENodeValidator.cpp b/src/graph/backends/NEON/NENodeValidator.cpp
index 8453cfa03a..d6e372004b 100644
--- a/src/graph/backends/NEON/NENodeValidator.cpp
+++ b/src/graph/backends/NEON/NENodeValidator.cpp
@@ -50,7 +50,7 @@ namespace graph
 {
 namespace backends
 {
-/** Collection of Neon element-wise functions */
+/** Collection of CPU element-wise functions */
 struct NEEltwiseLayerFunctions
 {
     using ArithmeticAddition      = NEArithmeticAddition;
@@ -60,7 +60,7 @@ struct NEEltwiseLayerFunctions
     using ArithmeticDivision      = NEElementwiseDivision;
 };
 
-/** Collection of Neon unary element-wise functions */
+/** Collection of CPU unary element-wise functions */
 struct NEUnaryEltwiseLayerFunctions
 {
     using ExpLayer = NEExpLayer;
diff --git a/src/runtime/NEON/functions/NEGEMMAssemblyDispatch.cpp b/src/runtime/NEON/functions/NEGEMMAssemblyDispatch.cpp
index e72a6c3226..7fb1d583ff 100644
--- a/src/runtime/NEON/functions/NEGEMMAssemblyDispatch.cpp
+++ b/src/runtime/NEON/functions/NEGEMMAssemblyDispatch.cpp
@@ -285,7 +285,7 @@ private:
 
     /** Assembly Gemm kernel */
     std::shared_ptr<arm_gemm::GemmCommon<TypeInput, TypeOutput>> _gemm_kernel_asm{ nullptr };
-    /** Optimised Neon kernel */
+    /** Optimised Arm® Neon™ kernel */
     std::unique_ptr<INEKernel> _optimised_kernel{ nullptr };
     /** Input A */
     const ITensor *_a
diff --git a/src/runtime/NEON/functions/NEGEMMConvolutionLayer.cpp b/src/runtime/NEON/functions/NEGEMMConvolutionLayer.cpp
index 59747a82f9..900330e4a6 100644
--- a/src/runtime/NEON/functions/NEGEMMConvolutionLayer.cpp
+++ b/src/runtime/NEON/functions/NEGEMMConvolutionLayer.cpp
@@ -431,7 +431,7 @@ Status NEGEMMConvolutionLayer::validate(const ITensorInfo *input, const ITensorI
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::BFLOAT16, DataType::F16, DataType::F32);
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(weights, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM8_PER_CHANNEL, DataType::BFLOAT16, DataType::F16, DataType::F32);
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, weights);
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(num_groups > 1, "Grouping (num_groups != 1) is not supported on Neon");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(num_groups > 1, "Grouping (num_groups != 1) is not supported");
 
     const DataLayout data_layout = input->data_layout();
     const DataType   data_type   = input->data_type();
@@ -523,7 +523,7 @@ Status NEGEMMConvolutionLayer::validate(const ITensorInfo *input, const ITensorI
     if(!skip_im2col)
     {
         // Create tensor info for im2col reshaped inputs
-        // For Neon the batch size is on the fourth dimension
+        // For CPU, the batch size is on the fourth dimension
         // TODO (giaiod01): Auto-initialize the output shape of im2col COMPMID-1482
         TensorShape shape_im2col = input->tensor_shape();
         shape_im2col.set(0, mat_weights_rows);
diff --git a/src/runtime/cpu/operators/CpuPooling.h b/src/runtime/cpu/operators/CpuPooling.h
index 9ebcd5f6aa..b1647ea689 100644
--- a/src/runtime/cpu/operators/CpuPooling.h
+++ b/src/runtime/cpu/operators/CpuPooling.h
@@ -40,7 +40,7 @@ namespace cpu
 {
 // Forward Declarations
 class CpuPoolingAssemblyDispatch;
-/** Basic function to simulate a pooling layer with the specified pooling operation. This function calls the following Neon kernels:
+/** Basic function to simulate a pooling layer with the specified pooling operation. This function calls the following kernels:
  *
  * -# @ref NEFillBorderKernel (executed if padding size is different from zero)
  * -# @ref kernels::CpuPoolingKernel
diff --git a/tests/SConscript b/tests/SConscript
index d4cc514693..df7a6c5686 100644
--- a/tests/SConscript
+++ b/tests/SConscript
@@ -1,3 +1,5 @@
+# -*- coding: utf-8 -*-
+
 # Copyright (c) 2017-2021 Arm Limited.
 #
 # SPDX-License-Identifier: MIT
diff --git a/tests/datasets/ScaleValidationDataset.h b/tests/datasets/ScaleValidationDataset.h
index 25112f155f..881be0fc26 100644
--- a/tests/datasets/ScaleValidationDataset.h
+++ b/tests/datasets/ScaleValidationDataset.h
@@ -140,7 +140,7 @@ const auto ScaleAlignCornersSamplingPolicySet = combine(framework::dataset::make
 }),
 framework::dataset::make("AlignCorners", { true }));
 
-/** Generated shapes: Used by Neon precommit and nightly
+/** Generated shapes: used by precommit and nightly for CPU tests
  * - 2D shapes with 0, 1, 2 vector iterations
  * - 3D shapes with 0, 1 vector iterations
  * - 4D shapes with 0 vector iterations
diff --git a/tests/framework/SConscript b/tests/framework/SConscript
index e805ac0e2c..c4fe50db05 100644
--- a/tests/framework/SConscript
+++ b/tests/framework/SConscript
@@ -1,4 +1,6 @@
-# Copyright (c) 2017 Arm Limited.
+# -*- coding: utf-8 -*-
+
+# Copyright (c) 2017-2021 Arm Limited.
 #
 # SPDX-License-Identifier: MIT
 #
@@ -28,7 +30,7 @@ Import('vars')
 # vars is imported from arm_compute:
 variables = [
     BoolVariable("pmu", "Enable PMU counters", False),
-    BoolVariable("mali", "Enable Mali hardware counters", False),
+    BoolVariable("mali", "Enable Arm® Mali™ hardware counters", False),
 ]
 
 # We need a separate set of Variables for the Help message (Otherwise the global variables will get displayed twice)
@@ -67,7 +69,7 @@ if not env['opencl']:
     files = [f for f in files if "OpenCL" not in os.path.basename(str(f))]
 
 if not framework_env['mali']:
-    # Remove Mali files
+    # Remove Arm® Mali™ files
     files = [f for f in files if "MaliCounter" not in os.path.basename(str(f))]
 else:
     framework_env.Append(CPPDEFINES = ['MALI_ENABLED'])
diff --git a/tests/framework/instruments/hwc_names.hpp b/tests/framework/instruments/hwc_names.hpp
index e68bcbed82..c39f3bba7a 100644
--- a/tests/framework/instruments/hwc_names.hpp
+++ b/tests/framework/instruments/hwc_names.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 Arm Limited.
+ * Copyright (c) 2017-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -44,7 +44,7 @@ enum
 /*
      * "Short names" for hardware counters used by Streamline. Counters names are
      * stored in accordance with their memory layout in the binary counter block
-     * emitted by the Mali GPU. Each "master" in the GPU emits a fixed-size block
+     * emitted by the Arm® Mali™ GPU. Each "master" in the GPU emits a fixed-size block
      * of 64 counters, and each GPU implements the same set of "masters" although
      * the counters each master exposes within its block of 64 may vary.
      *
diff --git a/tests/validation/NEON/DetectionPostProcessLayer.cpp b/tests/validation/NEON/DetectionPostProcessLayer.cpp
index a166402a79..7d725327b7 100644
--- a/tests/validation/NEON/DetectionPostProcessLayer.cpp
+++ b/tests/validation/NEON/DetectionPostProcessLayer.cpp
@@ -150,7 +150,7 @@ inline void base_test_case(DetectionPostProcessLayerInfo info, DataType data_typ
         quantize_and_fill_tensor(Accessor(anchors), anchors_vector);
     }
 
-    // Determine the output through the Neon kernel
+    // Determine the output through the Compute Library operator
     Tensor                      output_boxes;
     Tensor                      output_classes;
     Tensor                      output_scores;
diff --git a/tests/validation/NEON/QLSTMLayerNormalization.cpp b/tests/validation/NEON/QLSTMLayerNormalization.cpp
index 617f64ce1d..9738213114 100644
--- a/tests/validation/NEON/QLSTMLayerNormalization.cpp
+++ b/tests/validation/NEON/QLSTMLayerNormalization.cpp
@@ -167,7 +167,7 @@ TEST_SUITE(Quantized)
 TEST_SUITE(QSYMM16)
 
 /** Tests will be targetting
- * - Comparison between Neon kernel and the exact same but scalar version of reference kernel
+ * - Comparison between optimized kernel and the exact same but scalar version of reference kernel
  * - Input shapes of 1D and 2D with the first dimension covers boundary values of 128-bit vector size (0~3 iterations)
  * - Weight and bias 1D shape that have same size as that of input shapes
  * - Quantization scale is greater and smaller than one.
@@ -179,7 +179,7 @@ TEST_SUITE(QSYMM16)
  * - The algorithm has been sensitive to quantization scale but it is hard to fully test
  *   the sensitivity due to aforementioned reason.
  * - Again, it is hard to fully test corner values due to the exact same algorithm of the
- *   reference kernel and the Neon kernel.
+ *   reference kernel and the optimized kernel.
  */
 
 constexpr uint32_t qsymm16_per_vector = vector_size_byte / sizeof(int16_t);
diff --git a/tests/validation/fixtures/DepthConvertLayerFixture.h b/tests/validation/fixtures/DepthConvertLayerFixture.h
index 937a1a06a9..7ec35b4ea8 100644
--- a/tests/validation/fixtures/DepthConvertLayerFixture.h
+++ b/tests/validation/fixtures/DepthConvertLayerFixture.h
@@ -67,7 +67,7 @@ protected:
         }
         else
         {
-            // When converting S32 to F16, both reference and Neon implementations are + or - infinity outside the F16 range.
+            // When converting S32 to F16, both reference and Compute Library implementations are + or - infinity outside the F16 range.
             if(dt_in == DataType::S32 && dt_out == DataType::F16)
             {
                 std::uniform_int_distribution<int32_t> distribution_s32(-65504, 65504);
diff --git a/tests/validation/reference/UtilsQuantizedAsymm.h b/tests/validation/reference/UtilsQuantizedAsymm.h
index 25873acc93..1f593bb696 100644
--- a/tests/validation/reference/UtilsQuantizedAsymm.h
+++ b/tests/validation/reference/UtilsQuantizedAsymm.h
@@ -40,7 +40,7 @@ inline int32_t asymm_rounding_divide_by_pow2(int32_t x, int exponent)
     return (x >> exponent) + ((x & mask) > threshold ? 1 : 0);
 }
 
-/** Multiplication of two integers. The same as ARMv7 Neon VQRDMULH instruction. */
+/** Multiplication of two integers. The same as ARMv7 Arm® Neon™ VQRDMULH instruction. */
 inline int32_t asymm_int_mult(int32_t a, int32_t b)
 {
     bool    overflow = a == b && a == std::numeric_limits<int32_t>::min();
diff --git a/utils/GraphUtils.h b/utils/GraphUtils.h
index 8764514d7e..d2b05f27c6 100644
--- a/utils/GraphUtils.h
+++ b/utils/GraphUtils.h
@@ -673,7 +673,7 @@ inline TensorShape permute_shape(TensorShape tensor_shape, DataLayout in_data_la
 
 /** Utility function to return the TargetHint
  *
- * @param[in] target Integer value which expresses the selected target. Must be 0 for Neon or 1 for OpenCL or 2 (OpenCL with Tuner)
+ * @param[in] target Integer value which expresses the selected target. Must be 0 for Arm® Neon™ or 1 for OpenCL or 2 (OpenCL with Tuner)
  *
  * @return the TargetHint
  */
author	Michele Di Giorgio <michele.digiorgio@arm.com>	2021-03-09 14:09:08 +0000
committer	Michele Di Giorgio <michele.digiorgio@arm.com>	2021-03-31 17:08:51 +0000
commit	33f41fabd30fb444aaa0cf3e65b61794d498d151 (patch)
tree	a381cff3096a3b05198b0cd311fee28e40fd5a4f
parent	5f91b5d7063462854b62d342f9d4e04ae647e9a6 (diff)
download	ComputeLibrary-33f41fabd30fb444aaa0cf3e65b61794d498d151.tar.gz