17 files changed, 1294 insertions, 627 deletions
diff --git a/docs/03_scripts.dox b/docs/03_scripts.dox
deleted file mode 100644
index e66bb402fe..0000000000
--- a/docs/03_scripts.dox
+++ /dev/null
@@ -1,178 +0,0 @@
-///
-/// Copyright (c) 2017-2020 Arm Limited.
-///
-/// SPDX-License-Identifier: MIT
-///
-/// Permission is hereby granted, free of charge, to any person obtaining a copy
-/// of this software and associated documentation files (the "Software"), to
-/// deal in the Software without restriction, including without limitation the
-/// rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
-/// sell copies of the Software, and to permit persons to whom the Software is
-/// furnished to do so, subject to the following conditions:
-///
-/// The above copyright notice and this permission notice shall be included in all
-/// copies or substantial portions of the Software.
-///
-/// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-/// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-/// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-/// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-/// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-/// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-/// SOFTWARE.
-///
-namespace arm_compute
-{
-/**
-@page data_import Importing data from existing models
-
-@tableofcontents
-
-@section caffe_data_extractor Extract data from pre-trained caffe model
-
-One can find caffe <a href="https://github.com/BVLC/caffe/wiki/Model-Zoo">pre-trained models</a> on
-caffe's official github repository.
-
-The caffe_data_extractor.py provided in the scripts folder is an example script that shows how to
-extract the parameter values from a trained model.
-
-@note complex networks might require altering the script to properly work.
-
-@subsection caffe_how_to How to use the script
-
-Install caffe following <a href="http://caffe.berkeleyvision.org/installation.html">caffe's document</a>.
-Make sure the pycaffe has been added into the PYTHONPATH.
-
-Download the pre-trained caffe model.
-
-Run the caffe_data_extractor.py script by
-
-        python caffe_data_extractor.py -m <caffe model> -n <caffe netlist>
-
-For example, to extract the data from pre-trained caffe Alex model to binary file:
-
-        python caffe_data_extractor.py -m /path/to/bvlc_alexnet.caffemodel -n /path/to/caffe/models/bvlc_alexnet/deploy.prototxt
-
-The script has been tested under Python2.7.
-
-@subsection caffe_result  What is the expected output from the script
-
-If the script runs successfully, it prints the names and shapes of each layer onto the standard
-output and generates *.npy files containing the weights and biases of each layer.
-
-The arm_compute::utils::load_trained_data shows how one could load
-the weights and biases into tensor from the .npy file by the help of Accessor.
-
-@section tensorflow_data_extractor Extract data from pre-trained tensorflow model
-
-The script tensorflow_data_extractor.py extracts trainable parameters (e.g. values of weights and biases) from a
-trained tensorflow model. A tensorflow model consists of the following two files:
-
-{model_name}.data-{step}-{global_step}: A binary file containing values of each variable.
-
-{model_name}.meta:  A binary file containing a MetaGraph struct which defines the graph structure of the neural
-network.
-
-@note Since Tensorflow version 0.11 the binary checkpoint file which contains the values for each parameter has the format of:
-    {model_name}.data-{step}-of-{max_step}
-instead of:
-    {model_name}.ckpt
-When dealing with binary files with version >= 0.11, only pass {model_name} to -m option;
-when dealing with binary files with version < 0.11, pass the whole file name {model_name}.ckpt to -m option.
-
-@note This script relies on the parameters to be extracted being in the
-'trainable_variables' tensor collection. By default all variables are automatically added to this collection unless
-specified otherwise by the user. Thus should a user alter this default behavior and/or want to extract parameters from other
-collections, tf.GraphKeys.TRAINABLE_VARIABLES should be replaced accordingly.
-
-@subsection tensorflow_how_to How to use the script
-
-Install tensorflow and numpy.
-
-Download the pre-trained tensorflow model.
-
-Run tensorflow_data_extractor.py with
-
-        python tensorflow_data_extractor -m <path_to_binary_checkpoint_file> -n <path_to_metagraph_file>
-
-For example, to extract the data from pre-trained tensorflow Alex model to binary files:
-
-        python tensorflow_data_extractor -m /path/to/bvlc_alexnet -n /path/to/bvlc_alexnet.meta
-
-Or for binary checkpoint files before Tensorflow 0.11:
-
-        python tensorflow_data_extractor -m /path/to/bvlc_alexnet.ckpt -n /path/to/bvlc_alexnet.meta
-
-@note with versions >= Tensorflow 0.11 only model name is passed to the -m option
-
-The script has been tested with Tensorflow 1.2, 1.3 on Python 2.7.6 and Python 3.4.3.
-
-@subsection tensorflow_result What is the expected output from the script
-
-If the script runs successfully, it prints the names and shapes of each parameter onto the standard output and generates
- *.npy files containing the weights and biases of each layer.
-
-The arm_compute::utils::load_trained_data shows how one could load
-the weights and biases into tensor from the .npy file by the help of Accessor.
-
-@section tf_frozen_model_extractor Extract data from pre-trained frozen tensorflow model
-
-The script tf_frozen_model_extractor.py extracts trainable parameters (e.g. values of weights and biases) from a
-frozen trained Tensorflow model.
-
-@subsection tensorflow_frozen_how_to How to use the script
-
-Install Tensorflow and NumPy.
-
-Download the pre-trained Tensorflow model and freeze the model using the architecture and the checkpoint file.
-
-Run tf_frozen_model_extractor.py with
-
-        python tf_frozen_model_extractor -m <path_to_frozen_pb_model_file> -d <path_to_store_parameters>
-
-For example, to extract the data from pre-trained Tensorflow model to binary files:
-
-        python tf_frozen_model_extractor -m /path/to/inceptionv3.pb -d ./data
-
-@subsection tensorflow_frozen_result What is the expected output from the script
-
-If the script runs successfully, it prints the names and shapes of each parameter onto the standard output and generates
- *.npy files containing the weights and biases of each layer.
-
-The arm_compute::utils::load_trained_data shows how one could load
-the weights and biases into tensor from the .npy file by the help of Accessor.
-
-@section validate_examples Validating examples
-
-Compute Library provides a list of graph examples that are used in the context of integration and performance testing.
-The provenance of each model is part of its documentation and no structural or data alterations have been applied to any
-of them unless explicitly specified otherwise in the documentation.
-
-Using one of the provided scripts will generate files containing the trainable parameters.
-
-You can validate a given graph example on a list of inputs by running:
-
-    LD_LIBRARY_PATH=lib ./<graph_example> --validation-range='<validation_range>' --validation-file='<validation_file>' --validation-path='/path/to/test/images/' --data='/path/to/weights/'
-
-e.g:
-
-LD_LIBRARY_PATH=lib ./bin/graph_alexnet --target=CL --layout=NHWC --type=F32 --threads=4 --validation-range='16666,24998' --validation-file='val.txt' --validation-path='images/' --data='data/'
-
-where:
-    validation file is a plain document containing a list of images along with their expected label value.
-    e.g:
-
-        val_00000001.JPEG 65
-        val_00000002.JPEG 970
-        val_00000003.JPEG 230
-        val_00000004.JPEG 809
-        val_00000005.JPEG 516
-
-    --validation-range is the index range of the images within the validation file you want to check:
-    e.g:
-
-       --validation-range='100,200' will validate 100 images starting from 100th one in the validation file.
-
-    This can be useful when parallelizing the validation process is needed.
-*/
-}
diff --git a/docs/ComputeLibrary.dir b/docs/ComputeLibrary.dir
index e08f05eb2d..ab9dfc1b93 100644
--- a/docs/ComputeLibrary.dir
+++ b/docs/ComputeLibrary.dir
@@ -1,8 +1,12 @@
 //
-// Copyright © 2020 Arm Ltd. All rights reserved.
+// Copyright © 2020,2022 Arm Ltd. All rights reserved.
 // SPDX-License-Identifier: MIT
 //
 
+// The following files are omitted due to technical limitations:
+// Directories : data, include, python
+// Files : LICENSE, README.md, SConscript, SConstruct, Security.md, filelist.json, filedefs.json
+
 /** @file Android.bp
  *  @brief Generation script for building AndroidNN driver.
  */
@@ -194,14 +198,6 @@
  *  @brief Utility scripts.
  */
 
-/** @file scripts/caffe_data_extractor.py
- *  @brief Basic script to export weights from Caffe to npy files.
- */
-
-/** @file scripts/tensorflow_data_extractor.py
- *  @brief Basic script to export weights from TensorFlow to npy files.
- */
-
 /** @dir src
  *  @brief Source code implementing all the arm_compute headers.
  */
@@ -230,7 +226,7 @@
  *  @brief Scalar operations
  */
 
-/** @dir src/core/gpu/cl/kernels/gemm
+/** @dir src/gpu/cl/kernels/gemm
  *  @brief Folder containing all the configuration files for GEMM
  */
 
diff --git a/docs/Doxyfile b/docs/Doxyfile
index 27e28618b9..186f66c086 100644
--- a/docs/Doxyfile
+++ b/docs/Doxyfile
@@ -1,4 +1,4 @@
-# Doxyfile 1.8.15
+# Doxyfile 1.8.17
 
 # This file describes the settings to be used by the documentation system
 # doxygen (www.doxygen.org) for a project.
@@ -364,7 +364,7 @@ SUBGROUPING            = YES
 
 # When the INLINE_GROUPED_CLASSES tag is set to YES, classes, structs and unions
 # are shown inside the group in which they are included (e.g. using \ingroup)
-# instead of on a separate page (for HTML and Man pages) or section (for LaTeX
+# instead of on a separate page (for HTML and Manual pages) or section (for LaTeX
 # and RTF).
 #
 # Note that this feature does not work in combination with
@@ -378,7 +378,7 @@ INLINE_GROUPED_CLASSES = NO
 # the documentation of the scope in which they are defined (i.e. file,
 # namespace, or group documentation), provided this scope is documented. If set
 # to NO, structs, classes, and unions are shown on a separate page (for HTML and
-# Man pages) or section (for LaTeX and RTF).
+# Manual pages) or section (for LaTeX and RTF).
 # The default value is: NO.
 
 INLINE_SIMPLE_STRUCTS  = NO
@@ -773,6 +773,7 @@ INPUT                  = ./docs/user_guide/introduction.dox \
                          ./docs/user_guide/library.dox \
                          ./docs/user_guide/data_type.dox \
                          ./docs/user_guide/data_layout.dox \
+                         ./docs/user_guide/conv2d_heuristic.dox \
                          ./docs/user_guide/operator_list.dox \
                          ./docs/user_guide/tests.dox \
                          ./docs/user_guide/advanced.dox \
@@ -876,6 +877,7 @@ EXCLUDE                = ./arm_compute/core/NEON/kernels/assembly/ \
                          ./src/core/NEON/kernels/NEL2NormalizeLayerKernel.cpp \
                          ./tests/datasets/ \
                          ./tests/benchmark/fixtures/ \
+                         ./tests/validation/CL/UNIT/dynamic_fusion/ClCompositeKernel.cpp \
                          ./tests/validation/fixtures/
 
 # The EXCLUDE_SYMLINKS tag can be used to select whether or not files or
@@ -1550,11 +1552,11 @@ MATHJAX_FORMAT         = HTML-CSS
 # MATHJAX_RELPATH should be ../mathjax. The default value points to the MathJax
 # Content Delivery Network so you can quickly see the result without installing
 # MathJax. However, it is strongly recommended to install a local copy of
-# MathJax from http://www.mathjax.org before deployment.
-# The default value is: http://cdn.mathjax.org/mathjax/latest.
+# MathJax from https://www.mathjax.org before deployment.
+# The default value is: https://cdn.mathjax.org/mathjax/latest.
 # This tag requires that the tag USE_MATHJAX is set to YES.
 
-MATHJAX_RELPATH        = http://cdn.mathjax.org/mathjax/latest
+MATHJAX_RELPATH        = https://cdn.mathjax.org/mathjax/latest
 
 # The MATHJAX_EXTENSIONS tag can be used to specify one or more MathJax
 # extension names that should be enabled during MathJax rendering. For example
@@ -1882,16 +1884,16 @@ RTF_EXTENSIONS_FILE    =
 #RTF_SOURCE_CODE        = NO
 
 #---------------------------------------------------------------------------
-# Configuration options related to the man page output
+# Configuration options related to the manual page output
 #---------------------------------------------------------------------------
 
-# If the GENERATE_MAN tag is set to YES, doxygen will generate man pages for
+# If the GENERATE_MAN tag is set to YES, doxygen will generate manual pages for
 # classes and files.
 # The default value is: NO.
 
 GENERATE_MAN           = NO
 
-# The MAN_OUTPUT tag is used to specify where the man pages will be put. If a
+# The MAN_OUTPUT tag is used to specify where the manual pages will be put. If a
 # relative path is entered the value of OUTPUT_DIRECTORY will be put in front of
 # it. A directory man3 will be created inside the directory specified by
 # MAN_OUTPUT.
@@ -1901,7 +1903,7 @@ GENERATE_MAN           = NO
 MAN_OUTPUT             = man
 
 # The MAN_EXTENSION tag determines the extension that is added to the generated
-# man pages. In case the manual section does not start with a number, the number
+# manual pages. In case the manual section does not start with a number, the number
 # 3 is prepended. The dot (.) at the beginning of the MAN_EXTENSION tag is
 # optional.
 # The default value is: .3.
@@ -1910,15 +1912,15 @@ MAN_OUTPUT             = man
 MAN_EXTENSION          = .3
 
 # The MAN_SUBDIR tag determines the name of the directory created within
-# MAN_OUTPUT in which the man pages are placed. If defaults to man followed by
+# MAN_OUTPUT in which the manual pages are placed. If defaults to man followed by
 # MAN_EXTENSION with the initial . removed.
 # This tag requires that the tag GENERATE_MAN is set to YES.
 
 #MAN_SUBDIR             = 
 
 # If the MAN_LINKS tag is set to YES and doxygen generates man output, then it
-# will generate one additional man file for each entity documented in the real
-# man page(s). These additional files only source the real man page, but without
+# will generate one additional manual file for each entity documented in the real
+# manual page(s). These additional files only source the real manual page, but without
 # them the man command would be unable to find the correct page.
 # The default value is: NO.
 # This tag requires that the tag GENERATE_MAN is set to YES.
@@ -2068,7 +2070,7 @@ SEARCH_INCLUDES        = YES
 # preprocessor.
 # This tag requires that the tag SEARCH_INCLUDES is set to YES.
 
-INCLUDE_PATH           = 
+INCLUDE_PATH           = ./src/core/CL/cl_kernels/
 
 # You can use the INCLUDE_FILE_PATTERNS tag to specify one or more wildcard
 # patterns (like *.h and *.hpp) to filter out the header-files in the
diff --git a/docs/DoxygenLayout.xml b/docs/DoxygenLayout.xml
index b416b1cbcc..4e09e20e3d 100644
--- a/docs/DoxygenLayout.xml
+++ b/docs/DoxygenLayout.xml
@@ -3,23 +3,24 @@
   <!-- Navigation index tabs for HTML output -->
   <navindex>
     <tab type="mainpage" url="@ref introduction" title="Introduction"/>
-    <tab type="usergroup" title="User Guide" url="[none]">
+    <tab type="usergroup" title="User Guide">
         <tab type="user" url="@ref how_to_build" title="How to Build and Run Examples"/>
         <tab type="user" url="@ref architecture" title="Library Architecture"/>
         <tab type="user" url="@ref data_type_support" title="Data Type Support"/>
         <tab type="user" url="@ref data_layout_support" title="Data Layout Support"/>
+        <tab type="user" url="@ref conv2d_heuristic" title="Convolution 2D heuristic"/>
         <tab type="user" url="@ref operators_list" title="Operator List"/>
         <tab type="user" url="@ref tests" title="Validation and benchmarks"/>
         <tab type="user" url="@ref advanced" title="Advanced"/>
         <tab type="user" url="@ref versions_changelogs" title="Release Versions and Changelog"/>
         <tab type="user" url="@ref errata" title="Errata"/>
     </tab>
-    <tab type="usergroup" title="Contributor Guide" url="[none]"> 
+    <tab type="usergroup" title="Contributor Guide">
         <tab type="user" url="@ref contribution_guidelines" title="Contribution Guidelines"/>
         <tab type="user" url="@ref adding_operator" title="How to Add a New Operator"/>
         <tab type="user" url="@ref implementation_topic" title="Implementation Topics"/>
     </tab>
-    <tab type="pages" visible="no" title="" intro=""/>
+    <tab type="pages" visible="no" title="Pages" intro=""/>
     <tab type="modules" visible="yes" title="" intro=""/>
     <tab type="namespaces" visible="yes" title="">
       <tab type="namespacelist" visible="yes" title="" intro=""/>
@@ -27,15 +28,15 @@
     </tab>
     <tab type="classes" visible="yes" title="">
       <tab type="classlist" visible="yes" title="" intro=""/>
-      <tab type="classindex" visible="$ALPHABETICAL_INDEX" title=""/> 
+      <tab type="classindex" visible="$ALPHABETICAL_INDEX" title=""/>
       <tab type="hierarchy" visible="yes" title="" intro=""/>
       <tab type="classmembers" visible="yes" title="" intro=""/>
     </tab>
     <tab type="files" visible="yes" title="">
-      <tab type="filelist" visible="yes" title="" intro=""/>
+      <tab type="filelist" visible="yes" title="" intro="Below is a list of files with brief descriptions. Please note that the descriptions of some miscellaneous files and directories (e.g. data/ and include/) are omitted because of Doxygen limitations."/>
       <tab type="globals" visible="yes" title="" intro=""/>
     </tab>
-    <tab type="examples" visible="yes" title="" intro=""/>  
+    <tab type="examples" visible="yes" title="" intro=""/>
   </navindex>
 
   <!-- Layout definition for a class page -->
diff --git a/docs/contributor_guide/adding_operator.dox b/docs/contributor_guide/adding_operator.dox
index 772d4362c8..559e8e2e76 100644
--- a/docs/contributor_guide/adding_operator.dox
+++ b/docs/contributor_guide/adding_operator.dox
@@ -1,5 +1,5 @@
 ///
-/// Copyright (c) 2018-2021 Arm Limited.
+/// Copyright (c) 2018-2022 Arm Limited.
 ///
 /// SPDX-License-Identifier: MIT
 ///
@@ -55,13 +55,13 @@ Following are the steps involved in adding support for a new operator in Compute
 @subsection S4_1_1_add_datatypes Adding new data types
 
 Compute Library declares a few new datatypes related to its domain, kernels, and functions in the library process Tensors and Images (Computer Vision functions). Tensors are multi-dimensional arrays with a maximum of Coordinates::num_max_dimensions dimensions; depending on the number of dimensions tensors can be interpreted as various objects. A scalar can be represented as a zero-dimensional tensor and a vector of numbers can be represented as a one-dimensional tensor. Furthermore, an image is just a 2D tensor, a 3D tensor can be seen as an array of images and a 4D tensor as a 2D array of images, etc.
-All the datatype classes or structures are grouped in the core library folder arm_compute/core  like the @ref ITensor, @ref ITensorInfo (all the information of a tensor), TensorShape and simpler types are in arm_compute/core/Types.h.
+All the datatype classes or structures are grouped in the core library folder arm_compute/core  like the @ref ITensor, @ref ITensorInfo (all the information of a tensor), TensorShape and simpler types are in arm_compute/core/CoreTypes.h.
 
 If an operator handles a new datatype, it must be added to the library. While adding a new data type to the library, it's necessary to implement the function to enable printing, the to_string() method and the output stream insertion (<<) operator. Every datatype implements these two functions in utils/TypePrinter.h
 
-A quick example, in <a href="https://github.com/ARM-software/ComputeLibrary/blob/master/arm_compute/core/Types.h">Types.h</a> we add:
+A quick example, in <a href="https://github.com/ARM-software/ComputeLibrary/blob/main/arm_compute/core/CoreTypes.h">CoreTypes.h</a> we add:
 
-@snippet arm_compute/core/Types.h DataLayout enum definition
+@snippet arm_compute/core/CoreTypes.h DataLayout enum definition
 
 And for printing:
 
@@ -97,12 +97,12 @@ We must register the new layer in the respective libraries:
 
 These files contain the list of all kernels available in the corresponding Compute Library's backend, for example CLKernels:
 @code{.cpp}
-... 
+...
 #include "src/core/CL/kernels/CLMinMaxLayerKernel.h"
 #include "src/core/CL/kernels/CLMinMaxLocationKernel.h"
-... 
+...
 #include "src/core/CL/kernels/CLReshapeLayerKernel.h"
-... 
+...
 
 @endcode
 
@@ -119,11 +119,11 @@ Each kernel will have to implement the method:
 
 The structure of the kernel .cpp file should be similar to the next ones.
 For OpenCL:
-@snippet src/core/gpu/cl/kernels/ClReshapeKernel.cpp ClReshapeKernel Kernel
+@snippet src/gpu/cl/kernels/ClReshapeKernel.cpp ClReshapeKernel Kernel
 The run will call the function defined in the .cl file.
 
 For the Arm® Neon™ backend case:
-@snippet src/core/cpu/kernels/CpuReshapeKernel.cpp NEReshapeLayerKernel Kernel
+@snippet src/cpu/kernels/CpuReshapeKernel.cpp NEReshapeLayerKernel Kernel
 
 In the Arm® Neon™ case, there is no need to add an extra file and we implement the kernel in the same NEReshapeLayerKernel.cpp file.
 If the tests are already in place, the new kernel can be tested using the existing tests by adding the configure and run of the kernel to the compute_target() in the fixture.
diff --git a/docs/contributor_guide/contribution_guidelines.dox b/docs/contributor_guide/contribution_guidelines.dox
index f3a6def582..cbaa502635 100644
--- a/docs/contributor_guide/contribution_guidelines.dox
+++ b/docs/contributor_guide/contribution_guidelines.dox
@@ -1,5 +1,5 @@
 ///
-/// Copyright (c) 2019-2020 Arm Limited.
+/// Copyright (c) 2019-2023 Arm Limited.
 ///
 /// SPDX-License-Identifier: MIT
 ///
@@ -35,6 +35,14 @@ The development is structured in the following way:
 - Development repository: https://review.mlplatform.org/#/admin/projects/ml/ComputeLibrary
 - Please report issues here: https://github.com/ARM-software/ComputeLibrary/issues
 
+@section S5_0_inc_lang Inclusive language guideline
+As part of the initiative to use inclusive language, there are certain phrases and words that were removed or replaced by more inclusive ones. Examples include but not limited to:
+\includedoc non_inclusive_language_examples.dox
+
+Please also follow this guideline when committing changes to Compute Library.
+It is worth mentioning that the term "master" is still used in some comments but only in reference to external code links that Arm has no governance on.
+
+Futhermore, starting from release (22.05), 'master' branch is no longer being used, it has been replaced by 'main'. Please update your clone jobs accordingly.
 @section S5_1_coding_standards Coding standards and guidelines
 
 Best practices (as suggested by clang-tidy):
@@ -232,6 +240,18 @@ class ClassName
 };
 @endcode
 
+- In header files, use header guards that use the full file path from the project root and prepend it with "ACL_"
+
+@code{cpp}
+// For File arm_compute/runtime/NEON/functions/NEBatchNormalizationLayer.h
+#ifndef ACL_ARM_COMPUTE_RUNTIME_NEON_FUNCTIONS_NEBATCHNORMALIZATIONLAYER
+#define ACL_ARM_COMPUTE_RUNTIME_NEON_FUNCTIONS_NEBATCHNORMALIZATIONLAYER
+.
+.
+.
+#endif /* ACL_ARM_COMPUTE_RUNTIME_NEON_FUNCTIONS_NEBATCHNORMALIZATIONLAYER */
+@endcode
+
 - Use quotes instead of angular brackets to include local headers. Use angular brackets for system headers.
 - Also include the module header first, then local headers, and lastly system headers. All groups should be separated by a blank line and sorted lexicographically within each group.
 - Where applicable the C++ version of system headers has to be included, e.g. cstddef instead of stddef.h.
@@ -256,6 +276,47 @@ auto c = img.ptr(); // NO: Can't tell what the type is without knowing the API.
 auto d = vdup_n_u8(0); // NO: It's not obvious what type this function returns.
 @endcode
 
+- When to use const
+
+    - Local variables: Use const as much as possible. E.g. all read-ony variables should be declared as const.
+
+    - Function parameters
+
+        - Top-level const must not be used in the function declaration or definition. (Note that this applies to all types, including non-primitive types)
+          This is because for function parameters, top-level const in function declaration is always ignored by the compiler (it is meaningless).
+          Therefore we should omit top-level const to reduce visual clutter. In addition, its omission can improve API/ABI
+          stability to some extent as there is one fewer varying factor in function signatures.
+
+          Note that we could in theory allow top-level const in only definition (which is not ignored by the compiler) but not declaration.
+          But certain toolchains are known to require the declaration and definition to match exactly.
+
+        - Use low-level const (of references and pointers) as much as possible.
+@code{.cpp}
+// Primitive types
+void foo(const int a);              // NO: Top-level const must not be used in function declaration or definition
+void foo(int a);                    // OK
+// Pointer to primitive types
+void foo(int *const a);             // NO: Top-level const
+void foo(const int *const a);       // NO: Top-level const
+void foo(int *a);                   // OK. But only if foo needs to mutate the underlying object
+void foo(const int *a);             // OK but not recommended: See section above about passing primitives by value
+// Reference to primitive types
+// There's no "top-level const" for references
+void foo(int &a);                   // OK. But only if foo needs to mutate the underlying object
+void foo(const int &a);             // OK but not recommended: See section above about passing primitives by value
+
+// Custom types
+void foo(const Goo g);              // NO: Top-level const
+void foo(Goo g);                    // OK
+// Pointer to custom types
+void foo(Goo *const g);             // NO: Top-level const
+void foo(Goo *g);                   // OK. But only if foo needs to mutate the underlying object
+void foo(const Goo *g);             // OK
+// Reference to custom types
+void foo(Goo &g);                   // OK. But only if foo needs to mutate the underlying object
+void foo(const Goo &g);             // OK
+@endcode
+
 - OpenCL:
     - Use __ in front of the memory types qualifiers and kernel: __kernel, __constant, __private, __global, __local.
     - Indicate how the global workgroup size / offset / local workgroup size are being calculated.
@@ -264,7 +325,7 @@ auto d = vdup_n_u8(0); // NO: It's not obvious what type this function returns.
 
         - No '*' in front of argument names
         - [in], [out] or [in,out] *in front* of arguments
-        - Skip a line between the description and params and between params and @return (If there is a return)
+        - Skip a line between the description and params and between params and \@return (If there is a return)
         - Align params names and params descriptions (Using spaces), and with a single space between the widest column and the next one.
         - Use an upper case at the beginning of the description
 
@@ -274,6 +335,26 @@ auto d = vdup_n_u8(0); // NO: It's not obvious what type this function returns.
 
 astyle (http://astyle.sourceforge.net/) and clang-format (https://clang.llvm.org/docs/ClangFormat.html) can check and help you apply some of these rules.
 
+We have also provided the python scripts we use in our precommit pipeline inside scripts directory.
+    - format_code.py: checks Android.bp, bad style, end of file, formats doxygen, runs astyle and clang-format (assuming necessary binaries are in the path). Example invocations:
+@code{.sh}
+        python format_code.py
+        python format_code.py --error-on-diff
+        python format_code.py --files=git-diff (Default behavior in pre-commit configuration, where it checks the staged files)
+@endcode
+    - generate_build_files.py: generates build files required for CMake and Bazel builds. Example invocations:
+@code{.sh}
+        python generate_build_files.py --cmake
+        python generate_build_files.py --bazel
+@endcode
+
+Another way of running the checks is using `pre-commit` (https://pre-commit.com/) framework, which has also nice features like checking trailing spaces, and large committed files etc.
+`pre-commit` can be installed via `pip`. After installing, run the following command in the root directory of the repository:
+
+	pre-commit install
+
+This will create the hooks that perform the formatting checks mentioned above and will automatically run just before committing to flag issues.
+
 @subsection S5_1_3_library_size_guidelines Library size: best practices and guidelines
 
 @subsubsection S5_1_3_1_template_suggestions Template suggestions
@@ -434,7 +515,7 @@ You can add this to your patch with:
 
 You are now ready to submit your patch for review:
 
-	git push acl-gerrit HEAD:refs/for/master
+	git push acl-gerrit HEAD:refs/for/main
 
 @section S5_3_code_review Patch acceptance and code review
 
@@ -444,7 +525,7 @@ Once a patch is uploaded for review, there is a pre-commit test that runs on a J
 - get a "+1 Comments-Addressed", in case of comments from reviewers the committer has to address them all. A comment is considered addressed when the first line of the reply contains the word "Done"
 - get a "+2" from a reviewer, that means the patch has the final approval
 
-At the moment, the Jenkins server is not publicly accessible and for security reasons patches submitted by non-whitelisted committers do not trigger the pre-commit tests. For this reason, one of the maintainers has to manually trigger the job.
+At the moment, the Jenkins server is not publicly accessible and for security reasons patches submitted by non-allowlisted committers do not trigger the pre-commit tests. For this reason, one of the maintainers has to manually trigger the job.
 
 If the pre-commit test fails, the Jenkins job will post a comment on Gerrit with the details about the failure so that the committer will be able to reproduce the error and fix the issue, if any (sometimes there can be infrastructure issues, a test platform disconnecting for example, where the job needs to be retriggered).
 
diff --git a/docs/contributor_guide/implementation_topics.dox b/docs/contributor_guide/implementation_topics.dox
index 4afaa6d6a1..6ca78f98e7 100644
--- a/docs/contributor_guide/implementation_topics.dox
+++ b/docs/contributor_guide/implementation_topics.dox
@@ -1,5 +1,5 @@
 ///
-/// Copyright (c) 2017-2021 Arm Limited.
+/// Copyright (c) 2017-2021, 2024 Arm Limited.
 ///
 /// SPDX-License-Identifier: MIT
 ///
@@ -25,6 +25,52 @@ namespace arm_compute
 {
 /** @page implementation_topic Implementation Topics
 
+@section implementation_topic_assembly_kernels Assembly kernels
+
+Arm Compute Library contains a collection of highly optimized assembly kernels for Arm® A profile architecture. At runtime the
+library selects the best kernel based on the CPU detected. For example if the CPU supports the dot product instruction
+the library will choose a GEMM kernel which uses the dot product instruction. There are various kernels using Neon™ and
+architecture extensions like FP16, Dot product, SVE, SVE2 and SME.
+
+For example, some assembly kernels are located in the folders:
+- src/core/NEON/kernels/arm_gemm/kernels
+- src/core/NEON/kernels/arm_gemm/pooling
+- src/core/NEON/kernels/arm_conv/depthwise
+
+
+The assembly kernels are written using assembly mnemonics and the .inst directive which inserts the machine code to the output directly.
+
+Below you can see a code block from one of the kernels in the library which uses the .inst directive to generate the sdot instruction.
+This code can be found in the kernel @ref src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qa_dot_4x16/a55.cpp
+
+@code{.cpp}
+".inst 0x4f80eb10  // sdot v16.4s, v24.16b, v0.4b[2]\n"
+".inst 0x4f81eb14  // sdot v20.4s, v24.16b, v1.4b[2]\n"
+" ldr d24, [x12, #0xf0]\n"
+" ldr x20, [x12, #0xf8]\n"
+" .inst 0x4f80ebd1  // sdot v17.4s, v30.16b, v0.4b[2]\n"
+" .inst 0x4f81ebd5  // sdot v21.4s, v30.16b, v1.4b[2]\n"
+" mov v27.d[1], x23\n"
+" .inst 0x4f80ebb2  // sdot v18.4s, v29.16b, v0.4b[2]\n"
+" mov v26.d[1], x22\n"
+" .inst 0x4f81ebb6  // sdot v22.4s, v29.16b, v1.4b[2]\n"
+" mov v25.d[1], x21\n"
+" .inst 0x4f80eb93  // sdot v19.4s, v28.16b, v0.4b[2]\n"
+" mov v24.d[1], x20\n"
+" .inst 0x4f81eb97  // sdot v23.4s, v28.16b, v1.4b[2]\n"
+" add x9, x9, #0x10\n"
+" add x28, x28, #0x10\n"
+" add x12, x12, #0x100\n"
+" .inst 0x4fa0eb70  // sdot v16.4s, v27.16b, v0.4b[3]\n"
+" .inst 0x4fa1eb74  // sdot v20.4s, v27.16b, v1.4b[3]\n"
+" .inst 0x4fa0eb51  // sdot v17.4s, v26.16b, v0.4b[3]\n"
+" .inst 0x4fa1eb55  // sdot v21.4s, v26.16b, v1.4b[3]\n"
+@endcode
+
+Note that every occurrence of .inst is accompanied by a comment with the original opcode for readability purposes.
+
+The reason for using the opcodes instead of the mnemonic is that this approach will work on any toolchain, including the ones without support for the dot product mnemonic. The .inst directive is used to generate many other instructions and ensuring the code will compile on older toolchains that do not support them.
+
 @section implementation_topic_windows Windows
 
 A @ref Window represents a workload to execute, it can handle up to @ref Coordinates::num_max_dimensions dimensions.
@@ -140,4 +186,4 @@ This is a very basic implementation which was originally used in the Arm® Neon�
 All OpenCL kernels used by the library are built and stored in @ref CLKernelLibrary.
 If the library is compiled with embed_kernels=0 the application can set the path to the OpenCL kernels by calling @ref CLKernelLibrary::init(), by default the path is set to "./cl_kernels"
 */
-} // namespace arm_compute
-\ No newline at end of file
+} // namespace arm_compute
diff --git a/docs/contributor_guide/non_inclusive_language_examples.dox b/docs/contributor_guide/non_inclusive_language_examples.dox
new file mode 100644
index 0000000000..addfdd34dd
--- /dev/null
+++ b/docs/contributor_guide/non_inclusive_language_examples.dox
@@ -0,0 +1,4 @@
+ - master/slave
+ - black/white
+ - he/she, him/her, his/hers
+   - When referring to a person where gender is irrelevant or unknown, kindly use they, them, theirs, or a person’s preferred pronoun.
+\ No newline at end of file
diff --git a/docs/user_guide/advanced.dox b/docs/user_guide/advanced.dox
index 86ee2ce756..2b9e0d02f7 100644
--- a/docs/user_guide/advanced.dox
+++ b/docs/user_guide/advanced.dox
@@ -110,5 +110,30 @@ After the first run, the CLTuner's results can be exported to a file using the m
 This file can be also imported using the method "load_from_file("results.csv")".
 - tuner.load_from_file("results.csv");
 
+@section Security Concerns
+Here are some security concerns that may affect Compute Library.
+
+@subsection A process running under the same uid could read another process memory
+
+Processes running under same user ID (UID) may be able to read each other memory and running state. Hence, This can
+lead to information disclosure and sensitive data can be leaked, such as the weights of the model currently executing.
+This mainly affects Linux systems and it's the responsibility of the system owner to make processes secure against
+this vulnerability. Moreover, the YAMA security kernel module can be used to detect and stop such a trial of hacking,
+it can be selected at the kernel compile time by CONFIG_SECURITY_YAMA and configured during runtime changing the
+ptrace_scope in /proc/sys/kernel/yama.
+
+Please refer to: https://www.kernel.org/doc/html/v4.15/admin-guide/LSM/Yama.html for more information on this regard.
+
+@subsection Malicious users could alter Compute Library related files
+
+Extra care must be taken in order to reduce the posibility of a user altering sensitive files. CLTuner files
+should be protected by arbitrary writes since this can lead Compute Library to crash or waste all system's resources.
+
+@subsection Various concerns
+
+Sensitive applications that use Compute Library should consider posible attack vectors such as shared library hooking,
+information leakage from the underlying OpenCL driver or previous excecution and running arbitrary networks that consume
+all the available resources on the system, leading to denial of service.
+
 */
 } // namespace
 \ No newline at end of file
diff --git a/docs/user_guide/conv2d_heuristic.dox b/docs/user_guide/conv2d_heuristic.dox
new file mode 100644
index 0000000000..edd24a3d36
--- /dev/null
+++ b/docs/user_guide/conv2d_heuristic.dox
@@ -0,0 +1,89 @@
+///
+/// Copyright (c) 2023 Arm Limited.
+///
+/// SPDX-License-Identifier: MIT
+///
+/// Permission is hereby granted, free of charge, to any person obtaining a copy
+/// of this software and associated documentation files (the "Software"), to
+/// deal in the Software without restriction, including without limitation the
+/// rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+/// sell copies of the Software, and to permit persons to whom the Software is
+/// furnished to do so, subject to the following conditions:
+///
+/// The above copyright notice and this permission notice shall be included in all
+/// copies or substantial portions of the Software.
+///
+/// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+/// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+/// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+/// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+/// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+/// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+/// SOFTWARE.
+///
+
+namespace arm_compute
+{
+/**
+@page conv2d_heuristic Convolution 2D heuristic
+
+@section conv2d_heuristic_algorithms_used Convolution 2D heuristic: algorithm selection
+
+The convolution 2D (in short, conv2D) is certainly one of the most compute intensive and performance critical operators in ML workloads.
+This operator can be implemented with different algorithms, which differ in terms of accuracy, kernel size support, and additional memory required.
+Unfortunately, it does not exist a single algorithm that can be used in all scenarios to achieve the best performance.
+Therefore, the Arm Compute Library integrates an heuristic within the conv2d operators to select the most efficient algorithm, depending on input and kernel shapes and desired level of accuracy.
+The heuristic depends on the target backend (either NEON™ for Arm® CPUs or OpenCL for Arm® GPUs) and the following subsections will provide the main details behind the selection of the algorithm.
+
+⚠ Attention: The heuristics presented in the following subsections will only refer to the NHWC data layout, which is the optimal and recommended layout for the Arm Compute Library.
+
+@subsection conv2d_heuristic_on_cpu Convolution 2D heuristic: Arm® Cortex®-based CPUs
+
+The conv2d heuristic for Arm® Cortex®-based CPUs is inside the get_convolution_method() method in the CpuConv2d function.
+The algorithms used in the get_convolution_method() function are the following:
+- Direct-Conv2D
+- Im2Col+GeMM-based
+- Indirect-GeMM (a.k.a. GEMMCONV2D)
+- GeMM
+- Winograd
+
+⚠ Attention: Winograd only works with floating-point data types (F32, F16)
+
+The heuristic first checks less frequent cases that we may have in ML workloads for edge devices. These cases are the following:
+-# Non unit dilation: We call Im2Col+GeMM
+-# Large input and kernel shapes: We call Direct-Conv2D because it is the only algorithm that does not extra additionally temporary memory
+-# Small Input-Feature-Maps (IFM): In this scenario, we have found that the GeMM implementation is generally the most efficient algorithm compared to Winograd and Indirect-GeMM
+
+If we have a most frequent case, such as unit dilations, of larger IFM, we evaluate the following conditions instead:
+-# Unit kernel size (1x1): In this scenario, the conv2d operations corresponds to a matrix multiplication and we call GeMM.
+-# Winograd. Winograd only works with unit strides and supports a limited number of kernel sizes, such as 3x3, 3x1, 1x3, 5x1, 1x5 and 5x5
+-# Indirect-GeMM: It should be used in all cases expect when the kernel size is 1x1 or when the IFM is small
+
+If the preceding cases are not met, we will fall-back to the Im2Col+GeMM-based algorithm.
+
+@subsection conv2d_heuristic_on_gpu Convolution 2D heuristic: Arm® Mali™-based GPUs
+
+The conv2d heuristic for Arm® Mali™-based GPUs is inside the get_convolution_method() method in the ClConv2d function.
+
+The algorithms used in the get_convolution_method() function are the following:
+- Direct-Conv2D
+- Im2Col+GeMM-based
+- Indirect-GeMM
+- GeMM
+- Winograd
+
+⚠ Attention: Winograd only works with floating-point data types (F32, F16)
+
+The heuristic first checks less frequent cases that we may have in ML workloads for edge devices. These cases are the following:
+-# Non unit dilation: We call Im2Col+GeMM
+-# Large input and kernel shapes: We call Direct-Conv2D because it is the only algorithm that does not extra additionally temporary memory
+
+In all the other cases, the GPU heuristic evaluates the suitability of Winograd and Direct-Conv2D/Indirect-Conv2D.
+In particular, Winograd is adopted when the convolution parameters (kernel size and strides) are supported by the algorithm and when the IFM is not small (for example, greater than 8).
+The conditions for using the Direct-Conv2D algorithms are several and we recommend you look at the heuristic directly.
+In general, the Direct-Conv2D operators is used in almost all cases where kernel size is not 1x1.
+The Indirect-GeMM algorithm is used in alternative to Direct-Conv2D only for Arm® Mali™-G77 GPU.
+If neither Winograd nor Direct-Conv2D can be used, we will fall-back to either GeMM (when the kernel size is 1x1) or the Im2Col+GeMM-based algorithm.
+
+*/
+} // namespace
diff --git a/docs/user_guide/data_layout.dox b/docs/user_guide/data_layout.dox
index 97d3ea6262..711b85f08c 100644
--- a/docs/user_guide/data_layout.dox
+++ b/docs/user_guide/data_layout.dox
@@ -1,5 +1,5 @@
 ///
-/// Copyright (c) 2021 Arm Limited.
+/// Copyright (c) 2021-2022 Arm Limited.
 ///
 /// SPDX-License-Identifier: MIT
 ///
@@ -29,13 +29,36 @@ namespace arm_compute
 
 @section data_layout_support_supported_data_layout Supported Data Layouts
 
-Compute Library supports the following data layouts and
-the right-most letter represents the fastest changing dimension:
+With regard to convolution layers, Compute Library supports the following data layouts for input and output tensors:
 
 - NHWC: The native layout of Compute Library that delivers the best performance where channels are in the fastest changing dimension
 - NCHW: Legacy layout where width is in the fastest changing dimension
+- NDHWC: New data layout for supporting 3D operators
 
-, where N = batch, C = channel, H = height, W = width.
+, where N = batch, C = channel, H = height, W = width, D = depth.
+
+Note: The right-most letter represents the fastest changing dimension, which is the "lower dimension".
+The corresponding @ref TensorShape for each of the data layout would be initialized as:
+
+- NHWC: TensorShape(C, W, H, N)
+- NCHW: TensorShape(W, H, C, N)
+- NDHWC: TensorShape(C, W, H, D, N)
+
+For 2d Conv, the weight / filter tensors are arranged in 4 dimensions: Height (H), Width (W), Input channel (I), Output channel (O)
+For 3d Conv, the additional Depth dimension means exactly the same as the Depth in the input / output layout.
+
+The layout of weight tensors change with that of the input / output tensors, and the dimensions can be mapped as:
+
+- Weight Height -> Height
+- Weight Width -> Width
+- Weight Input channel -> Channel
+- Weight Output channel -> Batch
+
+Therefore, the corresponding weight layouts for each input / output layout are:
+
+- (input/output tensor) NHWC: (weight tensor) OHWI
+- (input/output tensor) NCHW: (weight tensor) OIHW
+- (input/output tensor) NDHWC: (weight tensor) ODHWI
 
 */
 } // namespace
diff --git a/docs/user_guide/errata.dox b/docs/user_guide/errata.dox
index 635358bf92..056e45a432 100644
--- a/docs/user_guide/errata.dox
+++ b/docs/user_guide/errata.dox
@@ -1,5 +1,5 @@
 ///
-/// Copyright (c) 2019-2021 Arm Limited.
+/// Copyright (c) 2019-2023 Arm Limited.
 ///
 /// SPDX-License-Identifier: MIT
 ///
@@ -30,22 +30,67 @@ namespace arm_compute
 
 @section S7_1_errata Errata
 
-- Experimented performance regressions for some networks on OpenCL when using Arm® Mali™ DDK r8p0
-    - Versions Affected: >= v21.05
+- (COMPMID-6493) Crash when running Arm Compute Library compiled for SVE2 on a computer that support SVE only.
+    - Versions: >= v21.02 && <=v23.08
+    - OSs: Linux, Android.
+    - Conditions:
+        - Compile the latest Arm Compute Library for SVE2 (arch=armv8.6-a-sve2).
+        - multi_isa = 0
+        - Device with SVE but without SVE2 support.
+    - Result:
+        - Crash due to illegal instruction.
+        - To run SVE only, build with arch="armv8.2-a-sve", arch="armv8.6-a-sve", or with multi_isa=1.
+
+- (COMPMID-6404) Under certain conditions, CLTile may produce incorrect result.
+    - Versions: >= v19.02 && < v23.08
+    - OSs: Linux, Android.
+    - Conditions:
+        - The size of the lowest dimension of the input tensor is greater than 16 bytes.
+        - The size of the lowest dimension of the input tensor is not a multiple of 16.
+    - Result:
+        - Incorrect result is produced.
+
+- (COMPMID-6271) Under certain conditions, CLArgMinMaxLayer validation tests may fail
+    - Versions Affected: >= v20.02 && < v23.08
+    - OSs Affected: Linux
+    - Conditions:
+        - Backend: OpenCL
+        - Axis == 0
+    - Result:
+        - Sporadic mismatches only on certain devices
+
+- (COMPMID-5324) Issue identified with direct and depthwise convolutions for certain Arm® Mali™ DDK versions.
+    - Versions Affected: < v22.08
+    - Conditions:
+        - Arm® Mali™ DDK Versions : >= r23p0 && <= r38p0
+        - Mali™ GPUs: Bifrost GPU family with the exception of G71
+        - Backend: OpenCL
+        - Build options Include : "cl-fast-relaxed-math"
+    - Result: Reduced accuracy issue, while using direct and depthwise convolutions fused with LU_BOUNDED_RELU activation.
+
+- (COMPMID-5134) An issue has been identified when running the graph_deepspeech_v0_4_1 graph example.
+    - Versions Affected: >= v21.08
+    - Conditions:
+        - Data type input: F32
+        - Backend: OpenCL
+    - Result: The execution of the graph_deepspeech_v0_4_1 could fail on OpenCL backend for systems with a small RAM. The issue is due to the extra temporary memory required to reshape the network weights
+
+- (COMPMID-4013) Experimented performance regressions for some networks on OpenCL when using Arm® Mali™ DDK r8p0
+    - Versions Affected: v21.05
     - OSs Affected: All
     - Conditions:
         - Arm® Mali™ DDK r8p0
 
-- Under certain conditions, CLFullyConnectedLayer quantized tests may fail due to an issue in the test framework.
-    - Versions Affected: 21.02
+- (COMPMID-5146) Under certain conditions, CLFullyConnectedLayer quantized tests may fail due to an issue in the test framework.
+    - Versions Affected: v21.02
     - OSs Affected: Linux
     - Conditions:
         - armv7a architecture
         - release mode
         - asserts enabled
 
-- Performance regression in Convolution Layer OpenCL backend on Mali™ G77 when QSYMM8_PER_CHANNEL is used as weights' data type.
-    - Versions Affected: >= v20.11
+- (COMPMID-4367) Performance regression in Convolution Layer OpenCL backend on Mali™ G77 when QSYMM8_PER_CHANNEL is used as weights' data type.
+    - Versions Affected: >= v20.11 && < v21.08
     - OSs Affected: All
     - Conditions:
         - Mali™ G77
@@ -53,34 +98,34 @@ namespace arm_compute
         - OpenCL backend
         - Convolution Layer uses QSYMM8_PER_CHANNEL as the data type of its weight
 
-- A wrong test configuration has been found in CLGEMMMatrixMultiplyReshapedOnlyRHS set of tests.
-    - Versions Affected: >= 20.11
+- (COMPMID-4306) A wrong test configuration has been found in CLGEMMMatrixMultiplyReshapedOnlyRHS set of tests.
+    - Versions Affected: >= v20.11 && < v21.05
     - Conditions:
         - Data type input: F32/F16
         - Fused bounded relu activation with coefficient 'a' being negative
 
-- Under certain conditions, the validation test case 'CL/DirectConvolutionLayer/Float/FP32/RunSmall9x9\@InputShape=32x37x3x4:StrideX=1:StrideY=1:PadX=0:PadY=0:KernelSize=9:NumKernels=1:DataType=F32:ActivationInfo=LU_BOUNDED_RELU:DataLayout=NHWC' may fail.
+- (COMPMID-5135) Under certain conditions, the validation test case 'CL/DirectConvolutionLayer/Float/FP32/RunSmall9x9\@InputShape=32x37x3x4:StrideX=1:StrideY=1:PadX=0:PadY=0:KernelSize=9:NumKernels=1:DataType=F32:ActivationInfo=LU_BOUNDED_RELU:DataLayout=NHWC' may fail.
     - Versions Affected: >= v20.08
     - Conditions:
         - The validation suite has to run in nightly mode and execute 40k+ test cases before the test mentioned above
 
-- Under certain conditions, benchmark examples can hang when OpenCL profiling queues are enabled.
+- (COMPMID-5136) Under certain conditions, benchmark examples can hang when OpenCL profiling queues are enabled.
     - Versions Affected: >= v19.11
     - OSs Affected: Linux
     - Conditions:
         - Arm® Mali™ DDK r1p0 - r8p0, and
         - Linux kernel >= 4.4
 
-- On Android with arm64-v8a/arm64-v8.2-a architecture, Arm® Neon™ validation tests can fail when compiled using Android Ndk
+- (COMPMID-5137) On Android with armv8a/armv8.2-a architecture, Arm® Neon™ validation tests can fail when compiled using Android Ndk
   >= r18b in debug mode (https://github.com/android/ndk/issues/1135).
     - Versions Affected: >= v19.11
     - OSs Affected: Android
     - Conditions:
-        - arm64-v8a/arm64-v8.2-a architecture, and
+        - armv8a/armv8.2-a architecture, and
         - Compiled using Android NDK >= r18b in debug mode.
 
-- An issue has been identified with CLCast.
-    - Versions Affected: >= 18.11
+- (COMPMID-4288) An issue has been identified with CLCast.
+    - Versions Affected: >= v18.11 && < v21.05
     - Conditions:
         - Data type input: F32
         - Data type output: All integer types
diff --git a/docs/user_guide/how_to_build_and_run_examples.dox b/docs/user_guide/how_to_build_and_run_examples.dox
index 1766199eb4..0b8a23b368 100644
--- a/docs/user_guide/how_to_build_and_run_examples.dox
+++ b/docs/user_guide/how_to_build_and_run_examples.dox
@@ -1,5 +1,5 @@
 ///
-/// Copyright (c) 2017-2021 Arm Limited.
+/// Copyright (c) 2017-2024 Arm Limited.
 ///
 /// SPDX-License-Identifier: MIT
 ///
@@ -30,203 +30,7 @@ namespace arm_compute
 @section S1_1_build_options Build options
 
 scons 2.3 or above is required to build the library.
-To see the build options available simply run ```scons -h```:
-
-        debug: Debug (yes|no)
-            default: False
-
-        asserts: Enable asserts (this flag is forced to 1 for debug=1) (yes|no)
-            default: False
-
-        logging: Logging (this flag is forced to 1 for debug=1) (yes|no)
-            default: False
-
-        arch: Target Architecture (armv7a|arm64-v8a|arm64-v8.2-a|arm64-v8.2-a-sve|arm64-v8.2-a-sve2|x86_32|x86_64|armv8a|armv8.2-a|armv8.2-a-sve|armv8.6-a|armv8.6-a-sve|armv8.6-a-sve2|armv8r64|x86)
-            default: armv7a
-
-        estate: Execution State (auto|32|64)
-            default: auto
-
-        os: Target OS (linux|android|macos|tizen|bare_metal)
-            default: linux
-
-        build: Build type (native|cross_compile|embed_only)
-            default: cross_compile
-
-        examples: Build example programs (yes|no)
-            default: True
-
-        gemm_tuner: Build gemm_tuner programs (yes|no)
-            default: True
-
-        Werror: Enable/disable the -Werror compilation flag (yes|no)
-            default: True
-
-        standalone: Builds the tests as standalone executables, links statically with libgcc, libstdc++ and libarm_compute (yes|no)
-            default: False
-
-        opencl: Enable OpenCL support (yes|no)
-            default: True
-
-        neon: Enable Arm® Neon™ support (yes|no)
-            default: False
-
-        embed_kernels: Embed OpenCL kernels in library binary (yes|no)
-            default: True
-
-        compress_kernels: Compress embedded OpenCL kernels in library binary. Note embed_kernels should be enabled as well (yes|no)
-            default: False
-
-        set_soname: Set the library's soname and shlibversion (requires SCons 2.4 or above) (yes|no)
-            default: False
-
-        openmp: Enable OpenMP backend (yes|no)
-            default: False
-
-        cppthreads: Enable C++11 threads backend (yes|no)
-            default: True
-
-        build_dir: Specify sub-folder for the build ( /path/to/build_dir )
-            default: .
-
-        install_dir: Specify sub-folder for the install ( /path/to/install_dir )
-            default:
-
-        exceptions: Enable/disable C++ exception support (yes|no)
-            default: True
-
-        linker_script: Use an external linker script ( /path/to/linker_script )
-            default:
-
-        custom_options: Custom options that can be used to turn on/off features
-            (all|none|comma-separated list of names)
-            allowed names: disable_mmla_fp
-            default: none
-
-        data_type_support: Enable a list of data types to support
-            (all|none|comma-separated list of names)
-            allowed names: qasymm8 qasymm8_signed qsymm16 fp16 fp32
-            default: all
-
-        toolchain_prefix: Override the toolchain prefix
-            default:
-
-        compiler_prefix: Override the compiler prefix
-            default:
-
-        extra_cxx_flags: Extra CXX flags to be appended to the build command
-            default:
-
-        extra_link_flags: Extra LD flags to be appended to the build command
-            default:
-
-        compiler_cache: Command to prefix to the C and C++ compiler (e.g ccache)
-            default:
-
-        specs_file: Specs file to use
-            default: rdimon.specs
-
-        benchmark_examples: Build benchmark examples programs (yes|no)
-            default: False
-
-        validate_examples: Build validate examples programs (yes|no)
-            default: False
-
-        reference_openmp: Build reference validation with openmp (yes|no)
-            default: True
-
-        validation_tests: Build validation test programs (yes|no)
-            default: False
-
-        benchmark_tests: Build benchmark test programs (yes|no)
-            default: False
-
-        test_filter: Pattern to specify the tests' filenames to be compiled
-            default: *.cpp
-
-        pmu: Enable PMU counters (yes|no)
-            default: False
-
-        mali: Enable Arm® Mali™ hardware counters (yes|no)
-            default: False
-
-        external_tests_dir: Add examples, benchmarks and tests to the tests suite from an external path ( /path/to/external_tests_dir )
-            default:
-
-@b debug / @b asserts:
- - With debug=1 asserts are enabled, and the library is built with symbols and no optimisations enabled.
- - With debug=0 and asserts=1: Optimisations are enabled and symbols are removed, however all the asserts are still present (This is about 20% slower than the release build)
- - With debug=0 and asserts=0: All optimisations are enable and no validation is performed, if the application misuses the library it is likely to result in a crash. (Only use this mode once you are sure your application is working as expected).
-
-@b arch: The x86_32 and x86_64 targets can only be used with neon=0 and opencl=1.
-
-@b os: Choose the operating system you are targeting: Linux, Android or bare metal.
-@note bare metal can only be used for Arm® Neon™ (not OpenCL), only static libraries get built and Neon™'s multi-threading support is disabled.
-
-@b build: you can either build directly on your device (native) or cross compile from your desktop machine (cross-compile). In both cases make sure the compiler is available in your path.
-
-@note If you want to natively compile for 32bit on a 64bit Arm device running a 64bit OS then you will have to use cross-compile too.
-
-There is also an 'embed_only' option which will generate all the .embed files for the OpenCL kernels. This might be useful if using a different build system to compile the library.
-
-In addition the option 'compress_kernels' will compress the embedded OpenCL kernel files using zlib and inject them in the library. This is useful for reducing the binary size. Note, this option is only available for Android when 'embed_kernels' is enabled.
-
-@b Werror: If you are compiling using the same toolchains as the ones used in this guide then there shouldn't be any warning and therefore you should be able to keep Werror=1. If with a different compiler version the library fails to build because of warnings interpreted as errors then, if you are sure the warnings are not important, you might want to try to build with Werror=0 (But please do report the issue on Github).
-
-@b opencl / @b neon: Choose which SIMD technology you want to target. (Neon™ for Arm® Cortex®-A CPUs or OpenCL for Arm® Mali™ GPUs)
-
-@b embed_kernels: For OpenCL only: set embed_kernels=1 if you want the OpenCL kernels to be built in the library's binaries instead of being read from separate ".cl" / ".cs" files. If embed_kernels is set to 0 then the application can set the path to the folder containing the OpenCL kernel files by calling CLKernelLibrary::init(). By default the path is set to "./cl_kernels".
-
-@b set_soname: Do you want to build the versioned version of the library ?
-
-If enabled the library will contain a SONAME and SHLIBVERSION and some symlinks will automatically be created between the objects.
-Example:
-  libarm_compute_core.so -> libarm_compute_core.so.1.0.0
-  libarm_compute_core.so.1 -> libarm_compute_core.so.1.0.0
-  libarm_compute_core.so.1.0.0
-
-@note This options is disabled by default as it requires SCons version 2.4 or above.
-
-@b extra_cxx_flags: Custom CXX flags which will be appended to the end of the build command.
-
-@b build_dir: Build the library in a subfolder of the "build" folder. (Allows to build several configurations in parallel).
-
-@b examples: Build or not the examples
-
-@b validation_tests: Enable the build of the validation suite.
-
-@b benchmark_tests: Enable the build of the benchmark tests
-
-@b pmu: Enable the PMU cycle counter to measure execution time in benchmark tests. (Your device needs to support it)
-
-@b mali: Enable the collection of Arm® Mali™ hardware counters to measure execution time in benchmark tests. (Your device needs to have a Arm® Mali™ driver that supports it)
-
-@b openmp Build in the OpenMP scheduler for Neon™.
-
-@note Only works when building with g++ not clang++
-
-@b cppthreads Build in the C++11 scheduler for Neon™.
-
-@sa Scheduler::set
-
-@b external_tests_dir Add examples, benchmarks and tests to the tests suite from an external path ( /path/to/external_tests_dir )
-
-In order to use this option, the external tests directory must have the following structure:
-
-    EXTERNAL_TESTS_DIR:
-    └── tests
-        ├── benchmark
-        │   ├── CL
-        │   ├── datasets
-        │   ├── fixtures
-        │   └── Neon
-        └── validation
-            ├── CL
-            ├── datasets
-            ├── fixtures
-            └── Neon
-
-Then, build the library with `external_tests_dir=<PATH_TO_EXTERNAL_TESTS_DIR>`.
+To see the build options available simply run ```scons -h```
 
 @section S1_2_linux Building for Linux
 
@@ -243,11 +47,11 @@ To cross-compile the library in debug mode, with Arm® Neon™ only support, for
 
 To cross-compile the library in asserts mode, with OpenCL only support, for Linux 64bit:
 
-	scons Werror=1 -j8 debug=0 asserts=1 neon=0 opencl=1 embed_kernels=1 os=linux arch=arm64-v8a
+	scons Werror=1 -j8 debug=0 asserts=1 neon=0 opencl=1 embed_kernels=1 os=linux arch=armv8a
 
 You can also compile the library natively on an Arm device by using <b>build=native</b>:
 
-	scons Werror=1 -j8 debug=0 neon=1 opencl=0 os=linux arch=arm64-v8a build=native
+	scons Werror=1 -j8 debug=0 neon=1 opencl=0 os=linux arch=armv8a build=native
 	scons Werror=1 -j8 debug=0 neon=1 opencl=0 os=linux arch=armv7a build=native
 
 @note g++ for Arm is mono-arch, therefore if you want to compile for Linux 32bit on a Linux 64bit platform you will have to use a cross compiler.
@@ -272,21 +76,21 @@ The examples get automatically built by scons as part of the build process of th
 
 To cross compile a Arm® Neon™ example for Linux 32bit:
 
-	arm-linux-gnueabihf-g++ examples/neon_cnn.cpp utils/Utils.cpp -I. -Iinclude -std=c++14 -mfpu=neon -L. -larm_compute -larm_compute_core -o neon_cnn
+	arm-linux-gnueabihf-g++ examples/neon_cnn.cpp utils/Utils.cpp -I. -Iinclude -std=c++14 -mfpu=neon -L. -larm_compute -o neon_cnn
 
 To cross compile a Arm® Neon™ example for Linux 64bit:
 
-	aarch64-linux-gnu-g++ examples/neon_cnn.cpp utils/Utils.cpp -I. -Iinclude -std=c++14 -L. -larm_compute -larm_compute_core -o neon_cnn
+	aarch64-linux-gnu-g++ examples/neon_cnn.cpp utils/Utils.cpp -I. -Iinclude -std=c++14 -L. -larm_compute -o neon_cnn
 
 (notice the only difference with the 32 bit command is that we don't need the -mfpu option and the compiler's name is different)
 
 To cross compile an OpenCL example for Linux 32bit:
 
-	arm-linux-gnueabihf-g++ examples/cl_sgemm.cpp utils/Utils.cpp -I. -Iinclude -std=c++14 -mfpu=neon -L. -larm_compute -larm_compute_core -o cl_sgemm -DARM_COMPUTE_CL
+	arm-linux-gnueabihf-g++ examples/cl_sgemm.cpp utils/Utils.cpp -I. -Iinclude -std=c++14 -mfpu=neon -L. -larm_compute -o cl_sgemm -DARM_COMPUTE_CL
 
 To cross compile an OpenCL example for Linux 64bit:
 
-	aarch64-linux-gnu-g++ examples/cl_sgemm.cpp utils/Utils.cpp -I. -Iinclude -std=c++14 -L. -larm_compute -larm_compute_core -o cl_sgemm -DARM_COMPUTE_CL
+	aarch64-linux-gnu-g++ examples/cl_sgemm.cpp utils/Utils.cpp -I. -Iinclude -std=c++14 -L. -larm_compute -o cl_sgemm -DARM_COMPUTE_CL
 
 (notice the only difference with the 32 bit command is that we don't need the -mfpu option and the compiler's name is different)
 
@@ -294,45 +98,45 @@ To cross compile the examples with the Graph API, such as graph_lenet.cpp, you n
 
 i.e. to cross compile the "graph_lenet" example for Linux 32bit:
 
-	arm-linux-gnueabihf-g++ examples/graph_lenet.cpp utils/Utils.cpp utils/GraphUtils.cpp utils/CommonGraphOptions.cpp -I. -Iinclude -std=c++14 -mfpu=neon -L. -larm_compute_graph -larm_compute -larm_compute_core -Wl,--allow-shlib-undefined -o graph_lenet
+	arm-linux-gnueabihf-g++ examples/graph_lenet.cpp utils/Utils.cpp utils/GraphUtils.cpp utils/CommonGraphOptions.cpp -I. -Iinclude -std=c++14 -mfpu=neon -L. -larm_compute_graph -larm_compute -Wl,--allow-shlib-undefined -o graph_lenet
 
 i.e. to cross compile the "graph_lenet" example for Linux 64bit:
 
-	aarch64-linux-gnu-g++ examples/graph_lenet.cpp utils/Utils.cpp utils/GraphUtils.cpp utils/CommonGraphOptions.cpp -I. -Iinclude -std=c++14 -L. -larm_compute_graph -larm_compute -larm_compute_core -Wl,--allow-shlib-undefined -o graph_lenet
+	aarch64-linux-gnu-g++ examples/graph_lenet.cpp utils/Utils.cpp utils/GraphUtils.cpp utils/CommonGraphOptions.cpp -I. -Iinclude -std=c++14 -L. -larm_compute_graph -larm_compute -Wl,--allow-shlib-undefined -o graph_lenet
 
 (notice the only difference with the 32 bit command is that we don't need the -mfpu option and the compiler's name is different)
 
-@note If compiling using static libraries, this order must be followed when linking: arm_compute_graph_static, arm_compute, arm_compute_core
+@note If compiling using static libraries, this order must be followed when linking: arm_compute_graph_static, arm_compute
 
 To compile natively (i.e directly on an Arm device) for Arm® Neon™ for Linux 32bit:
 
-	g++ examples/neon_cnn.cpp utils/Utils.cpp -I. -Iinclude -std=c++14 -mfpu=neon -larm_compute -larm_compute_core -o neon_cnn
+	g++ examples/neon_cnn.cpp utils/Utils.cpp -I. -Iinclude -std=c++14 -mfpu=neon -larm_compute -o neon_cnn
 
 To compile natively (i.e directly on an Arm device) for Arm® Neon™ for Linux 64bit:
 
-	g++ examples/neon_cnn.cpp utils/Utils.cpp -I. -Iinclude -std=c++14 -larm_compute -larm_compute_core -o neon_cnn
+	g++ examples/neon_cnn.cpp utils/Utils.cpp -I. -Iinclude -std=c++14 -larm_compute -o neon_cnn
 
 (notice the only difference with the 32 bit command is that we don't need the -mfpu option)
 
 To compile natively (i.e directly on an Arm device) for OpenCL for Linux 32bit or Linux 64bit:
 
-	g++ examples/cl_sgemm.cpp utils/Utils.cpp -I. -Iinclude -std=c++14 -larm_compute -larm_compute_core -o cl_sgemm -DARM_COMPUTE_CL
+	g++ examples/cl_sgemm.cpp utils/Utils.cpp -I. -Iinclude -std=c++14 -larm_compute -o cl_sgemm -DARM_COMPUTE_CL
 
 To compile natively the examples with the Graph API, such as graph_lenet.cpp, you need to link the examples against arm_compute_graph.so too.
 
 i.e. to natively compile the "graph_lenet" example for Linux 32bit:
 
-	g++ examples/graph_lenet.cpp utils/Utils.cpp utils/GraphUtils.cpp utils/CommonGraphOptions.cpp -I. -Iinclude -std=c++14 -mfpu=neon -L. -larm_compute_graph -larm_compute -larm_compute_core -Wl,--allow-shlib-undefined -o graph_lenet
+	g++ examples/graph_lenet.cpp utils/Utils.cpp utils/GraphUtils.cpp utils/CommonGraphOptions.cpp -I. -Iinclude -std=c++14 -mfpu=neon -L. -larm_compute_graph -larm_compute -Wl,--allow-shlib-undefined -o graph_lenet
 
 i.e. to natively compile the "graph_lenet" example for Linux 64bit:
 
-	g++ examples/graph_lenet.cpp utils/Utils.cpp utils/GraphUtils.cpp utils/CommonGraphOptions.cpp -I. -Iinclude -std=c++14 -L. -larm_compute_graph -larm_compute -larm_compute_core -Wl,--allow-shlib-undefined -o graph_lenet
+	g++ examples/graph_lenet.cpp utils/Utils.cpp utils/GraphUtils.cpp utils/CommonGraphOptions.cpp -I. -Iinclude -std=c++14 -L. -larm_compute_graph -larm_compute -Wl,--allow-shlib-undefined -o graph_lenet
 
 (notice the only difference with the 32 bit command is that we don't need the -mfpu option)
 
-@note If compiling using static libraries, this order must be followed when linking: arm_compute_graph_static, arm_compute, arm_compute_core
+@note If compiling using static libraries, this order must be followed when linking: arm_compute_graph_static, arm_compute
 
-@note These two commands assume libarm_compute.so is available in your library path, if not add the path to it using -L (e.g. -Llib/linux-arm64-v8a-neon-cl-asserts/)
+@note These two commands assume libarm_compute.so is available in your library path, if not add the path to it using -L (e.g. -Llib/linux-armv8a-neon-cl-asserts/)
 @note You might need to export the path to OpenCL library as well in your LD_LIBRARY_PATH if Compute Library was built with OpenCL enabled.
 
 To run the built executable simply run:
@@ -362,27 +166,79 @@ In order to build for SVE or SVE2 you need a compiler that supports them. You ca
 
 An example build command with SVE is:
 
-        scons arch=arm64-v8.2-a-sve os=linux build_dir=arm64 -j55 standalone=0 opencl=0 openmp=0 validation_tests=1 neon=1 cppthreads=1 toolchain_prefix=aarch64-none-linux-gnu-
+        scons arch=armv8.2-a-sve os=linux build_dir=arm64 -j55 standalone=0 opencl=0 openmp=0 validation_tests=1 neon=1 cppthreads=1 toolchain_prefix=aarch64-none-linux-gnu-
+
+@subsection S1_2_4_sme Build for SME2
+
+In order to build for SME2 you need to use a compiler that supports SVE2 and enable SVE2 in the build as well.
+
+@note You the need to indicate the toolchains using the scons "toolchain_prefix" parameter.
+
+An example build command with SME2 is:
+
+        scons arch=armv8.6-a-sve2-sme2 os=linux build_dir=arm64 -j55 standalone=0 opencl=0 openmp=0 validation_tests=1 neon=1 cppthreads=1 toolchain_prefix=aarch64-none-linux-gnu-
+
+@subsection S1_2_5_clang_build_linux Building with LLVM+Clang Natively on Linux
+
+The library can be built with LLVM+Clang by specifying CC and CXX environment variables appropriately as below. The **minimum** supported clang version is 11, as LLVM 11 introduces SVE/SVE2 VLA intrinsics: https://developer.arm.com/Tools%20and%20Software/LLVM%20Toolchain#Supported-Devices.
+
+	CC=clang CXX=clang++ <build command>
+
+Or, if the environment has multiple clang versions:
+
+	CC=clang-16 CXX=clang++-16
+
+Examples for different build tools look like below.
+
+(experimental) CMake:
+
+	mkdir build
+	cd build
+	CC=clang CXX=clang++ cmake .. -DCMAKE_BUILD_TYPE=Release -DARM_COMPUTE_OPENMP=1 -DARM_COMPUTE_WERROR=0 -DARM_COMPUTE_BUILD_EXAMPLES=1 -DARM_COMPUTE_BUILD_TESTING=1 -DCMAKE_INSTALL_LIBDIR=.
+	CC=clang CXX=clang++ cmake --build . -j32
+
+(experimental) Bazel:
+
+	CC=clang CXX=clang++ bazel build //...
+
+Scons:
+
+	CC=clang CXX=clang++ scons -j32 Werror=1 debug=0 neon=1 openmp=1 cppthreads=1 os=linux arch=armv8a multi_isa=1 build=native validation_tests=1
+
+Configurations supported are limited to the configurations supported by our CMake, Bazel and Multi ISA Scons builds. For more details on CMake and Bazel builds, please see @ref S1_8_experimental_builds
 
 @section S1_3_android Building for Android
 
 For Android, the library was successfully built and tested using Google's standalone toolchains:
- - clang++ from NDK r18b for armv7a
- - clang++ from NDK r20b for arm64-v8a
- - clang++ from NDK r20b for arm64-v8.2-a with FP16 support
+ - clang++ from NDK r20b for armv8a
+ - clang++ from NDK r20b for armv8.2-a with FP16 support
 
-For NDK r18 or older, here is a guide to <a href="https://developer.android.com/ndk/guides/standalone_toolchain.html">create your Android standalone toolchains from the NDK</a>:
+(From 23.02, NDK >= r20b is highly recommended) For NDK r18 or older, here is a guide to <a href="https://developer.android.com/ndk/guides/standalone_toolchain.html">create your Android standalone toolchains from the NDK</a>:
 - Download the NDK r18b from here: https://developer.android.com/ndk/downloads/index.html to directory $NDK
 - Make sure you have Python 2.7 installed on your machine.
 - Generate the 32 and/or 64 toolchains by running the following commands to your toolchain directory $MY_TOOLCHAINS:
 
 	$NDK/build/tools/make_standalone_toolchain.py --arch arm64 --install-dir $MY_TOOLCHAINS/aarch64-linux-android-ndk-r18b --stl libc++ --api 21
+
 	$NDK/build/tools/make_standalone_toolchain.py --arch arm --install-dir $MY_TOOLCHAINS/arm-linux-android-ndk-r18b --stl libc++ --api 21
 
 For NDK r19 or newer, you can directly <a href="https://developer.android.com/ndk/downloads">Download</a> the NDK package for your development platform, without the need to launch the make_standalone_toolchain.py script. You can find all the prebuilt binaries inside $NDK/toolchains/llvm/prebuilt/$OS_ARCH/bin/.
-@attention the building script will look for a binary named "aarch64-linux-android-clang++", while the prebuilt binaries will have their API version as a suffix to their filename (e.g. "aarch64-linux-android21-clang++"). You should copy/rename the binary removing this suffix, or - alternatively - create an alias for it.
 
+@parblock
+@attention The building script will look for a binary named "aarch64-linux-android-clang++", while the prebuilt binaries will have their API version as a suffix to their filename (e.g. "aarch64-linux-android21-clang++"). You can instruct scons to use the correct version by using a combination of the toolchain_prefix and the "CC" "CXX" environment variables.
+@attention For this particular example, you can specify:
+
+	CC=clang CXX=clang++ scons toolchain_prefix=aarch64-linux-android21-
+
+@attention or:
+
+	CC=aarch64-linux-android21-clang CXX=aarch64-linux-android21-clang++ scons toolchain_prefix=""
+
+@endparblock
+
+@parblock
 @attention We used to use gnustl but as of NDK r17 it is deprecated so we switched to libc++
+@endparblock
 
 @note Make sure to add the toolchains to your PATH:
 
@@ -396,7 +252,7 @@ To cross-compile the library in debug mode, with Arm® Neon™ only support, for
 
 To cross-compile the library in asserts mode, with OpenCL only support, for Android 64bit:
 
-	CXX=clang++ CC=clang scons Werror=1 -j8 debug=0 asserts=1 neon=0 opencl=1 embed_kernels=1 os=android arch=arm64-v8a
+	CXX=clang++ CC=clang scons Werror=1 -j8 debug=0 asserts=1 neon=0 opencl=1 embed_kernels=1 os=android arch=armv8a
 
 @subsection S1_3_2_examples How to manually build the examples ?
 
@@ -409,23 +265,23 @@ Once you've got your Android standalone toolchain built and added to your path y
 To cross compile a Arm® Neon™ example:
 
 	#32 bit:
-	arm-linux-androideabi-clang++ examples/neon_cnn.cpp utils/Utils.cpp -I. -Iinclude -std=c++14 -larm_compute-static -larm_compute_core-static -L. -o neon_cnn_arm -static-libstdc++ -pie
+	arm-linux-androideabi-clang++ examples/neon_cnn.cpp utils/Utils.cpp -I. -Iinclude -std=c++14 -larm_compute-static -L. -o neon_cnn_arm -static-libstdc++ -pie
 	#64 bit:
-	aarch64-linux-android-clang++ examples/neon_cnn.cpp utils/Utils.cpp -I. -Iinclude -std=c++14 -larm_compute-static -larm_compute_core-static -L. -o neon_cnn_aarch64 -static-libstdc++ -pie
+	aarch64-linux-android-clang++ examples/neon_cnn.cpp utils/Utils.cpp -I. -Iinclude -std=c++14 -larm_compute-static -L. -o neon_cnn_aarch64 -static-libstdc++ -pie
 
 To cross compile an OpenCL example:
 
 	#32 bit:
-	arm-linux-androideabi-clang++ examples/cl_sgemm.cpp utils/Utils.cpp -I. -Iinclude -std=c++14 -larm_compute-static -larm_compute_core-static -L. -o cl_sgemm_arm -static-libstdc++ -pie -DARM_COMPUTE_CL
+	arm-linux-androideabi-clang++ examples/cl_sgemm.cpp utils/Utils.cpp -I. -Iinclude -std=c++14 -larm_compute-static -L. -o cl_sgemm_arm -static-libstdc++ -pie -DARM_COMPUTE_CL
 	#64 bit:
-	aarch64-linux-android-clang++ examples/cl_sgemm.cpp utils/Utils.cpp -I. -Iinclude -std=c++14 -larm_compute-static -larm_compute_core-static -L. -o cl_sgemm_aarch64 -static-libstdc++ -pie -DARM_COMPUTE_CL
+	aarch64-linux-android-clang++ examples/cl_sgemm.cpp utils/Utils.cpp -I. -Iinclude -std=c++14 -larm_compute-static -L. -o cl_sgemm_aarch64 -static-libstdc++ -pie -DARM_COMPUTE_CL
 
 To cross compile the examples with the Graph API, such as graph_lenet.cpp, you need to link the library arm_compute_graph also.
 
 	#32 bit:
-	arm-linux-androideabi-clang++ examples/graph_lenet.cpp utils/Utils.cpp utils/GraphUtils.cpp utils/CommonGraphOptions.cpp -I. -Iinclude -std=c++14 -Wl,--whole-archive -larm_compute_graph-static -Wl,--no-whole-archive -larm_compute-static -larm_compute_core-static -L. -o graph_lenet_arm -static-libstdc++ -pie -DARM_COMPUTE_CL
+	arm-linux-androideabi-clang++ examples/graph_lenet.cpp utils/Utils.cpp utils/GraphUtils.cpp utils/CommonGraphOptions.cpp -I. -Iinclude -std=c++14 -Wl,--whole-archive -larm_compute_graph-static -Wl,--no-whole-archive -larm_compute-static -L. -o graph_lenet_arm -static-libstdc++ -pie -DARM_COMPUTE_CL
 	#64 bit:
-	aarch64-linux-android-clang++ examples/graph_lenet.cpp utils/Utils.cpp utils/GraphUtils.cpp utils/CommonGraphOptions.cpp -I. -Iinclude -std=c++14 -Wl,--whole-archive -larm_compute_graph-static -Wl,--no-whole-archive -larm_compute-static -larm_compute_core-static -L. -o graph_lenet_aarch64 -static-libstdc++ -pie -DARM_COMPUTE_CL
+	aarch64-linux-android-clang++ examples/graph_lenet.cpp utils/Utils.cpp utils/GraphUtils.cpp utils/CommonGraphOptions.cpp -I. -Iinclude -std=c++14 -Wl,--whole-archive -larm_compute_graph-static -Wl,--no-whole-archive -larm_compute-static -L. -o graph_lenet_aarch64 -static-libstdc++ -pie -DARM_COMPUTE_CL
 
 @note Due to some issues in older versions of the Arm® Mali™ OpenCL DDK (<= r13p0), we recommend to link arm_compute statically on Android.
 @note When linked statically the arm_compute_graph library currently needs the --whole-archive linker flag in order to work properly
@@ -469,7 +325,7 @@ The library was successfully natively built for Apple Silicon under macOS 11.1 u
 
 To natively compile the library with accelerated CPU support:
 
-	scons Werror=1 -j8 neon=1 opencl=0 os=macos arch=arm64-v8a build=native
+	scons Werror=1 -j8 neon=1 opencl=0 os=macos arch=armv8a build=native
 
 @note Initial support disables feature discovery through HWCAPS and thread scheduling affinity controls
 
@@ -477,40 +333,40 @@ To natively compile the library with accelerated CPU support:
 
 For bare metal, the library was successfully built using linaro's latest (gcc-linaro-6.3.1-2017.05) bare metal toolchains:
  - arm-eabi for armv7a
- - aarch64-elf for arm64-v8a
+ - aarch64-elf for armv8a
 
-Download linaro for <a href="https://releases.linaro.org/components/toolchain/binaries/6.3-2017.05/arm-eabi/">armv7a</a> and <a href="https://releases.linaro.org/components/toolchain/binaries/6.3-2017.05/aarch64-elf/">arm64-v8a</a>.
+Download linaro for <a href="https://releases.linaro.org/components/toolchain/binaries/6.3-2017.05/arm-eabi/">armv7a</a> and <a href="https://releases.linaro.org/components/toolchain/binaries/6.3-2017.05/aarch64-elf/">armv8a</a>.
 
 @note Make sure to add the toolchains to your PATH: export PATH=$PATH:$MY_TOOLCHAINS/gcc-linaro-6.3.1-2017.05-x86_64_aarch64-elf/bin:$MY_TOOLCHAINS/gcc-linaro-6.3.1-2017.05-x86_64_arm-eabi/bin
 
 @subsection S1_5_1_library How to build the library ?
 
-To cross-compile the library with Arm® Neon™ support for baremetal arm64-v8a:
+To cross-compile the library with Arm® Neon™ support for baremetal armv8a:
 
-	scons Werror=1 -j8 debug=0 neon=1 opencl=0 os=bare_metal arch=arm64-v8a build=cross_compile cppthreads=0 openmp=0 standalone=1
+	scons Werror=1 -j8 debug=0 neon=1 opencl=0 os=bare_metal arch=armv8a build=cross_compile cppthreads=0 openmp=0 standalone=1
 
 @subsection S1_5_2_examples How to manually build the examples ?
 
 Examples are disabled when building for bare metal. If you want to build the examples you need to provide a custom bootcode depending on the target architecture and link against the compute library. More information about bare metal bootcode can be found <a href="http://infocenter.arm.com/help/index.jsp?topic=/com.arm.doc.dai0527a/index.html">here</a>.
 
-@section S1_6_windows_host Building on a Windows host system
+@section S1_6_windows_host Building on a Windows® host system (cross-compile)
 
-Using `scons` directly from the Windows command line is known to cause
+Using `scons` directly from the Windows® command line is known to cause
 problems. The reason seems to be that if `scons` is setup for cross-compilation
-it gets confused about Windows style paths (using backslashes). Thus it is
+it gets confused about Windows® style paths (using backslashes). Thus it is
 recommended to follow one of the options outlined below.
 
-@subsection S1_6_1_ubuntu_on_windows Bash on Ubuntu on Windows
+@subsection S1_6_1_ubuntu_on_windows Bash on Ubuntu on Windows® (cross-compile)
 
 The best and easiest option is to use
-<a href="https://msdn.microsoft.com/en-gb/commandline/wsl/about">Ubuntu on Windows</a>.
+<a href="https://msdn.microsoft.com/en-gb/commandline/wsl/about">Ubuntu on Windows®</a>.
 This feature is still marked as *beta* and thus might not be available.
 However, if it is building the library is as simple as opening a *Bash on
-Ubuntu on Windows* shell and following the general guidelines given above.
+Ubuntu on Windows®* shell and following the general guidelines given above.
 
-@subsection S1_6_2_cygwin Cygwin
+@subsection S1_6_2_cygwin Cygwin (cross-compile)
 
-If the Windows subsystem for Linux is not available <a href="https://www.cygwin.com/">Cygwin</a>
+If the Windows® subsystem for Linux is not available <a href="https://www.cygwin.com/">Cygwin</a>
 can be used to install and run `scons`, the minimum Cygwin version must be 3.0.7 or later. In addition
 to the default packages installed by Cygwin `scons` has to be selected in the installer. (`git` might
 also be useful but is not strictly required if you already have got the source
@@ -521,6 +377,38 @@ compiler is included in the Android standalone toolchain. After everything has
 been set up in the Cygwin terminal the general guide on building the library
 can be followed.
 
+@subsection S1_6_3_WoA Windows® on Arm™ (native build)
+
+    Native builds on Windows® are experimental and some features from the library interacting with the OS are missing.
+
+It's possible to build Compute Library natively on a Windows® system running on Arm™.
+
+Windows® on Arm™ (WoA) systems provide compatibility emulating x86 binaries on aarch64. Unfortunately Visual Studio 2022 does not work on aarch64 systems because it's an x86_64bit application and these binaries cannot be exectuted on WoA yet.
+
+Because we cannot use Visual Studio to build Compute Library we have to set up a native standalone toolchain to compile C++ code for arm64 on Windows®.
+
+Native arm64 toolchain installation for WoA:
+- LLVM+Clang-12 which can be downloaded from: https://github.com/llvm/llvm-project/releases/download/llvmorg-12.0.0/LLVM-12.0.0-woa64.exe
+- Arm64 VC Runtime which can be downloaded from  https://aka.ms/vs/17/release/vc_redist.arm64.exe
+
+- While full VS22 cannot be installed on WoA, we can install some components
+    -# Desktop development with C++ and all Arm64 components for Visual Studio, refer to:  https://developer.arm.com/documentation/102528/0100/Install-Visual-Studio
+    -# VS22 build tools: https://visualstudio.microsoft.com/downloads/#build-tools-for-visual-studio-2022
+
+There are some additional tools we need to install to build Compute Library:
+
+- git https://git-scm.com/download/win
+- python 3 https://www.python.org/downloads/windows/
+- scons can be installed with pip install scons
+
+In order to use clang to build Windows® binaries natively we have to initialize the environment variables from VS22 correctly so that the compiler could find the arm64 C++ libraries. This can be done by pressing the key windows + r  and running the command:
+
+    cmd /k "C:\Program Files (x86)\Microsoft Visual Studio\2022\BuildTools\VC\Auxiliary\Build\vcvarsx86_arm64.bat"
+
+To build Compute Library type:
+
+     scons opencl=0 neon=1 os=windows examples=0 validation_tests=1 benchmark_examples=0 build=native arch=armv8a Werror=0 exceptions=1 standalone=1
+
 @section S1_7_cl_requirements OpenCL DDK Requirements
 
 @subsection S1_7_1_cl_hard_requirements Hard Requirements
@@ -537,5 +425,145 @@ OpenCL kernel level debugging can be simplified with the use of printf, this req
 
 SVM allocations are supported for all the underlying allocations in Compute Library. To enable this OpenCL 2.0 and above is a requirement.
 
+@section S1_8_experimental_builds Experimental Bazel and CMake builds
+
+In addition to the scons build the repository includes experimental Bazel and CMake builds.
+These builds currently support a limited range of options. Both are similar to the scons multi_isa build. It compiles all libraries with Neon (TM) support, as well as SVE and SVE2 libraries. The build is CPU only, not including OpenCL support. Only Linux environment is targeted for now. Both were successfully built with gcc / g++ version 10.2.
+
+@subsection S1_8_1_bazel_build Bazel build
+
+@subsubsection S1_8_1_1_file_structure File structure
+
+File structure for all files included in the Bazel build:
+
+	.
+	├──  .bazelrc
+	├──  BUILD
+	├──  WORKSPACE
+	├── arm_compute
+	│   └── BUILD
+	├── examples
+	│   └── BUILD
+	├── include
+	│   └── BUILD
+	├── scripts
+	│   ├── print_version_file.py
+	│   └── BUILD
+	├── src
+	│   └── BUILD
+	├── support
+	│   └── BUILD
+	├── tests
+	│   ├── BUILD
+	│   └── framework
+	│       └── BUILD
+	└── utils
+		└── BUILD
+
+@subsubsection S1_8_1_2_build_options Build options
+
+Available build options:
+
+	- debug: Enable ['-O0','-g','-gdwarf-2'] compilation flags
+	- Werror: Enable -Werror compilation flag
+	- logging: Enable logging
+	- cppthreads: Enable C++11 threads backend
+	- openmp: Enable OpenMP backend
+
+@subsubsection S1_8_1_3_example_builds Example builds
+
+Build everything (libraries, examples, tests):
+
+	bazel build //...
+
+Build libraries:
+
+	bazel build //:all
+
+Build arm_compute only:
+
+	bazel build //:arm_compute
+
+Build examples:
+
+	bazel build //examples:all
+
+Build resnet50 example:
+
+	bazel build //examples:graph_resnet50
+
+Build validation and benchmarking:
+
+	bazel build //tests:all
+
+@subsection S1_8_2_cmake_build CMake build
+
+@subsubsection S1_8_2_1_file_structure File structure
+
+File structure for all files included in the CMake build:
+
+	.
+	├──  CMakeLists.txt
+	├── cmake
+	│   ├── Options.cmake
+	│   ├── Version.cmake
+	│   └── toolchains
+	│       └── aarch64_linux_toolchain.cmake
+	├── examples
+	│   └── CMakeLists.txt
+	├── src
+	│   └── CMakeLists.txt
+	└── tests
+		├── CMakeLists.txt
+		├── benchmark
+		│   └── CMakeLists.txt
+		└── validation
+			└── CMakeLists.txt
+
+@subsubsection S1_8_2_2_build_options Build options
+
+Available build options:
+
+	- CMAKE_BUILD_TYPE: "Release" (default) enables ['-O3', '-DNDEBUG'] compilation flags, "Debug" enables ['-O0','-g','-gdwarf-2', '-DARM_COMPUTE_ASSERTS_ENABLED']
+	- ARM_COMPUTE_WERROR: Enable -Werror compilation flag
+	- ARM_COMPUTE_EXCEPTIONS: If disabled ARM_COMPUTE_EXCEPTIONS_DISABLED is enabled
+	- ARM_COMPUTE_LOGGING: Enable logging
+	- ARM_COMPUTE_BUILD_EXAMPLES: Build examples
+	- ARM_COMPUTE_BUILD_TESTING: Build tests
+	- ARM_COMPUTE_CPPTHREADS: Enable C++11 threads backend
+	- ARM_COMPUTE_OPENMP: Enable OpenMP backend
+
+@subsubsection S1_8_2_3_example_builds Example builds
+
+To build libraries, examples and tests:
+
+	mkdir build
+	cd build
+	cmake .. -DCMAKE_BUILD_TYPE=Release -DARM_COMPUTE_OPENMP=1 -DARM_COMPUTE_WERROR=0 -DARM_COMPUTE_BUILD_EXAMPLES=1 -DARM_COMPUTE_BUILD_TESTING=1 -DCMAKE_INSTALL_LIBDIR=.
+	cmake --build . -j32
+
+@section S1_9_fixed_format Building with support for fixed format kernels
+
+@subsection S1_9_1_intro_to_fixed_format_kernels What are fixed format kernels?
+
+The GEMM kernels used for convolutions and fully-connected layers in Compute Library employ memory layouts optimized for each kernel implementation. This then requires the supplied weights to be re-ordered into a buffer ready for consumption by the GEMM kernel. Where Compute Library is being called from a framework or library which implements operator caching, the re-ordering of the inputted weights into an intermediate buffer may no longer be desirable. When using a cached operator, the caller may wish to re-write the weights tensor, and re-run the operator using the updated weights. With the default GEMM kernels in Compute Library, the GEMM will be executed with the old weights, leading to incorrect results.
+
+To address this, Compute Library provides a set of GEMM kernels which use a common blocked memory format. These kernels consume the input weights directly from the weights buffer and do not execute an intermediate pre-transpose step. With this approach, it is the responsibility of the user (in this case the calling framework) to ensure that the weights are re-ordered into the required memory format. @ref NEGEMM::has_opt_impl is a static function that queries whether there exists fixed-format kernel, and if so will return in the expected weights format. The supported weight formats are enumerated in @ref arm_compute::WeightFormat.
+
+@subsection S1_9_2_building_fixed_format Building with fixed format kernels
+
+Fixed format kernels are only available for the CPU backend. To build Compute Library with fixed format kernels set fixed_format_kernels=1:
+
+        scons Werror=1 debug=0 neon=1 opencl=0 embed_kernels=0 os=linux multi_isa=1 build=native cppthreads=1 openmp=0 fixed_format_kernels=1
+
+@section S1_10_doxygen Building the Doxygen Documentation
+
+This documentation has been generated using the following shell command:
+
+        $ ./scripts/generate_documentation.sh
+
+This requires Doxygen to be installed and available on your system.
+
 */
+
 } // namespace arm_compute
diff --git a/docs/user_guide/introduction.dox b/docs/user_guide/introduction.dox
index 6b10b9c2a2..15c95f7103 100644
--- a/docs/user_guide/introduction.dox
+++ b/docs/user_guide/introduction.dox
@@ -1,5 +1,5 @@
 ///
-/// Copyright (c) 2017-2021 Arm Limited.
+/// Copyright (c) 2017-2024 Arm Limited.
 ///
 /// SPDX-License-Identifier: MIT
 ///
@@ -34,11 +34,15 @@ namespace arm_compute
 The Compute Library is a collection of low-level machine learning functions optimized for both Arm CPUs and GPUs using SIMD technologies.
 
 Several builds of the library are available using various configurations:
- - OS: Linux, Android, macOS or bare metal.
- - Architecture: armv7a (32bit) or arm64-v8a (64bit).
+ - OS: Linux®, Android™, macOS or bare metal.
+ - Architecture: armv7a (32bit) or armv8a (64bit).
  - Technology: Arm® Neon™ / OpenCL / Arm® Neon™ and OpenCL.
  - Debug / Asserts / Release: Use a build with asserts enabled to debug your application and enable extra validation. Once you are sure your application works as expected you can switch to a release build of the library for maximum performance.
 
+@warning Depecation Notice from 24.01: NCHW data format specific optimizations will gradually be removed from the code base in
+    future releases. The implication of this is that the user is expected to translate NCHW models into NHWC in
+    order to benefit from the optimizations.
+
 @b Minimum toolchains requirements are shown below:
 
 <table>
@@ -47,11 +51,11 @@ Several builds of the library are available using various configurations:
   <th>Architecture
   <th>Minimum Toolchain
 <tr>
-  <td rowspan="4">Linux
+  <td rowspan="4">Linux®
   <td>armv7a
   <td>gcc-linaro-6.3.1-2017.05-x86_64_arm-linux-gnueabihf
   <tr>
-  <td>arm64-v8a
+  <td>armv8a
   <td rowspan="2">gcc-linaro-6.3.1-2017.05-x86_64_aarch64-linux-gnu
   <tr>
   <td>armv8.2-a
@@ -59,13 +63,18 @@ Several builds of the library are available using various configurations:
   <td>armv8.2-a-sve
   <td>gcc-arm-10.2-2020.11-x86_64-aarch64-none-linux-gnu
 <tr>
-  <td rowspan="3">Android
-  <td>armv7a
-  <td rowspan="3">NDK r18b
+  <td rowspan="3">Android™
+  <td>armv8a
+  <td rowspan="2">NDK r20b
   <tr>
-  <td>arm64-v8a
+  <td>armv8.2-a
   <tr>
+  <td>armv8.2-a-sve
+  <td>NDK r23b
+<tr>
+  <td rowspan="1">macOS
   <td>armv8.2-a
+  <td>Monterey (OS version): clang 13 (native)
 </table>
 
 @section S0_1_contact Contact / Support
@@ -74,19 +83,22 @@ Please create an issue on <a href="https://github.com/ARM-software/ComputeLibrar
 
 In order to facilitate the work of the support team please provide the build information of the library you are using. To get the version of the library you are using simply run:
 
-    $ strings android-armv7a-cl-asserts/libarm_compute.so | grep arm_compute_version
-    arm_compute_version=v16.12 Build options: {'embed_kernels': '1', 'opencl': '1', 'arch': 'armv7a', 'neon': '0', 'asserts': '1', 'debug': '0', 'os': 'android', 'Werror': '1'} Git hash=f51a545d4ea12a9059fe4e598a092f1fd06dc858
+    $ strings android-armv8a-cl-asserts/libarm_compute.so | grep arm_compute_version
+    arm_compute_version=v16.12 Build options: {'embed_kernels': '1', 'opencl': '1', 'arch': 'armv8a', 'neon': '0', 'asserts': '1', 'debug': '0', 'os': 'android', 'Werror': '1'} Git hash=f51a545d4ea12a9059fe4e598a092f1fd06dc858
 
 @section S0_2_prebuilt_binaries Pre-built binaries
 
 For each release we provide some pre-built binaries of the library [here](https://github.com/ARM-software/ComputeLibrary/releases).
 
 These binaries have been built using the following toolchains:
-            - Linux armv7a: gcc-linaro-7.2.1-2017.11-x86_64_arm-linux-gnueabihf
-            - Linux arm64-v8a: gcc-linaro-7.2.1-2017.11-x86_64_aarch64-linux-gnu
-            - Linux armv8.2a-sve: gcc-arm-10.2-2020.11-x86_64-aarch64-none-linux-gnu
-            - Android armv7a: clang++ / libc++ NDK r18b
-            - Android am64-v8a: clang++ / libc++ NDK r20b
+            - Linux® armv7a: gcc-linaro-7.2.1-2017.11-x86_64_arm-linux-gnueabihf
+            - Linux® armv8a: gcc-linaro-7.2.1-2017.11-x86_64_aarch64-linux-gnu
+            - Linux® armv8.2-a: gcc-linaro-7.2.1-2017.11-x86_64_aarch64-linux-gnu
+            - Linux® armv8.2-a (multi-ISA binary): gcc-arm-10.2-2020.11-x86_64-aarch64-none-linux-gnu
+            - Linux® armv8.2-a-sve: gcc-arm-10.2-2020.11-x86_64-aarch64-none-linux-gnu
+            - Android™ armv8a: clang++ / libc++ NDK r20b
+            - Android™ armv8.2-a: clang++ / libc++ NDK r20b
+            - Android™ armv8.2-a-sve: clang++ / libc++ NDK r23b
 
 @warning Make sure to use a compatible toolchain to build your application or you will get some std::bad_alloc errors at runtime.
 
diff --git a/docs/user_guide/library.dox b/docs/user_guide/library.dox
index e987eac752..5a337c374b 100644
--- a/docs/user_guide/library.dox
+++ b/docs/user_guide/library.dox
@@ -1,5 +1,5 @@
 ///
-/// Copyright (c) 2017-2021 Arm Limited.
+/// Copyright (c) 2017-2021, 2023-2024 Arm Limited.
 ///
 /// SPDX-License-Identifier: MIT
 ///
@@ -28,20 +28,18 @@ namespace arm_compute
 
 @tableofcontents
 
-@section architecture_core_vs_runtime Core vs Runtime libraries
+@section architecture_compute_library Compute Library architecture
 
-The Core library is a low level collection of algorithms implementations, it is designed to be embedded in existing projects and applications:
+The Compute Library is a collection of low level algorithm implementations known as kernels @ref IKernel.
+These kernels are implemented as operators @ref IOperator that do not allocate any memory (i.e. all the memory allocations/mappings have to be handled by the caller)
+and are are designed to be embedded in existing projects and applications.
 
-- It doesn't allocate any memory (All the memory allocations/mappings have to be handled by the caller).
-- It doesn't perform any kind of multi-threading (but provide information to the caller about how the workload can be split).
+A higher-level interface wraps the operators into functions @ref IFunction that:
+- Performs memory allocation of images and tensors through the use of standard malloc().
+- Enables multi-threading of Arm® Neon™ code in a very basic way using a very simple pool of threads.
+- For OpenCL, uses the default CLScheduler command queue for all mapping operations and kernels.
 
-The Runtime library is a very basic wrapper around the Core library which can be used for quick prototyping, it is basic in the sense that:
-
-- It allocates images and tensors by using standard malloc().
-- It multi-threads Arm® Neon™ code in a very basic way using a very simple pool of threads.
-- For OpenCL it uses the default CLScheduler command queue for all mapping operations and kernels.
-
-For maximum performance, it is expected that the users would re-implement an equivalent to the runtime library which suits better their needs (With a more clever multi-threading strategy, load-balancing between Arm® Neon™ and OpenCL, etc.)
+For maximum performance, it is expected that the users would re-implement an equivalent to the function interface which suits better their needs (With a more clever multi-threading strategy, load-balancing between Arm® Neon™ and OpenCL, etc.)
 
 @section architecture_fast_math Fast-math support
 
@@ -54,6 +52,17 @@ When the fast-math flag is enabled, both Arm® Neon™ and CL convolution layers
     - no-fast-math: No Winograd support
     - fast-math: Supports Winograd 3x3,3x1,1x3,5x1,1x5,7x1,1x7,5x5,7x7
 
+@section bf16_acceleration BF16 acceleration
+
+Required toolchain: android-ndk-r23-beta5 or later.
+
+To build for BF16: "neon" flag should be set "=1" and "arch" has to be "=armv8.6-a", "=armv8.6-a-sve", or "=armv8.6-a-sve2". For example:
+
+	scons arch=armv8.6-a-sve neon=1 opencl=0 extra_cxx_flags="-fPIC" benchmark_tests=0 validation_tests=0 validation_examples=1 os=android Werror=0 toolchain_prefix=aarch64-linux-android29
+
+To enable BF16 acceleration when running FP32 "fast-math" has to be enabled and that works only for Neon convolution layer using cpu gemm.
+In this scenario on CPU: the CpuGemmConv2d kernel performs the conversion from FP32, type of input tensor, to BF16 at block level to exploit the arithmetic capabilities dedicated to BF16. Then transforms back to FP32, the output tensor type.
+
 @section architecture_thread_safety Thread-safety
 
 Although the library supports multi-threading during workload dispatch, thus parallelizing the execution of the workload at multiple threads, the current runtime module implementation is not thread-safe in the sense of executing different functions from separate threads.
@@ -555,5 +564,52 @@ The responsibilities of the operators can be summarized as follows:
 - Providing information to the caller required by the computation (e.g., memory requirements)
 - Allocation of any required auxiliary memory if it isn't given by its caller explicitly
 
+@subsection architecture_experimental_build_multi_isa Build multi-ISA binary
+
+Selecting multi_isa when building Compute Library, will create a library that contains all the supported ISA features.
+Based on the CPU support, the appropriate kernel will be selected at runtime for execution. Currently this option is
+supported in two configurations: (i) with armv8.2-a (ii) with armv8-a. In both cases all the supported ISA features are enabled
+in the build.
+
+The arch option in a multi_isa build sets the minimum architecture required to run the resulting binary.
+For example a multi_isa build for armv8-a will run on any armv8-a or later, when the binary is executed on a armv8.2-a device
+it will use the additional cpu features present in this architecture: FP16 and dot product.
+In order to have a binary like this (multi_isa+armv8-a) the FP16 and dot product kernels in the library are compiled for the
+target armv8.2-a and all other common code for armv8-a.
+
+@subsection architecture_experimental_per_operator_build Per-operator build
+
+Dependencies for all operators have been explicitly defined, this provides the ability to users to generate Compute Library
+binaries that include a user-defined list of operators.
+
+An experimental flag 'build_config' has been introduced where a JSON configuration file can be provided and consumed.
+An example config looks like:
+@code{.py}
+{
+    "operators": [
+        "Activation",
+        "DepthwiseConv2d",
+        "Conv2d",
+        "Permute",
+        "Pool2d",
+        "Reshape"
+    ],
+    "data_types": [
+        "NHWC"
+    ]
+}
+@endcode
+
+Supported data-types options are:
+- "NHWC"
+- "NCHW"
+
+The list of supported operators can be found in filelist.json in the root of Compute Library repo.
+
+@subsection architecture_experimental_build_high_priority_operators Build high priority operators
+
+Selecting high_priority when building Compute Library, one new library will be created: libarm_compute_hp and
+will contain a selected subset of the libary operators. Currently the operators are staticly set.
+
 */
 } // namespace arm_compute
diff --git a/docs/user_guide/operator_list.dox b/docs/user_guide/operator_list.dox
index 05cc892d40..e7f1823f8b 100644
--- a/docs/user_guide/operator_list.dox
+++ b/docs/user_guide/operator_list.dox
@@ -1,5 +1,5 @@
 ///
-/// Copyright (c) 2021 Arm Limited.
+/// Copyright (c) 2021-2024 Arm Limited.
 ///
 /// SPDX-License-Identifier: MIT
 ///
@@ -52,9 +52,10 @@ Compute Library supports the following data layouts (fast changing dimension fro
   <ul>
     <li>NHWC: The native layout of Compute Library that delivers the best performance where channels are in the fastest changing dimension
     <li>NCHW: Legacy layout where width is in the fastest changing dimension
+    <li>NDHWC: New data layout for supporting 3D operators
     <li>All: Agnostic to any specific data layout
   </ul>
-where N = batches, C = channels, H = height, W = width
+where N = batches, C = channels, H = height, W = width, D = depth
 
 <table>
 <caption id="multi_row"></caption>
@@ -108,6 +109,26 @@ where N = batches, C = channels, H = height, W = width
     <tr><td>F32<td>F32
     </table>
 <tr>
+  <td rowspan="1">AddMulAdd
+  <td rowspan="1" style="width:200px;"> Performs a fused Add + Mul + Add [+ Relu-based-Activation] operation.
+  <td rowspan="1">
+      <ul>
+       <li>n/a
+      </ul>
+  <td>NEAddMulAdd
+  <td>
+      <ul>
+       <li>Any
+      </ul>
+  <td>
+    <table>
+    <tr><th>input1<th>input2<th>bn_mul<th>bn_add<th>add_output<th>final_output
+    <tr><td>QASYMM8<td>QASYMM8<td>QASYMM8<td>QASYMM8<td>QASYMM8<td>QASYMM8
+    <tr><td>QASYMM8_SIGNED<td>QASYMM8_SIGNED<td>QASYMM8_SIGNED<td>QASYMM8_SIGNED<td>QASYMM8_SIGNED<td>QASYMM8_SIGNED
+    <tr><td>F16<td>F16<td>F16<td>F16<td>F16<td>F16
+    <tr><td>F32<td>F32<td>F32<td>F32<td>F32<td>F32
+    </table>
+<tr>
   <td rowspan="2">ArgMinMaxLayer
   <td rowspan="2" style="width:200px;"> Function to calculate the index of the minimum or maximum values in a tensor based on an axis.
   <td rowspan="2">
@@ -125,7 +146,7 @@ where N = batches, C = channels, H = height, W = width
     <tr><th>src<th>dst
     <tr><td>QASYMM8<td>U32, S32
     <tr><td>QASYMM8_SIGNED<td>U32, S32
-    <tr><td>S32<td>U32, S32
+    <tr><td>S32<td>U32, S32, S64
     <tr><td>F16<td>U32, S32
     <tr><td>F32<td>U32, S32
     </table>
@@ -164,9 +185,6 @@ where N = batches, C = channels, H = height, W = width
     <tr><td>QSYMM16<td>QSYMM16<td>QASYMM16
     <tr><td>QSYMM16<td>QSYMM16<td>S32
     <tr><td>U8<td>U8<td>U8
-    <tr><td>U8<td>U8<td>S16
-    <tr><td>U8<td>S16<td>S16
-    <tr><td>S16<td>U8<td>S16
     <tr><td>S16<td>S16<td>S16
     <tr><td>S32<td>S32<td>S32
     <tr><td>F16<td>F16<td>F16
@@ -192,9 +210,6 @@ where N = batches, C = channels, H = height, W = width
     <tr><td>QSYMM16<td>QSYMM16<td>QASYMM16
     <tr><td>QSYMM16<td>QSYMM16<td>S32
     <tr><td>U8<td>U8<td>U8
-    <tr><td>U8<td>U8<td>S16
-    <tr><td>U8<td>S16<td>S16
-    <tr><td>S16<td>U8<td>S16
     <tr><td>S16<td>S16<td>S16
     <tr><td>S32<td>S32<td>S32
     <tr><td>F16<td>F16<td>F16
@@ -442,12 +457,15 @@ where N = batches, C = channels, H = height, W = width
     <table>
     <tr><th>src<th>dst
     <tr><td>U8<td>S8, U16, S16, U32, S32, F16, F32
+    <tr><td>S8<td>U8, U16, S16, U32, S32, F16, F32
     <tr><td>U16<td>U8, S8, S16, U32, S32, F16, F32
     <tr><td>S16<td>U8, S8, U16, U32, S32, F16, F32
     <tr><td>U32<td>U8, S8, U16, S16, S32, F16, F32
     <tr><td>S32<td>U8, S8, U16, S16, U32, F16, F32
-    <tr><td>F16<td>U8, S8, U16, S16, U32, F32
-    <tr><td>F32<td>U8, S8, U16, S16, U32, F16
+    <tr><td>U64<td>U8, S8, U16, S16, U32, S32, F16, F32
+    <tr><td>S64<td>U8, S8, U16, S16, U32, S32, F16, F32
+    <tr><td>F16<td>U8, S8, U16, S16, S32, U32, F32
+    <tr><td>F32<td>U8, S8, U16, S16, S32, U32, F16
     </table>
 <tr>
   <td rowspan="2">ChannelShuffleLayer
@@ -460,6 +478,7 @@ where N = batches, C = channels, H = height, W = width
   <td>
       <ul>
        <li>NCHW
+       <li>NHWC
       </ul>
   <td>
     <table>
@@ -471,6 +490,7 @@ where N = batches, C = channels, H = height, W = width
   <td>
       <ul>
        <li>NCHW
+       <li>NHWC
       </ul>
   <td>
     <table>
@@ -604,6 +624,40 @@ where N = batches, C = channels, H = height, W = width
     <tr><td>QASYMM8_SIGNED<td>QSYMM8_PER_CHANNEL<td>S32<td>QASYMM8_SIGNED
     </table>
 <tr>
+  <td rowspan="2">Conv3D
+  <td rowspan="2" style="width:200px;"> Function to compute a 3d convolution layer.
+  <td rowspan="2">
+      <ul>
+       <li>ANEURALNETWORKS_CONV_3D
+      </ul>
+  <td>NEConv3D
+  <td>
+      <ul>
+       <li>NDHWC
+      </ul>
+  <td>
+    <table>
+    <tr><th>src0<th>src1<th>src2<th>dst
+    <tr><td>F16<td>F16<td>F16<td>F16
+    <tr><td>F32<td>F32<td>F32<td>F32
+    <tr><td>QASYMM8<td>QASYMM8<td>S32<td>QASYMM8
+    <tr><td>QASYMM8_SIGNED<td>QASYMM8_SIGNED<td>S32<td>QASYMM8_SIGNED
+    </table>
+<tr>
+  <td>CLConv3D
+  <td>
+      <ul>
+       <li>NDHWC
+      </ul>
+  <td>
+    <table>
+    <tr><th>src0<th>src1<th>src2<th>dst
+    <tr><td>F16<td>F16<td>F16<td>F16
+    <tr><td>F32<td>F32<td>F32<td>F32
+    <tr><td>QASYMM8<td>QASYMM8<td>S32<td>QASYMM8
+    <tr><td>QASYMM8_SIGNED<td>QASYMM8_SIGNED<td>S32<td>QASYMM8_SIGNED
+    </table>
+<tr>
   <td rowspan="2">Copy
   <td rowspan="2" style="width:200px;"> Function to copy a tensor.
   <td rowspan="2">
@@ -1408,9 +1462,9 @@ where N = batches, C = channels, H = height, W = width
     <tr><td>All<td>All
     </table>
 <tr>
-  <td rowspan="2">FillBorder
-  <td rowspan="2" style="width:200px;"> Function to fill the borders within the XY-planes.
-  <td rowspan="2">
+  <td rowspan="1">FillBorder
+  <td rowspan="1" style="width:200px;"> Function to fill the borders within the XY-planes.
+  <td rowspan="1">
       <ul>
        <li>n/a
       </ul>
@@ -1425,17 +1479,6 @@ where N = batches, C = channels, H = height, W = width
     <tr><td>All<td>All
     </table>
 <tr>
-  <td>CLFillBorder
-  <td>
-      <ul>
-       <li>All
-      </ul>
-  <td>
-    <table>
-    <tr><th>src<th>dst
-    <tr><td>All<td>All
-    </table>
-<tr>
   <td rowspan="2">FlattenLayer
   <td rowspan="2" style="width:200px;"> Reshape a tensor to be 1D
   <td rowspan="2">
@@ -1730,6 +1773,7 @@ where N = batches, C = channels, H = height, W = width
     <tr><td>QASYMM8_SIGNED<td>QASYMM8_SIGNED<td>S32<td>S32
     <tr><td>QASYMM8_SIGNED<td>QSYMM8_PER_CHANNEL<td>S32<td>S32
     <tr><td>QASYMM8_SIGNED<td>QSYMM8<td>S32<td>S32
+    <tr><td>QASYMM8_SIGNED<td>QASYMM8_SIGNED<td>F32<td>F32
     </table>
 <tr>
   <td>CLGEMMLowpMatrixMultiplyCore
@@ -2032,6 +2076,41 @@ where N = batches, C = channels, H = height, W = width
     <tr><td>QASYMM8<td>S32<td>QSYMM16<td>QASYMM8<td>QSYMM16<td>QASYMM8
     </table>
 <tr>
+  <td rowspan="2">MatMul
+  <td rowspan="2" style="width:200px;"> Computes a matrix multiplication in batches.
+  <td rowspan="2">
+      <ul>
+       <li>ANEURALNETWORKS_BATCH_MATMUL
+      </ul>
+  <td>NEMatMul
+  <td>
+      <ul>
+       <li>Any
+      </ul>
+  <td>
+    <table>
+    <tr><th>lhs<th>rhs<th>dst
+    <tr><td>F32<td>F32<td>F32
+    <tr><td>F16<td>F16<td>F16
+    <tr><td>BFLOAT16<td>BFLOAT16<td>BFLOAT16
+    <tr><td>QASYMM8_SIGNED<td>QASYMM8_SIGNED<td>QASYMM8_SIGNED
+    <tr><td>QASYMM8<td>QASYMM8<td>QASYMM8
+    </table>
+<tr>
+  <td>CLMatMul
+  <td>
+      <ul>
+       <li>All
+      </ul>
+  <td>
+    <table>
+    <tr><th>lhs<th>rhs<th>dst
+    <tr><td>F32<td>F32<td>F32
+    <tr><td>F16<td>F16<td>F16
+    <tr><td>QASYMM8_SIGNED<td>QASYMM8_SIGNED<td>QASYMM8_SIGNED
+    <tr><td>QASYMM8<td>QASYMM8<td>QASYMM8
+    </table>
+<tr>
   <td rowspan="2">MaxUnpoolingLayer
   <td rowspan="2" style="width:200px;"> Function to perform MaxUnpooling.
   <td rowspan="2">
@@ -2132,6 +2211,27 @@ where N = batches, C = channels, H = height, W = width
     <tr><td>F16<td>F16
     </table>
 <tr>
+  <td rowspan="1">NormalizePlanarYUVLayer
+  <td rowspan="1" style="width:200px;"> Function to compute normalization planar YUV layer.
+  <td rowspan="1">
+      <ul>
+       <li>n/a
+      </ul>
+  <td>CLNormalizePlanarYUVLayer
+  <td>
+      <ul>
+       <li>NHWC
+       <li>NCHW
+      </ul>
+  <td>
+    <table>
+    <tr><th>src<th>dst
+    <tr><td>F32<td>F32
+    <tr><td>F16<td>F16
+    <tr><td>QASYMM8<td>QASYMM8
+    <tr><td>QASYMM8_SIGNED<td>QASYMM8_SIGNED
+    </table>
+<tr>
   <td rowspan="2">PadLayer
   <td rowspan="2" style="width:200px;"> Function to pad a tensor.
   <td rowspan="2">
@@ -2280,6 +2380,40 @@ where N = batches, C = channels, H = height, W = width
     <tr><td>F32<td>F32
     </table>
 <tr>
+  <td rowspan="2">Pooling3dLayer
+  <td rowspan="2" style="width:200px;"> Function to perform pooling 3D with the specified pooling operation.
+  <td rowspan="2">
+      <ul>
+       <li>N/A
+      </ul>
+  <td>NEPooling3dLayer
+  <td>
+      <ul>
+       <li>NDHWC
+      </ul>
+  <td>
+    <table>
+    <tr><th>src<th>dst
+    <tr><td>F16<td>F16
+    <tr><td>F32<td>F32
+    <tr><td>QASYMM8<td>QASYMM8
+    <tr><td>QASYMM8_SIGNED<td>QASYMM8_SIGNED
+    </table>
+<tr>
+  <td>CLPooling3dLayer
+  <td>
+      <ul>
+       <li>NDHWC
+      </ul>
+  <td>
+    <table>
+    <tr><th>src<th>dst
+    <tr><td>F16<td>F16
+    <tr><td>F32<td>F32
+    <tr><td>QASYMM8<td>QASYMM8
+    <tr><td>QASYMM8_SIGNED<td>QASYMM8_SIGNED
+    </table>
+<tr>
   <td rowspan="2">PReluLayer
   <td rowspan="2" style="width:200px;"> Function to compute the activation layer with the PRELU activation function.
   <td rowspan="2">
@@ -2525,6 +2659,23 @@ where N = batches, C = channels, H = height, W = width
     <tr><td>S32<td>S32
     </table>
 <tr>
+  <td rowspan="1">ReorderLayer
+  <td rowspan="1" style="width:200px;"> Reorders a tensor to a different weights format.
+  <td rowspan="1">
+      <ul>
+       <li>n/a
+      </ul>
+  <td>NEReorderLayer
+  <td>
+      <ul>
+       <li>NCHW
+      </ul>
+  <td>
+    <table>
+    <tr><th>src<th>dst
+    <tr><td>F32<td>F32
+    </table>
+<tr>
   <td rowspan="2">ReorgLayer
   <td rowspan="2" style="width:200px;"> Performs a reorganization layer of input tensor to the output tensor.
   <td rowspan="2">
@@ -2598,7 +2749,7 @@ where N = batches, C = channels, H = height, W = width
   <td>
     <table>
     <tr><th>src0<th>src1<th>dst
-    <tr><td>All<td>U32<td>All
+    <tr><td>All<td>U32, S32<td>All
     </table>
 <tr>
   <td>CLReverse
@@ -2609,7 +2760,7 @@ where N = batches, C = channels, H = height, W = width
   <td>
     <table>
     <tr><th>src0<th>src1<th>dst
-    <tr><td>All<td>U32<td>All
+    <tr><td>All<td>U32, S32<td>All
     </table>
 <tr>
   <td rowspan="2">RNNLayer
@@ -2730,6 +2881,7 @@ where N = batches, C = channels, H = height, W = width
     <tr><td>F16<td>F16
     <tr><td>F32<td>F32
     <tr><td>U8<td>U8
+    <tr><td>S8<td>S8
     <tr><td>S16<td>S16
     </table>
 <tr>
@@ -3100,26 +3252,7 @@ where N = batches, C = channels, H = height, W = width
     <tr><td>F16<td>F16<td>F16<td>F16
     <tr><td>F32<td>F32<td>F32<td>F32
     </table>
-<tr>
-  <td rowspan="1">WinogradInputTransform
-  <td rowspan="1" style="width:200px;"> Function to perform a Winograd transform on the input tensor.
-  <td rowspan="1">
-      <ul>
-       <li>n/a
-      </ul>
-  <td>CLWinogradInputTransform
-  <td>
-      <ul>
-       <li>NHWC
-       <li>NCHW
-      </ul>
-  <td>
-    <table>
-    <tr><th>src<th>dst
-    <tr><td>F16<td>F16
-    <tr><td>F32<td>F32
-    </table>
 </table>
 
 */
-} // namespace
-\ No newline at end of file
+} // namespace
diff --git a/docs/user_guide/release_version_and_change_log.dox b/docs/user_guide/release_version_and_change_log.dox
index 3ffa11b045..a5f61d669d 100644
--- a/docs/user_guide/release_version_and_change_log.dox
+++ b/docs/user_guide/release_version_and_change_log.dox
@@ -1,5 +1,5 @@
 ///
-/// Copyright (c) 2017-2021 Arm Limited.
+/// Copyright (c) 2017-2024 Arm Limited.
 ///
 /// SPDX-License-Identifier: MIT
 ///
@@ -37,9 +37,313 @@ If there is more than one release in a month then an extra sequential number is
 	v17.04 (First release of April 2017)
 
 @note We're aiming at releasing one major public release with new features per quarter. All releases in between will only contain bug fixes.
+@note Starting from release 22.05, 'master' branch is no longer being used, it has been replaced by 'main'. Please update your clone jobs accordingly.
 
 @section S2_2_changelog Changelog
 
+v24.05 Public major release
+ - Add @ref CLScatter operator for FP32/16, S32/16/8, U32/16/8 data types
+ - Various fixes to enable FP16 kernels in armv8a multi_isa builds.
+ - Updated logic in the OpenMP scheduler to exclude LITTLE cores.
+
+v24.04 Public major release
+ - Add Bfloat16 data type support for @ref NEMatMul.
+ - Add support for SoftMax in SME2 for FP32, FP16, QASYMM8 and QASYMM8_SIGNED.
+ - Add support for in place accumulation to CPU GEMM kernels.
+ - Add low-precision Int8 * Int8 -> FP32 CPU GEMM which dequantizes after multiplication
+ - Add is_dynamic flag to QuantizationInfo to signal to operators that it may change after configuration
+ - Performance optimizations:
+   - Optimize start-up time of @ref NEConvolutionLayer for some input configurations where GeMM is selected as the convolution algorithm
+   - Optimize @ref NEConvolutionLayer for input tensor size > 1e7 bytes and weight tensor height > 7
+   - Optimize @ref NESoftmaxLayer for axis != 0 by natively supporting higher axes up to axis 3.
+
+v24.02.1 Public patch release
+ - Fix performance regression in fixed-format kernels
+ - Fix compile and runtime errors in arm_compute_validation for Windows on Arm(WoA)
+
+v24.02 Public major release
+ - Replace template writer with compute kernel writer in dynamic fusion.
+ - Performance optimizations:
+   - Parallelize @ref NEDepthwiseConvolutionLayer over batches if there is only 1 row
+
+v24.01 Public major release
+ - Remove the legacy 'libarm_compute_core' library. This library is an artifact of Compute Library's legacy library architecture and no longer serves any purpose.
+  You should link only to the main `libarm_compute` library for core functionality.
+ - Expand GPUTarget list with Mali™ G720 and G620.
+ - Optimize CPU activation functions using LUT-based implementation:
+   - Sigmoid function for FP16.
+ - New features
+   - Add support for FP16 in all multi_isa builds.
+ - Performance optimizations:
+   - Optimize @ref NESoftmaxLayer
+   - Optimize @ref NEDepthToSpaceLayer.
+
+v23.11 Public major release
+ - New features
+   - Add support for input data type U64/S64 in CLCast and NECast.
+   - Add support for output data type S64 in NEArgMinMaxLayer and CLArgMinMaxLayer
+   - Port the following kernels in the experimental Dynamic Fusion interface to use the new Compute Kernel Writer interface:
+     - @ref experimental::dynamic_fusion::GpuCkwResize
+     - @ref experimental::dynamic_fusion::GpuCkwPool2d
+     - @ref experimental::dynamic_fusion::GpuCkwDepthwiseConv2d
+     - @ref experimental::dynamic_fusion::GpuCkwMatMul
+   - Add support for OpenCL™ comand buffer with mutable dispatch extension.
+   - Add support for Arm® Cortex®-A520 and Arm® Cortex®-R82.
+   - Add support for negative axis values and inverted axis values in @ref arm_compute::NEReverse and @ref arm_compute::CLReverse.
+   - Add new OpenCL™ kernels:
+     - @ref opencl::kernels::ClMatMulLowpNativeMMULKernel support for QASYMM8 and QASYMM8_SIGNED, with batch support
+ - Performance optimizations:
+   - Optimize @ref cpu::CpuReshape
+   - Optimize @ref opencl::ClTranspose
+   - Optimize @ref NEStackLayer
+   - Optimize @ref CLReductionOperation.
+   - Optimize @ref CLSoftmaxLayer.
+   - Optimize start-up time of @ref NEConvolutionLayer for some input configurations where GeMM is selected as the convolution algorithm
+   - Reduce CPU Overhead by optimal flushing of CL kernels.
+ - Deprecate support for Bfloat16 in @ref cpu::CpuCast.
+ - Support for U32 axis in @ref arm_compute::NEReverse and @ref arm_compute::CLReverse will be deprecated in 24.02.
+ - Remove legacy PostOps interface. PostOps was the experimental interface for kernel fusion and is replaced by the new Dynamic Fusion interface.
+ - Update OpenCL™ API headers to v2023.04.17
+
+v23.08 Public major release
+ - Deprecate the legacy 'libarm_compute_core' library. This library is an artifact of Compute Library's legacy library architecture and no longer serves any purpose.
+ Users must no longer link their applications to this library and instead link only to the main `libarm_compute` library for core functionality.
+ - New features
+   - Rewrite CLArgMinMaxLayer for axis 0 and enable S64 output.
+   - Add multi-sketch support for dynamic fusion.
+   - Break up arm_compute/core/Types.h and utils/Utils.h a bit to reduce unused code in each inclusion of these headers.
+   - Add Fused Activation to CLMatMul.
+   - Implement FP32/FP16 @ref opencl::kernels::ClMatMulNativeMMULKernel using the MMUL extension.
+   - Use MatMul in fully connected layer with dynamic weights when supported.
+   - Optimize CPU depthwise convolution with channel multiplier.
+   - Add support in CpuCastKernel for conversion of S64/U64 to F32.
+   - Add new OpenCL™ kernels:
+     - @ref opencl::kernels::ClMatMulNativeMMULKernel support for FP32 and FP16, with batch support
+   - Enable transposed convolution with non-square kernels on CPU and GPU.
+   - Add support for input data type U64/S64 in CLCast.
+   - Add new Compute Kernel Writer (CKW) subproject that offers a C++ interface to generate tile-based OpenCL code in just-in-time fashion.
+   - Port the following kernels in the experimental Dynamic Fusion interface to use the new Compute Kernel Writer interface with support for FP16/FP32 only:
+     - @ref experimental::dynamic_fusion::GpuCkwActivation
+     - @ref experimental::dynamic_fusion::GpuCkwCast
+     - @ref experimental::dynamic_fusion::GpuCkwDirectConv2d
+     - @ref experimental::dynamic_fusion::GpuCkwElementwiseBinary
+     - @ref experimental::dynamic_fusion::GpuCkwStore
+ - Various optimizations and bug fixes.
+
+v23.05.1 Public patch release
+ - Enable CMake and Bazel option to build multi_isa without FP16 support.
+ - Fix compilation error in NEReorderLayer (aarch64 only).
+ - Disable invalid (false-negative) validation test with CPU scale layer on FP16.
+ - Various bug fixes
+
+v23.05 Public major release
+ - New features:
+   - Add new Arm® Neon™ kernels / functions:
+      - @ref NEMatMul for QASYMM8, QASYMM8_SIGNED, FP32 and FP16, with batch support.
+      - NEReorderLayer (aarch64 only)
+   - Add new OpenCL™ kernels / functions:
+      - @ref CLMatMul support for QASYMM8, QASYMM8_SIGNED, FP32 and FP16, with batch support.
+   - Add support for the multiple dimensions in the indices parameter for both the Arm® Neon™ and OpenCL™ implementations of the Gather Layer.
+   - Add support for dynamic weights in @ref CLFullyConnectedLayer and @ref NEFullyConnectedLayer for all data types.
+   - Add support for cropping in the Arm® Neon™ and OpenCL™: implementations of the BatchToSpace Layer for all data types.
+   - Add support for quantized data types for the ElementwiseUnary Operators for Arm® Neon™.
+   - Implement RSQRT for quantized data types on OpenCL™.
+   - Add FP16 depthwise convolution kernels for SME2.
+ - Performance optimizations:
+   - Improve CLTuner exhaustive mode tuning time.
+ - Deprecate dynamic block shape in @ref NEBatchToSpaceLayer and @ref CLBatchToSpaceLayer.
+ - Various optimizations and bug fixes.
+
+v23.02.1 Public patch release
+ - Allow mismatching data layouts between the source tensor and weights for \link cpu::CpuGemmDirectConv2d CpuGemmDirectConv2d \endlink with fixed format kernels.
+ - Fixes for experimental CPU only Bazel and CMake builds.
+
+v23.02 Public major release
+ - New features:
+   - Rework the experimental dynamic fusion interface by identifying auxiliary and intermediate tensors, and specifying an explicit output operator.
+   - Add the following operators to the experimental dynamic fusion API:
+     - GpuAdd, GpuCast, GpuClamp, GpuDepthwiseConv2d, GpuMul, GpuOutput, GpuPool2d, GpuReshape, GpuResize, GpuSoftmax, GpuSub.
+   - Add SME/SME2 kernels for GeMM, Winograd convolution, Depthwise convolution and Pooling.
+   - Add new CPU operator AddMulAdd for float and quantized types.
+   - Add new flag @ref ITensorInfo::lock_paddings() to tensors to prevent extending tensor paddings.
+   - Add experimental support for CPU only Bazel and CMake builds.
+ - Performance optimizations:
+   - Optimize CPU base-e exponential functions for FP32.
+   - Optimize CPU StridedSlice by copying first dimension elements in bulk where possible.
+   - Optimize CPU quantized Subtraction by reusing the quantized Addition kernel.
+   - Optimize CPU ReduceMean by removing quantization steps and performing the operation in integer domain.
+   - Optimize GPU Scale and Dynamic Fusion GpuResize by removing quantization steps and performing the operation in integer domain.
+   - Update the heuristic for CLDepthwiseConvolutionNative kernel.
+   - Add new optimized OpenCL kernel to compute indirect convolution:
+     - \link opencl::kernels::ClIndirectConv2dKernel ClIndirectConv2dKernel \endlink
+   - Add new optimized OpenCL kernel to compute transposed convolution:
+     - \link opencl::kernels::ClTransposedConvolutionKernel ClTransposedConvolutionKernel \endlink
+ - Update recommended/minimum NDK version to r20b.
+ - Various optimizations and bug fixes.
+
+v22.11 Public major release
+ - New features:
+   - Add new experimental dynamic fusion API.
+   - Add CPU batch matrix multiplication with adj_x = false and adj_y = false for FP32.
+   - Add CPU MeanStdDevNorm for QASYMM8.
+   - Add CPU and GPU GELU activation function for FP32 and FP16.
+   - Add CPU swish activation function for FP32 and FP16.
+ - Performance optimizations:
+   - Optimize CPU bilinear scale for FP32, FP16, QASYMM8, QASYMM8_SIGNED, U8 and S8.
+   - Optimize CPU activation functions using LUT-based implementation:
+     - Sigmoid function for QASYMM8 and QASYMM8_SIGNED.
+     - Hard swish function for QASYMM8_SIGNED.
+   - Optimize CPU addition for QASYMM8 and QASYMM8_SIGNED using fixed-point arithmetic.
+   - Optimize CPU multiplication, subtraction and activation layers by considering tensors as 1D.
+   - Optimize GPU depthwise convolution kernel and heuristic.
+   - Optimize GPU Conv2d heuristic.
+   - Optimize CPU MeanStdDevNorm for FP16.
+   - Optimize CPU tanh activation function for FP16 using rational approximation.
+ - Improve GPU GeMMLowp start-up time.
+ - Various optimizations and bug fixes.
+
+v22.08 Public major release
+ - Various bug fixes.
+ - Disable unsafe FP optimizations causing accuracy issues in:
+   - \link opencl::kernels::ClDirectConv2dKernel ClDirectConv2dKernel \endlink
+   - \link opencl::kernels::ClDirectConv2dKernel ClDirectConv3dKernel \endlink
+   - @ref CLDepthwiseConvolutionLayerNativeKernel
+ - Add Dynamic Fusion of Elementwise Operators: Div, Floor, Add.
+ - Optimize the gemm_reshaped_rhs_nly_nt OpenCL kernel using the arm_matrix_multiply extension available for Arm® Mali™-G715 and Arm® Mali™-G615.
+ - Add support for the arm_matrix_multiply extension in the gemmlowp_mm_reshaped_only_rhs_t OpenCL kernel.
+ - Expand GPUTarget list with missing Mali™ GPUs product names: G57, G68, G78AE, G610, G510, G310.
+ - Extend the direct convolution 2d interface to configure the block size.
+ - Update ClConv2D heuristic to use direct convolution.
+ - Use official Khronos® OpenCL extensions:
+   - Add cl_khr_integer_dot_product extension support.
+   - Add support of OpenCL 3.0 non-uniform workgroup.
+ - Cpu performance optimizations:
+   - Add LUT-based implementation of Hard Swish and Leaky ReLU activation function for aarch64 build.
+   - Optimize Add layer by considering the input tensors as 1D array.
+ - Add fixed-format BF16, FP16 and FP32 Neon™ GEMM kernels to support variable weights.
+ - Add new winograd convolution kernels implementation and update the ACL \link arm_compute::cpu::CpuWinogradConv2d CpuWinogradConv2d\endlink operator.
+ - Add experimental support for native builds for Windows® on Arm™.
+ - Build flag interpretation change: arch=armv8.6-a now translates to -march=armv8.6-a CXX flag instead of march=armv8.2-a + explicit selection of feature extensions.
+ - Build flag change: toolchain_prefix, compiler_prefix:
+   - Use empty string "" to suppress any prefixes.
+   - Use "auto" to use default (auto) prefixes chosen by the build script. This is the default behavior when unspecified.
+   - Any other string will be used as custom prefixes to the compiler and the rest of toolchain tools.
+   - The default behaviour when prefix is unspecified does not change, but its signifier has been changed from empty string "" to "auto".
+ - armv7a with Android build will no longer be tested or maintained.
+
+v22.05 Public major release
+ - Various bug fixes.
+ - Various optimizations.
+ - Add support for NDK r23b.
+ - Inclusive language adjustment. Please refer to @ref S5_0_inc_lang for details.
+ - New Arm® Neon™ kernels / functions :
+   - \link opencl::kernels::ClPool3dKernel ClPool3dKernel \endlink
+ - New OpenCL kernels / functions :
+   - \link cpu::kernels::CpuPool3dKernel CpuPool3dKernel \endlink
+ - Improve the start-up times for the following OpenCL kernels:
+   - \link opencl::kernels::ClWinogradInputTransformKernel ClWinogradInputTransformKernel \endlink
+   - \link opencl::kernels::ClWinogradOutputTransformKernel ClWinogradOutputTransformKernel \endlink
+   - \link opencl::kernels::ClWinogradFilterTransformKernel ClWinogradFilterTransformKernel \endlink
+   - \link opencl::kernels::ClHeightConcatenateKernel ClHeightConcatenateKernel \endlink
+ - Decouple the implementation of the following Cpu kernels into various data types (fp32, fp16, int):
+   - \link cpu::kernels::CpuDirectConv2dKernel CpuDirectConv2dKernel \endlink
+   - \link cpu::kernels::CpuDepthwiseConv2dNativeKernel CpuDepthwiseConv2dNativeKernel \endlink
+   - \link cpu::kernels::CpuGemmMatrixAdditionKernel CpuGemmMatrixAdditionKernel \endlink
+   - \link cpu::kernels::CpuGemmMatrixMultiplyKernel CpuGemmMatrixMultiplyKernel \endlink
+   - @ref NEFuseBatchNormalizationKernel
+   - @ref NEL2NormalizeLayerKernel
+
+v22.02 Public major release
+ - Various bug fixes.
+ - Various optimizations.
+ - Update A510 arm_gemm cpu Kernels.
+ - Inclusive language adjustment. Please refer to @ref S5_0_inc_lang for details.
+ - Improve the start-up time for the following OpenCL kernels:
+   - @ref CLScale
+   - @ref CLGEMM
+   - @ref CLDepthwiseConvolutionLayer
+   - \link opencl::kernels::ClIm2ColKernel ClIm2ColKernel \endlink
+   - \link opencl::kernels::ClDirectConv2dKernel ClDirectConv2dKernel \endlink
+ - Remove functions:
+   - CLRemap
+   - NERemap
+ - Remove padding from OpenCL kernels:
+   - \link opencl::kernels::ClDirectConv2dKernel ClDirectConv2dKernel \endlink
+ - Remove padding from Cpu kernels:
+   - \link cpu::kernels::CpuDirectConv2dKernel CpuDirectConv2dKernel \endlink
+ - Decouple the implementation of the following Cpu kernels into various data types (fp32, fp16, int):
+   - \link cpu::kernels::CpuActivationKernel CpuActivationKernel \endlink
+   - \link cpu::kernels::CpuAddKernel CpuAddKernel \endlink
+   - \link cpu::kernels::CpuElementwiseKernel CpuElementwiseKernel \endlink
+   - \link cpu::CpuSoftmaxGeneric CpuSoftmaxKernel \endlink
+   - @ref NEBoundingBoxTransformKernel
+   - @ref NECropKernel
+   - @ref NEComputeAllAnchorsKernel
+   - @ref NEInstanceNormalizationLayerKernel
+   - NEMaxUnpoolingLayerKernel
+   - @ref NEMeanStdDevNormalizationKernel
+   - @ref NERangeKernel
+   - @ref NEROIAlignLayerKernel
+   - @ref NESelectKernel
+
+v21.11 Public major release
+ - Various bug fixes.
+ - Various optimizations:
+   - Improve performance of bilinear and nearest neighbor Scale on both CPU and GPU for FP32, FP16, Int8, Uint8 data types
+   - Improve performance of Softmax on GPU for Uint8/Int8
+ - New OpenCL kernels / functions:
+   - @ref CLConv3D
+ - New Arm® Neon™ kernels / functions:
+   - @ref NEConv3D
+ - Support configurable build by a selected subset of operator list
+ - Support MobileBert on Neon™ backend
+ - Improve operator/function logging
+ - Remove padding from OpenCL kernels:
+   - ClPool2dKernel
+   - ClScaleKernel
+   - ClGemmMatrixMultiplyReshapedKernel
+ - Remove padding from Cpu kernels:
+   - CpuPool2dKernel
+ - Remove Y padding from OpenCL kernels:
+   - ClGemmMatrixMultiplyKernel
+   - ClGemmReshapedRHSMatrixKernel
+ - Remove legacy GeMM kernels in gemm_v1.cl
+
+v21.08 Public major release
+ - Various bug fixes.
+ - Various optimizations:
+  - Improve LWS (Local-Workgroup-Size) heuristic in OpenCL for GeMM, Direct Convolution and Winograd Transformations when OpenCL tuner is not used
+  - Improve QASYMM8/QSYMM8 performance on OpenCL for various Arm® Mali™ GPU architectures
+  - Add dynamic weights support in Fully connected layer (CPU/GPU)
+  - Various performance optimizations for floating-point data types (CPU/GPU)
+ - Add a reduced core library build arm_compute_core_v2
+ - Expose Operator API
+ - Support fat binary build for arm8.2-a via fat_binary build flag
+ - Add CPU discovery capabilities
+ - Add data type f16 support for:
+  - CLRemapKernel
+ - Port the following functions to stateless API:
+   - @ref CLConvolutionLayer
+   - @ref CLFlattenLayer
+   - @ref CLFullyConnectedLayer
+   - @ref CLGEMM
+   - @ref CLGEMMConvolutionLayer
+   - @ref CLGEMMLowpMatrixMultiplyCore
+   - @ref CLWinogradConvolutionLayer
+   - @ref NEConvolutionLayer
+   - @ref NEFlattenLayer
+   - @ref NEFullyConnectedLayer
+   - @ref NEGEMM
+   - @ref NEGEMMConv2d
+   - @ref NEGEMMConvolutionLayer
+   - @ref NEGEMMLowpMatrixMultiplyCore
+   - @ref NEWinogradConvolutionLayer
+ - Remove the following functions:
+   - CLWinogradInputTransform
+ - Remove CLCoreRuntimeContext
+ - Remove ICPPSimpleKernel
+ - Rename file arm_compute/runtime/CL/functions/CLElementWiseUnaryLayer.h to arm_compute/runtime/CL/functions/CLElementwiseUnaryLayer.h
+
 v21.05 Public major release
  - Various bug fixes.
  - Various optimisations.
@@ -62,7 +366,7 @@ v21.05 Public major release
   - @ref NEDeconvolutionLayer
  - Remove padding from OpenCL kernels:
    - @ref CLL2NormalizeLayerKernel
-   - @ref CLDepthwiseConvolutionLayer3x3NHWCKernel
+   - CLDepthwiseConvolutionLayer3x3NHWCKernel
    - @ref CLNormalizationLayerKernel
    - @ref CLNormalizePlanarYUVLayerKernel
    - @ref opencl::kernels::ClMulKernel
@@ -153,7 +457,7 @@ v21.05 Public major release
    - CLThreshold
    - CLWarpAffine
    - CLWarpPerspective
- 
+
 v21.02 Public major release
  - Various bug fixes.
  - Various optimisations.
@@ -165,8 +469,8 @@ v21.02 Public major release
    - @ref NEActivationLayer
    - @ref NEArithmeticAddition
    - @ref NEBatchNormalizationLayerKernel
-   - @ref cpu::kernels::CpuLogits1DSoftmaxKernel
-   - @ref cpu::kernels::CpuLogits1DMaxKernel
+   - cpu::kernels::CpuLogits1DSoftmaxKernel
+   - cpu::kernels::CpuLogits1DMaxKernel
    - @ref cpu::kernels::CpuElementwiseUnaryKernel
  - Remove padding from OpenCL kernels:
    - CLDirectConvolutionLayerKernel
@@ -227,7 +531,7 @@ v20.11 Public major release
       - @ref CLLogSoftmaxLayer
       - GCSoftmaxLayer
  - New OpenCL kernels / functions:
-   - @ref CLGEMMLowpQuantizeDownInt32ScaleByFixedPointKernel
+   - CLGEMMLowpQuantizeDownInt32ScaleByFixedPointKernel
    - @ref CLLogicalNot
    - @ref CLLogicalAnd
    - @ref CLLogicalOr
@@ -238,40 +542,40 @@ v20.11 Public major release
  - Removed padding from Arm® Neon™ kernels:
    - NEComplexPixelWiseMultiplicationKernel
    - NENonMaximaSuppression3x3Kernel
-   - @ref NERemapKernel
-   - @ref NEGEMMInterleave4x4Kernel
+   - NERemapKernel
+   - NEGEMMInterleave4x4Kernel
    - NEDirectConvolutionLayerKernel
    - NEScaleKernel
    - NELocallyConnectedMatrixMultiplyKernel
-   - @ref NEGEMMLowpOffsetContributionKernel
-   - @ref NEGEMMTranspose1xWKernel
+   - NEGEMMLowpOffsetContributionKernel
+   - NEGEMMTranspose1xWKernel
    - NEPoolingLayerKernel
    - NEConvolutionKernel
    - NEDepthwiseConvolutionLayerNativeKernel
-   - @ref NEGEMMLowpMatrixMultiplyKernel
-   - @ref NEGEMMMatrixMultiplyKernel
+   - NEGEMMLowpMatrixMultiplyKernel
+   - NEGEMMMatrixMultiplyKernel
    - NEDirectConvolutionLayerOutputStageKernel
    - @ref NEReductionOperationKernel
-   - @ref NEGEMMLowpMatrixAReductionKernel
-   - @ref NEGEMMLowpMatrixBReductionKernel
+   - NEGEMMLowpMatrixAReductionKernel
+   - NEGEMMLowpMatrixBReductionKernel
  - Removed padding from OpenCL kernels:
    - CLBatchConcatenateLayerKernel
    - CLElementwiseOperationKernel
    - @ref CLBatchNormalizationLayerKernel
    - CLPoolingLayerKernel
    - CLWinogradInputTransformKernel
-   - @ref CLGEMMLowpMatrixMultiplyNativeKernel
-   - @ref CLGEMMLowpMatrixAReductionKernel
-   - @ref CLGEMMLowpMatrixBReductionKernel
-   - @ref CLGEMMLowpOffsetContributionOutputStageKernel
-   - @ref CLGEMMLowpOffsetContributionKernel
+   - CLGEMMLowpMatrixMultiplyNativeKernel
+   - CLGEMMLowpMatrixAReductionKernel
+   - CLGEMMLowpMatrixBReductionKernel
+   - CLGEMMLowpOffsetContributionOutputStageKernel
+   - CLGEMMLowpOffsetContributionKernel
    - CLWinogradOutputTransformKernel
-   - @ref CLGEMMLowpMatrixMultiplyReshapedKernel
+   - CLGEMMLowpMatrixMultiplyReshapedKernel
    - @ref CLFuseBatchNormalizationKernel
    - @ref CLDepthwiseConvolutionLayerNativeKernel
    - CLDepthConvertLayerKernel
    - CLCopyKernel
-   - @ref CLDepthwiseConvolutionLayer3x3NHWCKernel
+   - CLDepthwiseConvolutionLayer3x3NHWCKernel
    - CLActivationLayerKernel
    - CLWinogradFilterTransformKernel
    - CLWidthConcatenateLayerKernel
@@ -281,11 +585,11 @@ v20.11 Public major release
    - CLLogits1DNormKernel
    - CLHeightConcatenateLayerKernel
    - CLGEMMMatrixMultiplyKernel
-   - @ref CLGEMMLowpQuantizeDownInt32ScaleKernel
-   - @ref CLGEMMLowpQuantizeDownInt32ScaleByFloatKernel
-   - @ref CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel
+   - CLGEMMLowpQuantizeDownInt32ScaleKernel
+   - CLGEMMLowpQuantizeDownInt32ScaleByFloatKernel
+   - CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel
    - CLDepthConcatenateLayerKernel
-   - @ref CLGEMMLowpQuantizeDownInt32ScaleByFixedPointKernel
+   - CLGEMMLowpQuantizeDownInt32ScaleByFixedPointKernel
  - Removed OpenCL kernels / functions:
    - CLGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel
    - CLGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel
@@ -520,7 +824,7 @@ v20.08 Public major release
  - New OpenCL kernels / functions:
    - @ref CLMaxUnpoolingLayerKernel
  - New Arm® Neon™ kernels / functions:
-   - @ref NEMaxUnpoolingLayerKernel
+   - NEMaxUnpoolingLayerKernel
  - New graph example:
    - graph_yolov3_output_detector
  - GEMMTuner improvements:
@@ -567,7 +871,7 @@ v20.08 Public major release
       The default "axis" value for @ref NESoftmaxLayer, @ref NELogSoftmaxLayer is changed from 1 to 0.
       Only axis 0 is supported.
  - The support for quantized data types has been removed from @ref CLLogSoftmaxLayer due to implementation complexity.
- - Removed padding requirement for the input (e.g. LHS of GEMM) and output in CLGEMMMatrixMultiplyNativeKernel, CLGEMMMatrixMultiplyReshapedKernel, CLGEMMMatrixMultiplyReshapedOnlyRHSKernel and @ref CLIm2ColKernel (NHWC only)
+ - Removed padding requirement for the input (e.g. LHS of GEMM) and output in CLGEMMMatrixMultiplyNativeKernel, CLGEMMMatrixMultiplyReshapedKernel, CLGEMMMatrixMultiplyReshapedOnlyRHSKernel and CLIm2ColKernel (NHWC only)
    - This change allows to use @ref CLGEMMConvolutionLayer without extra padding for the input and output.
    - Only the weights/bias of @ref CLGEMMConvolutionLayer could require padding for the computation.
    - Only on Arm® Mali™ Midgard GPUs, @ref CLGEMMConvolutionLayer could require padding since CLGEMMMatrixMultiplyKernel is called and currently requires padding.
@@ -583,9 +887,9 @@ v20.05 Public major release
  - Updated recommended gcc version to Linaro 6.3.1.
  - Added Bfloat16 type support
  - Added Bfloat16 support in:
-     - @ref NEWeightsReshapeKernel
-     - @ref NEConvolutionLayerReshapeWeights
-     - @ref NEIm2ColKernel
+     - NEWeightsReshapeKernel
+     - NEConvolutionLayerReshapeWeights
+     - NEIm2ColKernel
      - NEIm2Col
      - NEDepthConvertLayerKernel
      - @ref NEDepthConvertLayer
@@ -596,9 +900,9 @@ v20.05 Public major release
      - @ref CLDeconvolutionLayer
      - @ref CLDirectDeconvolutionLayer
      - @ref CLGEMMDeconvolutionLayer
-     - @ref CLGEMMLowpMatrixMultiplyReshapedKernel
-     - @ref CLGEMMLowpQuantizeDownInt32ScaleKernel
-     - @ref CLGEMMLowpQuantizeDownInt32ScaleByFloatKernel
+     - CLGEMMLowpMatrixMultiplyReshapedKernel
+     - CLGEMMLowpQuantizeDownInt32ScaleKernel
+     - CLGEMMLowpQuantizeDownInt32ScaleByFloatKernel
      - @ref CLReductionOperation
      - @ref CLReduceMean
      - @ref NEScale
@@ -609,7 +913,7 @@ v20.05 Public major release
      - @ref NEReduceMean
      - @ref NEArgMinMaxLayer
      - @ref NEDeconvolutionLayer
-     - @ref NEGEMMLowpQuantizeDownInt32ScaleKernel
+     - NEGEMMLowpQuantizeDownInt32ScaleKernel
      - @ref CPPBoxWithNonMaximaSuppressionLimit
      - @ref CPPDetectionPostProcessLayer
      - @ref CPPPermuteKernel
@@ -639,9 +943,9 @@ v20.05 Public major release
  - Removed NEDepthwiseConvolutionLayerOptimized
  - Added support for Winograd 3x3,4x4 on Arm® Neon™ FP16:
      - @ref NEWinogradConvolutionLayer
-     - @ref NEWinogradLayerTransformInputKernel
-     - @ref NEWinogradLayerTransformOutputKernel
-     - @ref NEWinogradLayerTransformWeightsKernel
+     - CpuWinogradConv2dTransformInputKernel
+     - CpuWinogradConv2dTransformOutputKernel
+     - CpuWinogradConv2dTransformWeightsKernel
  - Added CLCompileContext
  - Added Arm® Neon™ GEMM kernel with 2D window support
 
@@ -655,9 +959,9 @@ v20.02 Public major release
      - @ref CLDepthwiseConvolutionLayer
      - CLDepthwiseConvolutionLayer3x3
      - @ref CLGEMMConvolutionLayer
-     - @ref CLGEMMLowpMatrixMultiplyCore
-     - @ref CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel
-     - @ref CLGEMMLowpMatrixMultiplyNativeKernel
+     - CLGEMMLowpMatrixMultiplyCore
+     - CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel
+     - CLGEMMLowpMatrixMultiplyNativeKernel
      - @ref NEActivationLayer
      - NEComparisonOperationKernel
      - @ref NEConvolutionLayer
@@ -680,10 +984,10 @@ v20.02 Public major release
      - @ref NESplit
  - New OpenCL kernels / functions:
      - @ref CLFill
-     - CLGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel / @ref CLGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPoint
+     - CLGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel / CLGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPoint
  - New Arm® Neon™ kernels / functions:
      - @ref NEFill
-     - @ref NEGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel / @ref NEGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPoint
+     - NEGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel / NEGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPoint
  - Deprecated Arm® Neon™ functions / interfaces:
      - CLDepthwiseConvolutionLayer3x3
      - NEDepthwiseConvolutionLayerOptimized
@@ -800,7 +1104,7 @@ v19.08 Public major release
     - NEBatchConcatenateLayerKernel
     - @ref NEDepthToSpaceLayerKernel / @ref NEDepthToSpaceLayer
     - NEDepthwiseConvolutionLayerNativeKernel
-    - @ref NEGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel
+    - NEGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel
     - @ref NEMeanStdDevNormalizationKernel / @ref NEMeanStdDevNormalizationLayer
     - @ref NESpaceToDepthLayerKernel / @ref NESpaceToDepthLayer
  - New OpenCL kernels / functions:
@@ -848,7 +1152,7 @@ v19.05 Public major release
     - @ref NEFFTDigitReverseKernel
     - @ref NEFFTRadixStageKernel
     - @ref NEFFTScaleKernel
-    - @ref NEGEMMLowpOffsetContributionOutputStageKernel
+    - NEGEMMLowpOffsetContributionOutputStageKernel
     - NEHeightConcatenateLayerKernel
     - @ref NESpaceToBatchLayerKernel / @ref NESpaceToBatchLayer
     - @ref NEFFT1D
@@ -861,7 +1165,7 @@ v19.05 Public major release
     - @ref CLFFTDigitReverseKernel
     - @ref CLFFTRadixStageKernel
     - @ref CLFFTScaleKernel
-    - @ref CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel
+    - CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel
     - CLGEMMMatrixMultiplyReshapedOnlyRHSKernel
     - CLHeightConcatenateLayerKernel
     - @ref CLDirectDeconvolutionLayer
@@ -953,7 +1257,7 @@ v19.02 Public major release
     - @ref CLRangeKernel / @ref CLRange
     - @ref CLUnstack
     - @ref CLGatherKernel / @ref CLGather
-    - @ref CLGEMMLowpMatrixMultiplyReshapedKernel
+    - CLGEMMLowpMatrixMultiplyReshapedKernel
  - New CPP kernels / functions:
     - @ref CPPDetectionOutputLayer
     - @ref CPPTopKV / @ref CPPTopKVKernel
@@ -1020,7 +1324,7 @@ v18.11 Public major release
  - Added the validate method in:
     - @ref NEDepthConvertLayer
     - @ref NEFloor / @ref CLFloor
-    - @ref NEGEMMMatrixAdditionKernel
+    - NEGEMMMatrixAdditionKernel
     - @ref NEReshapeLayer / @ref CLReshapeLayer
     - @ref CLScale
  - Added new examples:
@@ -1032,10 +1336,10 @@ v18.11 Public major release
     - CLWidthConcatenateLayer
     - CLFlattenLayer
     - @ref CLSoftmaxLayer
- - Add dot product support for @ref CLDepthwiseConvolutionLayer3x3NHWCKernel non-unit stride
+ - Add dot product support for CLDepthwiseConvolutionLayer3x3NHWCKernel non-unit stride
  - Add SVE support
  - Fused batch normalization into convolution layer weights in @ref CLFuseBatchNormalization
- - Fuses activation in @ref CLDepthwiseConvolutionLayer3x3NCHWKernel, @ref CLDepthwiseConvolutionLayer3x3NHWCKernel and @ref NEGEMMConvolutionLayer
+ - Fuses activation in CLDepthwiseConvolutionLayer3x3NCHWKernel, CLDepthwiseConvolutionLayer3x3NHWCKernel and @ref NEGEMMConvolutionLayer
  - Added NHWC data layout support to:
     - @ref CLChannelShuffleLayer
     - @ref CLDeconvolutionLayer
@@ -1045,7 +1349,7 @@ v18.11 Public major release
     - NEDepthwiseConvolutionLayer3x3Kernel
     - CLPixelWiseMultiplicationKernel
  - Added FP16 support to the following kernels:
-    - @ref CLDepthwiseConvolutionLayer3x3NHWCKernel
+    - CLDepthwiseConvolutionLayer3x3NHWCKernel
     - NEDepthwiseConvolutionLayer3x3Kernel
     - @ref CLNormalizePlanarYUVLayerKernel
     - @ref CLWinogradConvolutionLayer (5x5 kernel)
@@ -1064,7 +1368,7 @@ v18.08 Public major release
     - @ref CLDirectConvolutionLayer
     - @ref CLConvolutionLayer
     - @ref CLScale
-    - @ref CLIm2ColKernel
+    - CLIm2ColKernel
  - New Arm® Neon™ kernels / functions:
     - @ref NERNNLayer
  - New OpenCL kernels / functions:
@@ -1171,9 +1475,9 @@ v18.02 Public major release
     - Added name() method to all kernels.
     - Added support for Winograd 5x5.
     - NEPermuteKernel / @ref NEPermute
-    - @ref NEWinogradLayerTransformInputKernel / NEWinogradLayer
-    - @ref NEWinogradLayerTransformOutputKernel / NEWinogradLayer
-    - @ref NEWinogradLayerTransformWeightsKernel / NEWinogradLayer
+    - CpuWinogradConv2dTransformInputKernel / NEWinogradLayer
+    - CpuWinogradConv2dTransformOutputKernel / NEWinogradLayer
+    - CpuWinogradConv2dTransformWeightsKernel / NEWinogradLayer
     - Renamed NEWinogradLayerKernel into NEWinogradLayerBatchedGEMMKernel
  - New GLES kernels / functions:
     - GCTensorShiftKernel / GCTensorShift
@@ -1242,13 +1546,13 @@ v17.12 Public major release
     - arm_compute::NEGEMMLowpAArch64A53Kernel / arm_compute::NEGEMMLowpAArch64Kernel / arm_compute::NEGEMMLowpAArch64V8P4Kernel / arm_compute::NEGEMMInterleavedBlockedKernel / arm_compute::NEGEMMLowpAssemblyMatrixMultiplyCore
     - arm_compute::NEHGEMMAArch64FP16Kernel
     - NEDepthwiseConvolutionLayer3x3Kernel / NEDepthwiseIm2ColKernel / NEGEMMMatrixVectorMultiplyKernel / NEDepthwiseVectorToTensorKernel / @ref NEDepthwiseConvolutionLayer
-    - @ref NEGEMMLowpOffsetContributionKernel / @ref NEGEMMLowpMatrixAReductionKernel / @ref NEGEMMLowpMatrixBReductionKernel / @ref NEGEMMLowpMatrixMultiplyCore
-    - @ref NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel / @ref NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint
+    - NEGEMMLowpOffsetContributionKernel / NEGEMMLowpMatrixAReductionKernel / NEGEMMLowpMatrixBReductionKernel / NEGEMMLowpMatrixMultiplyCore
+    - NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel / NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint
     - NEWinogradLayer / NEWinogradLayerKernel
 
  - New OpenCL kernels / functions
-    - @ref CLGEMMLowpOffsetContributionKernel / @ref CLGEMMLowpMatrixAReductionKernel / @ref CLGEMMLowpMatrixBReductionKernel / @ref CLGEMMLowpMatrixMultiplyCore
-    - CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel / @ref CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint
+    - CLGEMMLowpOffsetContributionKernel / CLGEMMLowpMatrixAReductionKernel / CLGEMMLowpMatrixBReductionKernel / CLGEMMLowpMatrixMultiplyCore
+    - CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel / CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint
 
  - New graph nodes for Arm® Neon™ and OpenCL
     - graph::BranchLayer
@@ -1280,13 +1584,13 @@ v17.09 Public major release
     - NEDequantizationLayerKernel / @ref NEDequantizationLayer
     - NEFloorKernel / @ref NEFloor
     - @ref NEL2NormalizeLayerKernel / @ref NEL2NormalizeLayer
-    - NEQuantizationLayerKernel @ref NEMinMaxLayerKernel / @ref NEQuantizationLayer
+    - NEQuantizationLayerKernel NEMinMaxLayerKernel / @ref NEQuantizationLayer
     - @ref NEROIPoolingLayerKernel / @ref NEROIPoolingLayer
     - @ref NEReductionOperationKernel / @ref NEReductionOperation
     - NEReshapeLayerKernel / @ref NEReshapeLayer
 
  - New OpenCL kernels / functions:
-    - @ref CLDepthwiseConvolutionLayer3x3NCHWKernel @ref CLDepthwiseConvolutionLayer3x3NHWCKernel CLDepthwiseIm2ColKernel CLDepthwiseVectorToTensorKernel CLDepthwiseWeightsReshapeKernel / CLDepthwiseConvolutionLayer3x3 @ref CLDepthwiseConvolutionLayer CLDepthwiseSeparableConvolutionLayer
+    - CLDepthwiseConvolutionLayer3x3NCHWKernel CLDepthwiseConvolutionLayer3x3NHWCKernel CLDepthwiseIm2ColKernel CLDepthwiseVectorToTensorKernel CLDepthwiseWeightsReshapeKernel / CLDepthwiseConvolutionLayer3x3 @ref CLDepthwiseConvolutionLayer CLDepthwiseSeparableConvolutionLayer
     - CLDequantizationLayerKernel / CLDequantizationLayer
     - CLDirectConvolutionLayerKernel / @ref CLDirectConvolutionLayer
     - CLFlattenLayer
@@ -1294,7 +1598,7 @@ v17.09 Public major release
     - CLGEMMTranspose1xW
     - CLGEMMMatrixVectorMultiplyKernel
     - @ref CLL2NormalizeLayerKernel / @ref CLL2NormalizeLayer
-    - CLQuantizationLayerKernel @ref CLMinMaxLayerKernel / @ref CLQuantizationLayer
+    - CLQuantizationLayerKernel CLMinMaxLayerKernel / @ref CLQuantizationLayer
     - @ref CLROIPoolingLayerKernel / @ref CLROIPoolingLayer
     - @ref CLReductionOperationKernel / @ref CLReductionOperation
     - CLReshapeLayerKernel / @ref CLReshapeLayer
@@ -1307,13 +1611,13 @@ v17.06 Public major release
  - Added infrastructure to provide GPU specific optimisation for some OpenCL kernels.
  - Added @ref OMPScheduler (OpenMP) scheduler for Neon
  - Added @ref SingleThreadScheduler scheduler for Arm® Neon™ (For bare metal)
- - User can specify his own scheduler by implementing the @ref IScheduler interface.
+ - User can specify their own scheduler by implementing the @ref IScheduler interface.
  - New OpenCL kernels / functions:
     - @ref CLBatchNormalizationLayerKernel / @ref CLBatchNormalizationLayer
     - CLDepthConcatenateLayerKernel / CLDepthConcatenateLayer
     - CLHOGOrientationBinningKernel CLHOGBlockNormalizationKernel, CLHOGDetectorKernel / CLHOGDescriptor CLHOGDetector CLHOGGradient CLHOGMultiDetection
     - CLLocallyConnectedMatrixMultiplyKernel / CLLocallyConnectedLayer
-    - @ref CLWeightsReshapeKernel / @ref CLConvolutionLayerReshapeWeights
+    - CLWeightsReshapeKernel / CLConvolutionLayerReshapeWeights
  - New C++ kernels:
     - CPPDetectionWindowNonMaximaSuppressionKernel
  - New Arm® Neon™ kernels / functions:
@@ -1321,7 +1625,7 @@ v17.06 Public major release
     - NEDepthConcatenateLayerKernel / NEDepthConcatenateLayer
     - NEDirectConvolutionLayerKernel / @ref NEDirectConvolutionLayer
     - NELocallyConnectedMatrixMultiplyKernel / NELocallyConnectedLayer
-    - @ref NEWeightsReshapeKernel / @ref NEConvolutionLayerReshapeWeights
+    - NEWeightsReshapeKernel / NEConvolutionLayerReshapeWeights
 
 v17.05 Public bug fixes release
  - Various bug fixes
@@ -1362,9 +1666,9 @@ v17.03.1 First Major public release of the sources
    - @ref NENormalizationLayerKernel / @ref NENormalizationLayer
    - NETransposeKernel / @ref NETranspose
    - NELogits1DMaxKernel, NELogits1DShiftExpSumKernel, NELogits1DNormKernel / @ref NESoftmaxLayer
-   - @ref NEIm2ColKernel, @ref NECol2ImKernel, NEConvolutionLayerWeightsReshapeKernel / @ref NEConvolutionLayer
+   - NEIm2ColKernel, NECol2ImKernel, NEConvolutionLayerWeightsReshapeKernel / @ref NEConvolutionLayer
    - NEGEMMMatrixAccumulateBiasesKernel / @ref NEFullyConnectedLayer
-   - @ref NEGEMMLowpMatrixMultiplyKernel / NEGEMMLowp
+   - NEGEMMLowpMatrixMultiplyKernel / NEGEMMLowp
 
 v17.03 Sources preview
  - New OpenCL kernels / functions:
@@ -1377,15 +1681,15 @@ v17.03 Sources preview
    - CLLaplacianPyramid, CLLaplacianReconstruct
  - New Arm® Neon™ kernels / functions:
    - NEActivationLayerKernel / @ref NEActivationLayer
-   - GEMM refactoring + FP16 support (Requires armv8.2 CPU): @ref NEGEMMInterleave4x4Kernel, @ref NEGEMMTranspose1xWKernel, @ref NEGEMMMatrixMultiplyKernel, @ref NEGEMMMatrixAdditionKernel / @ref NEGEMM
+   - GEMM refactoring + FP16 support (Requires armv8.2 CPU): NEGEMMInterleave4x4Kernel, NEGEMMTranspose1xWKernel, NEGEMMMatrixMultiplyKernel, NEGEMMMatrixAdditionKernel / @ref NEGEMM
    - NEPoolingLayerKernel / @ref NEPoolingLayer
 
 v17.02.1 Sources preview
  - New OpenCL kernels / functions:
    - CLLogits1DMaxKernel, CLLogits1DShiftExpSumKernel, CLLogits1DNormKernel / @ref CLSoftmaxLayer
    - CLPoolingLayerKernel / @ref CLPoolingLayer
-   - @ref CLIm2ColKernel, @ref CLCol2ImKernel, CLConvolutionLayerWeightsReshapeKernel / CLConvolutionLayer
-   - @ref CLRemapKernel / @ref CLRemap
+   - CLIm2ColKernel, CLCol2ImKernel, CLConvolutionLayerWeightsReshapeKernel / CLConvolutionLayer
+   - CLRemapKernel / CLRemap
    - CLGaussianPyramidHorKernel, CLGaussianPyramidVertKernel / CLGaussianPyramid, CLGaussianPyramidHalf, CLGaussianPyramidOrb
    - CLMinMaxKernel, CLMinMaxLocationKernel / CLMinMaxLocation
    - CLNonLinearFilterKernel / CLNonLinearFilter
@@ -1412,4 +1716,4 @@ v16.12 Binary preview release
  - Original release
 
  */
-} // namespace arm_compute
-\ No newline at end of file
+} // namespace arm_compute